From a5f919f15e6bd74dbaac53f877ec02ee33990d43 Mon Sep 17 00:00:00 2001 From: malteos Date: Mon, 29 Jul 2024 15:39:37 +0200 Subject: [PATCH] datatrove integration docs added --- docs/integration-with-other-frameworks.md | 29 +++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 30 insertions(+) create mode 100644 docs/integration-with-other-frameworks.md diff --git a/docs/integration-with-other-frameworks.md b/docs/integration-with-other-frameworks.md new file mode 100644 index 0000000..bebe97d --- /dev/null +++ b/docs/integration-with-other-frameworks.md @@ -0,0 +1,29 @@ +# Integration with other frameworks + +LLM-Datasets can be used in combination with our own processing pipelines or integration in other frameworks, for example with [Huggingface's DataTrove](https://github.com/huggingface/datatrove). + +## DataTrove integration + +HuggingFace's DataTrove is a library to process, filter and deduplicate text data at a very large scale. +All datasets implemented within LLM-Dataset can be processed with DataTrove. +To do so, you can use the `LLMDatasetsDatatroveReader` class as input for any DataTrove pipeline. +The `LLMDatasetsDatatroveReader` class takes a list of dataset ID(s) and/or [config files](config-files.md) as arguments, as shown in the example below: + +```python +from datatrove.pipeline.filters import SamplerFilter +from datatrove.pipeline.writers import JsonlWriter + +from llm_datasets.datatrove_reader import LLMDatasetsDatatroveReader +from llm_datasets.utils.config import Config, get_config_from_paths + +llmds_config: Config = get_config_from_paths(["path/to/my/config.yaml"]) + +pipeline = [ + LLMDatasetsDatatroveReader("legal_mc4_en", llmds_config), + SamplerFilter(rate=0.5), + JsonlWriter( + output_folder="/my/output/path" + ) +] +``` + diff --git a/mkdocs.yml b/mkdocs.yml index d3ceab0..cccd856 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -16,6 +16,7 @@ nav: - "Extract text data": extract-text-data.md - "Adding your own data": add-your-own-data.md - "Compose training and validation dataset": compose-train-validation-data.md + - "Integration with other frameworks": integration-with-other-frameworks.md - "Related work": related-work.md - "API reference": - "BaseDataset": api/base_dataset.md