From fb43614771ad37b70e3a62a926192a839e885561 Mon Sep 17 00:00:00 2001 From: Daniel Smilkov Date: Tue, 7 Nov 2023 09:35:06 -0500 Subject: [PATCH] Parquet source: When `pseudo_shuffle=True`, limit the number of shards we read from (#827) --- .vscode/extensions.json | 3 +- .vscode/settings.json | 1 - docs/datasets/dataset_load.md | 12 +++-- lilac/sources/parquet_source.py | 22 +++++--- lilac/sources/parquet_source_test.py | 16 +++--- poetry.lock | 78 ++++++++++++---------------- 6 files changed, 63 insertions(+), 69 deletions(-) diff --git a/.vscode/extensions.json b/.vscode/extensions.json index 20bf27899..aa94e7ab2 100644 --- a/.vscode/extensions.json +++ b/.vscode/extensions.json @@ -18,7 +18,8 @@ "ZixuanChen.vitest-explorer", "ryanluker.vscode-coverage-gutters", "bradlc.vscode-tailwindcss", - "svelte.svelte-vscode" + "svelte.svelte-vscode", + "ms-python.mypy-type-checker" ], // List of extensions recommended by VS Code that should not be recommended for users of this workspace. "unwantedRecommendations": [] diff --git a/.vscode/settings.json b/.vscode/settings.json index 3cd340bde..bba71c1f3 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -61,7 +61,6 @@ "eslint.workingDirectories": ["auto"], "eslint.validate": ["typescript", "svelte"], "python.envFile": "${workspaceFolder}/.venv", - "python.linting.mypyEnabled": true, "python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python", "git.enableSmartCommit": true, "git.confirmSync": false, diff --git a/docs/datasets/dataset_load.md b/docs/datasets/dataset_load.md index 8c5b4344e..e741d1580 100644 --- a/docs/datasets/dataset_load.md +++ b/docs/datasets/dataset_load.md @@ -117,17 +117,19 @@ use a glob pattern to load multiple files. The `ParquetSource` takes a few optional arguments related to sampling: - `sample_size`, the number of rows to sample. -- `approximate_shuffle`, defaulting to `False`. When `False`, we take an entire pass over the - dataset with reservoir sampling. When `True`, we read a fraction of rows from the start of each - shard, to avoid shard skew, without doing a full pass over the entire dataset. This is useful when - your dataset is very large and consists of a large number of shards. +- `pseudo_shuffle`, defaulting to `False`. When `False`, we take an entire pass over the dataset + with reservoir sampling. When `True`, we read a fraction of rows from the start of each shard, to + avoid shard skew, without doing a full pass over the entire dataset. This is useful when your + dataset is very large and consists of a large number of shards. +- `pseudo_shuffle_num_shards`, the maximum number of shards to read from when `pseudo_shuffle` is + `True`. Defaults to `10`. - `seed`, the random seed to use for sampling. ```python source = ll.ParquetSource( filepaths=['s3://lilac-public-data/test-*.parquet'], sample_size=100, - approximate_shuffle=True) + pseudo_shuffle=True) config = ll.DatasetConfig(namespace='local', name='parquet-test', source=source) dataset = ll.create_dataset(config) ``` diff --git a/lilac/sources/parquet_source.py b/lilac/sources/parquet_source.py index 2979decd4..0e1a7ee4d 100644 --- a/lilac/sources/parquet_source.py +++ b/lilac/sources/parquet_source.py @@ -34,11 +34,14 @@ class ParquetSource(Source): sample_size: Optional[int] = Field( title='Sample size', description='Number of rows to sample from the dataset', default=None ) - approximate_shuffle: bool = Field( + pseudo_shuffle: bool = Field( default=False, description='If true, the reader will read a fraction of rows from each shard, ' 'avoiding a pass over the entire dataset.', ) + pseudo_shuffle_num_shards: int = Field( + default=10, description='Number of shards to sample from when using pseudo shuffle.' + ) _source_schema: Optional[SourceSchema] = None _readers: list[pa.RecordBatchReader] = [] @@ -60,23 +63,26 @@ def validate_sample_size(cls, sample_size: int) -> int: raise ValueError('sample_size must be greater than 0.') return sample_size - @field_validator('approximate_shuffle') + @field_validator('pseudo_shuffle') @classmethod - def validate_approximate_shuffle(cls, approximate_shuffle: bool, info: ValidationInfo) -> bool: + def validate_pseudo_shuffle(cls, pseudo_shuffle: bool, info: ValidationInfo) -> bool: """Validate shuffle before sampling.""" - if approximate_shuffle and not info.data['sample_size']: - raise ValueError('`approximate_shuffle` requires `sample_size` to be set.') - return approximate_shuffle + if pseudo_shuffle and not info.data['sample_size']: + raise ValueError('`pseudo_shuffle` requires `sample_size` to be set.') + return pseudo_shuffle def _setup_sampling(self, duckdb_paths: list[str]) -> Schema: assert self._con, 'setup() must be called first.' - if self.approximate_shuffle: - assert self.sample_size, 'approximate_shuffle requires sample_size to be set.' + if self.pseudo_shuffle: + assert self.sample_size, 'pseudo_shuffle requires sample_size to be set.' # Find each individual file. glob_rows: list[tuple[str]] = self._con.execute( f'SELECT * FROM GLOB({duckdb_paths})' ).fetchall() duckdb_files: list[str] = list(set([row[0] for row in glob_rows])) + # Sub-sample shards so we don't open too many files. + num_shards = min(self.pseudo_shuffle_num_shards, len(duckdb_files)) + duckdb_files = random.sample(duckdb_files, num_shards) batch_size = max(1, min(self.sample_size // len(duckdb_files), ROWS_PER_BATCH_READ)) for duckdb_file in duckdb_files: # Since we are not fetching the entire results immediately, we need a seperate cursor diff --git a/lilac/sources/parquet_source_test.py b/lilac/sources/parquet_source_test.py index 780c58fe7..aa388c319 100644 --- a/lilac/sources/parquet_source_test.py +++ b/lilac/sources/parquet_source_test.py @@ -48,7 +48,7 @@ def test_single_shard_with_sampling(tmp_path: pathlib.Path) -> None: assert len(items) == min(sample_size, len(source_items)) -def test_single_shard_approximate_shuffle(tmp_path: pathlib.Path) -> None: +def test_single_shard_pseudo_shuffle(tmp_path: pathlib.Path) -> None: source_items = [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}] table = pa.Table.from_pylist(source_items) @@ -57,7 +57,7 @@ def test_single_shard_approximate_shuffle(tmp_path: pathlib.Path) -> None: # Test sampling with different sample sizes, including sample size > num_items. for sample_size in range(1, 5): - source = ParquetSource(filepaths=[out_file], sample_size=sample_size, approximate_shuffle=True) + source = ParquetSource(filepaths=[out_file], sample_size=sample_size, pseudo_shuffle=True) source.setup() items = list(source.process()) assert len(items) == min(sample_size, len(source_items)) @@ -103,7 +103,7 @@ def test_multi_shard_approx_shuffle(tmp_path: pathlib.Path) -> None: for sample_size in range(1, 5): source = ParquetSource( filepaths=[str(tmp_path / 'test-*.parquet')], - approximate_shuffle=True, + pseudo_shuffle=True, sample_size=sample_size, ) source.setup() @@ -111,7 +111,7 @@ def test_multi_shard_approx_shuffle(tmp_path: pathlib.Path) -> None: assert len(items) == min(sample_size, len(source_items)) -def test_uniform_shards_approximate_shuffle(tmp_path: pathlib.Path) -> None: +def test_uniform_shards_pseudo_shuffle(tmp_path: pathlib.Path) -> None: source_items = [{'index': i} for i in range(100)] for i, chunk in enumerate(chunks(source_items, 10)): table = pa.Table.from_pylist(chunk) @@ -119,14 +119,14 @@ def test_uniform_shards_approximate_shuffle(tmp_path: pathlib.Path) -> None: pq.write_table(table, out_file) source = ParquetSource( - filepaths=[str(tmp_path / 'test-*.parquet')], approximate_shuffle=True, sample_size=20 + filepaths=[str(tmp_path / 'test-*.parquet')], pseudo_shuffle=True, sample_size=20 ) source.setup() items = list(source.process()) assert len(items) == 20 -def test_nonuniform_shards_approximate_shuffle(tmp_path: pathlib.Path) -> None: +def test_nonuniform_shards_pseudo_shuffle(tmp_path: pathlib.Path) -> None: source_items = [{'index': i} for i in range(100)] shard_sizes = [49, 1, 40, 10] for i, shard_size in enumerate(shard_sizes): @@ -137,7 +137,7 @@ def test_nonuniform_shards_approximate_shuffle(tmp_path: pathlib.Path) -> None: pq.write_table(table, out_file) source = ParquetSource( - filepaths=[str(tmp_path / 'test-*.parquet')], approximate_shuffle=True, sample_size=20 + filepaths=[str(tmp_path / 'test-*.parquet')], pseudo_shuffle=True, sample_size=20 ) source.setup() items = list(source.process()) @@ -165,7 +165,7 @@ def test_approx_shuffle_with_seed(tmp_path: pathlib.Path) -> None: pq.write_table(table, out_file) source = ParquetSource( - filepaths=[str(tmp_path / 'test-*.parquet')], approximate_shuffle=True, sample_size=20, seed=42 + filepaths=[str(tmp_path / 'test-*.parquet')], pseudo_shuffle=True, sample_size=20, seed=42 ) source.setup() items = list(source.process()) diff --git a/poetry.lock b/poetry.lock index 59c6aea34..be249d815 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -352,7 +352,7 @@ css = ["tinycss2 (>=1.1.0,<1.2)"] name = "blis" version = "0.7.11" description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension." -optional = true +optional = false python-versions = "*" files = [ {file = "blis-0.7.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cd5fba34c5775e4c440d80e4dea8acb40e2d3855b546e07c4e21fad8f972404c"}, @@ -429,7 +429,7 @@ files = [ name = "catalogue" version = "2.0.10" description = "Super lightweight function registries for your library" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "catalogue-2.0.10-py3-none-any.whl", hash = "sha256:58c2de0020aa90f4a2da7dfad161bf7b3b054c86a5f09fcedc0b2b740c109a9f"}, @@ -683,7 +683,7 @@ typing = ["mypy (>=0.990)"] name = "confection" version = "0.1.3" description = "The sweetest config system for Python" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "confection-0.1.3-py3-none-any.whl", hash = "sha256:58b125c9bc6786f32e37fe4d98bc3a03e5f509a4b9de02541b99c559f2026092"}, @@ -954,7 +954,7 @@ tests = ["pytest", "pytest-cov", "pytest-xdist"] name = "cymem" version = "2.0.8" description = "Manage calls to calloc/free through Cython" -optional = true +optional = false python-versions = "*" files = [ {file = "cymem-2.0.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77b5d3a73c41a394efd5913ab7e48512054cd2dabb9582d489535456641c7666"}, @@ -1753,11 +1753,11 @@ files = [ google-auth = ">=2.14.1,<3.0.dev0" googleapis-common-protos = ">=1.56.2,<2.0.dev0" grpcio = [ - {version = ">=1.33.2,<2.0dev", optional = true, markers = "extra == \"grpc\""}, + {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, ] grpcio-status = [ - {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "extra == \"grpc\""}, + {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, ] protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0.dev0" @@ -2148,7 +2148,7 @@ files = [ {file = "greenlet-3.0.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0b72b802496cccbd9b31acea72b6f87e7771ccfd7f7927437d592e5c92ed703c"}, {file = "greenlet-3.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:527cd90ba3d8d7ae7dceb06fda619895768a46a1b4e423bdb24c1969823b8362"}, {file = "greenlet-3.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:37f60b3a42d8b5499be910d1267b24355c495064f271cfe74bf28b17b099133c"}, - {file = "greenlet-3.0.0-cp311-universal2-macosx_10_9_universal2.whl", hash = "sha256:c3692ecf3fe754c8c0f2c95ff19626584459eab110eaab66413b1e7425cd84e9"}, + {file = "greenlet-3.0.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1482fba7fbed96ea7842b5a7fc11d61727e8be75a077e603e8ab49d24e234383"}, {file = "greenlet-3.0.0-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:be557119bf467d37a8099d91fbf11b2de5eb1fd5fc5b91598407574848dc910f"}, {file = "greenlet-3.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:73b2f1922a39d5d59cc0e597987300df3396b148a9bd10b76a058a2f2772fc04"}, {file = "greenlet-3.0.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1e22c22f7826096ad503e9bb681b05b8c1f5a8138469b255eb91f26a76634f2"}, @@ -2158,7 +2158,6 @@ files = [ {file = "greenlet-3.0.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:952256c2bc5b4ee8df8dfc54fc4de330970bf5d79253c863fb5e6761f00dda35"}, {file = "greenlet-3.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:269d06fa0f9624455ce08ae0179430eea61085e3cf6457f05982b37fd2cefe17"}, {file = "greenlet-3.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9adbd8ecf097e34ada8efde9b6fec4dd2a903b1e98037adf72d12993a1c80b51"}, - {file = "greenlet-3.0.0-cp312-universal2-macosx_10_9_universal2.whl", hash = "sha256:553d6fb2324e7f4f0899e5ad2c427a4579ed4873f42124beba763f16032959af"}, {file = "greenlet-3.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6b5ce7f40f0e2f8b88c28e6691ca6806814157ff05e794cdd161be928550f4c"}, {file = "greenlet-3.0.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ecf94aa539e97a8411b5ea52fc6ccd8371be9550c4041011a091eb8b3ca1d810"}, {file = "greenlet-3.0.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80dcd3c938cbcac986c5c92779db8e8ce51a89a849c135172c88ecbdc8c056b7"}, @@ -3133,7 +3132,7 @@ text-helpers = ["chardet (>=5.1.0,<6.0.0)"] name = "langcodes" version = "3.3.0" description = "Tools for labeling human languages with IETF language tags" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "langcodes-3.3.0-py3-none-any.whl", hash = "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69"}, @@ -3652,7 +3651,7 @@ dill = ">=0.3.7" name = "murmurhash" version = "1.0.10" description = "Cython bindings for MurmurHash" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "murmurhash-1.0.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3e90eef568adca5e17a91f96975e9a782ace3a617bbb3f8c8c2d917096e9bfeb"}, @@ -4379,7 +4378,7 @@ complete = ["blosc", "numpy (>=1.9.0)", "pandas (>=0.19.0)", "pyzmq"] name = "pathy" version = "0.10.2" description = "pathlib.Path subclasses for local and cloud bucket storage" -optional = true +optional = false python-versions = ">= 3.6" files = [ {file = "pathy-0.10.2-py3-none-any.whl", hash = "sha256:681bc98dbff28e7de3e50efa8246910f727e8ac254c4318c47ce341f7c1ce21d"}, @@ -4535,7 +4534,7 @@ testing = ["pytest", "pytest-benchmark"] name = "preshed" version = "3.0.9" description = "Cython hash table that trusts the keys are pre-hashed" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "preshed-3.0.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4f96ef4caf9847b2bb9868574dcbe2496f974e41c2b83d6621c24fb4c3fc57e3"}, @@ -5627,9 +5626,6 @@ files = [ {file = "safetensors-0.3.3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:17f41344d9a075f2f21b289a49a62e98baff54b5754240ba896063bce31626bf"}, {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:f1045f798e1a16a6ced98d6a42ec72936d367a2eec81dc5fade6ed54638cd7d2"}, {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:eaf0e4bc91da13f21ac846a39429eb3f3b7ed06295a32321fa3eb1a59b5c70f3"}, - {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25149180d4dc8ca48bac2ac3852a9424b466e36336a39659b35b21b2116f96fc"}, - {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9e943bf78c39de8865398a71818315e7d5d1af93c7b30d4da3fc852e62ad9bc"}, - {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cccfcac04a010354e87c7a2fe16a1ff004fc4f6e7ef8efc966ed30122ce00bc7"}, {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a07121f427e646a50d18c1be0fa1a2cbf6398624c31149cd7e6b35486d72189e"}, {file = "safetensors-0.3.3-cp310-cp310-win32.whl", hash = "sha256:a85e29cbfddfea86453cc0f4889b4bcc6b9c155be9a60e27be479a34e199e7ef"}, {file = "safetensors-0.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:e13adad4a3e591378f71068d14e92343e626cf698ff805f61cdb946e684a218e"}, @@ -5638,9 +5634,6 @@ files = [ {file = "safetensors-0.3.3-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:f84a74cbe9859b28e3d6d7715ac1dd3097bebf8d772694098f6d42435245860c"}, {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:10d637423d98ab2e6a4ad96abf4534eb26fcaf8ca3115623e64c00759374e90d"}, {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_universal2.whl", hash = "sha256:3b46f5de8b44084aff2e480874c550c399c730c84b2e8ad1bddb062c94aa14e9"}, - {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e76da691a82dfaf752854fa6d17c8eba0c8466370c5ad8cf1bfdf832d3c7ee17"}, - {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4e342fd54e66aa9512dd13e410f791e47aa4feeb5f4c9a20882c72f3d272f29"}, - {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:178fd30b5dc73bce14a39187d948cedd0e5698e2f055b7ea16b5a96c9b17438e"}, {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e8fdf7407dba44587ed5e79d5de3533d242648e1f2041760b21474bd5ea5c8c"}, {file = "safetensors-0.3.3-cp311-cp311-win32.whl", hash = "sha256:7d3b744cee8d7a46ffa68db1a2ff1a1a432488e3f7a5a97856fe69e22139d50c"}, {file = "safetensors-0.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f579877d30feec9b6ba409d05fa174633a4fc095675a4a82971d831a8bb60b97"}, @@ -5648,9 +5641,6 @@ files = [ {file = "safetensors-0.3.3-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:41adb1d39e8aad04b16879e3e0cbcb849315999fad73bc992091a01e379cb058"}, {file = "safetensors-0.3.3-cp37-cp37m-macosx_12_0_x86_64.whl", hash = "sha256:0f2b404250b3b877b11d34afcc30d80e7035714a1116a3df56acaca6b6c00096"}, {file = "safetensors-0.3.3-cp37-cp37m-macosx_13_0_x86_64.whl", hash = "sha256:b43956ef20e9f4f2e648818a9e7b3499edd6b753a0f5526d4f6a6826fbee8446"}, - {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d61a99b34169981f088ccfbb2c91170843efc869a0a0532f422db7211bf4f474"}, - {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c0008aab36cd20e9a051a68563c6f80d40f238c2611811d7faa5a18bf3fd3984"}, - {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:93d54166072b143084fdcd214a080a088050c1bb1651016b55942701b31334e4"}, {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c32ee08f61cea56a5d62bbf94af95df6040c8ab574afffaeb7b44ae5da1e9e3"}, {file = "safetensors-0.3.3-cp37-cp37m-win32.whl", hash = "sha256:351600f367badd59f7bfe86d317bb768dd8c59c1561c6fac43cafbd9c1af7827"}, {file = "safetensors-0.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:034717e297849dae1af0a7027a14b8647bd2e272c24106dced64d83e10d468d1"}, @@ -5660,9 +5650,6 @@ files = [ {file = "safetensors-0.3.3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:69ccee8d05f55cdf76f7e6c87d2bdfb648c16778ef8acfd2ecc495e273e9233e"}, {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:c08a9a4b7a4ca389232fa8d097aebc20bbd4f61e477abc7065b5c18b8202dede"}, {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:a002868d2e3f49bbe81bee2655a411c24fa1f8e68b703dec6629cb989d6ae42e"}, - {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3bd2704cb41faa44d3ec23e8b97330346da0395aec87f8eaf9c9e2c086cdbf13"}, - {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b2951bf3f0ad63df5e6a95263652bd6c194a6eb36fd4f2d29421cd63424c883"}, - {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07114cec116253ca2e7230fdea30acf76828f21614afd596d7b5438a2f719bd8"}, {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ab43aeeb9eadbb6b460df3568a662e6f1911ecc39387f8752afcb6a7d96c087"}, {file = "safetensors-0.3.3-cp38-cp38-win32.whl", hash = "sha256:f2f59fce31dd3429daca7269a6b06f65e6547a0c248f5116976c3f1e9b73f251"}, {file = "safetensors-0.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:c31ca0d8610f57799925bf08616856b39518ab772c65093ef1516762e796fde4"}, @@ -5672,9 +5659,6 @@ files = [ {file = "safetensors-0.3.3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:12b83f1986cd16ea0454c636c37b11e819d60dd952c26978310a0835133480b7"}, {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:f439175c827c2f1bbd54df42789c5204a10983a30bc4242bc7deaf854a24f3f0"}, {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:0085be33b8cbcb13079b3a8e131656e05b0bc5e6970530d4c24150f7afd76d70"}, - {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3ec70c87b1e910769034206ad5efc051069b105aac1687f6edcd02526767f4"}, - {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f490132383e5e490e710608f4acffcb98ed37f91b885c7217d3f9f10aaff9048"}, - {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79d1b6c7ed5596baf79c80fbce5198c3cdcc521ae6a157699f427aba1a90082d"}, {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad3cc8006e7a86ee7c88bd2813ec59cd7cc75b03e6fa4af89b9c7b235b438d68"}, {file = "safetensors-0.3.3-cp39-cp39-win32.whl", hash = "sha256:ab29f54c6b8c301ca05fa014728996bd83aac6e21528f893aaf8945c71f42b6d"}, {file = "safetensors-0.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:0fa82004eae1a71e2aa29843ef99de9350e459a0fc2f65fc6ee0da9690933d2d"}, @@ -5711,11 +5695,6 @@ files = [ {file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f66eddfda9d45dd6cadcd706b65669ce1df84b8549875691b1f403730bdef217"}, {file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6448c37741145b241eeac617028ba6ec2119e1339b1385c9720dae31367f2be"}, {file = "scikit_learn-1.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:c413c2c850241998168bbb3bd1bb59ff03b1195a53864f0b80ab092071af6028"}, - {file = "scikit_learn-1.3.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ef540e09873e31569bc8b02c8a9f745ee04d8e1263255a15c9969f6f5caa627f"}, - {file = "scikit_learn-1.3.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:9147a3a4df4d401e618713880be023e36109c85d8569b3bf5377e6cd3fecdeac"}, - {file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2cd3634695ad192bf71645702b3df498bd1e246fc2d529effdb45a06ab028b4"}, - {file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c275a06c5190c5ce00af0acbb61c06374087949f643ef32d355ece12c4db043"}, - {file = "scikit_learn-1.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:0e1aa8f206d0de814b81b41d60c1ce31f7f2c7354597af38fae46d9c47c45122"}, {file = "scikit_learn-1.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:52b77cc08bd555969ec5150788ed50276f5ef83abb72e6f469c5b91a0009bbca"}, {file = "scikit_learn-1.3.1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a683394bc3f80b7c312c27f9b14ebea7766b1f0a34faf1a2e9158d80e860ec26"}, {file = "scikit_learn-1.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15d964d9eb181c79c190d3dbc2fff7338786bf017e9039571418a1d53dab236"}, @@ -5980,7 +5959,7 @@ files = [ name = "smart-open" version = "6.4.0" description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" -optional = true +optional = false python-versions = ">=3.6,<4.0" files = [ {file = "smart_open-6.4.0-py3-none-any.whl", hash = "sha256:8d3ef7e6997e8e42dd55c74166ed21e6ac70664caa32dd940b26d54a8f6b4142"}, @@ -6034,7 +6013,7 @@ files = [ name = "spacy" version = "3.6.1" description = "Industrial-strength Natural Language Processing (NLP) in Python" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "spacy-3.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2fb23b9af51ee8baeea4920d6ffc8ef85bc3ea7a6338dbf330a0626cf6ac6ea9"}, @@ -6121,7 +6100,7 @@ transformers = ["spacy-transformers (>=1.1.2,<1.3.0)"] name = "spacy-legacy" version = "3.0.12" description = "Legacy registered functions for spaCy backwards compatibility" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774"}, @@ -6132,7 +6111,7 @@ files = [ name = "spacy-loggers" version = "1.0.5" description = "Logging utilities for SpaCy" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "spacy-loggers-1.0.5.tar.gz", hash = "sha256:d60b0bdbf915a60e516cc2e653baeff946f0cfc461b452d11a4d5458c6fe5f24"}, @@ -6162,6 +6141,14 @@ files = [ {file = "SQLAlchemy-2.0.21-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b69f1f754d92eb1cc6b50938359dead36b96a1dcf11a8670bff65fd9b21a4b09"}, {file = "SQLAlchemy-2.0.21-cp311-cp311-win32.whl", hash = "sha256:af520a730d523eab77d754f5cf44cc7dd7ad2d54907adeb3233177eeb22f271b"}, {file = "SQLAlchemy-2.0.21-cp311-cp311-win_amd64.whl", hash = "sha256:141675dae56522126986fa4ca713739d00ed3a6f08f3c2eb92c39c6dfec463ce"}, + {file = "SQLAlchemy-2.0.21-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:56628ca27aa17b5890391ded4e385bf0480209726f198799b7e980c6bd473bd7"}, + {file = "SQLAlchemy-2.0.21-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:db726be58837fe5ac39859e0fa40baafe54c6d54c02aba1d47d25536170b690f"}, + {file = "SQLAlchemy-2.0.21-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7421c1bfdbb7214313919472307be650bd45c4dc2fcb317d64d078993de045b"}, + {file = "SQLAlchemy-2.0.21-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:632784f7a6f12cfa0e84bf2a5003b07660addccf5563c132cd23b7cc1d7371a9"}, + {file = "SQLAlchemy-2.0.21-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f6f7276cf26145a888f2182a98f204541b519d9ea358a65d82095d9c9e22f917"}, + {file = "SQLAlchemy-2.0.21-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2a1f7ffac934bc0ea717fa1596f938483fb8c402233f9b26679b4f7b38d6ab6e"}, + {file = "SQLAlchemy-2.0.21-cp312-cp312-win32.whl", hash = "sha256:bfece2f7cec502ec5f759bbc09ce711445372deeac3628f6fa1c16b7fb45b682"}, + {file = "SQLAlchemy-2.0.21-cp312-cp312-win_amd64.whl", hash = "sha256:526b869a0f4f000d8d8ee3409d0becca30ae73f494cbb48801da0129601f72c6"}, {file = "SQLAlchemy-2.0.21-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7614f1eab4336df7dd6bee05bc974f2b02c38d3d0c78060c5faa4cd1ca2af3b8"}, {file = "SQLAlchemy-2.0.21-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d59cb9e20d79686aa473e0302e4a82882d7118744d30bb1dfb62d3c47141b3ec"}, {file = "SQLAlchemy-2.0.21-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a95aa0672e3065d43c8aa80080cdd5cc40fe92dc873749e6c1cf23914c4b83af"}, @@ -6190,7 +6177,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""} +greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} typing-extensions = ">=4.2.0" [package.extras] @@ -6221,7 +6208,7 @@ sqlcipher = ["sqlcipher3-binary"] name = "srsly" version = "2.4.8" description = "Modern high-performance serialization utilities for Python" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "srsly-2.4.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:17f3bcb418bb4cf443ed3d4dcb210e491bd9c1b7b0185e6ab10b6af3271e63b2"}, @@ -6396,7 +6383,7 @@ viz = ["matplotlib (>=3.0,<4.0)"] name = "thinc" version = "8.1.12" description = "A refreshing functional take on deep learning, compatible with your favorite libraries" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "thinc-8.1.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:efda431bc1513e81e457dbff4ef1610592569ddc362f8df24422628b195d51f4"}, @@ -7241,7 +7228,7 @@ test = ["Cython (>=0.29.32,<0.30.0)", "aiohttp", "flake8 (>=3.9.2,<3.10.0)", "my name = "wasabi" version = "1.1.2" description = "A lightweight console printing and formatting toolkit" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "wasabi-1.1.2-py3-none-any.whl", hash = "sha256:0a3f933c4bf0ed3f93071132c1b87549733256d6c8de6473c5f7ed2e171b5cf9"}, @@ -7681,7 +7668,7 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] [extras] -all = ["cohere", "detect-secrets", "email-reply-parser", "google-api-python-client", "google-auth-httplib2", "google-auth-oauthlib", "google-cloud-aiplatform", "google-generativeai", "langdetect", "langsmith", "llama-index", "openai", "sentence-transformers", "spacy", "textacy"] +all = ["cohere", "detect-secrets", "email-reply-parser", "google-api-python-client", "google-auth-httplib2", "google-auth-oauthlib", "google-cloud-aiplatform", "google-generativeai", "langdetect", "langsmith", "llama-index", "openai", "sentence-transformers", "textacy"] cohere = ["cohere"] embeddings = ["cohere", "google-cloud-aiplatform", "google-generativeai", "openai", "sentence-transformers"] github = ["llama-index"] @@ -7690,16 +7677,15 @@ gte = ["sentence-transformers"] lang-detection = ["langdetect"] langsmith = ["langsmith"] llms = ["openai"] -ner = ["spacy"] openai = ["openai"] palm = ["google-cloud-aiplatform", "google-generativeai"] pii = ["detect-secrets", "regex"] sbert = ["sentence-transformers"] -signals = ["detect-secrets", "langdetect", "spacy", "textacy"] +signals = ["detect-secrets", "langdetect", "textacy"] sources = ["langsmith"] -text-stats = ["spacy", "textacy"] +text-stats = ["textacy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "995a0e4f82378b7dd6eebaa9b6cd753395acc358d5943557f2f003043ed3666a" +content-hash = "e240c8b47743483b2ca861f77f1061e053e5a8c21dd0444eb9d7558e77b26c73"