From 87aa24cb4486f10b5bc3f8dacb9d713d18325bab Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 20 Feb 2024 16:55:17 +0100 Subject: [PATCH 1/9] Use ruff --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index fd1efbe..a26891f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -black==23.3.0 -isort==5.12.0 +ruff==0.2.2 + linetimer plotnine plotly From 0ea9e002a0908443c0c30928b7fbf7a453e4c7a4 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 20 Feb 2024 16:58:56 +0100 Subject: [PATCH 2/9] Run ruff format --- polars_queries/q12.py | 4 +--- polars_queries/q21.py | 14 ++++++-------- prepare_files.py | 32 ++++++++------------------------ prepare_large_files.py | 32 ++++++++------------------------ vaex_queries/q2.py | 4 +--- 5 files changed, 24 insertions(+), 62 deletions(-) diff --git a/polars_queries/q12.py b/polars_queries/q12.py index d667ffd..f47a6c7 100644 --- a/polars_queries/q12.py +++ b/polars_queries/q12.py @@ -28,9 +28,7 @@ def q(): .then(1) .otherwise(0) .alias("high_line_count"), - pl.when( - pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]).not_() - ) + pl.when(pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]).not_()) .then(1) .otherwise(0) .alias("low_line_count"), diff --git a/polars_queries/q21.py b/polars_queries/q21.py index 5968cab..3cdc69e 100644 --- a/polars_queries/q21.py +++ b/polars_queries/q21.py @@ -14,14 +14,12 @@ def q(): var_1 = "SAUDI ARABIA" res_1 = ( - ( - line_item_ds.group_by("l_orderkey") - .agg(pl.col("l_suppkey").n_unique().alias("nunique_col")) - .filter(pl.col("nunique_col") > 1) - .join( - line_item_ds.filter(pl.col("l_receiptdate") > pl.col("l_commitdate")), - on="l_orderkey", - ) + line_item_ds.group_by("l_orderkey") + .agg(pl.col("l_suppkey").n_unique().alias("nunique_col")) + .filter(pl.col("nunique_col") > 1) + .join( + line_item_ds.filter(pl.col("l_receiptdate") > pl.col("l_commitdate")), + on="l_orderkey", ) ).cache() diff --git a/prepare_files.py b/prepare_files.py index 641f358..3806ee8 100644 --- a/prepare_files.py +++ b/prepare_files.py @@ -7,15 +7,11 @@ h_nation = """n_nationkey n_name n_regionkey -n_comment""".split( - "\n" -) +n_comment""".split("\n") h_region = """r_regionkey r_name -r_comment""".split( - "\n" -) +r_comment""".split("\n") h_part = """p_partkey p_name @@ -25,9 +21,7 @@ p_size p_container p_retailprice -p_comment""".split( - "\n" -) +p_comment""".split("\n") h_supplier = """s_suppkey s_name @@ -35,17 +29,13 @@ s_nationkey s_phone s_acctbal -s_comment""".split( - "\n" -) +s_comment""".split("\n") h_partsupp = """ps_partkey ps_suppkey ps_availqty ps_supplycost -ps_comment""".split( - "\n" -) +ps_comment""".split("\n") h_customer = """c_custkey c_name @@ -54,9 +44,7 @@ c_phone c_acctbal c_mktsegment -c_comment""".split( - "\n" -) +c_comment""".split("\n") h_orders = """o_orderkey o_custkey @@ -66,9 +54,7 @@ o_orderpriority o_clerk o_shippriority -o_comment""".split( - "\n" -) +o_comment""".split("\n") h_lineitem = """l_orderkey l_partkey @@ -85,9 +71,7 @@ l_receiptdate l_shipinstruct l_shipmode -comments""".split( - "\n" -) +comments""".split("\n") for name in [ "nation", diff --git a/prepare_large_files.py b/prepare_large_files.py index 06a6823..db9f36a 100644 --- a/prepare_large_files.py +++ b/prepare_large_files.py @@ -8,16 +8,12 @@ n_nationkey n_name n_regionkey -n_comment""".split( - "\n" -) +n_comment""".split("\n") h_region = """ r_regionkey r_name -r_comment""".split( - "\n" -) +r_comment""".split("\n") h_part = """ p_partkey @@ -28,9 +24,7 @@ p_size p_container p_retailprice -p_comment""".split( - "\n" -) +p_comment""".split("\n") h_supplier = """ s_suppkey @@ -39,18 +33,14 @@ s_nationkey s_phone s_acctbal -s_comment""".split( - "\n" -) +s_comment""".split("\n") h_partsupp = """ ps_partkey ps_suppkey ps_availqty ps_supplycost -ps_comment""".split( - "\n" -) +ps_comment""".split("\n") h_customer = """ c_custkey @@ -60,9 +50,7 @@ c_phone c_acctbal c_mktsegment -c_comment""".split( - "\n" -) +c_comment""".split("\n") h_orders = """ o_orderkey @@ -73,9 +61,7 @@ o_orderpriority o_clerk o_shippriority -o_comment""".split( - "\n" -) +o_comment""".split("\n") h_lineitem = """ l_orderkey @@ -93,9 +79,7 @@ l_receiptdate l_shipinstruct l_shipmode -comments""".split( - "\n" -) +comments""".split("\n") for name in [ "nation", diff --git a/vaex_queries/q2.py b/vaex_queries/q2.py index 185d215..cac4c24 100644 --- a/vaex_queries/q2.py +++ b/vaex_queries/q2.py @@ -151,9 +151,7 @@ def query(): True, True, ], - )[ - :100 - ] + )[:100] return result_df From 99b0312b23a519ce3316854b56c19e7b8dedd987 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 20 Feb 2024 17:01:17 +0100 Subject: [PATCH 3/9] Copy over ruff settings from main repo --- pyproject.toml | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b6ec71c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,72 @@ +[tool.ruff] +line-length = 88 +fix = true + +[tool.ruff.lint] +select = [ + "E", # pycodestyle + "W", # pycodestyle + "F", # Pyflakes + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "D", # flake8-docstrings + "D213", # Augment NumPy docstring convention: Multi-line docstring summary should start at the second line + "D417", # Augment NumPy docstring convention: Missing argument descriptions + "I", # isort + "SIM", # flake8-simplify + "TCH", # flake8-type-checking + "TID", # flake8-tidy-imports + "UP", # pyupgrade + "PT", # flake8-pytest-style + "RUF", # Ruff-specific rules + "PTH", # flake8-use-pathlib + "FA", # flake8-future-annotations + "PIE", # flake8-pie + "TD", # flake8-todos + "TRY", # tryceratops + "EM", # flake8-errmsg + "FBT001", # flake8-boolean-trap +] + +ignore = [ + # Line length regulated by formatter + "E501", + # pydocstyle: http://www.pydocstyle.org/en/stable/error_codes.html + "D401", # Relax NumPy docstring convention: First line should be in imperative mood + # flake8-pytest-style: + "PT011", # pytest.raises({exception}) is too broad, set the match parameter or use a more specific exception + # flake8-simplify + "SIM102", # Use a single `if` statement instead of nested `if` statements + "SIM108", # Use ternary operator + # ruff + "RUF005", # unpack-instead-of-concatenating-to-collection-literal + # pycodestyle + # TODO: Remove errors below to further improve docstring linting + # Ordered from most common to least common errors. + "D105", # Missing docstring in magic method + "D100", # Missing docstring in public module + "D104", # Missing docstring in public package + # flake8-todos + "TD002", # Missing author in TODO + "TD003", # Missing issue link on the line following this TODO + # tryceratops + "TRY003", # Avoid specifying long messages outside the exception class + # Lints below are turned off because of conflicts with the ruff formatter + "D206", + "W191", +] + +[tool.ruff.lint.pycodestyle] +max-doc-length = 88 + +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.lint.flake8-type-checking] +strict = true + +[tool.ruff.format] +docstring-code-format = true From e8fd4cf9153a7f4f28399471bc1b3f740d390793 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 20 Feb 2024 20:57:12 +0100 Subject: [PATCH 4/9] Update makefile --- Makefile | 107 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 70 insertions(+), 37 deletions(-) diff --git a/Makefile b/Makefile index 1f2321b..89c2173 100644 --- a/Makefile +++ b/Makefile @@ -1,62 +1,95 @@ -SHELL=/bin/bash -PYTHON=.venv/bin/python +.DEFAULT_GOAL := help -.venv: - @python -m venv .venv - @.venv/bin/pip install -U pip - @.venv/bin/pip install --no-cache-dir -r requirements.txt +PYTHONPATH= +SHELL=/bin/bash +VENV=.venv +VENV_BIN=$(VENV)/bin -clean-tpch-dbgen: - $(MAKE) -C tpch-dbgen clean +.venv: ## Set up Python virtual environment and install requirements + python3 -m venv $(VENV) + $(MAKE) requirements -clean-venv: - rm -r .venv +.PHONY: requirements +requirements: .venv ## Update Python project requirements + $(VENV_BIN)/python -m pip install --upgrade pip + $(VENV_BIN)/pip install --upgrade -r requirements.txt -clean-tables: - rm -r tables_scale_* +.PHONY: fmt +fmt: ## Run autoformatting and linting + $(VENV_BIN)/ruff check + $(VENV_BIN)/ruff format -clean: clean-tpch-dbgen clean-venv +.PHONY: pre-commit +pre-commit: fmt ## Run all code quality checks -tables_scale_1: .venv +.PHONY: tables-scale-1 +tables-scale-1: .venv ## Generate data tables $(MAKE) -C tpch-dbgen all cd tpch-dbgen && ./dbgen -vf -s 1 && cd .. mkdir -p "tables_scale_1" mv tpch-dbgen/*.tbl tables_scale_1/ - .venv/bin/python prepare_files.py 1 + $(VENV_BIN)/python prepare_files.py 1 -tables_scale_10: .venv +.PHONY: tables-scale-10 +tables-scale-10: .venv ## Generate bigger data tables $(MAKE) -C tpch-dbgen all cd tpch-dbgen && ./dbgen -vf -s 10 && cd .. mkdir -p "tables_scale_10" mv tpch-dbgen/*.tbl tables_scale_10/ - .venv/bin/python prepare_files.py 10 + $(VENV_BIN)/python prepare_files.py 10 + +.PHONY: run-polars +run-polars: .venv ## Run polars benchmarks + $(VENV_BIN)/python -m polars_queries.executor + +.PHONY: run-pandas +run-pandas: .venv ## Run pandas benchmarks + $(VENV_BIN)/python -m pandas_queries.executor + +.PHONY: run-pyspark +run-pyspark: .venv ## Run pyspark benchmarks + $(VENV_BIN)/python -m spark_queries.executor + +.PHONY: run-dask +run-dask: .venv ## Run dask benchmarks + $(VENV_BIN)/python -m dask_queries.executor + +.PHONY: run-duckdb +run-duckdb: .venv ## Run duckdb benchmarks + $(VENV_BIN)/python -m duckdb_queries.executor + +.PHONY: run-vaex +run-vaex: .venv ## Run vaex benchmarks + $(VENV_BIN)/python -m vaex_queries.executor -run_polars: .venv - .venv/bin/python -m polars_queries.executor +.PHONY: run-modin +run-modin: .venv ## Run modin benchmarks + $(VENV_BIN)/python -m modin_queries.executor -run_pandas: .venv - .venv/bin/python -m pandas_queries.executor +.PHONY: run-all +run-all: run-polars run-pandas run-pyspark run-dask run-duckdb run-vaex run-modin ## Run all benchmarks -run_dask: .venv - .venv/bin/python -m dask_queries.executor +.PHONY: plot +plot: .venv ## Plot results + $(VENV_BIN)/python -m scripts.plot_results -run_modin: .venv - .venv/bin/python -m modin_queries.executor -run_vaex: .venv - .venv/bin/python -m vaex_queries.executor +.PHONY: clean +clean: clean-tpch-dbgen clean-tables ## Clean up everything + @rm -rf .ruff_cache/ + @rm -rf .venv/ -run_spark: .venv - .venv/bin/python -m spark_queries.executor +.PHONY: clean-tpch-dbgen +clean-tpch-dbgen: ## Clean up TPC-H folder + @$(MAKE) -C tpch-dbgen clean -run_duckdb: .venv - .venv/bin/python -m duckdb_queries.executor +.PHONY: clean-tables +clean-tables: ## Clean up data tables + @rm -rf tables_scale_* -plot_results: .venv - .venv/bin/python -m scripts.plot_results -run_all: run_polars run_pandas run_vaex run_dask run_modin run_spark +.PHONY: help +help: ## Display this help screen + @echo -e "\033[1mAvailable commands:\033[0m" + @grep -E '^[a-z.A-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-22s\033[0m %s\n", $$1, $$2}' | sort -pre-commit: - .venv/bin/python -m isort . - .venv/bin/python -m black . From fc79313326dd75b2fcee075c17d6db34369d326d Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 20 Feb 2024 20:57:35 +0100 Subject: [PATCH 5/9] Add lint workflow --- .github/workflows/lint.yml | 30 ++++++++++++++++++++++++++++++ Makefile | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/lint.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..1fead85 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,30 @@ +name: Lint + +on: + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Get ruff version from requirements file + id: version + run: | + VERSION=$(grep -m 1 -oP 'ruff==\K(.*)' requirements-lint.txt) + echo "version=$VERSION" >> $GITHUB_OUTPUT + + - uses: chartboost/ruff-action@v1 + with: + version: ${{ steps.version.outputs.version }} + args: check --no-fix + + - uses: chartboost/ruff-action@v1 + with: + version: ${{ steps.version.outputs.version }} + args: format --diff diff --git a/Makefile b/Makefile index 89c2173..4e2a827 100644 --- a/Makefile +++ b/Makefile @@ -91,5 +91,5 @@ clean-tables: ## Clean up data tables .PHONY: help help: ## Display this help screen @echo -e "\033[1mAvailable commands:\033[0m" - @grep -E '^[a-z.A-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-22s\033[0m %s\n", $$1, $$2}' | sort + @grep -E '^[a-z.A-Z_0-9-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-22s\033[0m %s\n", $$1, $$2}' | sort From 187b87fffe795f29c243b3aff67fb58451793e17 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 20 Feb 2024 21:28:39 +0100 Subject: [PATCH 6/9] Update ruff settings --- pyproject.toml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b6ec71c..e9e0170 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,11 +41,7 @@ ignore = [ # ruff "RUF005", # unpack-instead-of-concatenating-to-collection-literal # pycodestyle - # TODO: Remove errors below to further improve docstring linting - # Ordered from most common to least common errors. - "D105", # Missing docstring in magic method - "D100", # Missing docstring in public module - "D104", # Missing docstring in public package + "D1", # Missing docstring # flake8-todos "TD002", # Missing author in TODO "TD003", # Missing issue link on the line following this TODO From 272884735ef5452126ba29414a1225a07fe09128 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Wed, 21 Feb 2024 02:17:32 +0100 Subject: [PATCH 7/9] Update make command in run.sh --- run.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/run.sh b/run.sh index 9aed1ef..4a8331b 100755 --- a/run.sh +++ b/run.sh @@ -2,10 +2,10 @@ export LOG_TIMINGS=1 export WRITE_PLOT=1 echo run with cached IO -make run_all -make plot_results +make run-all +make plot echo run with IO export INCLUDE_IO=1 -make run_all -make plot_results +make run-all +make plot From 4e62efc4c6d2aec1cc17857d7d99adebd0360ded Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Wed, 21 Feb 2024 02:19:55 +0100 Subject: [PATCH 8/9] Disable ruff lint for now --- .github/workflows/lint.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 1fead85..033e567 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -19,10 +19,10 @@ jobs: VERSION=$(grep -m 1 -oP 'ruff==\K(.*)' requirements-lint.txt) echo "version=$VERSION" >> $GITHUB_OUTPUT - - uses: chartboost/ruff-action@v1 - with: - version: ${{ steps.version.outputs.version }} - args: check --no-fix + # - uses: chartboost/ruff-action@v1 + # with: + # version: ${{ steps.version.outputs.version }} + # args: check --no-fix - uses: chartboost/ruff-action@v1 with: From 2ed58c18ee5a9d69e8b386b125d376e329e0c78e Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Wed, 21 Feb 2024 02:24:23 +0100 Subject: [PATCH 9/9] Fix workflow --- .github/workflows/lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 033e567..26e4c5b 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -16,7 +16,7 @@ jobs: - name: Get ruff version from requirements file id: version run: | - VERSION=$(grep -m 1 -oP 'ruff==\K(.*)' requirements-lint.txt) + VERSION=$(grep -m 1 -oP 'ruff==\K(.*)' requirements.txt) echo "version=$VERSION" >> $GITHUB_OUTPUT # - uses: chartboost/ruff-action@v1