From 87aa24cb4486f10b5bc3f8dacb9d713d18325bab Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Tue, 20 Feb 2024 16:55:17 +0100
Subject: [PATCH 1/9] Use ruff

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index fd1efbe..a26891f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-black==23.3.0
-isort==5.12.0
+ruff==0.2.2
+
 linetimer
 plotnine
 plotly

From 0ea9e002a0908443c0c30928b7fbf7a453e4c7a4 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Tue, 20 Feb 2024 16:58:56 +0100
Subject: [PATCH 2/9] Run ruff format

---
 polars_queries/q12.py  |  4 +---
 polars_queries/q21.py  | 14 ++++++--------
 prepare_files.py       | 32 ++++++++------------------------
 prepare_large_files.py | 32 ++++++++------------------------
 vaex_queries/q2.py     |  4 +---
 5 files changed, 24 insertions(+), 62 deletions(-)

diff --git a/polars_queries/q12.py b/polars_queries/q12.py
index d667ffd..f47a6c7 100644
--- a/polars_queries/q12.py
+++ b/polars_queries/q12.py
@@ -28,9 +28,7 @@ def q():
                 .then(1)
                 .otherwise(0)
                 .alias("high_line_count"),
-                pl.when(
-                    pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]).not_()
-                )
+                pl.when(pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]).not_())
                 .then(1)
                 .otherwise(0)
                 .alias("low_line_count"),
diff --git a/polars_queries/q21.py b/polars_queries/q21.py
index 5968cab..3cdc69e 100644
--- a/polars_queries/q21.py
+++ b/polars_queries/q21.py
@@ -14,14 +14,12 @@ def q():
     var_1 = "SAUDI ARABIA"
 
     res_1 = (
-        (
-            line_item_ds.group_by("l_orderkey")
-            .agg(pl.col("l_suppkey").n_unique().alias("nunique_col"))
-            .filter(pl.col("nunique_col") > 1)
-            .join(
-                line_item_ds.filter(pl.col("l_receiptdate") > pl.col("l_commitdate")),
-                on="l_orderkey",
-            )
+        line_item_ds.group_by("l_orderkey")
+        .agg(pl.col("l_suppkey").n_unique().alias("nunique_col"))
+        .filter(pl.col("nunique_col") > 1)
+        .join(
+            line_item_ds.filter(pl.col("l_receiptdate") > pl.col("l_commitdate")),
+            on="l_orderkey",
         )
     ).cache()
 
diff --git a/prepare_files.py b/prepare_files.py
index 641f358..3806ee8 100644
--- a/prepare_files.py
+++ b/prepare_files.py
@@ -7,15 +7,11 @@
 h_nation = """n_nationkey
 n_name
 n_regionkey
-n_comment""".split(
-    "\n"
-)
+n_comment""".split("\n")
 
 h_region = """r_regionkey
 r_name
-r_comment""".split(
-    "\n"
-)
+r_comment""".split("\n")
 
 h_part = """p_partkey
 p_name
@@ -25,9 +21,7 @@
 p_size
 p_container
 p_retailprice
-p_comment""".split(
-    "\n"
-)
+p_comment""".split("\n")
 
 h_supplier = """s_suppkey
 s_name
@@ -35,17 +29,13 @@
 s_nationkey
 s_phone
 s_acctbal
-s_comment""".split(
-    "\n"
-)
+s_comment""".split("\n")
 
 h_partsupp = """ps_partkey
 ps_suppkey
 ps_availqty
 ps_supplycost
-ps_comment""".split(
-    "\n"
-)
+ps_comment""".split("\n")
 
 h_customer = """c_custkey
 c_name
@@ -54,9 +44,7 @@
 c_phone
 c_acctbal
 c_mktsegment
-c_comment""".split(
-    "\n"
-)
+c_comment""".split("\n")
 
 h_orders = """o_orderkey
 o_custkey
@@ -66,9 +54,7 @@
 o_orderpriority
 o_clerk
 o_shippriority
-o_comment""".split(
-    "\n"
-)
+o_comment""".split("\n")
 
 h_lineitem = """l_orderkey
 l_partkey
@@ -85,9 +71,7 @@
 l_receiptdate
 l_shipinstruct
 l_shipmode
-comments""".split(
-    "\n"
-)
+comments""".split("\n")
 
 for name in [
     "nation",
diff --git a/prepare_large_files.py b/prepare_large_files.py
index 06a6823..db9f36a 100644
--- a/prepare_large_files.py
+++ b/prepare_large_files.py
@@ -8,16 +8,12 @@
 n_nationkey
 n_name
 n_regionkey
-n_comment""".split(
-    "\n"
-)
+n_comment""".split("\n")
 
 h_region = """
 r_regionkey
 r_name
-r_comment""".split(
-    "\n"
-)
+r_comment""".split("\n")
 
 h_part = """
 p_partkey
@@ -28,9 +24,7 @@
 p_size
 p_container
 p_retailprice
-p_comment""".split(
-    "\n"
-)
+p_comment""".split("\n")
 
 h_supplier = """
 s_suppkey
@@ -39,18 +33,14 @@
 s_nationkey
 s_phone
 s_acctbal
-s_comment""".split(
-    "\n"
-)
+s_comment""".split("\n")
 
 h_partsupp = """
 ps_partkey
 ps_suppkey
 ps_availqty
 ps_supplycost
-ps_comment""".split(
-    "\n"
-)
+ps_comment""".split("\n")
 
 h_customer = """
 c_custkey
@@ -60,9 +50,7 @@
 c_phone
 c_acctbal
 c_mktsegment
-c_comment""".split(
-    "\n"
-)
+c_comment""".split("\n")
 
 h_orders = """
 o_orderkey
@@ -73,9 +61,7 @@
 o_orderpriority
 o_clerk
 o_shippriority
-o_comment""".split(
-    "\n"
-)
+o_comment""".split("\n")
 
 h_lineitem = """
 l_orderkey
@@ -93,9 +79,7 @@
 l_receiptdate
 l_shipinstruct
 l_shipmode
-comments""".split(
-    "\n"
-)
+comments""".split("\n")
 
 for name in [
     "nation",
diff --git a/vaex_queries/q2.py b/vaex_queries/q2.py
index 185d215..cac4c24 100644
--- a/vaex_queries/q2.py
+++ b/vaex_queries/q2.py
@@ -151,9 +151,7 @@ def query():
                 True,
                 True,
             ],
-        )[
-            :100
-        ]
+        )[:100]
 
         return result_df
 

From 99b0312b23a519ce3316854b56c19e7b8dedd987 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Tue, 20 Feb 2024 17:01:17 +0100
Subject: [PATCH 3/9] Copy over ruff settings from main repo

---
 pyproject.toml | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b6ec71c
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,72 @@
+[tool.ruff]
+line-length = 88
+fix = true
+
+[tool.ruff.lint]
+select = [
+  "E", # pycodestyle
+  "W", # pycodestyle
+  "F", # Pyflakes
+  "B", # flake8-bugbear
+  "C4", # flake8-comprehensions
+  "D", # flake8-docstrings
+  "D213", # Augment NumPy docstring convention: Multi-line docstring summary should start at the second line
+  "D417", # Augment NumPy docstring convention: Missing argument descriptions
+  "I", # isort
+  "SIM", # flake8-simplify
+  "TCH", # flake8-type-checking
+  "TID", # flake8-tidy-imports
+  "UP", # pyupgrade
+  "PT", # flake8-pytest-style
+  "RUF", # Ruff-specific rules
+  "PTH", # flake8-use-pathlib
+  "FA", # flake8-future-annotations
+  "PIE", # flake8-pie
+  "TD", # flake8-todos
+  "TRY", # tryceratops
+  "EM", # flake8-errmsg
+  "FBT001", # flake8-boolean-trap
+]
+
+ignore = [
+  # Line length regulated by formatter
+  "E501",
+  # pydocstyle: http://www.pydocstyle.org/en/stable/error_codes.html
+  "D401", # Relax NumPy docstring convention: First line should be in imperative mood
+  # flake8-pytest-style:
+  "PT011", # pytest.raises({exception}) is too broad, set the match parameter or use a more specific exception
+  # flake8-simplify
+  "SIM102", # Use a single `if` statement instead of nested `if` statements
+  "SIM108", # Use ternary operator
+  # ruff
+  "RUF005", # unpack-instead-of-concatenating-to-collection-literal
+  # pycodestyle
+  # TODO: Remove errors below to further improve docstring linting
+  # Ordered from most common to least common errors.
+  "D105", # Missing docstring in magic method
+  "D100", # Missing docstring in public module
+  "D104", # Missing docstring in public package
+  # flake8-todos
+  "TD002", # Missing author in TODO
+  "TD003", # Missing issue link on the line following this TODO
+  # tryceratops
+  "TRY003", # Avoid specifying long messages outside the exception class
+  # Lints below are turned off because of conflicts with the ruff formatter
+  "D206",
+  "W191",
+]
+
+[tool.ruff.lint.pycodestyle]
+max-doc-length = 88
+
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
+
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[tool.ruff.lint.flake8-type-checking]
+strict = true
+
+[tool.ruff.format]
+docstring-code-format = true

From e8fd4cf9153a7f4f28399471bc1b3f740d390793 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Tue, 20 Feb 2024 20:57:12 +0100
Subject: [PATCH 4/9] Update makefile

---
 Makefile | 107 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 70 insertions(+), 37 deletions(-)

diff --git a/Makefile b/Makefile
index 1f2321b..89c2173 100644
--- a/Makefile
+++ b/Makefile
@@ -1,62 +1,95 @@
-SHELL=/bin/bash
-PYTHON=.venv/bin/python
+.DEFAULT_GOAL := help
 
-.venv:
-	@python -m venv .venv
-	@.venv/bin/pip install -U pip
-	@.venv/bin/pip install --no-cache-dir -r requirements.txt
+PYTHONPATH=
+SHELL=/bin/bash
+VENV=.venv
+VENV_BIN=$(VENV)/bin
 
-clean-tpch-dbgen:
-	$(MAKE) -C tpch-dbgen clean
+.venv:  ## Set up Python virtual environment and install requirements
+	python3 -m venv $(VENV)
+	$(MAKE) requirements
 
-clean-venv:
-	rm -r .venv
+.PHONY: requirements
+requirements: .venv  ## Update Python project requirements
+	$(VENV_BIN)/python -m pip install --upgrade pip
+	$(VENV_BIN)/pip install --upgrade -r requirements.txt
 
-clean-tables:
-	rm -r tables_scale_*
+.PHONY: fmt
+fmt:  ## Run autoformatting and linting
+	$(VENV_BIN)/ruff check
+	$(VENV_BIN)/ruff format
 
-clean: clean-tpch-dbgen clean-venv
+.PHONY: pre-commit
+pre-commit: fmt  ## Run all code quality checks
 
-tables_scale_1: .venv
+.PHONY: tables-scale-1
+tables-scale-1: .venv  ## Generate data tables
 	$(MAKE) -C tpch-dbgen all
 	cd tpch-dbgen && ./dbgen -vf -s 1 && cd ..
 	mkdir -p "tables_scale_1"
 	mv tpch-dbgen/*.tbl tables_scale_1/
-	.venv/bin/python prepare_files.py 1
+	$(VENV_BIN)/python prepare_files.py 1
 
-tables_scale_10: .venv
+.PHONY: tables-scale-10
+tables-scale-10: .venv  ## Generate bigger data tables
 	$(MAKE) -C tpch-dbgen all
 	cd tpch-dbgen && ./dbgen -vf -s 10 && cd ..
 	mkdir -p "tables_scale_10"
 	mv tpch-dbgen/*.tbl tables_scale_10/
-	.venv/bin/python prepare_files.py 10
+	$(VENV_BIN)/python prepare_files.py 10
+
+.PHONY: run-polars
+run-polars: .venv  ## Run polars benchmarks
+	$(VENV_BIN)/python -m polars_queries.executor
+
+.PHONY: run-pandas
+run-pandas: .venv  ## Run pandas benchmarks
+	$(VENV_BIN)/python -m pandas_queries.executor
+
+.PHONY: run-pyspark
+run-pyspark: .venv  ## Run pyspark benchmarks
+	$(VENV_BIN)/python -m spark_queries.executor
+
+.PHONY: run-dask
+run-dask: .venv  ## Run dask benchmarks
+	$(VENV_BIN)/python -m dask_queries.executor
+
+.PHONY: run-duckdb
+run-duckdb: .venv  ## Run duckdb benchmarks
+	$(VENV_BIN)/python -m duckdb_queries.executor
+
+.PHONY: run-vaex
+run-vaex: .venv  ## Run vaex benchmarks
+	$(VENV_BIN)/python -m vaex_queries.executor
 
-run_polars: .venv
-	.venv/bin/python -m polars_queries.executor
+.PHONY: run-modin
+run-modin: .venv  ## Run modin benchmarks
+	$(VENV_BIN)/python -m modin_queries.executor
 
-run_pandas: .venv
-	.venv/bin/python -m pandas_queries.executor
+.PHONY: run-all
+run-all: run-polars run-pandas run-pyspark run-dask run-duckdb run-vaex run-modin   ## Run all benchmarks
 
-run_dask: .venv
-	.venv/bin/python -m dask_queries.executor
+.PHONY: plot
+plot: .venv  ## Plot results
+	$(VENV_BIN)/python -m scripts.plot_results
 
-run_modin: .venv
-	.venv/bin/python -m modin_queries.executor
 
-run_vaex: .venv
-	.venv/bin/python -m vaex_queries.executor
+.PHONY: clean
+clean:  clean-tpch-dbgen clean-tables  ## Clean up everything
+	@rm -rf .ruff_cache/
+	@rm -rf .venv/
 
-run_spark: .venv
-	.venv/bin/python -m spark_queries.executor
+.PHONY: clean-tpch-dbgen
+clean-tpch-dbgen:  ## Clean up TPC-H folder
+	@$(MAKE) -C tpch-dbgen clean
 
-run_duckdb: .venv
-	.venv/bin/python -m duckdb_queries.executor
+.PHONY: clean-tables
+clean-tables:  ## Clean up data tables
+	@rm -rf tables_scale_*
 
-plot_results: .venv
-	.venv/bin/python -m scripts.plot_results
 
-run_all: run_polars run_pandas run_vaex run_dask run_modin run_spark
+.PHONY: help
+help:  ## Display this help screen
+	@echo -e "\033[1mAvailable commands:\033[0m"
+	@grep -E '^[a-z.A-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-22s\033[0m %s\n", $$1, $$2}' | sort
 
-pre-commit:
-	.venv/bin/python -m isort .
-	.venv/bin/python -m black .

From fc79313326dd75b2fcee075c17d6db34369d326d Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Tue, 20 Feb 2024 20:57:35 +0100
Subject: [PATCH 5/9] Add lint workflow

---
 .github/workflows/lint.yml | 30 ++++++++++++++++++++++++++++++
 Makefile                   |  2 +-
 2 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/lint.yml

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..1fead85
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,30 @@
+name: Lint
+
+on:
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Get ruff version from requirements file
+        id: version
+        run: |
+          VERSION=$(grep -m 1 -oP 'ruff==\K(.*)' requirements-lint.txt)
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+
+      - uses: chartboost/ruff-action@v1
+        with:
+          version: ${{ steps.version.outputs.version }}
+          args: check --no-fix
+
+      - uses: chartboost/ruff-action@v1
+        with:
+          version: ${{ steps.version.outputs.version }}
+          args: format --diff
diff --git a/Makefile b/Makefile
index 89c2173..4e2a827 100644
--- a/Makefile
+++ b/Makefile
@@ -91,5 +91,5 @@ clean-tables:  ## Clean up data tables
 .PHONY: help
 help:  ## Display this help screen
 	@echo -e "\033[1mAvailable commands:\033[0m"
-	@grep -E '^[a-z.A-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-22s\033[0m %s\n", $$1, $$2}' | sort
+	@grep -E '^[a-z.A-Z_0-9-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-22s\033[0m %s\n", $$1, $$2}' | sort
 

From 187b87fffe795f29c243b3aff67fb58451793e17 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Tue, 20 Feb 2024 21:28:39 +0100
Subject: [PATCH 6/9] Update ruff settings

---
 pyproject.toml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b6ec71c..e9e0170 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,11 +41,7 @@ ignore = [
   # ruff
   "RUF005", # unpack-instead-of-concatenating-to-collection-literal
   # pycodestyle
-  # TODO: Remove errors below to further improve docstring linting
-  # Ordered from most common to least common errors.
-  "D105", # Missing docstring in magic method
-  "D100", # Missing docstring in public module
-  "D104", # Missing docstring in public package
+  "D1", # Missing docstring
   # flake8-todos
   "TD002", # Missing author in TODO
   "TD003", # Missing issue link on the line following this TODO

From 272884735ef5452126ba29414a1225a07fe09128 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Wed, 21 Feb 2024 02:17:32 +0100
Subject: [PATCH 7/9] Update make command in run.sh

---
 run.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/run.sh b/run.sh
index 9aed1ef..4a8331b 100755
--- a/run.sh
+++ b/run.sh
@@ -2,10 +2,10 @@ export LOG_TIMINGS=1
 export WRITE_PLOT=1
 
 echo run with cached IO
-make run_all
-make plot_results
+make run-all
+make plot
 
 echo run with IO
 export INCLUDE_IO=1
-make run_all
-make plot_results
+make run-all
+make plot

From 4e62efc4c6d2aec1cc17857d7d99adebd0360ded Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Wed, 21 Feb 2024 02:19:55 +0100
Subject: [PATCH 8/9] Disable ruff lint for now

---
 .github/workflows/lint.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 1fead85..033e567 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -19,10 +19,10 @@ jobs:
           VERSION=$(grep -m 1 -oP 'ruff==\K(.*)' requirements-lint.txt)
           echo "version=$VERSION" >> $GITHUB_OUTPUT
 
-      - uses: chartboost/ruff-action@v1
-        with:
-          version: ${{ steps.version.outputs.version }}
-          args: check --no-fix
+      # - uses: chartboost/ruff-action@v1
+      #   with:
+      #     version: ${{ steps.version.outputs.version }}
+      #     args: check --no-fix
 
       - uses: chartboost/ruff-action@v1
         with:

From 2ed58c18ee5a9d69e8b386b125d376e329e0c78e Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Wed, 21 Feb 2024 02:24:23 +0100
Subject: [PATCH 9/9] Fix workflow

---
 .github/workflows/lint.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 033e567..26e4c5b 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -16,7 +16,7 @@ jobs:
       - name: Get ruff version from requirements file
         id: version
         run: |
-          VERSION=$(grep -m 1 -oP 'ruff==\K(.*)' requirements-lint.txt)
+          VERSION=$(grep -m 1 -oP 'ruff==\K(.*)' requirements.txt)
           echo "version=$VERSION" >> $GITHUB_OUTPUT
 
       # - uses: chartboost/ruff-action@v1