forked from pola-rs/polars-benchmark
-
Notifications
You must be signed in to change notification settings - Fork 0
/
settings.py
86 lines (63 loc) · 2.58 KB
/
settings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
from pathlib import Path
from typing import Literal, TypeAlias
from pydantic import computed_field
from pydantic_settings import BaseSettings, SettingsConfigDict
IoType: TypeAlias = Literal["skip", "parquet", "feather", "csv"]
class Paths(BaseSettings):
answers: Path = Path("data/answers")
tables: Path = Path("data/tables")
timings: Path = Path("output/run")
timings_filename: str = "timings.csv"
plots: Path = Path("output/plot")
model_config = SettingsConfigDict(
env_prefix="path_", env_file=".env", extra="ignore"
)
class Run(BaseSettings):
io_type: IoType = "parquet"
log_timings: bool = True
show_results: bool = False
check_results: bool = False # Only available for SCALE_FACTOR=1
polars_show_plan: bool = False
polars_eager: bool = os.environ.get("POLARS_EAGER", 0)
polars_gpu: bool = os.environ.get("POLARS_GPU", 0)
polars_streaming: bool = os.environ.get("POLARS_STREAMING", 0)
polars_gpu_device: int = 0 # The GPU device to run on for polars GPU
# Which style of GPU memory resource to use
# cuda -> cudaMalloc
# cuda-pool -> Pool suballocator wrapped around cudaMalloc
# managed -> cudaMallocManaged
# managed-pool -> Pool suballocator wrapped around cudaMallocManaged
# cuda-async -> cudaMallocAsync (comes with pool)
# See https://docs.rapids.ai/api/rmm/stable/ for details on RMM memory resources
use_rmm_mr: Literal[
"cuda", "cuda-pool", "managed", "managed-pool", "cuda-async"
] = "cuda-async"
modin_memory: int = 8_000_000_000 # Tune as needed for optimal performance
spark_driver_memory: str = "2g" # Tune as needed for optimal performance
spark_executor_memory: str = "1g" # Tune as needed for optimal performance
spark_log_level: str = "ERROR"
@computed_field # type: ignore[misc]
@property
def include_io(self) -> bool:
return self.io_type != "skip"
model_config = SettingsConfigDict(
env_prefix="run_", env_file=".env", extra="ignore"
)
class Plot(BaseSettings):
show: bool = False
n_queries: int = 22
y_limit: float | None = None
model_config = SettingsConfigDict(
env_prefix="plot_", env_file=".env", extra="ignore"
)
class Settings(BaseSettings):
scale_factor: float = 1.0
paths: Paths = Paths()
plot: Plot = Plot()
run: Run = Run()
@computed_field # type: ignore[misc]
@property
def dataset_base_dir(self) -> Path:
return self.paths.tables / f"scale-{self.scale_factor}"
model_config = SettingsConfigDict(env_file=".env", extra="ignore")