-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
1592 lines (1396 loc) · 66.4 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
import argparse
import configparser
import datetime
import enum
import fnmatch
import getpass
import json
import logging
import math
import os
import platform
import pprint
import random
import re
import statistics
import subprocess
import sys
import time
import webbrowser
from abc import ABC, abstractmethod
from dataclasses import dataclass
from glob import glob
from typing import Any, Generator
import dateutil.parser as dateutil_parser
import docker
import prometheus_client
import prometheus_client.parser as prometheus_parser
import psutil
import requests
import yaml
from service import schemas
from benchmark_service_client import BenchmarkServiceClient
from github_client import GithubClient
logger = logging.getLogger(__name__)
WARMUP_ITER = 1
# Any failed query whose response contains one of those strings will
# be retried indefinitely.
RETRY_ON_FAILED_RESPONSE_SUBSTR = [
"there are no available searcher nodes in the pool"
]
# Number of retries for errors that don't match
# RETRY_ON_FAILED_RESPONSE_SUBSTR.
NUM_QUERY_RETRIES = 4
# File where the JWT token will be cached across invocation of this
# tool.
JWT_TOKEN_FILENAME = "~/.jwt_token_benchmark_service.txt"
# Path to the optional runner config file. If it exists, it should be
# an ini config file with sections:
# - a 'paths' section mapping path placeholder names (e.g. 'qwdata')
# to the actual data path to use. Used for resolving placeholders in
# --engine-data-dir.
# - an 'engine_env' section with env variables to pass to the engine
# at startup.
RUNNER_CONFIG_FILENAME = "~/.qw_benchmarks_runner.txt"
AUTODETECT_GCP_INSTANCE_PLACEHOLDER = '{autodetect_gcp}'
class BenchType(enum.StrEnum):
INDEXING = "indexing"
SEARCH = "search"
def read_runner_config(runner_config_path: str):
parser = configparser.ConfigParser()
if not parser.read(os.path.expanduser(runner_config_path)):
raise ValueError(
f"Runner config ({runner_config_path}) could not be opened.")
return parser
def resolve_instance(instance_or_placeholder: str | None) -> str | None:
if instance_or_placeholder == AUTODETECT_GCP_INSTANCE_PLACEHOLDER:
try:
return requests.get(
"http://metadata.google.internal/computeMetadata/v1/instance/machine-type",
headers={"Metadata-Flavor": "Google"}).text.split('/')[-1]
except requests.exceptions.RequestException as ex:
logging.info("Could not get GCP machine type: %s", ex)
return "GCP_UNKNOWN"
return instance_or_placeholder
def resolve_engine_data_dir(engine: str, data_dir: str | None, runner_config_path: str) -> str | None:
"""Returns the data dir to use for an engine.
Args:
engine: name of the engine, e.g. quickwit.
data_dir: Optional data directory to use for the engine. It can
be a placholder, e.g. '{qwdata}' that will then be resolved
using the config.
runner_config_path: Path to an ini config file containing a 'paths'
section mapping placeholder names (e.g. 'qwdata') to the
actual data path to use.
Returns:
The data path to use for the engine.
"""
if not data_dir:
return os.path.join(os.getcwd(), "engines", engine, "data")
if data_dir[0] != '{' or data_dir[-1] != '}':
return os.path.abspath(os.path.expanduser(data_dir))
# Placeholder that must be resolved using the config.
try:
config = read_runner_config(runner_config_path)
except ValueError as ex:
raise ValueError(
f"Path placeholder was passed ({data_dir}) but the config to resolve placeholders could not be opened: {ex}.")
try:
return config.get("paths", data_dir[1:-1])
except configparser.NoOptionError as ex:
raise ValueError(
f"Path placeholder was passed ({data_dir}) but the config "
f"({runner_config_path}) did not include a mapping for this placeholder")
def resolve_engine_config_filename(engine: str, config_filename: str | None) -> str:
if engine != "quickwit":
raise ValueError("Only quickwit is supported in resolve_engine_config_filename()")
if config_filename:
return os.path.abspath(os.path.expanduser(config_filename))
return os.path.join(os.getcwd(), "engines", engine, "configs", "quickwit.yaml")
def get_engine_env(runner_config_path: str) -> dict[str, str]:
"""Returns additional engine env variables if present in the runner cfg."""
try:
config = read_runner_config(runner_config_path)
except ValueError as ex:
logging.info(f"Could not read runner config {runner_config_path}. This is not necessarily a problem. Original exception {ex}")
return {}
if "engine_env" not in config:
return {}
return {key.upper(): value for key, value in config.items("engine_env")}
def get_docker_info(container_name: str):
client = docker.from_env()
try:
container = client.containers.get(container_name)
except docker.errors.NotFound as ex:
logging.error("Could not fetch the docker container: %s", ex)
return {}
return {"image_label": container.attrs["Config"]["Image"],
"image_hash": container.attrs["Image"]}
def find_process(process_name, cmdline_component: str | None = None) -> psutil.Process | None:
"""Finds a process by name and optionnaly cmdline component."""
process = None
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
if cmdline_component is not None and cmdline_component not in proc.cmdline():
continue
if proc.name() != process_name:
continue
if process:
raise ValueError(
f'Found multiple processes with name {process_name}: {process.pid}, {proc.pid}')
process = proc
return process
@dataclass
class WatchedMetric:
name: str # e.g "object_storage_fetch_requests"
labels: dict[str, str] # e.g. {'operation': 'GET', 'status_code': '200'}
factor: float = 1.
def sample_matches(self, sample: prometheus_client.samples.Sample) -> bool:
if self.name != sample.name: return False
for label_name, label_value in self.labels.items():
if sample.labels.get(label_name) != label_value:
return False
return True
class ProcessMonitor:
"""Reports process metrics between the start and stop of the monitor.
Example:
monitor = ProcessMonitor(
process_name='loki', metrics_addr='localhost:3100/metrics',
watched_metrics={
'object_storage_fetch_requests': WatchedMetric(
name='loki_gcs_request_duration_seconds_count',
labels={'operation': 'GET', 'status_code': '200'}),
})
monitor.start()
# ...Perfom a loki query...
monitor.get_stats_since_start()
will return a dict with the following stats since monitor.start() was called:
- 'total_cpu_time_s' with the total CPU time (user+system) of the process
with name 'loki'.
- 'peak_memory_megabytes' with the peak resident memory of the
process. Drawing conclusions from this metric should be done
with care, e.g. the process's allocator might not have released
previously allocated memory to the OS.
- 'object_storage_fetch_requests' with the diff of the Prometheus metric
'loki_gcs_request_duration_seconds_count' with labels
{'operation': 'GET', 'status_code': '200'}.
"""
def __init__(self, process_id=None, process_name=None, metrics_addr=None,
watched_metrics: dict[str, WatchedMetric] | None = None,
fine_grained_cpu_metrics=False):
if bool(process_id) == bool(process_name):
raise ValueError('Either process_id or process_name should be specified')
if not process_id:
process = find_process(process_name)
if process is None:
raise ValueError(
f"Can't monitor a process that was not found {process_name=}")
process_id = process.pid
self.process = psutil.Process(process_id)
self.metrics_addr = metrics_addr
self.watched_metrics = watched_metrics
self._metrics_values = {}
self._cpu_times = None
self._reset_vm_hwm_success = True
self._docker_client = docker.from_env()
self._fine_grained_cpu_metrics = fine_grained_cpu_metrics
def _read_metrics(self):
if not self.metrics_addr or not self.watched_metrics:
return {}
metrics = {}
for family in prometheus_parser.text_string_to_metric_families(
requests.get(self.metrics_addr).text):
for sample in family.samples:
for name, watched in self.watched_metrics.items():
if watched.sample_matches(sample):
metrics[name] = sample.value * watched.factor
return metrics
def _get_docker_container_id(self) -> str | None:
"""Return the container ID of the process."""
# See man cgroups and
# https://docs.docker.com/config/containers/runmetrics/#find-the-cgroup-for-a-given-container.
with open(f'/proc/{self.process.pid}/cgroup', 'r') as process_cgroup:
match = re.search(r"docker-(?P<containerid1>.*)\.scope|docker/(?P<containerid2>.*)",
process_cgroup.read().split(":")[-1])
if not match:
return None
return match.group("containerid1") or match.group("containerid2")
def _reset_vm_hwm(self) -> bool:
"""Reset VmHWM for the process in /proc. See `man proc` for details."""
# /proc/$PID/clear_refs is only writeable by the owner of the
# process, which in case of an engine running in a docker
# container is typically...root.
# In that case, we run the commands inside the docker
# container. This is annoying as we also need to translate
# between between the PID namespace of the host and of the
# docker container, and because available commands inside a
# container are typically limited.
container_id = self._get_docker_container_id()
if container_id:
container = self._docker_client.containers.get(container_id)
# This will return a \n separated list of /proc/PID/status
# files for the processes matching self.process.name()
# inside the container.
grep_result = container.exec_run(
# We don't use pgrep or fancier tools, as they are
# often not available in a docker image.
["sh", "-c", r"grep -l -s -e '^Name:\s*" + self.process.name() + r"$' /proc/*/status"])
matching_processes_status = grep_result.output.decode("ascii").strip().split("\n")
if grep_result.exit_code != 0 or not matching_processes_status:
logging.error("Failed to get processes with name %s inside container %s",
self.process.name(), container_id)
return False
if len(matching_processes_status) > 1:
# Typically java, as the memory usage is not very
# representative because of xms, xmx, we don't
# disambiguate using the command line.
logging.error(
("Found multiple processes with name name '%s' inside container '%s'."
"Cannot reset VmHWM and won't report peak memory usage"),
self.process.name(), container_id)
return False
# Finally, reset VmHWM inside the container.
clear_refs_result = container.exec_run(
["sh", "-c", "echo 5 > " + matching_processes_status[0].replace("/status", "/clear_refs")])
if clear_refs_result.exit_code != 0:
logging.error("Failed to reset VmHWM of process with name name %s inside container %s",
self.process.name(), container_id)
return False
return True
else: # Not running in docker.
with open(f'/proc/{self.process.pid}/clear_refs', 'w') as clear_refs:
clear_refs.write("5\n")
return True
def _get_vm_hwm_megabytes(self) -> float | None:
"""Read /proc/pid/status to get the VmHWM (see man proc)."""
with open(f'/proc/{self.process.pid}/status', 'r') as status:
match = re.search(r"VmHWM:\s*(?P<size>\d*)\s*kB", status.read())
if match:
return int(match.group("size")) / 1024
return None
def start(self):
self._reset_vm_hwm_success = self._reset_vm_hwm()
self._cpu_times = self.process.cpu_times()
self._metrics_values = self._read_metrics()
return self
def get_stats_since_start(self) -> dict[str, float]:
if self._cpu_times is None:
raise ValueError(f"{self} was not started.")
cpu_times = self.process.cpu_times()
def total_cpu_time(cpu_times):
return (cpu_times.user + cpu_times.system +
cpu_times.children_user + cpu_times.children_system)
stats = {
'total_cpu_time_s': max(
total_cpu_time(cpu_times) - total_cpu_time(self._cpu_times),
0)
}
if self._fine_grained_cpu_metrics:
stats.update({
'user_cpu_time_s': max(cpu_times.user - self._cpu_times.user, 0),
'system_cpu_time_s': max(cpu_times.system - self._cpu_times.system, 0),
'children_user_cpu_time_s': max(cpu_times.children_user - self._cpu_times.children_user, 0),
'children_system_cpu_time_s': max(cpu_times.children_system - self._cpu_times.children_system, 0),
})
vm_hwm = self._get_vm_hwm_megabytes()
if self._reset_vm_hwm_success and vm_hwm is not None:
stats['peak_memory_megabytes'] = vm_hwm
for name, new_v in self._read_metrics().items():
stats[name] = new_v - self._metrics_values[name]
return stats
def __repr__(self) -> str:
return f"ProcessMonitor({self.process=}, {self.metrics_addr=}, {self.watched_metrics=})"
class Query(object):
def __init__(self, name, query):
self.name = name
self.query = query
@dataclass
class IndexInfo:
engine_index_info: dict
index_uid: str | None
class SearchClient(ABC):
@abstractmethod
def query(self, index: str, query):
raise NotImplementedError
@abstractmethod
def engine_info(self) -> dict[str, Any]:
raise NotImplementedError
@abstractmethod
def commit_hash(self) -> str | None:
raise NotImplementedError
@property
@abstractmethod
def docker_container_name(self) -> str:
"""The name of the docker container running this engine."""
raise NotImplementedError
@abstractmethod
def create_started_monitor(self) -> ProcessMonitor:
"""Creates a ProcessMonitor and starts it."""
raise NotImplementedError
@abstractmethod
def index_info(self, index_name: str) -> IndexInfo | None:
raise NotImplementedError
def get_reference_run(
bench_service_client: BenchmarkServiceClient,
github_client: GithubClient,
github_pr: str | int,
current_run_info: schemas.RunInfo,
reference_branch: str = "main",
reference_tag: str = "push_main") -> schemas.SearchRun | None:
"""Find a reference benchmark run to compare `current_run` to.
This will typically be the appropriate run with tag
`reference_tag` that ran on an engine built at the reference commit
which is the most recent common ancestor between
`reference_branch` and the HEAD of the pull request `github_pr`.
However, that run might not be available (e.g. the github workflow
does not run on every commits, or the run might have failed). In
that case, we get the most recent commits from github, and find
the most recent runs on those commits.
Args:
bench_service_client:
github_client:
github_pr: Pull request ID of this benchmark run.
current_info_run: Info of the current benchmark run.
reference_branch: The branch to which we want a comparison.
reference_tag: We'll look for this tag in the reference run.
"""
pr_head = github_client.get_pull_request_head(github_pr)
logging.debug("Github PR HEAD: %s", pr_head)
if not pr_head:
return
reference_commit = github_client.get_merge_base(pr_head, reference_branch)
logging.debug("Reference commit: %s", reference_commit)
if not reference_commit:
return
base_run_filter = {
"run_type": current_run_info.run_type,
"track": current_run_info.track,
"engine": current_run_info.engine,
"storage": current_run_info.storage,
"instance": current_run_info.instance,
"tag": reference_tag,
# Only use runs triggered by github workflow as reference.
"source": "github_workflow",
}
# Simple case: we have a bench run on the exact base commit.
previous_run_info = bench_service_client.list_runs(
schemas.ListRunsRequest.model_validate(
base_run_filter | {"commit_hash_list": [reference_commit]}))
if previous_run_info:
logging.debug("Found directly a ref run on ref commit: %s",
previous_run_info[0])
return bench_service_client.get_run(previous_run_info[0].id)
# We get recent commits and try to find the most recent runs on
# those commits.
previous_ref_commits = github_client.get_commits(reference_commit)
if previous_ref_commits is None:
return
run_filter = base_run_filter | {
"commit_hash_list": previous_ref_commits,
}
previous_run_infos = bench_service_client.list_runs(
schemas.ListRunsRequest.model_validate(run_filter))
logging.debug("Ref run candidates: %s, filter: %s",
previous_run_infos, run_filter)
if not previous_run_infos:
return
ref_commits_to_prio = {commit: prio for prio, commit in enumerate(previous_ref_commits)}
reference_run_info = min(
previous_run_infos,
key=lambda run_info: ref_commits_to_prio.get(run_info.commit_hash, 1e9))
return bench_service_client.get_run(reference_run_info.id)
@dataclass
class RunComparison:
search_latency_ratio: float | None = None
error_msg: str | None = None
def compare_runs(reference_run: schemas.SearchRun | None,
current_run: schemas.SearchRun | None) -> RunComparison:
"""Compare two runs and returns a perf ratio.
Returns:
Geometric average of Search Latency ratios. This reproduces the
formula in web/src/index.js (generateDataView()) to compute a
ratio metric between the performance of two engines.
Re-implementing this logic here is unfortunately needed to to
report this metric in a github workflow (and we are not going to
execute js from py just for this function...).
"""
if reference_run is None or current_run is None:
return RunComparison(error_msg="No reference run found")
# A list of mapping from query name to the median engine duration.
queries: list[dict[str, float]] = [{}, {}]
for i, run in enumerate((reference_run, current_run)):
for query in run.run_results.queries:
# TODO: We could report additional metrics than just engine_duration.
if not query.engine_duration.values: continue
queries[i][query.name] = statistics.median(query.engine_duration.values)
ref_query_names = set(queries[0].keys())
current_query_names = set(queries[1].keys())
if ref_query_names != current_query_names:
return RunComparison(error_msg=(
f"Not the same queries, cannot compare, "
f"difference: {ref_query_names ^ current_query_names}"))
if not ref_query_names:
return RunComparison(error_msg="No queries to compare, cannot compare runs")
sum_log_ratios = 0
for query_name in ref_query_names:
# Microseconds -> milliseconds
ref_ms = queries[0][query_name] / 1000
current_ms = queries[1][query_name] / 1000
ratio = ((current_ms + 10) / (ref_ms + 10)) if abs(ref_ms - current_ms) >= 3 else 1
sum_log_ratios += math.log(ratio)
return RunComparison(search_latency_ratio=math.exp(sum_log_ratios / len(ref_query_names)))
def export_results(bench_service_client: BenchmarkServiceClient,
args: argparse.Namespace,
results: dict[str, Any],
results_type: str,
exporter_token: str | None,
url_file: str | None = None):
"""Exports bench results to the benchmark service.
If this is a run triggered by a pull request (typically as part of
a Github workflow), this also performs a comparison to an
appropriate baseline, and stores the comparison link and message
to `url_file` (typically set to $GITHUB_OUTPUT in a github workflow).
"""
results = results.copy()
info_fields = {
'track', 'engine', 'storage', 'instance', 'tag', 'unsafe_user',
'source', 'commit_hash', 'index_uid',
'github_pr',
'github_workflow_user',
'github_workflow_run_id',
}
run_info = {k: results.pop(k) for k in info_fields}
run_results = results
# See CreateIndexingRunRequest / CreateSearchRunResults in service/schemas.py.
request = {
'run': {
'run_info': run_info,
'run_results': run_results,
}
}
run_info = bench_service_client.export_run(request, results_type, exporter_token)
if not run_info:
logging.error("Failed to export run")
return
run_id = run_info.id
run_url = bench_service_client.build_url_for_run_ids([run_id])
color = '' if os.environ.get("NO_COLOR") else '\033[92m'
logging.info(f'Exported results to {bench_service_client.endpoint}: {run_info}\n{color}Results can be seen at address: {run_url} \033[0m')
ref_run = None
comparison = None
if (results_type == "search" and args.github_pr is not None and
args.comparison_reference_tag is not None and
args.comparison_reference_branch is not None):
# Compare against a reference.
ref_run = get_reference_run(
bench_service_client,
GithubClient(github_owner=args.github_owner,
github_repo=args.github_repo),
github_pr=args.github_pr,
current_run_info=run_info,
reference_branch=args.comparison_reference_branch,
reference_tag=args.comparison_reference_tag)
comparison = compare_runs(ref_run, bench_service_client.get_run(run_id))
comparison_url = (bench_service_client.build_url_for_run_ids([run_id, ref_run.run_info.id])
if ref_run else run_url)
comparison_text = None
if not ref_run:
comparison_text = "Reference bench run not found"
elif comparison.error_msg:
comparison_text = "ERROR: " + comparison.error_msg
else:
comparison_text = (
f"Average search latency is {comparison.search_latency_ratio:.3}x that "
f"of the reference (lower is better).<br/>Ref run id: {ref_run.run_info.id}, "
f"ref commit: {ref_run.run_info.commit_hash}<br/>"
f"[Link]({comparison_url})")
export_to_url_file = {
"url": run_url,
"comparison_url": comparison_url,
"comparison_text": comparison_text,
}
logging.info("Comparison results:\n%s", pprint.pformat(export_to_url_file))
# This will typically be $GITHUB_OUTPUT for easily getting the URL
# from a github workflow.
if url_file:
with open(url_file, 'a') as out:
for k, v in export_to_url_file.items():
out.write(f"{k}={v}\n")
def get_common_debug_info(engine_client: SearchClient, index_name: str):
index_info: IndexInfo | None = engine_client.index_info(index_name)
return {
"command_line": ' '.join(sys.argv),
"unsafe_user": getpass.getuser(),
"engine_info": engine_client.engine_info(),
"commit_hash": engine_client.commit_hash(),
"docker_info": get_docker_info(engine_client.docker_container_name),
"platform_uname": ' '.join(platform.uname()),
"index_info": index_info.engine_index_info if index_info else None,
"index_uid": index_info.index_uid if index_info else None,
}
class ElasticClient(SearchClient):
def __init__(self, endpoint="http://127.0.0.1:9200", no_hits=False,
docker_container_name: str = "elasticsearch"):
self.no_hits = no_hits
self.root_api = endpoint
self._docker_container_name = docker_container_name
def create_index(self, index, config_json: str):
response = requests.put(f"{self.root_api}/{index}", data=config_json, headers={"Content-Type": "application/json"})
if response.status_code != 200:
raise Exception("Error while creating index", response.text)
return response.json()
def delete_index(self, index: str):
response = requests.delete(f"{self.root_api}/{index}")
if response.status_code != 200:
raise Exception("Error while deleting index", response.text)
return response.json()
def check_index_exists(self, index: str):
response = requests.get(f"{self.root_api}/{index}")
if response.status_code == 404:
return False
if response.status_code != 200:
raise Exception("Error while checking index", response.text)
return True
def create_started_monitor(self) -> ProcessMonitor:
process = find_process(
"java",
cmdline_component=(
"org.opensearch.bootstrap.OpenSearch"
if self._docker_container_name == "opensearch-node"
else "org.elasticsearch.server/org.elasticsearch.bootstrap.Elasticsearch"))
if process is None:
raise ValueError(
f"Can't monitor a process that was not found for {self._docker_container_name=}")
return ProcessMonitor(process_id=process.pid).start()
def query(self, index: str, query, extra_url_component=None):
if self.no_hits:
query["size"] = 0
url = self.root_api
if extra_url_component:
url += '/' + extra_url_component
monitor = self.create_started_monitor()
try:
response = requests.post(f"{url}/{index}/_search", json=query)
response.raise_for_status()
except requests.exceptions.RequestException as ex:
print("Error while querying", query, ex)
return {
"num_hits": 0,
"elapsed_time_micros": -1,
"response_status_code": ex.response.status_code if ex.response else -1,
"response": str(ex),
}
monitor_stats = monitor.get_stats_since_start()
data = response.json()
return {
"num_hits": data["hits"]["total"]["value"] if "total" in data["hits"] else 0,
"elapsed_time_micros": data["took"] * 1000
} | monitor_stats
def engine_info(self):
response = requests.get(f"{self.root_api}/")
if response.status_code != 200:
raise Exception(
f"Error while checking basic info {response.status_code=} {response.text=}")
return response.json()
def commit_hash(self) -> str | None:
return self.engine_info().get("version", {}).get("build_hash")
@property
def docker_container_name(self) -> str:
"""The name of the docker container running this engine."""
return self._docker_container_name
def index_info(self, index_name: str) -> IndexInfo | None:
response = requests.get(f"{self.root_api}/{index_name}")
if response.status_code != 200:
return None
from_engine = response.json().get(index_name)
return IndexInfo(engine_index_info=from_engine,
index_uid=from_engine.get("settings", {}).get("index", {}).get("uuid"))
class QuickwitClient(ElasticClient):
def __init__(self, endpoint="http://127.0.0.1:7280/api/v1", no_hits=False):
super().__init__(endpoint=endpoint, no_hits=no_hits)
def create_index(self, index: str, config_yaml: str):
response = requests.post(f"{self.root_api}/indexes", data=config_yaml, headers={"Content-Type": "application/yaml"})
if response.status_code != 200:
raise Exception("Error while creating index", response.text)
return response.json()
def delete_index(self, index: str):
response = requests.delete(f"{self.root_api}/indexes/{index}")
if response.status_code != 200:
raise Exception("Error while deleting index", response.text)
return response.json()
def check_index_exists(self, index: str):
response = requests.get(f"{self.root_api}/indexes/{index}")
if response.status_code == 404:
return False
if response.status_code != 200:
raise Exception("Error while checking index", response.text)
return True
def index_info(self, index_name: str) -> IndexInfo | None:
response = requests.get(f"{self.root_api}/indexes/{index_name}")
if response.status_code != 200:
return None
engine_index_info = response.json()
if not engine_index_info:
return None
return IndexInfo(engine_index_info=engine_index_info,
index_uid=engine_index_info.get("index_uid"))
def create_started_monitor(self) -> ProcessMonitor:
# TODO: Improve hack.
metrics_url = self.root_api.removesuffix('/api/v1') + '/metrics'
return ProcessMonitor(
process_name='quickwit',
metrics_addr=metrics_url,
watched_metrics={
'object_storage_fetch_requests': WatchedMetric(
name='quickwit_storage_object_storage_gets_total',
labels={}),
'object_storage_put_requests': WatchedMetric(
name='quickwit_storage_object_storage_puts_total',
labels={}),
'object_storage_download_megabytes': WatchedMetric(
name='quickwit_storage_object_storage_download_num_bytes_total',
labels={},
# bytes to megabytes.
factor=1. / (2 ** 20)),
'object_storage_upload_megabytes': WatchedMetric(
name='quickwit_storage_object_storage_upload_num_bytes_total',
labels={},
# bytes to megabytes.
factor=1. / (2 ** 20)),
}).start()
def query(self, index: str, query):
monitor = self.create_started_monitor()
results = super().query(index, query, extra_url_component='_elastic')
monitor_stats = monitor.get_stats_since_start()
return results | monitor_stats
def engine_info(self):
response = requests.get(f"{self.root_api}/version")
if response.status_code != 200:
raise Exception(
f"Error while checking basic info {response.status_code=} {response.text=}")
return response.json()
def commit_hash(self) -> str | None:
return self.engine_info().get("build", {}).get("commit_hash")
@property
def docker_container_name(self) -> str:
"""The name of the docker container running this engine."""
return "quickwit"
class LokiClient(SearchClient):
def __init__(self, endpoint="http://127.0.0.1:3100", no_hits=False) -> None:
self.no_hits = no_hits
self.root_api = endpoint
def create_index(self, index, config_json: str):
return
def delete_index(self, index: str):
response = requests.post(f'{self.root_api}/loki/api/v1/delete?query={{label="benchmark"}}&start=2000-01-08T22:15:32.000Z', data={}, headers={"Content-Type": "application/json"})
if response.status_code != 200:
raise Exception("Error while deleting index", response.text)
return response.json()
def check_index_exists(self, index: str):
answer = input(f"You have to delete the data in loki yourself (e.g. delete data folder) because loki has a unusable deletion API, confirm (y/n)")
if answer.lower() in ["n","no"]:
raise Exception("Need to confirm")
return False
def create_started_monitor(self) -> ProcessMonitor:
return ProcessMonitor(
process_name='loki', metrics_addr=f'{self.root_api}/metrics',
watched_metrics={
'object_storage_fetch_requests': WatchedMetric(
name='loki_gcs_request_duration_seconds_count',
labels={'operation': 'GET', 'status_code': '200'}),
'object_storage_put_requests': WatchedMetric(
name='loki_gcs_request_duration_seconds_count',
labels={'operation': 'PUT', 'status_code': '200'}),
}).start()
def query(self, index: str, query):
del index # Loki does not have the concept of an index.
# Sanity check.
if 'query' not in query:
raise ValueError(f'Expected the json query to have a "query" field. Got {query}')
monitor = self.create_started_monitor()
response = requests.get(f"{self.root_api}/loki/api/v1/query_range", params=query)
monitor_stats = monitor.get_stats_since_start()
if response.status_code != 200:
print("Error while querying", query, response.text)
return {
"num_hits": 0,
"elapsed_time_micros": -1,
}
data = response.json()
# For reference, data["data"]["stats"]["summary"] contains:
# "bytesProcessedPerSecond": 75177670,
# "linesProcessedPerSecond": 91689,
# "totalBytesProcessed": 909287,
# "totalLinesProcessed": 1109,
# "execTime": 0.012095,
# "queueTime": 0.0008,
# "subqueries": 0,
# "totalEntriesReturned": 2,
# "splits": 5,
# "shards": 0,
# "totalPostFilterLines": 1109,
# "totalStructuredMetadataBytesProcessed": 0
return {
# Not really the number of hits, but the best we have.
"num_hits": data["data"]["stats"]["summary"]["totalEntriesReturned"],
# "execTime" is in seconds.
"elapsed_time_micros": data["data"]["stats"]["summary"]["execTime"] * 1000_000,
} | monitor_stats
def engine_info(self):
response = requests.get(f"{self.root_api}/loki/api/v1/status/buildinfo")
if response.status_code != 200:
raise Exception(
f"Error while checking basic info {response.status_code=} {response.text=}")
return response.json()
def commit_hash(self) -> str | None:
return self.engine_info().get("revision")
@property
def docker_container_name(self) -> str:
"""The name of the docker container running this engine."""
return "loki"
def index_info(self, index_name: str) -> IndexInfo | None:
del index_name # unused
# Loki does not have the concept of an index.
return None
class QuickwitDatafusionClient(SearchClient):
"""Client for the Datafusion SQL API of Quickwit."""
def __init__(self, endpoint="http://127.0.0.1:7289/api/v1",
no_hits=False) -> None:
self.no_hits = no_hits
self.endpoint = endpoint
def create_index(self, index, config_json: str):
del index
del config_json
raise Exception("Not supported")
def delete_index(self, index: str):
del index
raise Exception("Not supported")
def check_index_exists(self, index: str):
del index
raise Exception("Not supported")
def create_started_monitor(self) -> ProcessMonitor:
return ProcessMonitor(process_name='datafusion-quickwit').start()
def query(self, index: str, query):
monitor = self.create_started_monitor()
start = time.monotonic()
query["only_count"] = True
query["row_limit"] = 1_000_000_000
try:
response = requests.post(f"{self.endpoint}/run_sql_query", json=query)
response.raise_for_status()
except requests.exceptions.RequestException as ex:
print("Error while querying", query, ex)
return {
"num_hits": 0,
"elapsed_time_micros": -1,
"response_status_code": ex.response.status_code if ex.response else -1,
"response": str(ex),
}
monitor_stats = monitor.get_stats_since_start()
duration = int((time.monotonic() - start) * 1e6)
data = response.json()
return {
# Not really the number of "hits", but the best we have.
"num_hits": data["num_rows"],
# For now, quickwit datafusion does not report the engine duration, use the best we can.
"elapsed_time_micros": duration,
} | monitor_stats
def engine_info(self):
return {}
def commit_hash(self) -> str | None:
return None
@property
def docker_container_name(self) -> str:
"""The name of the docker container running this engine."""
return "datafusion-quickwit"
def index_info(self, index_name: str) -> IndexInfo | None:
del index_name # unused
return None
def drive(index: str, queries: list[Query], client: SearchClient) -> Generator[dict[str, Any], None, None]:
for query in queries:
tries = 0
while True:
start = time.monotonic()
result = client.query(index, query.query)
tries += 1
stop = time.monotonic()
if result.get("response_status_code", 200) == 200:
# Success, no need to retry.
break
else: # Failure
if any([sub in result.get("response", "")
for sub in RETRY_ON_FAILED_RESPONSE_SUBSTR]):
logging.info(
"Retrying query %s because the engine does not "
"seem ready to take requests.",
query.name)
continue
if tries <= NUM_QUERY_RETRIES:
logging.info("Retrying query %s (try %d of %d)",
query.name, tries, NUM_QUERY_RETRIES + 1)
continue
logging.info("Not retrying failed query %s", query.name)
break
# This could move under the ProcessMonitor and we could get rid of this drive() function.
duration = int((stop - start) * 1e6)
yield result | {'query': query, 'duration': duration}
def read_queries(queries_dir: str, query_filter: str) -> Generator[Query, None, None]:
query_files = sorted(glob("{queries_dir}/*.json".format(queries_dir=queries_dir)))
for q_filepath in query_files:
query_name, _ = os.path.splitext(os.path.basename(q_filepath))
if not fnmatch.fnmatch(query_name, query_filter):
continue
try:
query_json = json.load(open(q_filepath))
except Exception as ex:
raise ValueError(f'Error with query in path {q_filepath}: {ex}')
yield Query(query_name, query_json)
def get_engine_client(engine: str, no_hits: bool) -> SearchClient:
if engine == "quickwit":
return QuickwitClient(no_hits=no_hits)
elif engine == "quickwit-datafusion":
return QuickwitDatafusionClient(no_hits=no_hits)
elif engine == "loki":
return LokiClient(endpoint="http://127.0.0.1:3100", no_hits=no_hits)
elif engine == "opensearch":
return ElasticClient(endpoint="http://127.0.0.1:9301", no_hits=no_hits,
docker_container_name="opensearch-node")
elif engine == "elasticsearch":
return ElasticClient(endpoint="http://127.0.0.1:9200", no_hits=no_hits)
else:
raise ValueError(f"Unknown engine {engine}")