-
Notifications
You must be signed in to change notification settings - Fork 453
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Align all parquet BEIR yaml configs with jsonl versions (#2612)
{flat, hnsw} x {fp32, int8}: aligned scores, tolerances, etc.
- Loading branch information
Showing
232 changed files
with
11,977 additions
and
4,408 deletions.
There are no files selected for viewing
60 changes: 60 additions & 0 deletions
60
...n/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.flat-int8.cached.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
--- | ||
corpus: beir-v1.0.0-arguana.bge-base-en-v1.5 | ||
corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/arguana.parquet | ||
|
||
index_path: indexes/lucene-flat-int8.beir-v1.0.0-arguana.bge-base-en-v1.5/ | ||
index_type: flat | ||
collection_class: ParquetDenseVectorCollection | ||
generator_class: ParquetDenseVectorDocumentGenerator | ||
index_threads: 16 | ||
index_options: -quantize.int8 | ||
|
||
metrics: | ||
- metric: nDCG@10 | ||
command: bin/trec_eval | ||
params: -c -m ndcg_cut.10 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
- metric: R@100 | ||
command: bin/trec_eval | ||
params: -c -m recall.100 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
- metric: R@1000 | ||
command: bin/trec_eval | ||
params: -c -m recall.1000 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
|
||
topic_reader: JsonStringVector | ||
topics: | ||
- name: "BEIR (v1.0.0): ArguAna" | ||
id: test | ||
path: topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.gz | ||
qrel: qrels.beir-v1.0.0-arguana.test.txt | ||
|
||
models: | ||
- name: bge-flat-int8-cached | ||
display: BGE-base-en-v1.5 | ||
type: flat | ||
params: -hits 1000 -removeQuery -threads 16 | ||
results: | ||
nDCG@10: | ||
- 0.6361 | ||
R@100: | ||
- 0.9915 | ||
R@1000: | ||
- 0.9964 | ||
tolerance: | ||
nDCG@10: | ||
- 0.001 | ||
R@100: | ||
- 0.001 | ||
R@1000: | ||
- 0.001 |
60 changes: 60 additions & 0 deletions
60
...ain/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
--- | ||
corpus: beir-v1.0.0-arguana.bge-base-en-v1.5 | ||
corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/arguana.parquet | ||
|
||
index_path: indexes/lucene-flat-int8.beir-v1.0.0-arguana.bge-base-en-v1.5/ | ||
index_type: flat | ||
collection_class: ParquetDenseVectorCollection | ||
generator_class: ParquetDenseVectorDocumentGenerator | ||
index_threads: 16 | ||
index_options: -quantize.int8 | ||
|
||
metrics: | ||
- metric: nDCG@10 | ||
command: bin/trec_eval | ||
params: -c -m ndcg_cut.10 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
- metric: R@100 | ||
command: bin/trec_eval | ||
params: -c -m recall.100 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
- metric: R@1000 | ||
command: bin/trec_eval | ||
params: -c -m recall.1000 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
|
||
topic_reader: TsvString | ||
topics: | ||
- name: "BEIR (v1.0.0): ArguAna" | ||
id: test | ||
path: topics.beir-v1.0.0-arguana.test.tsv.gz | ||
qrel: qrels.beir-v1.0.0-arguana.test.txt | ||
|
||
models: | ||
- name: bge-flat-int8-onnx | ||
display: BGE-base-en-v1.5 | ||
type: flat | ||
params: -encoder BgeBaseEn15 -hits 1000 -removeQuery -threads 16 | ||
results: | ||
nDCG@10: | ||
- 0.6361 | ||
R@100: | ||
- 0.9915 | ||
R@1000: | ||
- 0.9964 | ||
tolerance: | ||
nDCG@10: | ||
- 0.02 | ||
R@100: | ||
- 0.03 | ||
R@1000: | ||
- 0.004 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
83 changes: 45 additions & 38 deletions
83
src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.flat.onnx.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,53 +1,60 @@ | ||
--- | ||
corpus: beir-v1.0.0-arguana.bge-base-en-v1.5 | ||
corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/arguana.parquet | ||
|
||
index_path: indexes/parquet/arguana | ||
index_path: indexes/lucene-flat.beir-v1.0.0-arguana.bge-base-en-v1.5/ | ||
index_type: flat | ||
collection_class: ParquetDenseVectorCollection | ||
generator_class: ParquetDenseVectorDocumentGenerator | ||
index_threads: 16 | ||
index_options: "" | ||
|
||
metrics: | ||
- metric: nDCG@10 | ||
command: bin/trec_eval | ||
params: -c -m ndcg_cut.10 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
- metric: R@100 | ||
command: bin/trec_eval | ||
params: -c -m recall.100 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
- metric: R@1000 | ||
command: bin/trec_eval | ||
params: -c -m recall.1000 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
- metric: nDCG@10 | ||
command: bin/trec_eval | ||
params: -c -m ndcg_cut.10 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
- metric: R@100 | ||
command: bin/trec_eval | ||
params: -c -m recall.100 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
- metric: R@1000 | ||
command: bin/trec_eval | ||
params: -c -m recall.1000 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
|
||
topic_reader: TsvString | ||
topics: | ||
- name: "BEIR (v1.0.0): ArguAna" | ||
id: test | ||
path: topics.beir-v1.0.0-arguana.test.tsv.gz | ||
qrel: qrels.beir-v1.0.0-arguana.test.txt | ||
- name: "BEIR (v1.0.0): ArguAna" | ||
id: test | ||
path: topics.beir-v1.0.0-arguana.test.tsv.gz | ||
qrel: qrels.beir-v1.0.0-arguana.test.txt | ||
|
||
models: | ||
- name: bge-flat-onnx | ||
display: BGE-base-en-v1.5 | ||
type: flat | ||
params: -generator VectorQueryGenerator -topicField vector -removeQuery -threads | ||
16 -hits 1000 -encoder BgeBaseEn15 | ||
results: | ||
nDCG@10: | ||
- 0.6361 | ||
R@100: | ||
- 0.9915 | ||
R@1000: | ||
- 0.9964 | ||
- name: bge-flat-onnx | ||
display: BGE-base-en-v1.5 | ||
type: flat | ||
params: -encoder BgeBaseEn15 -hits 1000 -removeQuery -threads 16 | ||
results: | ||
nDCG@10: | ||
- 0.6361 | ||
R@100: | ||
- 0.9915 | ||
R@1000: | ||
- 0.9964 | ||
tolerance: | ||
nDCG@10: | ||
- 0.02 | ||
R@100: | ||
- 0.02 | ||
R@1000: | ||
- 0.004 |
60 changes: 60 additions & 0 deletions
60
...n/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
--- | ||
corpus: beir-v1.0.0-arguana.bge-base-en-v1.5 | ||
corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/arguana.parquet | ||
|
||
index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-arguana.bge-base-en-v1.5/ | ||
index_type: hnsw | ||
collection_class: ParquetDenseVectorCollection | ||
generator_class: ParquetDenseVectorDocumentGenerator | ||
index_threads: 16 | ||
index_options: -M 16 -efC 100 -quantize.int8 | ||
|
||
metrics: | ||
- metric: nDCG@10 | ||
command: bin/trec_eval | ||
params: -c -m ndcg_cut.10 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
- metric: R@100 | ||
command: bin/trec_eval | ||
params: -c -m recall.100 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
- metric: R@1000 | ||
command: bin/trec_eval | ||
params: -c -m recall.1000 | ||
separator: "\t" | ||
parse_index: 2 | ||
metric_precision: 4 | ||
can_combine: false | ||
|
||
topic_reader: JsonStringVector | ||
topics: | ||
- name: "BEIR (v1.0.0): ArguAna" | ||
id: test | ||
path: topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.gz | ||
qrel: qrels.beir-v1.0.0-arguana.test.txt | ||
|
||
models: | ||
- name: bge-hnsw-int8-cached | ||
display: BGE-base-en-v1.5 | ||
type: hnsw | ||
params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 | ||
results: | ||
nDCG@10: | ||
- 0.6361 | ||
R@100: | ||
- 0.9915 | ||
R@1000: | ||
- 0.9964 | ||
tolerance: | ||
nDCG@10: | ||
- 0.001 | ||
R@100: | ||
- 0.001 | ||
R@1000: | ||
- 0.001 |
Oops, something went wrong.