From 4d087dbcb1dd7d9b1b4b93409ee2e512254e0668 Mon Sep 17 00:00:00 2001 From: jamesamcl Date: Wed, 8 Jan 2025 01:14:56 +0000 Subject: [PATCH] implement materialised queries in solr (#11) --- .github/workflows/docker.yml | 10 +- dataload/04_index/grebi_index/src/main.rs | 10 +- dataload/05_link/grebi_link/src/main.rs | 14 +- ...jsons.py => merge_graph_metadata_jsons.py} | 0 .../neo4j => 06_create_neo_db}/copy_to_ftp.sh | 0 .../cypher/create_indexes.cypher | 0 .../cypher/ic_scores_1.cypher | 0 .../cypher/ic_scores_2.cypher | 0 .../grebi_make_neo_csv/Cargo.lock | 0 .../grebi_make_neo_csv/Cargo.toml | 0 .../grebi_make_neo_csv/src/main.rs | 4 +- .../grebi_make_neo_ids_csv/Cargo.lock | 0 .../grebi_make_neo_ids_csv/Cargo.toml | 0 .../grebi_make_neo_ids_csv/src/main.rs | 0 .../neo4j_import.dockersh | 0 .../neo4j_import.slurm.py | 8 +- .../add_query_metadatas_to_graph_metadata.py | 21 + .../csvs_to_sqlite.py | 0 dataload/07_run_queries/jsonl_to_csv.py | 26 + .../run_queries.dockerpy | 29 +- .../run_queries.py | 21 +- .../solr/grebi_link_results/Cargo.toml | 14 + .../solr/grebi_link_results/src/main.rs | 103 ++ .../solr}/grebi_make_solr/Cargo.lock | 0 .../solr}/grebi_make_solr/Cargo.toml | 2 +- .../solr}/grebi_make_solr/src/main.rs | 0 .../solr}/make_solr_autocomplete_config.py | 0 .../solr}/make_solr_config.py | 4 +- .../solr/make_solr_results_config.py | 31 + .../solr}/solr_config_template/README.md | 0 .../conf/_rest_managed.json | 0 .../conf/lang/stopwords_en.txt | 0 .../grebi_autocomplete/conf/protwords.txt | 0 .../grebi_autocomplete/conf/schema.xml | 0 .../grebi_autocomplete/conf/solrconfig.xml | 0 .../grebi_autocomplete/conf/stopwords.txt | 0 .../grebi_autocomplete/conf/synonyms.txt | 0 .../grebi_autocomplete/core.properties | 0 .../grebi_edges/conf/lang/contractions_ca.txt | 0 .../grebi_edges/conf/lang/contractions_fr.txt | 0 .../grebi_edges/conf/lang/contractions_ga.txt | 0 .../grebi_edges/conf/lang/contractions_it.txt | 0 .../grebi_edges/conf/lang/hyphenations_ga.txt | 0 .../grebi_edges/conf/lang/stemdict_nl.txt | 0 .../grebi_edges/conf/lang/stoptags_ja.txt | 0 .../grebi_edges/conf/lang/stopwords_ar.txt | 0 .../grebi_edges/conf/lang/stopwords_bg.txt | 0 .../grebi_edges/conf/lang/stopwords_ca.txt | 0 .../grebi_edges/conf/lang/stopwords_cz.txt | 0 .../grebi_edges/conf/lang/stopwords_da.txt | 0 .../grebi_edges/conf/lang/stopwords_de.txt | 0 .../grebi_edges/conf/lang/stopwords_el.txt | 0 .../grebi_edges/conf/lang/stopwords_en.txt | 0 .../grebi_edges/conf/lang/stopwords_es.txt | 0 .../grebi_edges/conf/lang/stopwords_et.txt | 0 .../grebi_edges/conf/lang/stopwords_eu.txt | 0 .../grebi_edges/conf/lang/stopwords_fa.txt | 0 .../grebi_edges/conf/lang/stopwords_fi.txt | 0 .../grebi_edges/conf/lang/stopwords_fr.txt | 0 .../grebi_edges/conf/lang/stopwords_ga.txt | 0 .../grebi_edges/conf/lang/stopwords_gl.txt | 0 .../grebi_edges/conf/lang/stopwords_hi.txt | 0 .../grebi_edges/conf/lang/stopwords_hu.txt | 0 .../grebi_edges/conf/lang/stopwords_hy.txt | 0 .../grebi_edges/conf/lang/stopwords_id.txt | 0 .../grebi_edges/conf/lang/stopwords_it.txt | 0 .../grebi_edges/conf/lang/stopwords_ja.txt | 0 .../grebi_edges/conf/lang/stopwords_lv.txt | 0 .../grebi_edges/conf/lang/stopwords_nl.txt | 0 .../grebi_edges/conf/lang/stopwords_no.txt | 0 .../grebi_edges/conf/lang/stopwords_pt.txt | 0 .../grebi_edges/conf/lang/stopwords_ro.txt | 0 .../grebi_edges/conf/lang/stopwords_ru.txt | 0 .../grebi_edges/conf/lang/stopwords_sv.txt | 0 .../grebi_edges/conf/lang/stopwords_th.txt | 0 .../grebi_edges/conf/lang/stopwords_tr.txt | 0 .../grebi_edges/conf/lang/userdict_ja.txt | 0 .../grebi_edges/conf/protwords.txt | 0 .../grebi_edges/conf/schema.xml | 0 .../grebi_edges/conf/solrconfig.xml | 0 .../grebi_edges/conf/stopwords.txt | 0 .../grebi_edges/conf/synonyms.txt | 0 .../grebi_edges/core.properties | 0 .../grebi_nodes/conf/lang/contractions_ca.txt | 0 .../grebi_nodes/conf/lang/contractions_fr.txt | 0 .../grebi_nodes/conf/lang/contractions_ga.txt | 0 .../grebi_nodes/conf/lang/contractions_it.txt | 0 .../grebi_nodes/conf/lang/hyphenations_ga.txt | 0 .../grebi_nodes/conf/lang/stemdict_nl.txt | 0 .../grebi_nodes/conf/lang/stoptags_ja.txt | 0 .../grebi_nodes/conf/lang/stopwords_ar.txt | 0 .../grebi_nodes/conf/lang/stopwords_bg.txt | 0 .../grebi_nodes/conf/lang/stopwords_ca.txt | 0 .../grebi_nodes/conf/lang/stopwords_cz.txt | 0 .../grebi_nodes/conf/lang/stopwords_da.txt | 0 .../grebi_nodes/conf/lang/stopwords_de.txt | 0 .../grebi_nodes/conf/lang/stopwords_el.txt | 0 .../grebi_nodes/conf/lang/stopwords_en.txt | 0 .../grebi_nodes/conf/lang/stopwords_es.txt | 0 .../grebi_nodes/conf/lang/stopwords_et.txt | 0 .../grebi_nodes/conf/lang/stopwords_eu.txt | 0 .../grebi_nodes/conf/lang/stopwords_fa.txt | 0 .../grebi_nodes/conf/lang/stopwords_fi.txt | 0 .../grebi_nodes/conf/lang/stopwords_fr.txt | 0 .../grebi_nodes/conf/lang/stopwords_ga.txt | 0 .../grebi_nodes/conf/lang/stopwords_gl.txt | 0 .../grebi_nodes/conf/lang/stopwords_hi.txt | 0 .../grebi_nodes/conf/lang/stopwords_hu.txt | 0 .../grebi_nodes/conf/lang/stopwords_hy.txt | 0 .../grebi_nodes/conf/lang/stopwords_id.txt | 0 .../grebi_nodes/conf/lang/stopwords_it.txt | 0 .../grebi_nodes/conf/lang/stopwords_ja.txt | 0 .../grebi_nodes/conf/lang/stopwords_lv.txt | 0 .../grebi_nodes/conf/lang/stopwords_nl.txt | 0 .../grebi_nodes/conf/lang/stopwords_no.txt | 0 .../grebi_nodes/conf/lang/stopwords_pt.txt | 0 .../grebi_nodes/conf/lang/stopwords_ro.txt | 0 .../grebi_nodes/conf/lang/stopwords_ru.txt | 0 .../grebi_nodes/conf/lang/stopwords_sv.txt | 0 .../grebi_nodes/conf/lang/stopwords_th.txt | 0 .../grebi_nodes/conf/lang/stopwords_tr.txt | 0 .../grebi_nodes/conf/lang/userdict_ja.txt | 0 .../grebi_nodes/conf/protwords.txt | 0 .../grebi_nodes/conf/schema.xml | 0 .../grebi_nodes/conf/solrconfig.xml | 0 .../grebi_nodes/conf/stopwords.txt | 0 .../grebi_nodes/conf/synonyms.txt | 0 .../grebi_nodes/core.properties | 0 .../conf/lang/contractions_ca.txt | 8 + .../conf/lang/contractions_fr.txt | 15 + .../conf/lang/contractions_ga.txt | 5 + .../conf/lang/contractions_it.txt | 23 + .../conf/lang/hyphenations_ga.txt | 5 + .../grebi_results/conf/lang/stemdict_nl.txt | 6 + .../grebi_results/conf/lang/stoptags_ja.txt | 420 +++++ .../grebi_results/conf/lang/stopwords_ar.txt | 125 ++ .../grebi_results/conf/lang/stopwords_bg.txt | 193 ++ .../grebi_results/conf/lang/stopwords_ca.txt | 220 +++ .../grebi_results/conf/lang/stopwords_cz.txt | 172 ++ .../grebi_results/conf/lang/stopwords_da.txt | 110 ++ .../grebi_results/conf/lang/stopwords_de.txt | 294 +++ .../grebi_results/conf/lang/stopwords_el.txt | 78 + .../grebi_results/conf/lang/stopwords_en.txt | 54 + .../grebi_results/conf/lang/stopwords_es.txt | 356 ++++ .../grebi_results/conf/lang/stopwords_et.txt | 1603 +++++++++++++++++ .../grebi_results/conf/lang/stopwords_eu.txt | 99 + .../grebi_results/conf/lang/stopwords_fa.txt | 313 ++++ .../grebi_results/conf/lang/stopwords_fi.txt | 97 + .../grebi_results/conf/lang/stopwords_fr.txt | 186 ++ .../grebi_results/conf/lang/stopwords_ga.txt | 110 ++ .../grebi_results/conf/lang/stopwords_gl.txt | 161 ++ .../grebi_results/conf/lang/stopwords_hi.txt | 235 +++ .../grebi_results/conf/lang/stopwords_hu.txt | 211 +++ .../grebi_results/conf/lang/stopwords_hy.txt | 46 + .../grebi_results/conf/lang/stopwords_id.txt | 359 ++++ .../grebi_results/conf/lang/stopwords_it.txt | 303 ++++ .../grebi_results/conf/lang/stopwords_ja.txt | 127 ++ .../grebi_results/conf/lang/stopwords_lv.txt | 172 ++ .../grebi_results/conf/lang/stopwords_nl.txt | 119 ++ .../grebi_results/conf/lang/stopwords_no.txt | 194 ++ .../grebi_results/conf/lang/stopwords_pt.txt | 253 +++ .../grebi_results/conf/lang/stopwords_ro.txt | 233 +++ .../grebi_results/conf/lang/stopwords_ru.txt | 243 +++ .../grebi_results/conf/lang/stopwords_sv.txt | 133 ++ .../grebi_results/conf/lang/stopwords_th.txt | 119 ++ .../grebi_results/conf/lang/stopwords_tr.txt | 212 +++ .../grebi_results/conf/lang/userdict_ja.txt | 29 + .../grebi_results/conf/managed-schema.xml | 1080 +++++++++++ .../grebi_results/conf/protwords.txt | 21 + .../grebi_results/conf/solrconfig.xml | 1164 ++++++++++++ .../grebi_results/conf/stopwords.txt | 14 + .../grebi_results/conf/synonyms.txt | 29 + .../grebi_results/core.properties | 3 + .../solr}/solr_config_template/solr.xml | 0 .../solr}/solr_config_template/solrconfig.xml | 0 .../solr}/solr_config_template/zoo.cfg | 0 .../solr/solr_import.dockerpy | 0 .../solr/solr_import.slurm.py | 8 +- .../grebi_make_compressed_blob/Cargo.lock | 0 .../grebi_make_compressed_blob/Cargo.toml | 2 +- .../grebi_make_compressed_blob/src/main.rs | 0 .../sqlite/grebi_make_sqlite/Cargo.lock | 0 .../sqlite/grebi_make_sqlite/Cargo.toml | 0 .../sqlite/grebi_make_sqlite/src/main.rs | 0 dataload/Cargo.lock | 35 +- dataload/Cargo.toml | 13 +- .../datasource_configs/ols_efo_only.yaml | 9 + dataload/configs/pipeline_configs/ebi.json | 3 - .../configs/pipeline_configs/hett_only.json | 5 - .../configs/pipeline_configs/hra_only.json | 5 - dataload/configs/subgraph_configs/Makefile | 2 +- .../subgraph_configs/gwas_and_efo.json | 141 ++ .../subgraph_configs/src/gwas_and_efo.py | 14 + dataload/nextflow/codon_nextflow.config | 17 + dataload/nextflow/load_subgraph.nf | 448 ++--- dataload/nextflow/saturos_nextflow.config | 32 +- dataload/scripts/check_datarelease.sh | 37 + dataload/scripts/dataload.py | 23 - dataload/scripts/dataload_codon.sh | 23 +- dataload/scripts/dataload_saturos.sh | 11 +- dataload/scripts/ebi_datarelease_to_ftp.sh | 2 + .../scripts/ebi_datarelease_to_staging.sh | 66 + dataload/scripts/start_local_solr.py | 2 +- materialised_queries/hello_world.yaml | 11 + materialised_queries/impc_x_gwas.yaml | 4 +- notebooks/summaries.ipynb | 2 +- webapp/docker-compose.yml | 14 +- .../main/java/uk/ac/ebi/grebi/GrebiApi.java | 28 +- ...SummaryClient.java => MetadataClient.java} | 14 +- .../ac/ebi/grebi/repo/GrebiMetadataRepo.java | 34 + .../ac/ebi/grebi/repo/GrebiSummaryRepo.java | 34 - .../Dockerfile | 2 +- .../pom.xml | 0 .../GrebiMetadataSvc.java} | 10 +- .../frontends/ebi/pages/EbiDownloadsPage.tsx | 6 +- .../k8chart/templates/backend_deployment.yaml | 4 +- ....yaml => metadata_service_deployment.yaml} | 16 +- webapp/k8chart/templates/services.yaml | 6 +- webapp/up_saturos.fish | 2 +- 219 files changed, 10833 insertions(+), 501 deletions(-) rename dataload/05_link/{merge_summary_jsons.py => merge_graph_metadata_jsons.py} (100%) rename dataload/{07_create_db/neo4j => 06_create_neo_db}/copy_to_ftp.sh (100%) rename dataload/{07_create_db/neo4j => 06_create_neo_db}/cypher/create_indexes.cypher (100%) rename dataload/{07_create_db/neo4j => 06_create_neo_db}/cypher/ic_scores_1.cypher (100%) rename dataload/{07_create_db/neo4j => 06_create_neo_db}/cypher/ic_scores_2.cypher (100%) rename dataload/{06_prepare_db_import => 06_create_neo_db}/grebi_make_neo_csv/Cargo.lock (100%) rename dataload/{06_prepare_db_import => 06_create_neo_db}/grebi_make_neo_csv/Cargo.toml (100%) rename dataload/{06_prepare_db_import => 06_create_neo_db}/grebi_make_neo_csv/src/main.rs (99%) rename dataload/{06_prepare_db_import => 06_create_neo_db}/grebi_make_neo_ids_csv/Cargo.lock (100%) rename dataload/{06_prepare_db_import => 06_create_neo_db}/grebi_make_neo_ids_csv/Cargo.toml (100%) rename dataload/{06_prepare_db_import => 06_create_neo_db}/grebi_make_neo_ids_csv/src/main.rs (100%) rename dataload/{07_create_db/neo4j => 06_create_neo_db}/neo4j_import.dockersh (100%) rename dataload/{07_create_db/neo4j => 06_create_neo_db}/neo4j_import.slurm.py (88%) create mode 100644 dataload/07_run_queries/add_query_metadatas_to_graph_metadata.py rename dataload/{08_run_queries => 07_run_queries}/csvs_to_sqlite.py (100%) create mode 100644 dataload/07_run_queries/jsonl_to_csv.py rename dataload/{08_run_queries => 07_run_queries}/run_queries.dockerpy (50%) rename dataload/{08_run_queries => 07_run_queries}/run_queries.py (72%) create mode 100644 dataload/08_create_other_dbs/solr/grebi_link_results/Cargo.toml create mode 100644 dataload/08_create_other_dbs/solr/grebi_link_results/src/main.rs rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/grebi_make_solr/Cargo.lock (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/grebi_make_solr/Cargo.toml (82%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/grebi_make_solr/src/main.rs (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/make_solr_autocomplete_config.py (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/make_solr_config.py (94%) create mode 100644 dataload/08_create_other_dbs/solr/make_solr_results_config.py rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/README.md (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_autocomplete/conf/_rest_managed.json (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_autocomplete/conf/lang/stopwords_en.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_autocomplete/conf/protwords.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_autocomplete/conf/schema.xml (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_autocomplete/conf/solrconfig.xml (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_autocomplete/conf/stopwords.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_autocomplete/conf/synonyms.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_autocomplete/core.properties (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/contractions_ca.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/contractions_fr.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/contractions_ga.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/contractions_it.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/hyphenations_ga.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stemdict_nl.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stoptags_ja.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_ar.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_bg.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_ca.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_cz.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_da.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_de.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_el.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_en.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_es.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_et.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_eu.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_fa.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_fi.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_fr.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_ga.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_gl.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_hi.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_hu.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_hy.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_id.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_it.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_ja.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_lv.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_nl.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_no.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_pt.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_ro.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_ru.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_sv.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_th.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/stopwords_tr.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/lang/userdict_ja.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/protwords.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/schema.xml (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/solrconfig.xml (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/stopwords.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/conf/synonyms.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_edges/core.properties (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/contractions_ca.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/contractions_fr.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/contractions_ga.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/contractions_it.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/hyphenations_ga.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stemdict_nl.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stoptags_ja.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_ar.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_bg.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_ca.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_cz.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_da.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_de.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_el.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_en.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_es.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_et.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_eu.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_fa.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_fi.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_fr.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_ga.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_gl.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_hi.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_hu.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_hy.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_id.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_it.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_ja.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_lv.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_nl.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_no.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_pt.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_ro.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_ru.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_sv.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_th.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/stopwords_tr.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/lang/userdict_ja.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/protwords.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/schema.xml (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/solrconfig.xml (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/stopwords.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/conf/synonyms.txt (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/grebi_nodes/core.properties (100%) create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_ca.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_fr.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_ga.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_it.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/hyphenations_ga.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stemdict_nl.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stoptags_ja.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ar.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_bg.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ca.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_cz.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_da.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_de.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_el.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_en.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_es.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_et.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_eu.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fa.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fi.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fr.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ga.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_gl.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hi.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hu.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hy.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_id.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_it.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ja.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_lv.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_nl.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_no.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_pt.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ro.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ru.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_sv.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_th.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_tr.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/userdict_ja.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/managed-schema.xml create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/protwords.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/solrconfig.xml create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/stopwords.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/synonyms.txt create mode 100644 dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/core.properties rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/solr.xml (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/solrconfig.xml (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/solr}/solr_config_template/zoo.cfg (100%) rename dataload/{07_create_db => 08_create_other_dbs}/solr/solr_import.dockerpy (100%) rename dataload/{07_create_db => 08_create_other_dbs}/solr/solr_import.slurm.py (89%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/sqlite}/grebi_make_compressed_blob/Cargo.lock (100%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/sqlite}/grebi_make_compressed_blob/Cargo.toml (85%) rename dataload/{06_prepare_db_import => 08_create_other_dbs/sqlite}/grebi_make_compressed_blob/src/main.rs (100%) rename dataload/{07_create_db => 08_create_other_dbs}/sqlite/grebi_make_sqlite/Cargo.lock (100%) rename dataload/{07_create_db => 08_create_other_dbs}/sqlite/grebi_make_sqlite/Cargo.toml (100%) rename dataload/{07_create_db => 08_create_other_dbs}/sqlite/grebi_make_sqlite/src/main.rs (100%) create mode 100644 dataload/configs/datasource_configs/ols_efo_only.yaml delete mode 100644 dataload/configs/pipeline_configs/ebi.json delete mode 100644 dataload/configs/pipeline_configs/hett_only.json delete mode 100644 dataload/configs/pipeline_configs/hra_only.json create mode 100644 dataload/configs/subgraph_configs/gwas_and_efo.json create mode 100644 dataload/configs/subgraph_configs/src/gwas_and_efo.py create mode 100644 dataload/scripts/check_datarelease.sh delete mode 100644 dataload/scripts/dataload.py create mode 100755 dataload/scripts/ebi_datarelease_to_ftp.sh create mode 100755 dataload/scripts/ebi_datarelease_to_staging.sh create mode 100644 materialised_queries/hello_world.yaml rename webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/db/{SummaryClient.java => MetadataClient.java} (80%) create mode 100644 webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/repo/GrebiMetadataRepo.java delete mode 100644 webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/repo/GrebiSummaryRepo.java rename webapp/{grebi_summary_service => grebi_metadata_service}/Dockerfile (63%) rename webapp/{grebi_summary_service => grebi_metadata_service}/pom.xml (100%) rename webapp/{grebi_summary_service/src/main/java/uk/ac/ebi/grebi_summary_service/GrebiSummarySvc.java => grebi_metadata_service/src/main/java/uk/ac/ebi/grebi_metadata_service/GrebiMetadataSvc.java} (70%) rename webapp/k8chart/templates/{summary_service_deployment.yaml => metadata_service_deployment.yaml} (66%) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 856e802..b3dfa0a 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -39,12 +39,12 @@ jobs: docker tag ghcr.io/ebispot/grebi_resolver_service:${{github.sha}} ghcr.io/ebispot/grebi_resolver_service:${{ github.ref_name }} docker push --all-tags ghcr.io/ebispot/grebi_resolver_service - - name: Build and push GrEBI summary service Docker image + - name: Build and push GrEBI metadata service Docker image run: | - cd webapp/grebi_summary_service - docker build -t ghcr.io/ebispot/grebi_summary_service:${{ github.sha }} . - docker tag ghcr.io/ebispot/grebi_summary_service:${{github.sha}} ghcr.io/ebispot/grebi_summary_service:${{ github.ref_name }} - docker push --all-tags ghcr.io/ebispot/grebi_summary_service + cd webapp/grebi_metadata_service + docker build -t ghcr.io/ebispot/grebi_metadata_service:${{ github.sha }} . + docker tag ghcr.io/ebispot/grebi_metadata_service:${{github.sha}} ghcr.io/ebispot/grebi_metadata_service:${{ github.ref_name }} + docker push --all-tags ghcr.io/ebispot/grebi_metadata_service - name: Build and push GrEBI UI Docker image run: | diff --git a/dataload/04_index/grebi_index/src/main.rs b/dataload/04_index/grebi_index/src/main.rs index 0fc00e9..b8a23d8 100644 --- a/dataload/04_index/grebi_index/src/main.rs +++ b/dataload/04_index/grebi_index/src/main.rs @@ -30,10 +30,10 @@ struct Args { subgraph_name: String, #[arg(long)] - out_summary_json_path: String, + out_graph_metadata_json_path: String, #[arg(long)] - out_metadata_jsonl_path: String, + out_entity_metadata_jsonl_path: String, #[arg(long)] out_names_txt: String, @@ -57,8 +57,8 @@ fn main() { let mut all_names:BTreeSet> = BTreeSet::new(); let mut all_ids:BTreeSet> = BTreeSet::new(); - let mut summary_writer = BufWriter::new(File::create(&args.out_summary_json_path).unwrap()); - let mut metadata_writer = BufWriter::new(File::create(&args.out_metadata_jsonl_path).unwrap()); + let mut graph_metadata_writer = BufWriter::new(File::create(&args.out_graph_metadata_json_path).unwrap()); + let mut metadata_writer = BufWriter::new(File::create(&args.out_entity_metadata_jsonl_path).unwrap()); let mut names_writer = BufWriter::new(File::create(&args.out_names_txt).unwrap()); let mut ids_writer = BufWriter::new(File::create(&args.out_ids_txt).unwrap()); @@ -223,7 +223,7 @@ fn main() { let start_time3 = std::time::Instant::now(); - summary_writer.write_all( + graph_metadata_writer.write_all( serde_json::to_string_pretty(&json!({ "subgraph_name": args.subgraph_name, "entity_props": entity_props_to_count.iter().map(|(k,v)| { diff --git a/dataload/05_link/grebi_link/src/main.rs b/dataload/05_link/grebi_link/src/main.rs index 0c4f524..c0f53b7 100644 --- a/dataload/05_link/grebi_link/src/main.rs +++ b/dataload/05_link/grebi_link/src/main.rs @@ -48,13 +48,13 @@ struct Args { in_metadata_jsonl: String, #[arg(long)] - in_summary_json: String, + in_graph_metadata_json: String, #[arg(long)] out_edges_jsonl: String, #[arg(long)] - out_summary_json: String, + out_graph_metadata_json: String, #[arg(long)] groups_txt: String, @@ -119,7 +119,7 @@ fn main() -> std::io::Result<()> { let mut types_to_count:HashMap,i64> = HashMap::new(); { - let summary_json:Map = serde_json::from_reader(File::open(&args.in_summary_json).unwrap()).unwrap(); + let summary_json:Map = serde_json::from_reader(File::open(&args.in_graph_metadata_json).unwrap()).unwrap(); for (k, v) in summary_json["types"].as_object().unwrap() { types_to_count.insert(k.as_bytes().to_vec(), v.as_object().unwrap()["count"].as_i64().unwrap()); } @@ -134,8 +134,8 @@ fn main() -> std::io::Result<()> { let stdout = io::stdout().lock(); let mut nodes_writer = BufWriter::new(stdout); - let summary_file = File::create(args.out_summary_json).unwrap(); - let mut summary_writer = BufWriter::new(summary_file); + let summary_file = File::create(args.out_graph_metadata_json).unwrap(); + let mut graph_metadata_writer = BufWriter::new(summary_file); let mut edge_summary:EdgeSummaryTable = HashMap::new(); @@ -263,7 +263,7 @@ fn main() -> std::io::Result<()> { } } - summary_writer.write_all(serde_json::to_string_pretty(&json!({ + graph_metadata_writer.write_all(serde_json::to_string_pretty(&json!({ "entity_prop_defs": entity_prop_defs, "edge_prop_defs": edge_prop_defs, "types": type_defs, @@ -275,7 +275,7 @@ fn main() -> std::io::Result<()> { "edges": edge_summary })).unwrap().as_bytes()).unwrap(); - summary_writer.flush().unwrap(); + graph_metadata_writer.flush().unwrap(); Ok(()) } diff --git a/dataload/05_link/merge_summary_jsons.py b/dataload/05_link/merge_graph_metadata_jsons.py similarity index 100% rename from dataload/05_link/merge_summary_jsons.py rename to dataload/05_link/merge_graph_metadata_jsons.py diff --git a/dataload/07_create_db/neo4j/copy_to_ftp.sh b/dataload/06_create_neo_db/copy_to_ftp.sh similarity index 100% rename from dataload/07_create_db/neo4j/copy_to_ftp.sh rename to dataload/06_create_neo_db/copy_to_ftp.sh diff --git a/dataload/07_create_db/neo4j/cypher/create_indexes.cypher b/dataload/06_create_neo_db/cypher/create_indexes.cypher similarity index 100% rename from dataload/07_create_db/neo4j/cypher/create_indexes.cypher rename to dataload/06_create_neo_db/cypher/create_indexes.cypher diff --git a/dataload/07_create_db/neo4j/cypher/ic_scores_1.cypher b/dataload/06_create_neo_db/cypher/ic_scores_1.cypher similarity index 100% rename from dataload/07_create_db/neo4j/cypher/ic_scores_1.cypher rename to dataload/06_create_neo_db/cypher/ic_scores_1.cypher diff --git a/dataload/07_create_db/neo4j/cypher/ic_scores_2.cypher b/dataload/06_create_neo_db/cypher/ic_scores_2.cypher similarity index 100% rename from dataload/07_create_db/neo4j/cypher/ic_scores_2.cypher rename to dataload/06_create_neo_db/cypher/ic_scores_2.cypher diff --git a/dataload/06_prepare_db_import/grebi_make_neo_csv/Cargo.lock b/dataload/06_create_neo_db/grebi_make_neo_csv/Cargo.lock similarity index 100% rename from dataload/06_prepare_db_import/grebi_make_neo_csv/Cargo.lock rename to dataload/06_create_neo_db/grebi_make_neo_csv/Cargo.lock diff --git a/dataload/06_prepare_db_import/grebi_make_neo_csv/Cargo.toml b/dataload/06_create_neo_db/grebi_make_neo_csv/Cargo.toml similarity index 100% rename from dataload/06_prepare_db_import/grebi_make_neo_csv/Cargo.toml rename to dataload/06_create_neo_db/grebi_make_neo_csv/Cargo.toml diff --git a/dataload/06_prepare_db_import/grebi_make_neo_csv/src/main.rs b/dataload/06_create_neo_db/grebi_make_neo_csv/src/main.rs similarity index 99% rename from dataload/06_prepare_db_import/grebi_make_neo_csv/src/main.rs rename to dataload/06_create_neo_db/grebi_make_neo_csv/src/main.rs index 86da142..c958766 100644 --- a/dataload/06_prepare_db_import/grebi_make_neo_csv/src/main.rs +++ b/dataload/06_create_neo_db/grebi_make_neo_csv/src/main.rs @@ -31,7 +31,7 @@ struct Args { in_edges_jsonl: String, #[arg(long)] - in_summary_jsons: String, + in_graph_metadata_jsons: String, #[arg(long)] out_nodes_csv_path: String, @@ -59,7 +59,7 @@ fn main() -> std::io::Result<()> { let mut all_edge_props: HashSet = HashSet::new(); - for f in args.in_summary_jsons.split(",") { + for f in args.in_graph_metadata_jsons.split(",") { let summary:Value = serde_json::from_reader(File::open(f).unwrap()).unwrap(); for prop in summary["edge_props"].as_object().unwrap().keys() { all_edge_props.insert(prop.to_string()); diff --git a/dataload/06_prepare_db_import/grebi_make_neo_ids_csv/Cargo.lock b/dataload/06_create_neo_db/grebi_make_neo_ids_csv/Cargo.lock similarity index 100% rename from dataload/06_prepare_db_import/grebi_make_neo_ids_csv/Cargo.lock rename to dataload/06_create_neo_db/grebi_make_neo_ids_csv/Cargo.lock diff --git a/dataload/06_prepare_db_import/grebi_make_neo_ids_csv/Cargo.toml b/dataload/06_create_neo_db/grebi_make_neo_ids_csv/Cargo.toml similarity index 100% rename from dataload/06_prepare_db_import/grebi_make_neo_ids_csv/Cargo.toml rename to dataload/06_create_neo_db/grebi_make_neo_ids_csv/Cargo.toml diff --git a/dataload/06_prepare_db_import/grebi_make_neo_ids_csv/src/main.rs b/dataload/06_create_neo_db/grebi_make_neo_ids_csv/src/main.rs similarity index 100% rename from dataload/06_prepare_db_import/grebi_make_neo_ids_csv/src/main.rs rename to dataload/06_create_neo_db/grebi_make_neo_ids_csv/src/main.rs diff --git a/dataload/07_create_db/neo4j/neo4j_import.dockersh b/dataload/06_create_neo_db/neo4j_import.dockersh similarity index 100% rename from dataload/07_create_db/neo4j/neo4j_import.dockersh rename to dataload/06_create_neo_db/neo4j_import.dockersh diff --git a/dataload/07_create_db/neo4j/neo4j_import.slurm.py b/dataload/06_create_neo_db/neo4j_import.slurm.py similarity index 88% rename from dataload/07_create_db/neo4j/neo4j_import.slurm.py rename to dataload/06_create_neo_db/neo4j_import.slurm.py index 656e3bd..25950a9 100644 --- a/dataload/07_create_db/neo4j/neo4j_import.slurm.py +++ b/dataload/06_create_neo_db/neo4j_import.slurm.py @@ -34,8 +34,8 @@ def main(): '--bind ' + os.path.abspath(".") + ':/mnt', '--bind ' + shlex.quote(neo_data_path) + ':/data', '--bind ' + shlex.quote(neo_logs_path) + ':/logs', - '--bind ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '07_create_db/neo4j/neo4j_import.dockersh')) + ':/import.sh', - '--bind ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '07_create_db/neo4j/cypher')) + ':/cypher', + '--bind ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '06_create_neo_db/neo4j_import.dockersh')) + ':/import.sh', + '--bind ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '06_create_neo_db/cypher')) + ':/cypher', '--writable-tmpfs', '--network=none', '--env NEO4J_AUTH=none', @@ -49,8 +49,8 @@ def main(): ] + list(map(lambda f: "-v " + os.path.abspath(f) + ":/mnt/" + os.path.basename(f), glob.glob(args.in_csv_path + "/neo_*"))) + [ '-v ' + shlex.quote(neo_data_path) + ':/data', '-v ' + shlex.quote(neo_logs_path) + ':/logs', - '-v ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '07_create_db/neo4j/neo4j_import.dockersh')) + ':/import.sh', - '-v ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '07_create_db/neo4j/cypher')) + ':/cypher', + '-v ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '06_create_neo_db/neo4j_import.dockersh')) + ':/import.sh', + '-v ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '06_create_neo_db/cypher')) + ':/cypher', '-e NEO4J_AUTH=none', 'neo4j:5.18.0', 'bash /import.sh' diff --git a/dataload/07_run_queries/add_query_metadatas_to_graph_metadata.py b/dataload/07_run_queries/add_query_metadatas_to_graph_metadata.py new file mode 100644 index 0000000..841397b --- /dev/null +++ b/dataload/07_run_queries/add_query_metadatas_to_graph_metadata.py @@ -0,0 +1,21 @@ + +import json +import sys + +def main(): + graph_metadata_filename = sys.argv[1] + query_metadata_filenames = sys.argv[2:] + + with open(graph_metadata_filename, 'r') as file: + graph_metadata = json.load(file) + graph_metadata['materialised_queries'] = [] + for query_metadata_filename in query_metadata_filenames: + with open(query_metadata_filename, 'r') as file: + query_metadata = json.load(file) + graph_metadata['materialised_queries'].append(query_metadata) + + print(json.dumps(graph_metadata, indent=2)) + +if __name__=="__main__": + main() + diff --git a/dataload/08_run_queries/csvs_to_sqlite.py b/dataload/07_run_queries/csvs_to_sqlite.py similarity index 100% rename from dataload/08_run_queries/csvs_to_sqlite.py rename to dataload/07_run_queries/csvs_to_sqlite.py diff --git a/dataload/07_run_queries/jsonl_to_csv.py b/dataload/07_run_queries/jsonl_to_csv.py new file mode 100644 index 0000000..7fed6bb --- /dev/null +++ b/dataload/07_run_queries/jsonl_to_csv.py @@ -0,0 +1,26 @@ + +import sys +import pandas as pd +import json + +def main(): + data = [] + for line in sys.stdin: + line = line.strip() + if line: + obj = json.loads(line) + for key, value in obj.items(): + if isinstance(value, list): + obj[key] = ';'.join(map(str, value)) + elif isinstance(value, dict): + obj[key] = json.dumps(value) + else: + obj[key] = str(value) + data.append(obj) + + df = pd.DataFrame(data) + df.to_csv(sys.stdout, index=False) + +if __name__ == "__main__": + main() + diff --git a/dataload/08_run_queries/run_queries.dockerpy b/dataload/07_run_queries/run_queries.dockerpy similarity index 50% rename from dataload/08_run_queries/run_queries.dockerpy rename to dataload/07_run_queries/run_queries.dockerpy index 03985b5..2a37760 100644 --- a/dataload/08_run_queries/run_queries.dockerpy +++ b/dataload/07_run_queries/run_queries.dockerpy @@ -4,7 +4,7 @@ import os from pathlib import Path from pandas import DataFrame import json -from timeit import default_timer as timer +from datetime import datetime os.system('echo "dbms.security.auth_enabled=false" >> /var/lib/neo4j/conf/neo4j.conf') @@ -15,6 +15,8 @@ from py2neo import Graph import yaml graph = Graph("bolt://localhost:7687") +metadatas = [] + for file in os.listdir("/materialised_queries"): if not file.endswith(".yaml"): continue @@ -23,22 +25,29 @@ for file in os.listdir("/materialised_queries"): query = yaml.safe_load(open(f"/materialised_queries/{file}")) - start_time = timer() + start_time = datetime.now() print(f"Running query {query_id}") - df = DataFrame(graph.run(query['cypher_query']).data()) - end_time = timer() + with open(f"/out/{query_id}.results.jsonl", "w") as f: + for row in graph.run(query['cypher_query']).data(): + json.dump(row, f, skipkeys=True) + f.write("\n") + + end_time = datetime.now() - query['start_time'] = start_time - query['end_time'] = end_time - query['time'] = end_time - start_time + query['id'] = query_id + query['start_time'] = start_time.strftime("%Y-%m-%d %H:%M:%S") + query['end_time'] = end_time.strftime("%Y-%m-%d %H:%M:%S") + query['time'] = (end_time - start_time).total_seconds() - print(f"Saving {len(df)} rows to {Path(f'/out/{query_id}.csv.gz')}") - df.to_csv(Path(f"/out/{query_id}.csv.gz"), index=False, compression="gzip") + metadatas.append(query) with open(f"/out/{query_id}.json", "w") as f: - json.dump(query, f) + json.dump(query, f, skipkeys=True) + +with open(f"/out/queries.json", "w") as f: + json.dump(metadatas, f, skipkeys=True) os.system("sleep 20") os.system("neo4j stop") diff --git a/dataload/08_run_queries/run_queries.py b/dataload/07_run_queries/run_queries.py similarity index 72% rename from dataload/08_run_queries/run_queries.py rename to dataload/07_run_queries/run_queries.py index 38b6ac5..56cc77a 100644 --- a/dataload/08_run_queries/run_queries.py +++ b/dataload/07_run_queries/run_queries.py @@ -12,7 +12,7 @@ def main(): parser = argparse.ArgumentParser(description='Materialise Cypher queries as CSV') parser.add_argument('--in-db-path', type=str, help='Path with the neo4j database to query', required=True) - parser.add_argument('--out-csvs-path', type=str, help='Path for the output csv files of materialised results', required=True) + parser.add_argument('--out-jsons-path', type=str, help='Path for the output json files of materialised results', required=True) args = parser.parse_args() has_singularity = os.system('which singularity') == 0 @@ -23,9 +23,9 @@ def main(): neo_data_path = os.path.abspath(os.path.join(neo_path, "data")) neo_logs_path = os.path.abspath(os.path.join(neo_path, "logs")) - csvs_path = args.out_csvs_path + jsons_path = args.out_jsons_path - os.makedirs(csvs_path) + os.makedirs(jsons_path) if has_singularity: cmd = ' '.join([ @@ -34,24 +34,29 @@ def main(): '--bind ' + os.path.abspath(".") + ':/mnt', '--bind ' + shlex.quote(neo_data_path) + ':/data', '--bind ' + shlex.quote(neo_logs_path) + ':/logs', - '--bind ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '08_run_queries/run_queries.dockerpy')) + ':/run_queries.py', + '--bind ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '07_run_queries/run_queries.dockerpy')) + ':/run_queries.py', '--bind ' + os.path.abspath(os.environ['GREBI_QUERY_YAMLS_PATH']) + ':/materialised_queries', - '--bind ' + os.path.abspath(args.out_csvs_path) + ':/out', + '--bind ' + os.path.abspath(args.out_jsons_path) + ':/out', '--writable-tmpfs', '--network=none', '--env NEO4J_AUTH=none', + '--env NEO4J_server_memory_heap_initial__size=300G', + '--env NEO4J_server_memory_heap_max__size=300G', + '--env NEO4J_server_memory_pagecache_size=150G', + '--env NEO4J_dbms_memory_transaction_total_max=150G', + '--env TINI_SUBREAPER=true', 'docker://ghcr.io/ebispot/grebi_neo4j_with_extras:5.18.0', 'python3 /run_queries.py' ]) else: cmd = ' '.join([ 'docker run', - '--user="$(id -u):$(id -g)"' + '--user="$(id -u):$(id -g)"', '-v ' + shlex.quote(neo_data_path) + ':/data', '-v ' + shlex.quote(neo_logs_path) + ':/logs', - '-v ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '08_run_queries/run_queries.dockerpy')) + ':/run_queries.py', + '-v ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '07_run_queries/run_queries.dockerpy')) + ':/run_queries.py', '-v ' + os.path.abspath(os.environ['GREBI_QUERY_YAMLS_PATH']) + ':/materialised_queries', - '-v ' + os.path.abspath(args.out_csvs_path) + ':/out', + '-v ' + os.path.abspath(args.out_jsons_path) + ':/out', '-e NEO4J_AUTH=none', 'ghcr.io/ebispot/grebi_neo4j_with_extras:5.18.0', 'python3 /run_queries.py' diff --git a/dataload/08_create_other_dbs/solr/grebi_link_results/Cargo.toml b/dataload/08_create_other_dbs/solr/grebi_link_results/Cargo.toml new file mode 100644 index 0000000..72ff6fb --- /dev/null +++ b/dataload/08_create_other_dbs/solr/grebi_link_results/Cargo.toml @@ -0,0 +1,14 @@ + +[package] +name = "grebi_link_results" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde_json = { version = "1.0.108", features=["preserve_order"] } +grebi_shared = { path = "../../../grebi_shared" } +csv = "1.3.0" +lmdb-zero = "0.4.4" +bloomfilter = "1.0.13" +jemallocator = "0.5.4" +clap = { version = "4.4.11", features = ["derive"] } \ No newline at end of file diff --git a/dataload/08_create_other_dbs/solr/grebi_link_results/src/main.rs b/dataload/08_create_other_dbs/solr/grebi_link_results/src/main.rs new file mode 100644 index 0000000..013b338 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/grebi_link_results/src/main.rs @@ -0,0 +1,103 @@ + + + + +use std::collections::{HashMap, HashSet, BTreeSet}; +use std::fs::File; +use std::{env, io}; +use std::io::{BufRead, BufReader }; +use std::io::{Write, BufWriter}; +use grebi_shared::json_lexer::{lex, JsonTokenType}; +use grebi_shared::json_parser::JsonParser; +use clap::Parser; + +use serde_json::Value; + +use grebi_shared::load_metadata_mapping_table; +use grebi_shared::load_groups_txt::load_id_to_group_mapping; + +#[derive(clap::Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + + #[arg(long)] + in_metadata_jsonl: String, + + #[arg(long)] + groups_txt: String, + + +} + +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +fn main() { + + let args = Args::parse(); + + let id_to_group = load_id_to_group_mapping(&args.groups_txt); + + let node_metadata = load_metadata_mapping_table::load_metadata_mapping_table(&args.in_metadata_jsonl); + + let start_time = std::time::Instant::now(); + + let stdin = io::stdin(); + let handle = stdin.lock(); + let mut reader = BufReader::new(handle); + + let stdout = io::stdout().lock(); + let mut writer = BufWriter::new(stdout); + + loop { + let mut line: Vec = Vec::new(); + reader.read_until(b'\n', &mut line).unwrap(); + + if line.len() == 0 { + break; + } + + let mut json:serde_json::Map = serde_json::from_slice(&line).unwrap(); + let mut refs = serde_json::Map::new(); + let mut nodeids:HashSet = HashSet::new(); + + for (k,v) in json.iter() { + + if v.is_null() { + continue; + } + + let k_group = id_to_group.get(k.as_str().as_bytes()); + + if k_group.is_some() { + let metadata = node_metadata.get(k_group.unwrap()); + if metadata.is_some() { + refs.insert(k.as_str().to_string(), serde_json::from_slice( metadata.unwrap().json.as_slice() ).unwrap() ); + nodeids.insert(String::from_utf8(k_group.unwrap().to_vec()).unwrap().to_string()); + } + } + + let v_group = id_to_group.get(v.as_str().unwrap().as_bytes()); + + if v_group.is_some() { + let metadata = node_metadata.get(v_group.unwrap()); + if metadata.is_some() { + refs.insert(v.as_str().unwrap().to_string(), serde_json::from_slice( metadata.unwrap().json.as_slice() ).unwrap() ); + nodeids.insert(String::from_utf8(v_group.unwrap().to_vec()).unwrap().to_string()); + } + } + } + + json.insert("_refs".to_string(), Value::Object(refs)); + json.insert("_node_ids".to_string(), Value::Array( nodeids.iter().map(|id| Value::String(id.clone())).collect())); + + writer.write_all(Value::Object(json).to_string().as_bytes()).unwrap(); + writer.write_all("\n".as_bytes()).unwrap(); + + } + + eprintln!("completed id to group mapping in {}", start_time.elapsed().as_secs()); + +} + + diff --git a/dataload/06_prepare_db_import/grebi_make_solr/Cargo.lock b/dataload/08_create_other_dbs/solr/grebi_make_solr/Cargo.lock similarity index 100% rename from dataload/06_prepare_db_import/grebi_make_solr/Cargo.lock rename to dataload/08_create_other_dbs/solr/grebi_make_solr/Cargo.lock diff --git a/dataload/06_prepare_db_import/grebi_make_solr/Cargo.toml b/dataload/08_create_other_dbs/solr/grebi_make_solr/Cargo.toml similarity index 82% rename from dataload/06_prepare_db_import/grebi_make_solr/Cargo.toml rename to dataload/08_create_other_dbs/solr/grebi_make_solr/Cargo.toml index e9c0b79..61523e0 100644 --- a/dataload/06_prepare_db_import/grebi_make_solr/Cargo.toml +++ b/dataload/08_create_other_dbs/solr/grebi_make_solr/Cargo.toml @@ -6,5 +6,5 @@ edition = "2021" [dependencies] clap = { version = "4.4.11", features = ["derive"] } jemallocator = "0.5.4" -grebi_shared = { path = "../../grebi_shared" } +grebi_shared = { path = "../../../grebi_shared" } serde_json = { version = "1.0.108", features=["preserve_order"] } diff --git a/dataload/06_prepare_db_import/grebi_make_solr/src/main.rs b/dataload/08_create_other_dbs/solr/grebi_make_solr/src/main.rs similarity index 100% rename from dataload/06_prepare_db_import/grebi_make_solr/src/main.rs rename to dataload/08_create_other_dbs/solr/grebi_make_solr/src/main.rs diff --git a/dataload/06_prepare_db_import/make_solr_autocomplete_config.py b/dataload/08_create_other_dbs/solr/make_solr_autocomplete_config.py similarity index 100% rename from dataload/06_prepare_db_import/make_solr_autocomplete_config.py rename to dataload/08_create_other_dbs/solr/make_solr_autocomplete_config.py diff --git a/dataload/06_prepare_db_import/make_solr_config.py b/dataload/08_create_other_dbs/solr/make_solr_config.py similarity index 94% rename from dataload/06_prepare_db_import/make_solr_config.py rename to dataload/08_create_other_dbs/solr/make_solr_config.py index 7e155b0..5b0e3a1 100644 --- a/dataload/06_prepare_db_import/make_solr_config.py +++ b/dataload/08_create_other_dbs/solr/make_solr_config.py @@ -13,7 +13,7 @@ def main(): parser = argparse.ArgumentParser(description='Create Solr config') parser.add_argument('--subgraph-name', type=str, help='subgraph name', required=True) - parser.add_argument('--in-summary-json', type=str, help='summary.json', required=True) + parser.add_argument('--in-graph-metadata-json', type=str, help='summary.json', required=True) parser.add_argument('--in-template-config-dir', type=str, help='Path of config template', required=True) parser.add_argument('--out-config-dir', type=str, help='Path to write config', required=True) args = parser.parse_args() @@ -29,7 +29,7 @@ def main(): os.system('cp ' + shlex.quote(os.path.join(args.in_template_config_dir, "solrconfig.xml")) + ' ' + shlex.quote(args.out_config_dir)) os.system('cp ' + shlex.quote(os.path.join(args.in_template_config_dir, "zoo.cfg")) + ' ' + shlex.quote(args.out_config_dir)) - summary = json.load(open(args.in_summary_json)) + summary = json.load(open(args.in_graph_metadata_json)) node_props = map(lambda f: f.replace(':', '__').replace('&', '_'), summary['entity_props'].keys()) edge_props = map(lambda f: f.replace(':', '__').replace('&', '_'), summary['edge_props'].keys()) diff --git a/dataload/08_create_other_dbs/solr/make_solr_results_config.py b/dataload/08_create_other_dbs/solr/make_solr_results_config.py new file mode 100644 index 0000000..bc31d94 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/make_solr_results_config.py @@ -0,0 +1,31 @@ +import os +import shlex +import argparse +from pathlib import Path +from subprocess import Popen, PIPE, STDOUT + +def main(): + parser = argparse.ArgumentParser(description='Create Solr results config') + parser.add_argument('--subgraph-name', type=str, help='subgraph name', required=True) + parser.add_argument('--query-id', type=str, help='query id', required=True) + parser.add_argument('--in-template-config-dir', type=str, help='Path of config template', required=True) + parser.add_argument('--out-config-dir', type=str, help='Path to write config', required=True) + args = parser.parse_args() + + os.makedirs(args.out_config_dir) + + results_core_path = os.path.join(args.out_config_dir, f'grebi_results__{args.subgraph_name}__{args.query_id}') + os.system('cp -r ' + shlex.quote(os.path.join(args.in_template_config_dir, "grebi_results")) + ' ' + shlex.quote(results_core_path)) + + os.system('cp ' + shlex.quote(os.path.join(args.in_template_config_dir, "solr.xml")) + ' ' + shlex.quote(args.out_config_dir)) + os.system('cp ' + shlex.quote(os.path.join(args.in_template_config_dir, "solrconfig.xml")) + ' ' + shlex.quote(args.out_config_dir)) + os.system('cp ' + shlex.quote(os.path.join(args.in_template_config_dir, "zoo.cfg")) + ' ' + shlex.quote(args.out_config_dir)) + + Path(f'{results_core_path}/core.properties').write_text(f"name=grebi_results__{args.subgraph_name}__{args.query_id}\n") + +if __name__=="__main__": + main() + + + + diff --git a/dataload/06_prepare_db_import/solr_config_template/README.md b/dataload/08_create_other_dbs/solr/solr_config_template/README.md similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/README.md rename to dataload/08_create_other_dbs/solr/solr_config_template/README.md diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/_rest_managed.json b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/_rest_managed.json similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/_rest_managed.json rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/_rest_managed.json diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/lang/stopwords_en.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/lang/stopwords_en.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/lang/stopwords_en.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/lang/stopwords_en.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/protwords.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/protwords.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/protwords.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/protwords.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/schema.xml b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/schema.xml similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/schema.xml rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/schema.xml diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/solrconfig.xml b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/solrconfig.xml similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/solrconfig.xml rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/solrconfig.xml diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/stopwords.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/stopwords.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/stopwords.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/stopwords.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/synonyms.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/synonyms.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/conf/synonyms.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/conf/synonyms.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/core.properties b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/core.properties similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_autocomplete/core.properties rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_autocomplete/core.properties diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/contractions_ca.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/contractions_ca.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/contractions_ca.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/contractions_ca.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/contractions_fr.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/contractions_fr.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/contractions_fr.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/contractions_fr.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/contractions_ga.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/contractions_ga.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/contractions_ga.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/contractions_ga.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/contractions_it.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/contractions_it.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/contractions_it.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/contractions_it.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/hyphenations_ga.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/hyphenations_ga.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/hyphenations_ga.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/hyphenations_ga.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stemdict_nl.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stemdict_nl.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stemdict_nl.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stemdict_nl.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stoptags_ja.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stoptags_ja.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stoptags_ja.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stoptags_ja.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ar.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ar.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ar.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ar.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_bg.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_bg.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_bg.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_bg.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ca.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ca.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ca.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ca.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_cz.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_cz.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_cz.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_cz.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_da.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_da.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_da.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_da.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_de.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_de.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_de.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_de.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_el.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_el.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_el.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_el.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_en.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_en.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_en.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_en.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_es.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_es.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_es.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_es.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_et.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_et.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_et.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_et.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_eu.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_eu.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_eu.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_eu.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_fa.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_fa.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_fa.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_fa.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_fi.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_fi.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_fi.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_fi.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_fr.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_fr.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_fr.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_fr.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ga.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ga.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ga.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ga.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_gl.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_gl.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_gl.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_gl.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_hi.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_hi.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_hi.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_hi.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_hu.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_hu.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_hu.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_hu.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_hy.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_hy.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_hy.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_hy.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_id.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_id.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_id.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_id.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_it.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_it.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_it.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_it.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ja.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ja.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ja.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ja.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_lv.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_lv.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_lv.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_lv.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_nl.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_nl.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_nl.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_nl.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_no.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_no.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_no.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_no.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_pt.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_pt.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_pt.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_pt.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ro.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ro.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ro.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ro.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ru.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ru.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_ru.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_ru.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_sv.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_sv.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_sv.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_sv.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_th.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_th.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_th.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_th.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_tr.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_tr.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/stopwords_tr.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/stopwords_tr.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/userdict_ja.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/userdict_ja.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/lang/userdict_ja.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/lang/userdict_ja.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/protwords.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/protwords.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/protwords.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/protwords.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/schema.xml b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/schema.xml similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/schema.xml rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/schema.xml diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/solrconfig.xml b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/solrconfig.xml similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/solrconfig.xml rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/solrconfig.xml diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/stopwords.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/stopwords.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/stopwords.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/stopwords.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/synonyms.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/synonyms.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/conf/synonyms.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/conf/synonyms.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_edges/core.properties b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/core.properties similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_edges/core.properties rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_edges/core.properties diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/contractions_ca.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/contractions_ca.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/contractions_ca.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/contractions_ca.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/contractions_fr.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/contractions_fr.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/contractions_fr.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/contractions_fr.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/contractions_ga.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/contractions_ga.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/contractions_ga.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/contractions_ga.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/contractions_it.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/contractions_it.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/contractions_it.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/contractions_it.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/hyphenations_ga.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/hyphenations_ga.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/hyphenations_ga.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/hyphenations_ga.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stemdict_nl.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stemdict_nl.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stemdict_nl.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stemdict_nl.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stoptags_ja.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stoptags_ja.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stoptags_ja.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stoptags_ja.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ar.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ar.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ar.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ar.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_bg.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_bg.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_bg.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_bg.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ca.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ca.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ca.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ca.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_cz.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_cz.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_cz.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_cz.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_da.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_da.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_da.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_da.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_de.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_de.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_de.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_de.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_el.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_el.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_el.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_el.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_en.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_en.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_en.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_en.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_es.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_es.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_es.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_es.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_et.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_et.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_et.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_et.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_eu.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_eu.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_eu.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_eu.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_fa.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_fa.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_fa.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_fa.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_fi.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_fi.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_fi.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_fi.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_fr.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_fr.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_fr.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_fr.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ga.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ga.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ga.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ga.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_gl.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_gl.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_gl.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_gl.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_hi.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_hi.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_hi.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_hi.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_hu.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_hu.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_hu.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_hu.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_hy.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_hy.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_hy.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_hy.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_id.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_id.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_id.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_id.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_it.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_it.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_it.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_it.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ja.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ja.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ja.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ja.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_lv.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_lv.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_lv.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_lv.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_nl.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_nl.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_nl.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_nl.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_no.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_no.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_no.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_no.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_pt.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_pt.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_pt.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_pt.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ro.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ro.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ro.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ro.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ru.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ru.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_ru.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_ru.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_sv.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_sv.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_sv.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_sv.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_th.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_th.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_th.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_th.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_tr.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_tr.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/stopwords_tr.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/stopwords_tr.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/userdict_ja.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/userdict_ja.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/lang/userdict_ja.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/lang/userdict_ja.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/protwords.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/protwords.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/protwords.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/protwords.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/schema.xml b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/schema.xml similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/schema.xml rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/schema.xml diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/solrconfig.xml b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/solrconfig.xml similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/solrconfig.xml rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/solrconfig.xml diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/stopwords.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/stopwords.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/stopwords.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/stopwords.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/synonyms.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/synonyms.txt similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/conf/synonyms.txt rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/conf/synonyms.txt diff --git a/dataload/06_prepare_db_import/solr_config_template/grebi_nodes/core.properties b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/core.properties similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/grebi_nodes/core.properties rename to dataload/08_create_other_dbs/solr/solr_config_template/grebi_nodes/core.properties diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_ca.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_ca.txt new file mode 100644 index 0000000..307a85f --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_ca.txt @@ -0,0 +1,8 @@ +# Set of Catalan contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +l +m +n +s +t diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_fr.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_fr.txt new file mode 100644 index 0000000..f1bba51 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_fr.txt @@ -0,0 +1,15 @@ +# Set of French contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +l +m +t +qu +n +s +j +d +c +jusqu +quoiqu +lorsqu +puisqu diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_ga.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_ga.txt new file mode 100644 index 0000000..9ebe7fa --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +m +b diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_it.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_it.txt new file mode 100644 index 0000000..cac0409 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/contractions_it.txt @@ -0,0 +1,23 @@ +# Set of Italian contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +c +l +all +dall +dell +nell +sull +coll +pell +gl +agl +dagl +degl +negl +sugl +un +m +t +s +v +d diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/hyphenations_ga.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/hyphenations_ga.txt new file mode 100644 index 0000000..4d2642c --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/hyphenations_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish hyphenations for StopFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +h +n +t diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stemdict_nl.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stemdict_nl.txt new file mode 100644 index 0000000..4410729 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stemdict_nl.txt @@ -0,0 +1,6 @@ +# Set of overrides for the dutch stemmer +# TODO: load this as a resource from the analyzer and sync it in build.xml +fiets fiets +bromfiets bromfiets +ei eier +kind kinder diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stoptags_ja.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stoptags_ja.txt new file mode 100644 index 0000000..71b7508 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stoptags_ja.txt @@ -0,0 +1,420 @@ +# +# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. +# +# Any token with a part-of-speech tag that exactly matches those defined in this +# file are removed from the token stream. +# +# Set your own stoptags by uncommenting the lines below. Note that comments are +# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, +# etc. that can be useful for building you own stoptag set. +# +# The entire possible tagset is provided below for convenience. +# +##### +# noun: unclassified nouns +#名詞 +# +# noun-common: Common nouns or nouns where the sub-classification is undefined +#名詞-一般 +# +# noun-proper: Proper nouns where the sub-classification is undefined +#名詞-固有名詞 +# +# noun-proper-misc: miscellaneous proper nouns +#名詞-固有名詞-一般 +# +# noun-proper-person: Personal names where the sub-classification is undefined +#名詞-固有名詞-人名 +# +# noun-proper-person-misc: names that cannot be divided into surname and +# given name; foreign names; names where the surname or given name is unknown. +# e.g. お市の方 +#名詞-固有名詞-人名-一般 +# +# noun-proper-person-surname: Mainly Japanese surnames. +# e.g. 山田 +#名詞-固有名詞-人名-姓 +# +# noun-proper-person-given_name: Mainly Japanese given names. +# e.g. 太郎 +#名詞-固有名詞-人名-名 +# +# noun-proper-organization: Names representing organizations. +# e.g. 通産省, NHK +#名詞-固有名詞-組織 +# +# noun-proper-place: Place names where the sub-classification is undefined +#名詞-固有名詞-地域 +# +# noun-proper-place-misc: Place names excluding countries. +# e.g. アジア, バルセロナ, 京都 +#名詞-固有名詞-地域-一般 +# +# noun-proper-place-country: Country names. +# e.g. 日本, オーストラリア +#名詞-固有名詞-地域-国 +# +# noun-pronoun: Pronouns where the sub-classification is undefined +#名詞-代名詞 +# +# noun-pronoun-misc: miscellaneous pronouns: +# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ +#名詞-代名詞-一般 +# +# noun-pronoun-contraction: Spoken language contraction made by combining a +# pronoun and the particle 'wa'. +# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ +#名詞-代名詞-縮約 +# +# noun-adverbial: Temporal nouns such as names of days or months that behave +# like adverbs. Nouns that represent amount or ratios and can be used adverbially, +# e.g. 金曜, 一月, 午後, 少量 +#名詞-副詞可能 +# +# noun-verbal: Nouns that take arguments with case and can appear followed by +# 'suru' and related verbs (する, できる, なさる, くださる) +# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り +#名詞-サ変接続 +# +# noun-adjective-base: The base form of adjectives, words that appear before な ("na") +# e.g. 健康, 安易, 駄目, だめ +#名詞-形容動詞語幹 +# +# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数. +# e.g. 0, 1, 2, 何, 数, 幾 +#名詞-数 +# +# noun-affix: noun affixes where the sub-classification is undefined +#名詞-非自立 +# +# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that +# attach to the base form of inflectional words, words that cannot be classified +# into any of the other categories below. This category includes indefinite nouns. +# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, +# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, +# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳, +# わり, 割り, 割, ん-口語/, もん-口語/ +#名詞-非自立-一般 +# +# noun-affix-adverbial: noun affixes that that can behave as adverbs. +# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, +# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, +# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, +# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, +# 儘, 侭, みぎり, 矢先 +#名詞-非自立-副詞可能 +# +# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars +# with the stem よう(だ) ("you(da)"). +# e.g. よう, やう, 様 (よう) +#名詞-非自立-助動詞語幹 +# +# noun-affix-adjective-base: noun affixes that can connect to the indeclinable +# connection form な (aux "da"). +# e.g. みたい, ふう +#名詞-非自立-形容動詞語幹 +# +# noun-special: special nouns where the sub-classification is undefined. +#名詞-特殊 +# +# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is +# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base +# form of inflectional words. +# e.g. そう +#名詞-特殊-助動詞語幹 +# +# noun-suffix: noun suffixes where the sub-classification is undefined. +#名詞-接尾 +# +# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect +# to ガル or タイ and can combine into compound nouns, words that cannot be classified into +# any of the other categories below. In general, this category is more inclusive than +# 接尾語 ("suffix") and is usually the last element in a compound noun. +# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み, +# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用 +#名詞-接尾-一般 +# +# noun-suffix-person: Suffixes that form nouns and attach to person names more often +# than other nouns. +# e.g. 君, 様, 著 +#名詞-接尾-人名 +# +# noun-suffix-place: Suffixes that form nouns and attach to place names more often +# than other nouns. +# e.g. 町, 市, 県 +#名詞-接尾-地域 +# +# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that +# can appear before スル ("suru"). +# e.g. 化, 視, 分け, 入り, 落ち, 買い +#名詞-接尾-サ変接続 +# +# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, +# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the +# conjunctive form of inflectional words. +# e.g. そう +#名詞-接尾-助動詞語幹 +# +# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive +# form of inflectional words and appear before the copula だ ("da"). +# e.g. 的, げ, がち +#名詞-接尾-形容動詞語幹 +# +# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. +# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ) +#名詞-接尾-副詞可能 +# +# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category +# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach +# to numbers. +# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半 +#名詞-接尾-助数詞 +# +# noun-suffix-special: Special suffixes that mainly attach to inflecting words. +# e.g. (楽し) さ, (考え) 方 +#名詞-接尾-特殊 +# +# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words +# together. +# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦) +#名詞-接続詞的 +# +# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are +# semantically verb-like. +# e.g. ごらん, ご覧, 御覧, 頂戴 +#名詞-動詞非自立的 +# +# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, +# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") +# is いわく ("iwaku"). +#名詞-引用文字列 +# +# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and +# behave like an adjective. +# e.g. 申し訳, 仕方, とんでも, 違い +#名詞-ナイ形容詞語幹 +# +##### +# prefix: unclassified prefixes +#接頭詞 +# +# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) +# excluding numerical expressions. +# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派) +#接頭詞-名詞接続 +# +# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb +# in conjunctive form followed by なる/なさる/くださる. +# e.g. お (読みなさい), お (座り) +#接頭詞-動詞接続 +# +# prefix-adjectival: Prefixes that attach to adjectives. +# e.g. お (寒いですねえ), バカ (でかい) +#接頭詞-形容詞接続 +# +# prefix-numerical: Prefixes that attach to numerical expressions. +# e.g. 約, およそ, 毎時 +#接頭詞-数接続 +# +##### +# verb: unclassified verbs +#動詞 +# +# verb-main: +#動詞-自立 +# +# verb-auxiliary: +#動詞-非自立 +# +# verb-suffix: +#動詞-接尾 +# +##### +# adjective: unclassified adjectives +#形容詞 +# +# adjective-main: +#形容詞-自立 +# +# adjective-auxiliary: +#形容詞-非自立 +# +# adjective-suffix: +#形容詞-接尾 +# +##### +# adverb: unclassified adverbs +#副詞 +# +# adverb-misc: Words that can be segmented into one unit and where adnominal +# modification is not possible. +# e.g. あいかわらず, 多分 +#副詞-一般 +# +# adverb-particle_conjunction: Adverbs that can be followed by の, は, に, +# な, する, だ, etc. +# e.g. こんなに, そんなに, あんなに, なにか, なんでも +#副詞-助詞類接続 +# +##### +# adnominal: Words that only have noun-modifying forms. +# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, +# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, +# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き +#連体詞 +# +##### +# conjunction: Conjunctions that can occur independently. +# e.g. が, けれども, そして, じゃあ, それどころか +接続詞 +# +##### +# particle: unclassified particles. +助詞 +# +# particle-case: case particles where the subclassification is undefined. +助詞-格助詞 +# +# particle-case-misc: Case particles. +# e.g. から, が, で, と, に, へ, より, を, の, にて +助詞-格助詞-一般 +# +# particle-case-quote: the "to" that appears after nouns, a person’s speech, +# quotation marks, expressions of decisions from a meeting, reasons, judgements, +# conjectures, etc. +# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...) +助詞-格助詞-引用 +# +# particle-case-compound: Compounds of particles and verbs that mainly behave +# like case particles. +# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って, +# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, +# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, +# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, +# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって, +# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, +# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる, +# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ +助詞-格助詞-連語 +# +# particle-conjunctive: +# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, +# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, +# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/ +助詞-接続助詞 +# +# particle-dependency: +# e.g. こそ, さえ, しか, すら, は, も, ぞ +助詞-係助詞 +# +# particle-adverbial: +# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, +# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/, +# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, +# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/, +# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」) +助詞-副助詞 +# +# particle-interjective: particles with interjective grammatical roles. +# e.g. (松島) や +助詞-間投助詞 +# +# particle-coordinate: +# e.g. と, たり, だの, だり, とか, なり, や, やら +助詞-並立助詞 +# +# particle-final: +# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, +# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/ +助詞-終助詞 +# +# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is +# adverbial, conjunctive, or sentence final. For example: +# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」 +# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」 +# 「(祈りが届いたせい) か (, 試験に合格した.)」 +# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」 +# e.g. か +助詞-副助詞/並立助詞/終助詞 +# +# particle-adnominalizer: The "no" that attaches to nouns and modifies +# non-inflectional words. +助詞-連体化 +# +# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs +# that are giongo, giseigo, or gitaigo. +# e.g. に, と +助詞-副詞化 +# +# particle-special: A particle that does not fit into one of the above classifications. +# This includes particles that are used in Tanka, Haiku, and other poetry. +# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家) +助詞-特殊 +# +##### +# auxiliary-verb: +助動詞 +# +##### +# interjection: Greetings and other exclamations. +# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, +# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい +#感動詞 +# +##### +# symbol: unclassified Symbols. +記号 +# +# symbol-misc: A general symbol not in one of the categories below. +# e.g. [○◎@$〒→+] +記号-一般 +# +# symbol-comma: Commas +# e.g. [,、] +記号-読点 +# +# symbol-period: Periods and full stops. +# e.g. [..。] +記号-句点 +# +# symbol-space: Full-width whitespace. +記号-空白 +# +# symbol-open_bracket: +# e.g. [({‘“『【] +記号-括弧開 +# +# symbol-close_bracket: +# e.g. [)}’”』」】] +記号-括弧閉 +# +# symbol-alphabetic: +#記号-アルファベット +# +##### +# other: unclassified other +#その他 +# +# other-interjection: Words that are hard to classify as noun-suffixes or +# sentence-final particles. +# e.g. (だ)ァ +その他-間投 +# +##### +# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. +# e.g. あの, うんと, えと +フィラー +# +##### +# non-verbal: non-verbal sound. +非言語音 +# +##### +# fragment: +#語断片 +# +##### +# unknown: unknown part of speech. +#未知語 +# +##### End of file diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ar.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ar.txt new file mode 100644 index 0000000..046829d --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ar.txt @@ -0,0 +1,125 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Cleaned on October 11, 2009 (not normalized, so use before normalization) +# This means that when modifying this list, you might need to add some +# redundant entries, for example containing forms with both أ and ا +من +ومن +منها +منه +في +وفي +فيها +فيه +و +ف +ثم +او +أو +ب +بها +به +ا +أ +اى +اي +أي +أى +لا +ولا +الا +ألا +إلا +لكن +ما +وما +كما +فما +عن +مع +اذا +إذا +ان +أن +إن +انها +أنها +إنها +انه +أنه +إنه +بان +بأن +فان +فأن +وان +وأن +وإن +التى +التي +الذى +الذي +الذين +الى +الي +إلى +إلي +على +عليها +عليه +اما +أما +إما +ايضا +أيضا +كل +وكل +لم +ولم +لن +ولن +هى +هي +هو +وهى +وهي +وهو +فهى +فهي +فهو +انت +أنت +لك +لها +له +هذه +هذا +تلك +ذلك +هناك +كانت +كان +يكون +تكون +وكانت +وكان +غير +بعض +قد +نحو +بين +بينما +منذ +ضمن +حيث +الان +الآن +خلال +بعد +قبل +حتى +عند +عندما +لدى +جميع diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_bg.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_bg.txt new file mode 100644 index 0000000..1ae4ba2 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_bg.txt @@ -0,0 +1,193 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +а +аз +ако +ала +бе +без +беше +би +бил +била +били +било +близо +бъдат +бъде +бяха +в +вас +ваш +ваша +вероятно +вече +взема +ви +вие +винаги +все +всеки +всички +всичко +всяка +във +въпреки +върху +г +ги +главно +го +д +да +дали +до +докато +докога +дори +досега +доста +е +едва +един +ето +за +зад +заедно +заради +засега +затова +защо +защото +и +из +или +им +има +имат +иска +й +каза +как +каква +какво +както +какъв +като +кога +когато +което +които +кой +който +колко +която +къде +където +към +ли +м +ме +между +мен +ми +мнозина +мога +могат +може +моля +момента +му +н +на +над +назад +най +направи +напред +например +нас +не +него +нея +ни +ние +никой +нито +но +някои +някой +няма +обаче +около +освен +особено +от +отгоре +отново +още +пак +по +повече +повечето +под +поне +поради +после +почти +прави +пред +преди +през +при +пък +първо +с +са +само +се +сега +си +скоро +след +сме +според +сред +срещу +сте +съм +със +също +т +тази +така +такива +такъв +там +твой +те +тези +ти +тн +то +това +тогава +този +той +толкова +точно +трябва +тук +тъй +тя +тях +у +харесва +ч +че +често +чрез +ще +щом +я diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ca.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ca.txt new file mode 100644 index 0000000..3da65de --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ca.txt @@ -0,0 +1,220 @@ +# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) +a +abans +ací +ah +així +això +al +als +aleshores +algun +alguna +algunes +alguns +alhora +allà +allí +allò +altra +altre +altres +amb +ambdós +ambdues +apa +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquí +baix +cada +cadascú +cadascuna +cadascunes +cadascuns +com +contra +d'un +d'una +d'unes +d'uns +dalt +de +del +dels +des +després +dins +dintre +donat +doncs +durant +e +eh +el +els +em +en +encara +ens +entre +érem +eren +éreu +es +és +esta +està +estàvem +estaven +estàveu +esteu +et +etc +ets +fins +fora +gairebé +ha +han +has +havia +he +hem +heu +hi +ho +i +igual +iguals +ja +l'hi +la +les +li +li'n +llavors +m'he +ma +mal +malgrat +mateix +mateixa +mateixes +mateixos +me +mentre +més +meu +meus +meva +meves +molt +molta +moltes +molts +mon +mons +n'he +n'hi +ne +ni +no +nogensmenys +només +nosaltres +nostra +nostre +nostres +o +oh +oi +on +pas +pel +pels +per +però +perquè +poc +poca +pocs +poques +potser +propi +qual +quals +quan +quant +que +què +quelcom +qui +quin +quina +quines +quins +s'ha +s'han +sa +semblant +semblants +ses +seu +seus +seva +seva +seves +si +sobre +sobretot +sóc +solament +sols +son +són +sons +sota +sou +t'ha +t'han +t'he +ta +tal +també +tampoc +tan +tant +tanta +tantes +teu +teus +teva +teves +ton +tons +tot +tota +totes +tots +un +una +unes +uns +us +va +vaig +vam +van +vas +veu +vosaltres +vostra +vostre +vostres diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_cz.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_cz.txt new file mode 100644 index 0000000..53c6097 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_cz.txt @@ -0,0 +1,172 @@ +a +s +k +o +i +u +v +z +dnes +cz +tímto +budeš +budem +byli +jseš +můj +svým +ta +tomto +tohle +tuto +tyto +jej +zda +proč +máte +tato +kam +tohoto +kdo +kteří +mi +nám +tom +tomuto +mít +nic +proto +kterou +byla +toho +protože +asi +ho +naši +napište +re +což +tím +takže +svých +její +svými +jste +aj +tu +tedy +teto +bylo +kde +ke +pravé +ji +nad +nejsou +či +pod +téma +mezi +přes +ty +pak +vám +ani +když +však +neg +jsem +tento +článku +články +aby +jsme +před +pta +jejich +byl +ještě +až +bez +také +pouze +první +vaše +která +nás +nový +tipy +pokud +může +strana +jeho +své +jiné +zprávy +nové +není +vás +jen +podle +zde +už +být +více +bude +již +než +který +by +které +co +nebo +ten +tak +má +při +od +po +jsou +jak +další +ale +si +se +ve +to +jako +za +zpět +ze +do +pro +je +na +atd +atp +jakmile +přičemž +já +on +ona +ono +oni +ony +my +vy +jí +ji +mě +mne +jemu +tomu +těm +těmu +němu +němuž +jehož +jíž +jelikož +jež +jakož +načež diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_da.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_da.txt new file mode 100644 index 0000000..42e6145 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_da.txt @@ -0,0 +1,110 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + +og | and +i | in +jeg | I +det | that (dem. pronoun)/it (pers. pronoun) +at | that (in front of a sentence)/to (with infinitive) +en | a/an +den | it (pers. pronoun)/that (dem. pronoun) +til | to/at/for/until/against/by/of/into, more +er | present tense of "to be" +som | who, as +på | on/upon/in/on/at/to/after/of/with/for, on +de | they +med | with/by/in, along +han | he +af | of/by/from/off/for/in/with/on, off +for | at/for/to/from/by/of/ago, in front/before, because +ikke | not +der | who/which, there/those +var | past tense of "to be" +mig | me/myself +sig | oneself/himself/herself/itself/themselves +men | but +et | a/an/one, one (number), someone/somebody/one +har | present tense of "to have" +om | round/about/for/in/a, about/around/down, if +vi | we +min | my +havde | past tense of "to have" +ham | him +hun | she +nu | now +over | over/above/across/by/beyond/past/on/about, over/past +da | then, when/as/since +fra | from/off/since, off, since +du | you +ud | out +sin | his/her/its/one's +dem | them +os | us/ourselves +op | up +man | you/one +hans | his +hvor | where +eller | or +hvad | what +skal | must/shall etc. +selv | myself/youself/herself/ourselves etc., even +her | here +alle | all/everyone/everybody etc. +vil | will (verb) +blev | past tense of "to stay/to remain/to get/to become" +kunne | could +ind | in +når | when +være | present tense of "to be" +dog | however/yet/after all +noget | something +ville | would +jo | you know/you see (adv), yes +deres | their/theirs +efter | after/behind/according to/for/by/from, later/afterwards +ned | down +skulle | should +denne | this +end | than +dette | this +mit | my/mine +også | also +under | under/beneath/below/during, below/underneath +have | have +dig | you +anden | other +hende | her +mine | my +alt | everything +meget | much/very, plenty of +sit | his, her, its, one's +sine | his, her, its, one's +vor | our +mod | against +disse | these +hvis | if +din | your/yours +nogle | some +hos | by/at +blive | be/become +mange | many +ad | by/through +bliver | present tense of "to be/to become" +hendes | her/hers +været | be +thi | for (conj) +jer | you +sådan | such, like this/like that diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_de.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_de.txt new file mode 100644 index 0000000..86525e7 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_de.txt @@ -0,0 +1,294 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + +aber | but + +alle | all +allem +allen +aller +alles + +als | than, as +also | so +am | an + dem +an | at + +ander | other +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders + +auch | also +auf | on +aus | out of +bei | by +bin | am +bis | until +bist | art +da | there +damit | with it +dann | then + +der | the +den +des +dem +die +das + +daß | that + +derselbe | the same +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe + +dazu | to that + +dein | thy +deine +deinem +deinen +deiner +deines + +denn | because + +derer | of those +dessen | of him + +dich | thee +dir | to thee +du | thou + +dies | this +diese +diesem +diesen +dieser +dieses + + +doch | (several meanings) +dort | (over) there + + +durch | through + +ein | a +eine +einem +einen +einer +eines + +einig | some +einige +einigem +einigen +einiger +einiges + +einmal | once + +er | he +ihn | him +ihm | to him + +es | it +etwas | something + +euer | your +eure +eurem +euren +eurer +eures + +für | for +gegen | towards +gewesen | p.p. of sein +hab | have +habe | have +haben | have +hat | has +hatte | had +hatten | had +hier | here +hin | there +hinter | behind + +ich | I +mich | me +mir | to me + + +ihr | you, to her +ihre +ihrem +ihren +ihrer +ihres +euch | to you + +im | in + dem +in | in +indem | while +ins | in + das +ist | is + +jede | each, every +jedem +jeden +jeder +jedes + +jene | that +jenem +jenen +jener +jenes + +jetzt | now +kann | can + +kein | no +keine +keinem +keinen +keiner +keines + +können | can +könnte | could +machen | do +man | one + +manche | some, many a +manchem +manchen +mancher +manches + +mein | my +meine +meinem +meinen +meiner +meines + +mit | with +muss | must +musste | had to +nach | to(wards) +nicht | not +nichts | nothing +noch | still, yet +nun | now +nur | only +ob | whether +oder | or +ohne | without +sehr | very + +sein | his +seine +seinem +seinen +seiner +seines + +selbst | self +sich | herself + +sie | they, she +ihnen | to them + +sind | are +so | so + +solche | such +solchem +solchen +solcher +solches + +soll | shall +sollte | should +sondern | but +sonst | else +über | over +um | about, around +und | and + +uns | us +unse +unsem +unsen +unser +unses + +unter | under +viel | much +vom | von + dem +von | from +vor | before +während | while +war | was +waren | were +warst | wast +was | what +weg | away, off +weil | because +weiter | further + +welche | which +welchem +welchen +welcher +welches + +wenn | when +werde | will +werden | will +wie | how +wieder | again +will | want +wir | we +wird | will +wirst | willst +wo | where +wollen | want +wollte | wanted +würde | would +würden | would +zu | to +zum | zu + dem +zur | zu + der +zwar | indeed +zwischen | between + diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_el.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_el.txt new file mode 100644 index 0000000..232681f --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_el.txt @@ -0,0 +1,78 @@ +# Lucene Greek Stopwords list +# Note: by default this file is used after GreekLowerCaseFilter, +# so when modifying this file use 'σ' instead of 'ς' +ο +η +το +οι +τα +του +τησ +των +τον +την +και +κι +κ +ειμαι +εισαι +ειναι +ειμαστε +ειστε +στο +στον +στη +στην +μα +αλλα +απο +για +προσ +με +σε +ωσ +παρα +αντι +κατα +μετα +θα +να +δε +δεν +μη +μην +επι +ενω +εαν +αν +τοτε +που +πωσ +ποιοσ +ποια +ποιο +ποιοι +ποιεσ +ποιων +ποιουσ +αυτοσ +αυτη +αυτο +αυτοι +αυτων +αυτουσ +αυτεσ +αυτα +εκεινοσ +εκεινη +εκεινο +εκεινοι +εκεινεσ +εκεινα +εκεινων +εκεινουσ +οπωσ +ομωσ +ισωσ +οσο +οτι diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_en.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_en.txt new file mode 100644 index 0000000..2c164c0 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_en.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_es.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_es.txt new file mode 100644 index 0000000..487d78c --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_es.txt @@ -0,0 +1,356 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Spanish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | from, of +la | the, her +que | who, that +el | the +en | in +y | and +a | to +los | the, them +del | de + el +se | himself, from him etc +las | the, them +por | for, by, etc +un | a +para | for +con | with +no | no +una | a +su | his, her +al | a + el + | es from SER +lo | him +como | how +más | more +pero | pero +sus | su plural +le | to him, her +ya | already +o | or + | fue from SER +este | this + | ha from HABER +sí | himself etc +porque | because +esta | this + | son from SER +entre | between + | está from ESTAR +cuando | when +muy | very +sin | without +sobre | on + | ser from SER + | tiene from TENER +también | also +me | me +hasta | until +hay | there is/are +donde | where + | han from HABER +quien | whom, that + | están from ESTAR + | estado from ESTAR +desde | from +todo | all +nos | us +durante | during + | estados from ESTAR +todos | all +uno | a +les | to them +ni | nor +contra | against +otros | other + | fueron from SER +ese | that +eso | that + | había from HABER +ante | before +ellos | they +e | and (variant of y) +esto | this +mí | me +antes | before +algunos | some +qué | what? +unos | a +yo | I +otro | other +otras | other +otra | other +él | he +tanto | so much, many +esa | that +estos | these +mucho | much, many +quienes | who +nada | nothing +muchos | many +cual | who + | sea from SER +poco | few +ella | she +estar | to be + | haber from HABER +estas | these + | estaba from ESTAR + | estamos from ESTAR +algunas | some +algo | something +nosotros | we + + | other forms + +mi | me +mis | mi plural +tú | thou +te | thee +ti | thee +tu | thy +tus | tu plural +ellas | they +nosotras | we +vosotros | you +vosotras | you +os | you +mío | mine +mía | +míos | +mías | +tuyo | thine +tuya | +tuyos | +tuyas | +suyo | his, hers, theirs +suya | +suyos | +suyas | +nuestro | ours +nuestra | +nuestros | +nuestras | +vuestro | yours +vuestra | +vuestros | +vuestras | +esos | those +esas | those + + | forms of estar, to be (not including the infinitive): +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad + + | forms of haber, to have (not including the infinitive): +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas + + | forms of ser, to be (not including the infinitive): +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +siendo +sido + | sed also means 'thirst' + + | forms of tener, to have (not including the infinitive): +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened + diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_et.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_et.txt new file mode 100644 index 0000000..1b06a13 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_et.txt @@ -0,0 +1,1603 @@ +# Estonian stopwords list +all +alla +allapoole +allpool +alt +altpoolt +eel +eespool +enne +hommikupoole +hoolimata +ilma +kaudu +keset +kesk +kohe +koos +kuhupoole +kuni +kuspool +kustpoolt +kõige +käsikäes +lappi +ligi +läbi +mööda +paitsi +peale +pealepoole +pealpool +pealt +pealtpoolt +piki +pikku +piku +pikuti +põiki +pärast +päri +risti +sealpool +sealtpoolt +seespool +seltsis +siiapoole +siinpool +siitpoolt +sinnapoole +sissepoole +taga +tagantpoolt +tagapidi +tagapool +taha +tahapoole +teispool +teispoole +tänu +tükkis +vaatamata +vastu +väljapoole +väljaspool +väljastpoolt +õhtupoole +ühes +ühestükis +ühestükkis +ülalpool +ülaltpoolt +üle +ülespoole +ülevalpool +ülevaltpoolt +ümber +ümbert +aegu +aegus +alguks +algul +algule +algult +alguni +all +alla +alt +alul +alutsi +arvel +asemel +asemele +eel +eeli +ees +eesotsas +eest +eestotsast +esitsi +ette +etteotsa +haaval +heaks +hoolimata +hulgas +hulgast +hulka +jalgu +jalus +jalust +jaoks +jooksul +juurde +juures +juurest +jälil +jälile +järel +järele +järelt +järgi +kaasas +kallal +kallale +kallalt +kamul +kannul +kannule +kannult +kaudu +kaupa +keskel +keskele +keskelt +keskis +keskpaiku +kestel +kestes +kilda +killas +killast +kimpu +kimpus +kiuste +kohal +kohale +kohalt +kohaselt +kohe +kohta +koos +korral +kukil +kukile +kukilt +kulul +kõrva +kõrval +kõrvale +kõrvalt +kõrvas +kõrvast +käekõrval +käekõrvale +käekõrvalt +käes +käest +kätte +külge +küljes +küljest +küüsi +küüsis +küüsist +ligi +ligidal +ligidale +ligidalt +aegu +aegus +alguks +algul +algule +algult +alguni +all +alla +alt +alul +alutsi +arvel +asemel +asemele +eel +eeli +ees +eesotsas +eest +eestotsast +esitsi +ette +etteotsa +haaval +heaks +hoolimata +hulgas +hulgast +hulka +jalgu +jalus +jalust +jaoks +jooksul +juurde +juures +juurest +jälil +jälile +järel +järele +järelt +järgi +kaasas +kallal +kallale +kallalt +kamul +kannul +kannule +kannult +kaudu +kaupa +keskel +keskele +keskelt +keskis +keskpaiku +kestel +kestes +kilda +killas +killast +kimpu +kimpus +kiuste +kohal +kohale +kohalt +kohaselt +kohe +kohta +koos +korral +kukil +kukile +kukilt +kulul +kõrva +kõrval +kõrvale +kõrvalt +kõrvas +kõrvast +käekõrval +käekõrvale +käekõrvalt +käes +käest +kätte +külge +küljes +küljest +küüsi +küüsis +küüsist +ligi +ligidal +ligidale +ligidalt +lool +läbi +lähedal +lähedale +lähedalt +man +mant +manu +meelest +mööda +nahas +nahka +nahkas +najal +najale +najalt +nõjal +nõjale +otsa +otsas +otsast +paigale +paigu +paiku +peal +peale +pealt +perra +perrä +pidi +pihta +piki +pikku +pool +poole +poolest +poolt +puhul +puksiiris +pähe +päralt +päras +pärast +päri +ringi +ringis +risust +saadetusel +saadik +saatel +saati +seas +seast +sees +seest +sekka +seljataga +seltsi +seltsis +seltsist +sisse +slepis +suhtes +šlepis +taga +tagant +tagantotsast +tagaotsas +tagaselja +tagasi +tagast +tagutsi +taha +tahaotsa +takka +tarvis +tasa +tuuri +tuuris +tõttu +tükkis +uhal +vaatamata +vahel +vahele +vahelt +vahepeal +vahepeale +vahepealt +vahetsi +varal +varale +varul +vastas +vastast +vastu +veerde +veeres +viisi +võidu +võrd +võrdki +võrra +võrragi +väel +väele +vältel +väärt +väärtki +äärde +ääre +ääres +äärest +ühes +üle +ümber +ümbert +a +abil +aina +ainult +alalt +alates +alati +alles +b +c +d +e +eales +ealeski +edasi +edaspidi +eelkõige +eemal +ei +eks +end +enda +enese +ennem +esialgu +f +g +h +hoopis +i +iganes +igatahes +igati +iial +iialgi +ikka +ikkagi +ilmaski +iseenda +iseenese +iseenesest +isegi +j +jah +ju +juba +juhul +just +järelikult +k +ka +kah +kas +kasvõi +keda +kestahes +kogu +koguni +kohati +kokku +kuhu +kuhugi +kuidagi +kuidas +kunagi +kus +kusagil +kusjuures +kuskil +kust +kõigepealt +küll +l +liiga +lisaks +m +miks +mil +millal +millalgi +mispärast +mistahes +mistõttu +mitte +muide +muidu +muidugi +muist +mujal +mujale +mujalt +mõlemad +mõnda +mõne +mõnikord +n +nii +niikaua +niimoodi +niipaljuke +niisama +niisiis +niivõrd +nõnda +nüüd +o +omaette +omakorda +omavahel +ometi +p +palju +paljuke +palju-palju +peaaegu +peagi +peamiselt +pigem +pisut +praegu +päris +r +rohkem +s +samas +samuti +seal +sealt +sedakorda +sedapuhku +seega +seejuures +seejärel +seekord +seepärast +seetõttu +sellepärast +seni +sestap +siia +siiani +siin +siinkohal +siis +siiski +siit +sinna +suht +š +z +ž +t +teel +teineteise +tõesti +täiesti +u +umbes +v +w +veel +veelgi +vist +võibolla +võib-olla +väga +vähemalt +välja +väljas +väljast +õ +ä +ära +ö +ü +ühtlasi +üksi +ükskõik +ülal +ülale +ülalt +üles +ülesse +üleval +ülevalt +ülimalt +üsna +x +y +aga +ega +ehk +ehkki +elik +ellik +enge +ennegu +ent +et +ja +justkui +kui +kuid +kuigi +kuivõrd +kuna +kuni +kut +mistab +muudkui +nagu +nigu +ning +olgugi +otsekui +otsenagu +selmet +sest +sestab +vaid +või +aa +adaa +adjöö +ae +ah +ahaa +ahah +ah-ah-ah +ah-haa +ahoi +ai +aidaa +aidu-raidu +aih +aijeh +aituma +aitäh +aitüma +ammuu +amps +ampsti +aptsih +ass +at +ata +at-at-at +atsih +atsihh +auh +bai-bai +bingo +braavo +brr +ee +eeh +eh +ehee +eheh +eh-eh-hee +eh-eh-ee +ehei +ehh +ehhee +einoh +ena +ennäe +ennäh +fuh +fui +fuih +haa +hah +hahaa +hah-hah-hah +halleluuja +hallo +halloo +hass +hee +heh +he-he-hee +hei +heldeke(ne) +heureka +hihii +hip-hip-hurraa +hmh +hmjah +hoh-hoh-hoo +hohoo +hoi +hollallaa +hoo +hoplaa +hopp +hops +hopsassaa +hopsti +hosianna +huh +huidii +huist +hurjah +hurjeh +hurjoh +hurjuh +hurraa +huu +hõhõh +hõi +hõissa +hõissassa +hõk +hõkk +häh +hä-hä-hää +hüvasti +ih-ah-haa +ih-ih-hii +ii-ha-ha +issake +issakene +isver +jaa-ah +ja-ah +jaah +janäe +jeeh +jeerum +jeever +jessas +jestas +juhhei +jumalaga +jumalime +jumaluke +jumalukene +jutas +kaaps +kaapsti +kaasike +kae +kalps +kalpsti +kannäe +kanäe +kappadi +kaps +kapsti +karkõmm +karkäuh +karkääks +karkääksti +karmauh +karmauhti +karnaps +karnapsti +karniuhti +karpartsaki +karpauh +karpauhti +karplauh +karplauhti +karprauh +karprauhti +karsumdi +karsumm +kartsumdi +kartsumm +karviuh +karviuhti +kaske +kassa +kauh +kauhti +keh +keksti +kepsti +khe +khm +kih +kiiks +kiiksti +kiis +kiiss +kikerii +kikerikii +kili +kilk +kilk-kõlk +kilks +kilks-kolks +kilks-kõlks +kill +killadi +killadi|-kolladi +killadi-kõlladi +killa-kolla +killa-kõlla +kill-kõll +kimps-komps +kipp +kips-kõps +kiriküüt +kirra-kõrra +kirr-kõrr +kirts +klaps +klapsti +klirdi +klirr +klonks +klops +klopsti +kluk +klu-kluu +klõks +klõksti +klõmdi +klõmm +klõmpsti +klõnks +klõnksti +klõps +klõpsti +kläu +kohva-kohva +kok +koks +koksti +kolaki +kolk +kolks +kolksti +koll +kolladi +komp +komps +kompsti +kop +kopp +koppadi +kops +kopsti +kossu +kotsu +kraa +kraak +kraaks +kraaps +kraapsti +krahh +kraks +kraksti +kraps +krapsti +krauh +krauhti +kriiks +kriiksti +kriips +kriips-kraaps +kripa-krõpa +krips-kraps +kriuh +kriuks +kriuksti +kromps +kronk +kronks +krooks +kruu +krõks +krõksti +krõpa +krõps +krõpsti +krõuh +kräu +kräuh +kräuhti +kräuks +kss +kukeleegu +kukku +kuku +kulu +kurluu +kurnäu +kuss +kussu +kõks +kõksti +kõldi +kõlks +kõlksti +kõll +kõmaki +kõmdi +kõmm +kõmps +kõpp +kõps +kõpsadi +kõpsat +kõpsti +kõrr +kõrra-kõrra +kõss +kõtt +kõõksti +kärr +kärts +kärtsti +käuks +käuksti +kääga +kääks +kääksti +köh +köki-möki +köksti +laks +laksti +lampsti +larts +lartsti +lats +latsti +leelo +legoo +lehva +liiri-lõõri +lika-lõka +likat-lõkat +limpsti +lips +lipsti +lirts +lirtsaki +lirtsti +lonksti +lops +lopsti +lorts +lortsti +luks +lups +lupsti +lurts +lurtsti +lõks +lõksti +lõmps +lõmpsti +lõnks +lõnksti +lärts +lärtsti +läts +lätsti +lörts +lörtsti +lötsti +lööps +lööpsti +marss +mats +matsti +mauh +mauhti +mh +mhh +mhmh +miau +mjaa +mkm +m-mh +mnjaa +mnjah +moens +mulks +mulksti +mull-mull +mull-mull-mull +muu +muuh +mõh +mõmm +mäh +mäts +mäu +mää +möh +möh-öh-ää +möö +müh-müh +mühüh +müks +müksti +müraki +mürr +mürts +mürtsaki +mürtsti +mütaku +müta-mäta +müta-müta +müt-müt +müt-müt-müt +müts +mütsti +mütt +naa +naah +nah +naks +naksti +nanuu +naps +napsti +nilpsti +nipsti +nirr +niuh +niuh-näuh +niuhti +noh +noksti +nolpsti +nonoh +nonoo +nonäh +noo +nooh +nooks +norr +nurr +nuuts +nõh +nõhh +nõka-nõka +nõks +nõksat-nõksat +nõks-nõks +nõksti +nõõ +nõõh +näeh +näh +nälpsti +nämm-nämm +näpsti +näts +nätsti +näu +näuh +näuhti +näuks +näuksti +nääh +nääks +nühkat-nühkat +oeh +oh +ohh +ohhh +oh-hoi +oh-hoo +ohoh +oh-oh-oo +oh-oh-hoo +ohoi +ohoo +oi +oih +oijee +oijeh +oo +ooh +oo-oh +oo-ohh +oot +ossa +ot +paa +pah +pahh +pakaa +pamm +pantsti +pardon +pardonks +parlartsti +parts +partsti +partsumdi +partsumm +pastoi +pats +patst +patsti +pau +pauh +pauhti +pele +pfui +phuh +phuuh +phäh +phähh +piiks +piip +piiri-pääri +pimm +pimm-pamm +pimm-pomm +pimm-põmm +piraki +piuks +piu-pau +plaks +plaksti +plarts +plartsti +plats +platsti +plauh +plauhh +plauhti +pliks +pliks-plaks +plinn +pliraki +plirts +plirtsti +pliu +pliuh +ploks +plotsti +plumps +plumpsti +plõks +plõksti +plõmdi +plõmm +plõnn +plärr +plärts +plärtsat +plärtsti +pläu +pläuh +plää +plörtsat +pomm +popp +pops +popsti +ports +pot +pots +potsti +pott +praks +praksti +prants +prantsaki +prantsti +prassai +prauh +prauhh +prauhti +priks +priuh +priuhh +priuh-prauh +proosit +proost +prr +prrr +prõks +prõksti +prõmdi +prõmm +prõntsti +prääk +prääks +pst +psst +ptrr +ptruu +ptüi +puh +puhh +puksti +pumm +pumps +pup-pup-pup +purts +puuh +põks +põksti +põmdi +põmm +põmmadi +põnks +põnn +põnnadi +põnt +põnts +põntsti +põraki +põrr +põrra-põrra +päh +pähh +päntsti +pää +pöörd +püh +raks +raksti +raps +rapsti +ratataa +rauh +riips +riipsti +riks +riks-raks +rips-raps +rivitult +robaki +rops +ropsaki +ropsti +ruik +räntsti +räts +röh +röhh +sah +sahh +sahkat +saps +sapsti +sauh +sauhti +servus +sihkadi-sahkadi +sihka-sahka +sihkat-sahkat +silks +silk-solk +sips +sipsti +sirr +sirr-sorr +sirts +sirtsti +siu +siuh +siuh-sauh +siuh-säuh +siuhti +siuks +siuts +skool +so +soh +solks +solksti +solpsti +soo +sooh +so-oh +soo-oh +sopp +sops +sopsti +sorr +sorts +sortsti +so-soo +soss +soss-soss +ss +sss +sst +stopp +suhkat-sahkat +sulk +sulks +sulksti +sull +sulla-sulla +sulpa-sulpa +sulps +sulpsti +sumaki +sumdi +summ +summat-summat +sups +supsaku +supsti +surts +surtsti +suss +susti +suts +sutsti +säh +sähke +särts +särtsti +säu +säuh +säuhti +taevake +taevakene +takk +tere +terekest +tibi-tibi +tikk-takk +tiks +tilk +tilks +till +tilla-talla +till-tall +tilulii +tinn +tip +tip-tap +tirr +tirtsti +tiu +tjaa +tjah +tohhoh +tohhoo +tohoh +tohoo +tok +tokk +toks +toksti +tonks +tonksti +tota +totsti +tot-tot +tprr +tpruu +trah +trahh +trallallaa +trill +trillallaa +trr +trrr +tsah +tsahh +tsilk +tsilk-tsolk +tsirr +tsiuh +tskae +tsolk +tss +tst +tsst +tsuhh +tsuk +tsumm +tsurr +tsäuh +tšao +tšš +tššš +tuk +tuks +turts +turtsti +tutki +tutkit +tutu-lutu +tutulutu +tuut +tuutu-luutu +tõks +tötsti +tümps +uh +uhh +uh-huu +uhtsa +uhtsaa +uhuh +uhuu +ui +uih +uih-aih +uijah +uijeh +uist +uit +uka +upsti +uraa +urjah +urjeh +urjoh +urjuh +urr +urraa +ust +utu +uu +uuh +vaak +vaat +vae +vaeh +vai +vat +vau +vhüüt +vidiit +viiks +vilks +vilksti +vinki-vinki +virdi +virr +viu +viudi +viuh +viuhti +voeh +voh +vohh +volks +volksti +vooh +vops +vopsti +vot +vuh +vuhti +vuih +vulks +vulksti +vull +vulpsti +vups +vupsaki +vupsaku +vupsti +vurdi +vurr +vurra-vurra +vurts +vurtsti +vutt +võe +võeh +või +võih +võrr +võts +võtt +vääks +õe +õits +õk +õkk +õrr +õss +õuh +äh +ähh +ähhähhää +äh-hää +äh-äh-hää +äiu +äiu-ää +äss +ää +ääh +äähh +öh +öhh +ök +üh +eelmine +eikeegi +eimiski +emb-kumb +enam +enim +iga +igasugune +igaüks +ise +isesugune +järgmine +keegi +kes +kumb +kumbki +kõik +meiesugune +meietaoline +midagi +mihuke +mihukene +milletaoline +milline +mina +minake +mingi +mingisugune +minusugune +minutaoline +mis +miski +miskisugune +missugune +misuke +mitmes +mitmesugune +mitu +mitu-mitu +mitu-setu +muu +mõlema +mõnesugune +mõni +mõningane +mõningas +mäherdune +määrane +naasugune +need +nemad +nendesugune +nendetaoline +nihuke +nihukene +niimitu +niisamasugune +niisugune +nisuke +nisukene +oma +omaenese +omasugune +omataoline +pool +praegune +sama +samasugune +samataoline +see +seesama +seesamane +seesamune +seesinane +seesugune +selline +sihuke +sihukene +sina +sinusugune +sinutaoline +siuke +siukene +säherdune +säärane +taoline +teiesugune +teine +teistsugune +tema +temake +temakene +temasugune +temataoline +too +toosama +toosamane +üks +üksteise +hakkama +minema +olema +pidama +saama +tegema +tulema +võima diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_eu.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_eu.txt new file mode 100644 index 0000000..25f1db9 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_eu.txt @@ -0,0 +1,99 @@ +# example set of basque stopwords +al +anitz +arabera +asko +baina +bat +batean +batek +bati +batzuei +batzuek +batzuetan +batzuk +bera +beraiek +berau +berauek +bere +berori +beroriek +beste +bezala +da +dago +dira +ditu +du +dute +edo +egin +ere +eta +eurak +ez +gainera +gu +gutxi +guzti +haiei +haiek +haietan +hainbeste +hala +han +handik +hango +hara +hari +hark +hartan +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honek +honela +honetan +honi +hor +hori +horiei +horiek +horietan +horko +horra +horrek +horrela +horretan +horri +hortik +hura +izan +ni +noiz +nola +non +nondik +nongo +nor +nora +ze +zein +zen +zenbait +zenbat +zer +zergatik +ziren +zituen +zu +zuek +zuen +zuten diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fa.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fa.txt new file mode 100644 index 0000000..723641c --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fa.txt @@ -0,0 +1,313 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Note: by default this file is used after normalization, so when adding entries +# to this file, use the arabic 'ي' instead of 'ی' +انان +نداشته +سراسر +خياه +ايشان +وي +تاكنون +بيشتري +دوم +پس +ناشي +وگو +يا +داشتند +سپس +هنگام +هرگز +پنج +نشان +امسال +ديگر +گروهي +شدند +چطور +ده +و +دو +نخستين +ولي +چرا +چه +وسط +ه +كدام +قابل +يك +رفت +هفت +همچنين +در +هزار +بله +بلي +شايد +اما +شناسي +گرفته +دهد +داشته +دانست +داشتن +خواهيم +ميليارد +وقتيكه +امد +خواهد +جز +اورده +شده +بلكه +خدمات +شدن +برخي +نبود +بسياري +جلوگيري +حق +كردند +نوعي +بعري +نكرده +نظير +نبايد +بوده +بودن +داد +اورد +هست +جايي +شود +دنبال +داده +بايد +سابق +هيچ +همان +انجا +كمتر +كجاست +گردد +كسي +تر +مردم +تان +دادن +بودند +سري +جدا +ندارند +مگر +يكديگر +دارد +دهند +بنابراين +هنگامي +سمت +جا +انچه +خود +دادند +زياد +دارند +اثر +بدون +بهترين +بيشتر +البته +به +براساس +بيرون +كرد +بعضي +گرفت +توي +اي +ميليون +او +جريان +تول +بر +مانند +برابر +باشيم +مدتي +گويند +اكنون +تا +تنها +جديد +چند +بي +نشده +كردن +كردم +گويد +كرده +كنيم +نمي +نزد +روي +قصد +فقط +بالاي +ديگران +اين +ديروز +توسط +سوم +ايم +دانند +سوي +استفاده +شما +كنار +داريم +ساخته +طور +امده +رفته +نخست +بيست +نزديك +طي +كنيد +از +انها +تمامي +داشت +يكي +طريق +اش +چيست +روب +نمايد +گفت +چندين +چيزي +تواند +ام +ايا +با +ان +ايد +ترين +اينكه +ديگري +راه +هايي +بروز +همچنان +پاعين +كس +حدود +مختلف +مقابل +چيز +گيرد +ندارد +ضد +همچون +سازي +شان +مورد +باره +مرسي +خويش +برخوردار +چون +خارج +شش +هنوز +تحت +ضمن +هستيم +گفته +فكر +بسيار +پيش +براي +روزهاي +انكه +نخواهد +بالا +كل +وقتي +كي +چنين +كه +گيري +نيست +است +كجا +كند +نيز +يابد +بندي +حتي +توانند +عقب +خواست +كنند +بين +تمام +همه +ما +باشند +مثل +شد +اري +باشد +اره +طبق +بعد +اگر +صورت +غير +جاي +بيش +ريزي +اند +زيرا +چگونه +بار +لطفا +مي +درباره +من +ديده +همين +گذاري +برداري +علت +گذاشته +هم +فوق +نه +ها +شوند +اباد +همواره +هر +اول +خواهند +چهار +نام +امروز +مان +هاي +قبل +كنم +سعي +تازه +را +هستند +زير +جلوي +عنوان +بود diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fi.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fi.txt new file mode 100644 index 0000000..4372c9a --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fi.txt @@ -0,0 +1,97 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| forms of BE + +olla +olen +olet +on +olemme +olette +ovat +ole | negative form + +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet + +en | negation +et +ei +emme +ette +eivät + +|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans +minä minun minut minua minussa minusta minuun minulla minulta minulle | I +sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you +hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she +me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we +te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you +he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they + +tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this +tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that +se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it +nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these +nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those +ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they + +kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who +ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) +mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what +mitkä | (pl) + +joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which +jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) + +| conjunctions + +että | that +ja | and +jos | if +koska | because +kuin | than +mutta | but +niin | so +sekä | and +sillä | for +tai | or +vaan | but +vai | or +vaikka | although + + +| prepositions + +kanssa | with +mukaan | according to +noin | about +poikki | across +yli | over, across + +| other + +kun | when +niin | so +nyt | now +itse | self + diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fr.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fr.txt new file mode 100644 index 0000000..749abae --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_fr.txt @@ -0,0 +1,186 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A French stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +au | a + le +aux | a + les +avec | with +ce | this +ces | these +dans | with +de | of +des | de + les +du | de + le +elle | she +en | `of them' etc +et | and +eux | them +il | he +je | I +la | the +le | the +leur | their +lui | him +ma | my (fem) +mais | but +me | me +même | same; as in moi-même (myself) etc +mes | me (pl) +moi | me +mon | my (masc) +ne | not +nos | our (pl) +notre | our +nous | we +on | one +ou | where +par | by +pas | not +pour | for +qu | que before vowel +que | that +qui | who +sa | his, her (fem) +se | oneself +ses | his (pl) +son | his, her (masc) +sur | on +ta | thy (fem) +te | thee +tes | thy (pl) +toi | thee +ton | thy (masc) +tu | thou +un | a +une | a +vos | your (pl) +votre | your +vous | you + + | single letter forms + +c | c' +d | d' +j | j' +l | l' +à | to, at +m | m' +n | n' +s | s' +t | t' +y | there + + | forms of être (not including the infinitive): +été +étée +étées +étés +étant +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent + + | forms of avoir (not including the infinitive): +ayant +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent + + | Later additions (from Jean-Christophe Deschamps) +ceci | this +cela | that +celà | that +cet | this +cette | this +ici | here +ils | they +les | the (pl) +leurs | their (pl) +quel | which +quels | which +quelle | which +quelles | which +sans | without +soi | oneself + diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ga.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ga.txt new file mode 100644 index 0000000..9ff88d7 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ga.txt @@ -0,0 +1,110 @@ + +a +ach +ag +agus +an +aon +ar +arna +as +b' +ba +beirt +bhúr +caoga +ceathair +ceathrar +chomh +chtó +chuig +chun +cois +céad +cúig +cúigear +d' +daichead +dar +de +deich +deichniúr +den +dhá +do +don +dtí +dá +dár +dó +faoi +faoin +faoina +faoinár +fara +fiche +gach +gan +go +gur +haon +hocht +i +iad +idir +in +ina +ins +inár +is +le +leis +lena +lenár +m' +mar +mo +mé +na +nach +naoi +naonúr +ná +ní +níor +nó +nócha +ocht +ochtar +os +roimh +sa +seacht +seachtar +seachtó +seasca +seisear +siad +sibh +sinn +sna +sé +sí +tar +thar +thú +triúr +trí +trína +trínár +tríocha +tú +um +ár +é +éis +í +ó +ón +óna +ónár diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_gl.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_gl.txt new file mode 100644 index 0000000..d8760b1 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_gl.txt @@ -0,0 +1,161 @@ +# galican stopwords +a +aínda +alí +aquel +aquela +aquelas +aqueles +aquilo +aquí +ao +aos +as +así +á +ben +cando +che +co +coa +comigo +con +connosco +contigo +convosco +coas +cos +cun +cuns +cunha +cunhas +da +dalgunha +dalgunhas +dalgún +dalgúns +das +de +del +dela +delas +deles +desde +deste +do +dos +dun +duns +dunha +dunhas +e +el +ela +elas +eles +en +era +eran +esa +esas +ese +eses +esta +estar +estaba +está +están +este +estes +estiven +estou +eu +é +facer +foi +foron +fun +había +hai +iso +isto +la +las +lle +lles +lo +los +mais +me +meu +meus +min +miña +miñas +moi +na +nas +neste +nin +no +non +nos +nosa +nosas +noso +nosos +nós +nun +nunha +nuns +nunhas +o +os +ou +ó +ós +para +pero +pode +pois +pola +polas +polo +polos +por +que +se +senón +ser +seu +seus +sexa +sido +sobre +súa +súas +tamén +tan +te +ten +teñen +teño +ter +teu +teus +ti +tido +tiña +tiven +túa +túas +un +unha +unhas +uns +vos +vosa +vosas +voso +vosos +vós diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hi.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hi.txt new file mode 100644 index 0000000..86286bb --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hi.txt @@ -0,0 +1,235 @@ +# Also see http://www.opensource.org/licenses/bsd-license.html +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# This file was created by Jacques Savoy and is distributed under the BSD license. +# Note: by default this file also contains forms normalized by HindiNormalizer +# for spelling variation (see section below), such that it can be used whether or +# not you enable that feature. When adding additional entries to this list, +# please add the normalized form as well. +अंदर +अत +अपना +अपनी +अपने +अभी +आदि +आप +इत्यादि +इन +इनका +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हों +उस +उसके +उसी +उसे +एक +एवं +एस +ऐसे +और +कई +कर +करता +करते +करना +करने +करें +कहते +कहा +का +काफ़ी +कि +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गया +घर +जब +जहाँ +जा +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसे +जीधर +जैसा +जैसे +जो +तक +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तो +था +थी +थे +दबारा +दिया +दुसरा +दूसरे +दो +द्वारा +न +नहीं +ना +निहायत +नीचे +ने +पर +पर +पहले +पूरा +पे +फिर +बनी +बही +बहुत +बाद +बाला +बिलकुल +भी +भीतर +मगर +मानो +मे +में +यदि +यह +यहाँ +यही +या +यिह +ये +रखें +रहा +रहे +ऱ्वासा +लिए +लिये +लेकिन +व +वर्ग +वह +वह +वहाँ +वहीं +वाले +वुह +वे +वग़ैरह +संग +सकता +सकते +सबसे +सभी +साथ +साबुत +साभ +सारा +से +सो +ही +हुआ +हुई +हुए +है +हैं +हो +होता +होती +होते +होना +होने +# additional normalized forms of the above +अपनि +जेसे +होति +सभि +तिंहों +इंहों +दवारा +इसि +किंहें +थि +उंहों +ओर +जिंहें +वहिं +अभि +बनि +हि +उंहिं +उंहें +हें +वगेरह +एसे +रवासा +कोन +निचे +काफि +उसि +पुरा +भितर +हे +बहि +वहां +कोइ +यहां +जिंहों +तिंहें +किसि +कइ +यहि +इंहिं +जिधर +इंहें +अदि +इतयादि +हुइ +कोनसा +इसकि +दुसरे +जहां +अप +किंहों +उनकि +भि +वरग +हुअ +जेसा +नहिं diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hu.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hu.txt new file mode 100644 index 0000000..37526da --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hu.txt @@ -0,0 +1,211 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| Hungarian stop word list +| prepared by Anna Tordai + +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +ő +ők +őket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hy.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hy.txt new file mode 100644 index 0000000..60c1c50 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_hy.txt @@ -0,0 +1,46 @@ +# example set of Armenian stopwords. +այդ +այլ +այն +այս +դու +դուք +եմ +են +ենք +ես +եք +է +էի +էին +էինք +էիր +էիք +էր +ըստ +թ +ի +ին +իսկ +իր +կամ +համար +հետ +հետո +մենք +մեջ +մի +ն +նա +նաև +նրա +նրանք +որ +որը +որոնք +որպես +ու +ում +պիտի +վրա +և diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_id.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_id.txt new file mode 100644 index 0000000..4617f83 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_id.txt @@ -0,0 +1,359 @@ +# from appendix D of: A Study of Stemming Effects on Information +# Retrieval in Bahasa Indonesia +ada +adanya +adalah +adapun +agak +agaknya +agar +akan +akankah +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +diantaranya +antara +antaranya +diantara +apa +apaan +mengapa +apabila +apakah +apalagi +apatah +atau +ataukah +ataupun +bagai +bagaikan +sebagai +sebagainya +bagaimana +bagaimanapun +sebagaimana +bagaimanakah +bagi +bahkan +bahwa +bahwasanya +sebaliknya +banyak +sebanyak +beberapa +seberapa +begini +beginian +beginikah +beginilah +sebegini +begitu +begitukah +begitulah +begitupun +sebegitu +belum +belumlah +sebelum +sebelumnya +sebenarnya +berapa +berapakah +berapalah +berapapun +betulkah +sebetulnya +biasa +biasanya +bila +bilakah +bisa +bisakah +sebisanya +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +cuma +percuma +dahulu +dalam +dan +dapat +dari +daripada +dekat +demi +demikian +demikianlah +sedemikian +dengan +depan +di +dia +dialah +dini +diri +dirinya +terdiri +dong +dulu +enggak +enggaknya +entah +entahlah +terhadap +terhadapnya +hal +hampir +hanya +hanyalah +harus +haruslah +harusnya +seharusnya +hendak +hendaklah +hendaknya +hingga +sehingga +ia +ialah +ibarat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jangan +jangankan +janganlah +jika +jikalau +juga +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +dikarenakan +karena +karenanya +ke +kecil +kemudian +kenapa +kepada +kepadanya +ketika +seketika +khususnya +kini +kinilah +kiranya +sekiranya +kita +kitalah +kok +lagi +lagian +selagi +lah +lain +lainnya +melainkan +selaku +lalu +melalui +terlalu +lama +lamanya +selama +selama +selamanya +lebih +terlebih +bermacam +macam +semacam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masih +masihkah +semasih +masing +mau +maupun +semaunya +memang +mereka +merekalah +meski +meskipun +semula +mungkin +mungkinkah +nah +namun +nanti +nantinya +nyaris +oleh +olehnya +seorang +seseorang +pada +padanya +padahal +paling +sepanjang +pantas +sepantasnya +sepantasnyalah +para +pasti +pastilah +per +pernah +pula +pun +merupakan +rupanya +serupa +saat +saatnya +sesaat +saja +sajalah +saling +bersama +sama +sesama +sambil +sampai +sana +sangat +sangatlah +saya +sayalah +se +sebab +sebabnya +sebuah +tersebut +tersebutlah +sedang +sedangkan +sedikit +sedikitnya +segala +segalanya +segera +sesegera +sejak +sejenak +sekali +sekalian +sekalipun +sesekali +sekaligus +sekarang +sekarang +sekitar +sekitarnya +sela +selain +selalu +seluruh +seluruhnya +semakin +sementara +sempat +semua +semuanya +sendiri +sendirinya +seolah +seperti +sepertinya +sering +seringnya +serta +siapa +siapakah +siapapun +disini +disinilah +sini +sinilah +sesuatu +sesuatunya +suatu +sesudah +sesudahnya +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tak +tanpa +setelah +telah +tentang +tentu +tentulah +tentunya +tertentu +seterusnya +tapi +tetapi +setiap +tiap +setidaknya +tidak +tidakkah +tidaklah +toh +waduh +wah +wahai +sewaktu +walau +walaupun +wong +yaitu +yakni +yang diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_it.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_it.txt new file mode 100644 index 0000000..1219cc7 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_it.txt @@ -0,0 +1,303 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | An Italian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +ad | a (to) before vowel +al | a + il +allo | a + lo +ai | a + i +agli | a + gli +all | a + l' +agl | a + gl' +alla | a + la +alle | a + le +con | with +col | con + il +coi | con + i (forms collo, cogli etc are now very rare) +da | from +dal | da + il +dallo | da + lo +dai | da + i +dagli | da + gli +dall | da + l' +dagl | da + gll' +dalla | da + la +dalle | da + le +di | of +del | di + il +dello | di + lo +dei | di + i +degli | di + gli +dell | di + l' +degl | di + gl' +della | di + la +delle | di + le +in | in +nel | in + el +nello | in + lo +nei | in + i +negli | in + gli +nell | in + l' +negl | in + gl' +nella | in + la +nelle | in + le +su | on +sul | su + il +sullo | su + lo +sui | su + i +sugli | su + gli +sull | su + l' +sugl | su + gl' +sulla | su + la +sulle | su + le +per | through, by +tra | among +contro | against +io | I +tu | thou +lui | he +lei | she +noi | we +voi | you +loro | they +mio | my +mia | +miei | +mie | +tuo | +tua | +tuoi | thy +tue | +suo | +sua | +suoi | his, her +sue | +nostro | our +nostra | +nostri | +nostre | +vostro | your +vostra | +vostri | +vostre | +mi | me +ti | thee +ci | us, there +vi | you, there +lo | him, the +la | her, the +li | them +le | them, the +gli | to him, the +ne | from there etc +il | the +un | a +uno | a +una | a +ma | but +ed | and +se | if +perché | why, because +anche | also +come | how +dov | where (as dov') +dove | where +che | who, that +chi | who +cui | whom +non | not +più | more +quale | who, that +quanto | how much +quanti | +quanta | +quante | +quello | that +quelli | +quella | +quelle | +questo | this +questi | +questa | +queste | +si | yes +tutto | all +tutti | all + + | single letter forms: + +a | at +c | as c' for ce or ci +e | and +i | the +l | as l' +o | or + + | forms of avere, to have (not including the infinitive): + +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute + + | forms of essere, to be (not including the infinitive): +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo + + | forms of fare, to do (not including the infinitive, fa, fat-): +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo + + | forms of stare, to be (not including the infinitive): +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ja.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ja.txt new file mode 100644 index 0000000..d4321be --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ja.txt @@ -0,0 +1,127 @@ +# +# This file defines a stopword set for Japanese. +# +# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. +# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 +# for frequency lists, etc. that can be useful for making your own set (if desired) +# +# Note that there is an overlap between these stopwords and the terms stopped when used +# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note +# that comments are not allowed on the same line as stopwords. +# +# Also note that stopping is done in a case-insensitive manner. Change your StopFilter +# configuration if you need case-sensitive stopping. Lastly, note that stopping is done +# using the same character width as the entries in this file. Since this StopFilter is +# normally done after a CJKWidthFilter in your chain, you would usually want your romaji +# entries to be in half-width and your kana entries to be in full-width. +# +の +に +は +を +た +が +で +て +と +し +れ +さ +ある +いる +も +する +から +な +こと +として +い +や +れる +など +なっ +ない +この +ため +その +あっ +よう +また +もの +という +あり +まで +られ +なる +へ +か +だ +これ +によって +により +おり +より +による +ず +なり +られる +において +ば +なかっ +なく +しかし +について +せ +だっ +その後 +できる +それ +う +ので +なお +のみ +でき +き +つ +における +および +いう +さらに +でも +ら +たり +その他 +に関する +たち +ます +ん +なら +に対して +特に +せる +及び +これら +とき +では +にて +ほか +ながら +うち +そして +とともに +ただし +かつて +それぞれ +または +お +ほど +ものの +に対する +ほとんど +と共に +といった +です +とも +ところ +ここ +##### End of file diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_lv.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_lv.txt new file mode 100644 index 0000000..e21a23c --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_lv.txt @@ -0,0 +1,172 @@ +# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins +# the original list of over 800 forms was refined: +# pronouns, adverbs, interjections were removed +# +# prepositions +aiz +ap +ar +apakš +ārpus +augšpus +bez +caur +dēļ +gar +iekš +iz +kopš +labad +lejpus +līdz +no +otrpus +pa +par +pār +pēc +pie +pirms +pret +priekš +starp +šaipus +uz +viņpus +virs +virspus +zem +apakšpus +# Conjunctions +un +bet +jo +ja +ka +lai +tomēr +tikko +turpretī +arī +kaut +gan +tādēļ +tā +ne +tikvien +vien +kā +ir +te +vai +kamēr +# Particles +ar +diezin +droši +diemžēl +nebūt +ik +it +taču +nu +pat +tiklab +iekšpus +nedz +tik +nevis +turpretim +jeb +iekam +iekām +iekāms +kolīdz +līdzko +tiklīdz +jebšu +tālab +tāpēc +nekā +itin +jā +jau +jel +nē +nezin +tad +tikai +vis +tak +iekams +vien +# modal verbs +būt +biju +biji +bija +bijām +bijāt +esmu +esi +esam +esat +būšu +būsi +būs +būsim +būsiet +tikt +tiku +tiki +tika +tikām +tikāt +tieku +tiec +tiek +tiekam +tiekat +tikšu +tiks +tiksim +tiksiet +tapt +tapi +tapāt +topat +tapšu +tapsi +taps +tapsim +tapsiet +kļūt +kļuvu +kļuvi +kļuva +kļuvām +kļuvāt +kļūstu +kļūsti +kļūst +kļūstam +kļūstat +kļūšu +kļūsi +kļūs +kļūsim +kļūsiet +# verbs +varēt +varēju +varējām +varēšu +varēsim +var +varēji +varējāt +varēsi +varēsiet +varat +varēja +varēs diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_nl.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_nl.txt new file mode 100644 index 0000000..47a2aea --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_nl.txt @@ -0,0 +1,119 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + +de | the +en | and +van | of, from +ik | I, the ego +te | (1) chez, at etc, (2) to, (3) too +dat | that, which +die | that, those, who, which +in | in, inside +een | a, an, one +hij | he +het | the, it +niet | not, nothing, naught +zijn | (1) to be, being, (2) his, one's, its +is | is +was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river +op | on, upon, at, in, up, used up +aan | on, upon, to (as dative) +met | with, by +als | like, such as, when +voor | (1) before, in front of, (2) furrow +had | had, past tense all persons sing. of 'hebben' (have) +er | there +maar | but, only +om | round, about, for etc +hem | him +dan | then +zou | should/would, past tense all persons sing. of 'zullen' +of | or, whether, if +wat | what, something, anything +mijn | possessive and noun 'mine' +men | people, 'one' +dit | this +zo | so, thus, in this way +door | through by +over | over, across +ze | she, her, they, them +zich | oneself +bij | (1) a bee, (2) by, near, at +ook | also, too +tot | till, until +je | you +mij | me +uit | out of, from +der | Old Dutch form of 'van der' still found in surnames +daar | (1) there, (2) because +haar | (1) her, their, them, (2) hair +naar | (1) unpleasant, unwell etc, (2) towards, (3) as +heb | present first person sing. of 'to have' +hoe | how, why +heeft | present third person sing. of 'to have' +hebben | 'to have' and various parts thereof +deze | this +u | you +want | (1) for, (2) mitten, (3) rigging +nog | yet, still +zal | 'shall', first and third person sing. of verb 'zullen' (will) +me | me +zij | she, they +nu | now +ge | 'thou', still used in Belgium and south Netherlands +geen | none +omdat | because +iets | something, somewhat +worden | to become, grow, get +toch | yet, still +al | all, every, each +waren | (1) 'were' (2) to wander, (3) wares, (3) +veel | much, many +meer | (1) more, (2) lake +doen | to do, to make +toen | then, when +moet | noun 'spot/mote' and present form of 'to must' +ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' +zonder | without +kan | noun 'can' and present form of 'to be able' +hun | their, them +dus | so, consequently +alles | all, everything, anything +onder | under, beneath +ja | yes, of course +eens | once, one day +hier | here +wie | who +werd | imperfect third person sing. of 'become' +altijd | always +doch | yet, but etc +wordt | present third person sing. of 'become' +wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans +kunnen | to be able +ons | us/our +zelf | self +tegen | against, towards, at +na | after, near +reeds | already +wil | (1) present tense of 'want', (2) 'will', noun, (3) fender +kon | could; past tense of 'to be able' +niets | nothing +uw | your +iemand | somebody +geweest | been; past participle of 'be' +andere | other diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_no.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_no.txt new file mode 100644 index 0000000..a7a2c28 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_no.txt @@ -0,0 +1,194 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + +og | and +i | in +jeg | I +det | it/this/that +at | to (w. inf.) +en | a/an +et | a/an +den | it/this/that +til | to +er | is/am/are +som | who/that +på | on +de | they / you(formal) +med | with +han | he +av | of +ikke | not +ikkje | not * +der | there +så | so +var | was/were +meg | me +seg | you +men | but +ett | one +har | have +om | about +vi | we +min | my +mitt | my +ha | have +hadde | had +hun | she +nå | now +over | over +da | when/as +ved | by/know +fra | from +du | you +ut | out +sin | your +dem | them +oss | us +opp | up +man | you/one +kan | can +hans | his +hvor | where +eller | or +hva | what +skal | shall/must +selv | self (reflective) +sjøl | self (reflective) +her | here +alle | all +vil | will +bli | become +ble | became +blei | became * +blitt | have become +kunne | could +inn | in +når | when +være | be +kom | come +noen | some +noe | some +ville | would +dere | you +som | who/which/that +deres | their/theirs +kun | only/just +ja | yes +etter | after +ned | down +skulle | should +denne | this +for | for/because +deg | you +si | hers/his +sine | hers/his +sitt | hers/his +mot | against +å | to +meget | much +hvorfor | why +dette | this +disse | these/those +uten | without +hvordan | how +ingen | none +din | your +ditt | your +blir | become +samme | same +hvilken | which +hvilke | which (plural) +sånn | such a +inni | inside/within +mellom | between +vår | our +hver | each +hvem | who +vors | us/ours +hvis | whose +både | both +bare | only/just +enn | than +fordi | as/because +før | before +mange | many +også | also +slik | just +vært | been +være | to be +båe | both * +begge | both +siden | since +dykk | your * +dykkar | yours * +dei | they * +deira | them * +deires | theirs * +deim | them * +di | your (fem.) * +då | as/when * +eg | I * +ein | a/an * +eit | a/an * +eitt | a/an * +elles | or * +honom | he * +hjå | at * +ho | she * +hoe | she * +henne | her +hennar | her/hers +hennes | hers +hoss | how * +hossen | how * +ikkje | not * +ingi | noone * +inkje | noone * +korleis | how * +korso | how * +kva | what/which * +kvar | where * +kvarhelst | where * +kven | who/whom * +kvi | why * +kvifor | why * +me | we * +medan | while * +mi | my * +mine | my * +mykje | much * +no | now * +nokon | some (masc./neut.) * +noka | some (fem.) * +nokor | some * +noko | some * +nokre | some * +si | his/hers * +sia | since * +sidan | since * +so | so * +somt | some * +somme | some * +um | about* +upp | up * +vere | be * +vore | was * +verte | become * +vort | become * +varte | became * +vart | became * + diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_pt.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_pt.txt new file mode 100644 index 0000000..acfeb01 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_pt.txt @@ -0,0 +1,253 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Portuguese stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | of, from +a | the; to, at; her +o | the; him +que | who, that +e | and +do | de + o +da | de + a +em | in +um | a +para | for + | é from SER +com | with +não | not, no +uma | a +os | the; them +no | em + o +se | himself etc +na | em + a +por | for +mais | more +as | the; them +dos | de + os +como | as, like +mas | but + | foi from SER +ao | a + o +ele | he +das | de + as + | tem from TER +à | a + a +seu | his +sua | her +ou | or + | ser from SER +quando | when +muito | much + | há from HAV +nos | em + os; us +já | already, now + | está from EST +eu | I +também | also +só | only, just +pelo | per + o +pela | per + a +até | up to +isso | that +ela | he +entre | between + | era from SER +depois | after +sem | without +mesmo | same +aos | a + os + | ter from TER +seus | his +quem | whom +nas | em + as +me | me +esse | that +eles | they + | estão from EST +você | you + | tinha from TER + | foram from SER +essa | that +num | em + um +nem | nor +suas | her +meu | my +às | a + as +minha | my + | têm from TER +numa | em + uma +pelos | per + os +elas | they + | havia from HAV + | seja from SER +qual | which + | será from SER +nós | we + | tenho from TER +lhe | to him, her +deles | of them +essas | those +esses | those +pelas | per + as +este | this + | fosse from SER +dele | of him + + | other words. There are many contractions such as naquele = em+aquele, + | mo = me+o, but they are rare. + | Indefinite article plural forms are also rare. + +tu | thou +te | thee +vocês | you (plural) +vos | you +lhes | to them +meus | my +minhas +teu | thy +tua +teus +tuas +nosso | our +nossa +nossos +nossas + +dela | of her +delas | of them + +esta | this +estes | these +estas | these +aquele | that +aquela | that +aqueles | those +aquelas | those +isto | this +aquilo | that + + | forms of estar, to be (not including the infinitive): +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem + + | forms of haver, to have (not including the infinitive): +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam + + | forms of ser, to be (not including the infinitive): +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam + + | forms of ter, to have (not including the infinitive): +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ro.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ro.txt new file mode 100644 index 0000000..4fdee90 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ro.txt @@ -0,0 +1,233 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +acea +aceasta +această +aceea +acei +aceia +acel +acela +acele +acelea +acest +acesta +aceste +acestea +aceşti +aceştia +acolo +acum +ai +aia +aibă +aici +al +ăla +ale +alea +ălea +altceva +altcineva +am +ar +are +aş +aşadar +asemenea +asta +ăsta +astăzi +astea +ăstea +ăştia +asupra +aţi +au +avea +avem +aveţi +azi +bine +bucur +bună +ca +că +căci +când +care +cărei +căror +cărui +cât +câte +câţi +către +câtva +ce +cel +ceva +chiar +cînd +cine +cineva +cît +cîte +cîţi +cîtva +contra +cu +cum +cumva +curând +curînd +da +dă +dacă +dar +datorită +de +deci +deja +deoarece +departe +deşi +din +dinaintea +dintr +dintre +drept +după +ea +ei +el +ele +eram +este +eşti +eu +face +fără +fi +fie +fiecare +fii +fim +fiţi +iar +ieri +îi +îl +îmi +împotriva +în +înainte +înaintea +încât +încît +încotro +între +întrucât +întrucît +îţi +la +lângă +le +li +lîngă +lor +lui +mă +mâine +mea +mei +mele +mereu +meu +mi +mine +mult +multă +mulţi +ne +nicăieri +nici +nimeni +nişte +noastră +noastre +noi +noştri +nostru +nu +ori +oricând +oricare +oricât +orice +oricînd +oricine +oricît +oricum +oriunde +până +pe +pentru +peste +pînă +poate +pot +prea +prima +primul +prin +printr +sa +să +săi +sale +sau +său +se +şi +sînt +sîntem +sînteţi +spre +sub +sunt +suntem +sunteţi +ta +tăi +tale +tău +te +ţi +ţie +tine +toată +toate +tot +toţi +totuşi +tu +un +una +unde +undeva +unei +unele +uneori +unor +vă +vi +voastră +voastre +voi +voştri +vostru +vouă +vreo +vreun diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ru.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ru.txt new file mode 100644 index 0000000..5527140 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_ru.txt @@ -0,0 +1,243 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. + + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | letter `ё' is translated to `е'. + +и | and +в | in/into +во | alternative form +не | not +что | what/that +он | he +на | on/onto +я | i +с | from +со | alternative form +как | how +а | milder form of `no' (but) +то | conjunction and form of `that' +все | all +она | she +так | so, thus +его | him +но | but +да | yes/and +ты | thou +к | towards, by +у | around, chez +же | intensifier particle +вы | you +за | beyond, behind +бы | conditional/subj. particle +по | up to, along +только | only +ее | her +мне | to me +было | it was +вот | here is/are, particle +от | away from +меня | me +еще | still, yet, more +нет | no, there isnt/arent +о | about +из | out of +ему | to him +теперь | now +когда | when +даже | even +ну | so, well +вдруг | suddenly +ли | interrogative particle +если | if +уже | already, but homonym of `narrower' +или | or +ни | neither +быть | to be +был | he was +него | prepositional form of его +до | up to +вас | you accusative +нибудь | indef. suffix preceded by hyphen +опять | again +уж | already, but homonym of `adder' +вам | to you +сказал | he said +ведь | particle `after all' +там | there +потом | then +себя | oneself +ничего | nothing +ей | to her +может | usually with `быть' as `maybe' +они | they +тут | here +где | where +есть | there is/are +надо | got to, must +ней | prepositional form of ей +для | for +мы | we +тебя | thee +их | them, their +чем | than +была | she was +сам | self +чтоб | in order to +без | without +будто | as if +человек | man, person, one +чего | genitive form of `what' +раз | once +тоже | also +себе | to oneself +под | beneath +жизнь | life +будет | will be +ж | short form of intensifer particle `же' +тогда | then +кто | who +этот | this +говорил | was saying +того | genitive form of `that' +потому | for that reason +этого | genitive form of `this' +какой | which +совсем | altogether +ним | prepositional form of `его', `они' +здесь | here +этом | prepositional form of `этот' +один | one +почти | almost +мой | my +тем | instrumental/dative plural of `тот', `то' +чтобы | full form of `in order that' +нее | her (acc.) +кажется | it seems +сейчас | now +были | they were +куда | where to +зачем | why +сказать | to say +всех | all (acc., gen. preposn. plural) +никогда | never +сегодня | today +можно | possible, one can +при | by +наконец | finally +два | two +об | alternative form of `о', about +другой | another +хоть | even +после | after +над | above +больше | more +тот | that one (masc.) +через | across, in +эти | these +нас | us +про | about +всего | in all, only, of all +них | prepositional form of `они' (they) +какая | which, feminine +много | lots +разве | interrogative particle +сказала | she said +три | three +эту | this, acc. fem. sing. +моя | my, feminine +впрочем | moreover, besides +хорошо | good +свою | ones own, acc. fem. sing. +этой | oblique form of `эта', fem. `this' +перед | in front of +иногда | sometimes +лучше | better +чуть | a little +том | preposn. form of `that one' +нельзя | one must not +такой | such a one +им | to them +более | more +всегда | always +конечно | of course +всю | acc. fem. sing of `all' +между | between + + + | b: some paradigms + | + | personal pronouns + | + | я меня мне мной [мною] + | ты тебя тебе тобой [тобою] + | он его ему им [него, нему, ним] + | она ее эи ею [нее, нэи, нею] + | оно его ему им [него, нему, ним] + | + | мы нас нам нами + | вы вас вам вами + | они их им ими [них, ним, ними] + | + | себя себе собой [собою] + | + | demonstrative pronouns: этот (this), тот (that) + | + | этот эта это эти + | этого эты это эти + | этого этой этого этих + | этому этой этому этим + | этим этой этим [этою] этими + | этом этой этом этих + | + | тот та то те + | того ту то те + | того той того тех + | тому той тому тем + | тем той тем [тою] теми + | том той том тех + | + | determinative pronouns + | + | (a) весь (all) + | + | весь вся все все + | всего всю все все + | всего всей всего всех + | всему всей всему всем + | всем всей всем [всею] всеми + | всем всей всем всех + | + | (b) сам (himself etc) + | + | сам сама само сами + | самого саму само самих + | самого самой самого самих + | самому самой самому самим + | самим самой самим [самою] самими + | самом самой самом самих + | + | stems of verbs `to be', `to have', `to do' and modal + | + | быть бы буд быв есть суть + | име + | дел + | мог мож мочь + | уме + | хоч хот + | долж + | можн + | нужн + | нельзя + diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_sv.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_sv.txt new file mode 100644 index 0000000..096f87f --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_sv.txt @@ -0,0 +1,133 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + +och | and +det | it, this/that +att | to (with infinitive) +i | in, at +en | a +jag | I +hon | she +som | who, that +han | he +på | on +den | it, this/that +med | with +var | where, each +sig | him(self) etc +för | for +så | so (also: seed) +till | to +är | is +men | but +ett | a +om | if; around, about +hade | had +de | they, these/those +av | of +icke | not, no +mig | me +du | you +henne | her +då | then, when +sin | his +nu | now +har | have +inte | inte någon = no one +hans | his +honom | him +skulle | 'sake' +hennes | her +där | there +min | my +man | one (pronoun) +ej | nor +vid | at, by, on (also: vast) +kunde | could +något | some etc +från | from, off +ut | out +när | when +efter | after, behind +upp | up +vi | we +dem | them +vara | be +vad | what +över | over +än | than +dig | you +kan | can +sina | his +här | here +ha | have +mot | towards +alla | all +under | under (also: wonder) +någon | some etc +eller | or (else) +allt | all +mycket | much +sedan | since +ju | why +denna | this/that +själv | myself, yourself etc +detta | this/that +åt | to +utan | without +varit | was +hur | how +ingen | no +mitt | my +ni | you +bli | to be, become +blev | from bli +oss | us +din | thy +dessa | these/those +några | some etc +deras | their +blir | from bli +mina | my +samma | (the) same +vilken | who, that +er | you, your +sådan | such a +vår | our +blivit | from bli +dess | its +inom | within +mellan | between +sådant | such a +varför | why +varje | each +vilka | who, that +ditt | thy +vem | who +vilket | who, that +sitta | his +sådana | such a +vart | each +dina | thy +vars | whose +vårt | our +våra | our +ert | your +era | your +vilkas | whose + diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_th.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_th.txt new file mode 100644 index 0000000..07f0fab --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_th.txt @@ -0,0 +1,119 @@ +# Thai stopwords from: +# "Opinion Detection in Thai Political News Columns +# Based on Subjectivity Analysis" +# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak +ไว้ +ไม่ +ไป +ได้ +ให้ +ใน +โดย +แห่ง +แล้ว +และ +แรก +แบบ +แต่ +เอง +เห็น +เลย +เริ่ม +เรา +เมื่อ +เพื่อ +เพราะ +เป็นการ +เป็น +เปิดเผย +เปิด +เนื่องจาก +เดียวกัน +เดียว +เช่น +เฉพาะ +เคย +เข้า +เขา +อีก +อาจ +อะไร +ออก +อย่าง +อยู่ +อยาก +หาก +หลาย +หลังจาก +หลัง +หรือ +หนึ่ง +ส่วน +ส่ง +สุด +สําหรับ +ว่า +วัน +ลง +ร่วม +ราย +รับ +ระหว่าง +รวม +ยัง +มี +มาก +มา +พร้อม +พบ +ผ่าน +ผล +บาง +น่า +นี้ +นํา +นั้น +นัก +นอกจาก +ทุก +ที่สุด +ที่ +ทําให้ +ทํา +ทาง +ทั้งนี้ +ทั้ง +ถ้า +ถูก +ถึง +ต้อง +ต่างๆ +ต่าง +ต่อ +ตาม +ตั้งแต่ +ตั้ง +ด้าน +ด้วย +ดัง +ซึ่ง +ช่วง +จึง +จาก +จัด +จะ +คือ +ความ +ครั้ง +คง +ขึ้น +ของ +ขอ +ขณะ +ก่อน +ก็ +การ +กับ +กัน +กว่า +กล่าว diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_tr.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_tr.txt new file mode 100644 index 0000000..84d9408 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/stopwords_tr.txt @@ -0,0 +1,212 @@ +# Turkish stopwords from LUCENE-559 +# merged with the list from "Information Retrieval on Turkish Texts" +# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) +acaba +altmış +altı +ama +ancak +arada +aslında +ayrıca +bana +bazı +belki +ben +benden +beni +benim +beri +beş +bile +bin +bir +birçok +biri +birkaç +birkez +birşey +birşeyi +biz +bize +bizden +bizi +bizim +böyle +böylece +bu +buna +bunda +bundan +bunlar +bunları +bunların +bunu +bunun +burada +çok +çünkü +da +daha +dahi +de +defa +değil +diğer +diye +doksan +dokuz +dolayı +dolayısıyla +dört +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +eğer +elli +en +etmesi +etti +ettiği +ettiğini +gibi +göre +halen +hangi +hatta +hem +henüz +hep +hepsi +her +herhangi +herkesin +hiç +hiçbir +için +iki +ile +ilgili +ise +işte +itibaren +itibariyle +kadar +karşın +katrilyon +kendi +kendilerine +kendini +kendisi +kendisine +kendisini +kez +ki +kim +kimden +kime +kimi +kimse +kırk +milyar +milyon +mu +mü +mı +nasıl +ne +neden +nedenle +nerde +nerede +nereye +niye +niçin +o +olan +olarak +oldu +olduğu +olduğunu +olduklarını +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor +on +ona +ondan +onlar +onlardan +onları +onların +onu +onun +otuz +oysa +öyle +pek +rağmen +sadece +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +sizi +sizin +şey +şeyden +şeyi +şeyler +şöyle +şu +şuna +şunda +şundan +şunları +şunu +tarafından +trilyon +tüm +üç +üzere +var +vardı +ve +veya +ya +yani +yapacak +yapılan +yapılması +yapıyor +yapmak +yaptı +yaptığı +yaptığını +yaptıkları +yedi +yerine +yetmiş +yine +yirmi +yoksa +yüz +zaten diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/userdict_ja.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/userdict_ja.txt new file mode 100644 index 0000000..6f0368e --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/lang/userdict_ja.txt @@ -0,0 +1,29 @@ +# +# This is a sample user dictionary for Kuromoji (JapaneseTokenizer) +# +# Add entries to this file in order to override the statistical model in terms +# of segmentation, readings and part-of-speech tags. Notice that entries do +# not have weights since they are always used when found. This is by-design +# in order to maximize ease-of-use. +# +# Entries are defined using the following CSV format: +# , ... , ... , +# +# Notice that a single half-width space separates tokens and readings, and +# that the number tokens and readings must match exactly. +# +# Also notice that multiple entries with the same is undefined. +# +# Whitespace only lines are ignored. Comments are not allowed on entry lines. +# + +# Custom segmentation for kanji compounds +日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 +関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 + +# Custom segmentation for compound katakana +トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 +ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 + +# Custom reading for former sumo wrestler +朝青龍,朝青龍,アサショウリュウ,カスタム人名 diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/managed-schema.xml b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/managed-schema.xml new file mode 100644 index 0000000..81030b3 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/managed-schema.xml @@ -0,0 +1,1080 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/protwords.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/protwords.txt new file mode 100644 index 0000000..1dfc0ab --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/protwords.txt @@ -0,0 +1,21 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# Use a protected word file to protect against the stemmer reducing two +# unrelated words to the same base word. + +# Some non-words that normally won't be encountered, +# just to test that they won't be stemmed. +dontstems +zwhacky + diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/solrconfig.xml b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/solrconfig.xml new file mode 100644 index 0000000..7956716 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/solrconfig.xml @@ -0,0 +1,1164 @@ + + + + + + + + + 9.0 + + + + + + + + + + + ${solr.data.dir:} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.lock.type:native} + + + + + + + + + + + + + + + + + + + + + ${solr.ulog.dir:} + ${solr.ulog.numVersionBuckets:65536} + + + + + ${solr.autoCommit.maxTime:15000} + false + + + + + + ${solr.autoSoftCommit.maxTime:-1} + + + + + + + + + + + + + + ${solr.max.booleanClauses:1024} + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + 20 + + + 200 + + + + + + + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + explicit + 10 + + + + + + + explicit + json + true + + + + + + + _text_ + + + + + + + text_general + + + + + + default + _text_ + solr.DirectSolrSpellChecker + + internal + + 0.5 + + 2 + + 1 + + 5 + + 4 + + 0.01 + + + + + + + + + + + + default + on + true + 10 + 5 + 5 + true + true + 10 + 5 + + + spellcheck + + + + + + + + + + + + 100 + + + + + + + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} + + + + + + + ]]> + ]]> + + + + + + + + + + + + + + + + + + + + + + + + ,, + ,, + ,, + ,, + ,]]> + ]]> + + + + + + 10 + .,!? + + + + + + + WORD + + + en + US + + + + + + + + + + + + + + ^[^_].* + + java.lang.String + text_general + + str_* + 8192 + + + lowercase_* + 8192 + + + edge_* + 8192 + + + whitespace_* + 8192 + + + whitespace_edge_* + 8192 + + + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/stopwords.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/stopwords.txt new file mode 100644 index 0000000..ae1e83e --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/stopwords.txt @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/synonyms.txt b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/synonyms.txt new file mode 100644 index 0000000..eab4ee8 --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/conf/synonyms.txt @@ -0,0 +1,29 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaafoo => aaabar +bbbfoo => bbbfoo bbbbar +cccfoo => cccbar cccbaz +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma + diff --git a/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/core.properties b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/core.properties new file mode 100644 index 0000000..6527d4a --- /dev/null +++ b/dataload/08_create_other_dbs/solr/solr_config_template/grebi_results/core.properties @@ -0,0 +1,3 @@ +#Written by CorePropertiesLocator +#Wed Jun 29 18:23:24 UTC 2022 +name=grebi_results diff --git a/dataload/06_prepare_db_import/solr_config_template/solr.xml b/dataload/08_create_other_dbs/solr/solr_config_template/solr.xml similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/solr.xml rename to dataload/08_create_other_dbs/solr/solr_config_template/solr.xml diff --git a/dataload/06_prepare_db_import/solr_config_template/solrconfig.xml b/dataload/08_create_other_dbs/solr/solr_config_template/solrconfig.xml similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/solrconfig.xml rename to dataload/08_create_other_dbs/solr/solr_config_template/solrconfig.xml diff --git a/dataload/06_prepare_db_import/solr_config_template/zoo.cfg b/dataload/08_create_other_dbs/solr/solr_config_template/zoo.cfg similarity index 100% rename from dataload/06_prepare_db_import/solr_config_template/zoo.cfg rename to dataload/08_create_other_dbs/solr/solr_config_template/zoo.cfg diff --git a/dataload/07_create_db/solr/solr_import.dockerpy b/dataload/08_create_other_dbs/solr/solr_import.dockerpy similarity index 100% rename from dataload/07_create_db/solr/solr_import.dockerpy rename to dataload/08_create_other_dbs/solr/solr_import.dockerpy diff --git a/dataload/07_create_db/solr/solr_import.slurm.py b/dataload/08_create_other_dbs/solr/solr_import.slurm.py similarity index 89% rename from dataload/07_create_db/solr/solr_import.slurm.py rename to dataload/08_create_other_dbs/solr/solr_import.slurm.py index 8fba3a8..d8d62d4 100644 --- a/dataload/07_create_db/solr/solr_import.slurm.py +++ b/dataload/08_create_other_dbs/solr/solr_import.slurm.py @@ -32,11 +32,11 @@ def main(): 'singularity run', '--env PYTHONUNBUFFERED=TRUE', '--env NO_PROXY=localhost', - ] + list(map(lambda f: "--bind " + os.path.abspath(f) + ":/mnt/" + os.path.basename(f), glob.glob(args.in_data + "/solr_*"))) + [ + ] + list(map(lambda f: "--bind " + os.path.abspath(f) + ":/mnt/" + os.path.basename(f), glob.glob(args.in_data + "/*.jsonl"))) + [ ('--bind ' + os.path.abspath(args.in_names_txt) + ':/names.txt') if args.in_names_txt != None else '', '--bind ' + os.path.abspath(args.solr_config) + ':/config', '--bind ' + os.path.abspath(args.out_path) + ':/var/solr', - '--bind ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '07_create_db/solr/solr_import.dockerpy')) + ':/import.py', + '--bind ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '08_create_other_dbs/solr/solr_import.dockerpy')) + ':/import.py', #'--writable-tmpfs', '--net --network=none', 'docker://ghcr.io/ebispot/grebi_solr_with_extras:9.5.0', @@ -49,11 +49,11 @@ def main(): '--user="$(id -u):$(id -g)" ' '-e PYTHONUNBUFFERED=TRUE', '-e NO_PROXY=localhost', - ] + list(map(lambda f: "-v " + os.path.abspath(f) + ":/mnt/" + os.path.basename(f), glob.glob(args.in_data + "/solr_*"))) + [ + ] + list(map(lambda f: "-v " + os.path.abspath(f) + ":/mnt/" + os.path.basename(f), glob.glob(args.in_data + "/*.jsonl"))) + [ ('-v ' + os.path.abspath(args.in_names_txt) + ':/names.txt') if args.in_names_txt != None else '', '-v ' + os.path.abspath(args.solr_config) + ':/config', '-v ' + os.path.abspath(args.out_path) + ':/var/solr', - '-v ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '07_create_db/solr/solr_import.dockerpy')) + ':/import.py', + '-v ' + os.path.abspath(os.path.join(os.environ['GREBI_DATALOAD_HOME'], '08_create_other_dbs/solr/solr_import.dockerpy')) + ':/import.py', 'ghcr.io/ebispot/grebi_solr_with_extras:9.5.0', 'python3 /import.py', args.core, args.port, args.mem ]) diff --git a/dataload/06_prepare_db_import/grebi_make_compressed_blob/Cargo.lock b/dataload/08_create_other_dbs/sqlite/grebi_make_compressed_blob/Cargo.lock similarity index 100% rename from dataload/06_prepare_db_import/grebi_make_compressed_blob/Cargo.lock rename to dataload/08_create_other_dbs/sqlite/grebi_make_compressed_blob/Cargo.lock diff --git a/dataload/06_prepare_db_import/grebi_make_compressed_blob/Cargo.toml b/dataload/08_create_other_dbs/sqlite/grebi_make_compressed_blob/Cargo.toml similarity index 85% rename from dataload/06_prepare_db_import/grebi_make_compressed_blob/Cargo.toml rename to dataload/08_create_other_dbs/sqlite/grebi_make_compressed_blob/Cargo.toml index a00056f..acd9de3 100644 --- a/dataload/06_prepare_db_import/grebi_make_compressed_blob/Cargo.toml +++ b/dataload/08_create_other_dbs/sqlite/grebi_make_compressed_blob/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [dependencies] clap = { version = "4.4.11", features = ["derive"] } -grebi_shared = { path = "../../grebi_shared" } +grebi_shared = { path = "../../../grebi_shared" } flate2 = {version="1.0.28", features=["zlib-ng"]} serde_json = { version = "1.0.108", features=["preserve_order"] } jemallocator = "0.5.4" diff --git a/dataload/06_prepare_db_import/grebi_make_compressed_blob/src/main.rs b/dataload/08_create_other_dbs/sqlite/grebi_make_compressed_blob/src/main.rs similarity index 100% rename from dataload/06_prepare_db_import/grebi_make_compressed_blob/src/main.rs rename to dataload/08_create_other_dbs/sqlite/grebi_make_compressed_blob/src/main.rs diff --git a/dataload/07_create_db/sqlite/grebi_make_sqlite/Cargo.lock b/dataload/08_create_other_dbs/sqlite/grebi_make_sqlite/Cargo.lock similarity index 100% rename from dataload/07_create_db/sqlite/grebi_make_sqlite/Cargo.lock rename to dataload/08_create_other_dbs/sqlite/grebi_make_sqlite/Cargo.lock diff --git a/dataload/07_create_db/sqlite/grebi_make_sqlite/Cargo.toml b/dataload/08_create_other_dbs/sqlite/grebi_make_sqlite/Cargo.toml similarity index 100% rename from dataload/07_create_db/sqlite/grebi_make_sqlite/Cargo.toml rename to dataload/08_create_other_dbs/sqlite/grebi_make_sqlite/Cargo.toml diff --git a/dataload/07_create_db/sqlite/grebi_make_sqlite/src/main.rs b/dataload/08_create_other_dbs/sqlite/grebi_make_sqlite/src/main.rs similarity index 100% rename from dataload/07_create_db/sqlite/grebi_make_sqlite/src/main.rs rename to dataload/08_create_other_dbs/sqlite/grebi_make_sqlite/src/main.rs diff --git a/dataload/Cargo.lock b/dataload/Cargo.lock index 708725a..c36c9ed 100644 --- a/dataload/Cargo.lock +++ b/dataload/Cargo.lock @@ -482,67 +482,80 @@ dependencies = [ ] [[package]] -name = "grebi_make_compressed_blob" +name = "grebi_link" version = "0.1.0" dependencies = [ "clap", "flate2", "grebi_shared", + "hex", "jemallocator", "serde_json", + "sha1", ] [[package]] -name = "grebi_make_neo_csv" +name = "grebi_link_results" version = "0.1.0" dependencies = [ + "bloomfilter", "clap", - "flate2", + "csv", "grebi_shared", "jemallocator", + "lmdb-zero", "serde_json", ] [[package]] -name = "grebi_make_neo_ids_csv" +name = "grebi_make_compressed_blob" version = "0.1.0" dependencies = [ + "clap", + "flate2", "grebi_shared", "jemallocator", + "serde_json", ] [[package]] -name = "grebi_make_solr" +name = "grebi_make_neo_csv" version = "0.1.0" dependencies = [ "clap", + "flate2", "grebi_shared", "jemallocator", "serde_json", ] [[package]] -name = "grebi_make_sqlite" +name = "grebi_make_neo_ids_csv" +version = "0.1.0" +dependencies = [ + "grebi_shared", + "jemallocator", +] + +[[package]] +name = "grebi_make_solr" version = "0.1.0" dependencies = [ "clap", "grebi_shared", "jemallocator", - "rusqlite", "serde_json", ] [[package]] -name = "grebi_materialise" +name = "grebi_make_sqlite" version = "0.1.0" dependencies = [ "clap", - "flate2", "grebi_shared", - "hex", "jemallocator", + "rusqlite", "serde_json", - "sha1", ] [[package]] diff --git a/dataload/Cargo.toml b/dataload/Cargo.toml index 8658907..be0ef35 100644 --- a/dataload/Cargo.toml +++ b/dataload/Cargo.toml @@ -20,12 +20,13 @@ members = [ "02_assign_ids/grebi_superclasses2types", "03_merge/grebi_merge", "04_index/grebi_index", - "05_materialise/grebi_materialise", - "06_prepare_db_import/grebi_make_neo_csv", - "06_prepare_db_import/grebi_make_neo_ids_csv", - "06_prepare_db_import/grebi_make_solr", - "06_prepare_db_import/grebi_make_compressed_blob", - "07_create_db/sqlite/grebi_make_sqlite", + "05_link/grebi_link", + "06_create_neo_db/grebi_make_neo_csv", + "06_create_neo_db/grebi_make_neo_ids_csv", + "08_create_other_dbs/solr/grebi_make_solr", + "08_create_other_dbs/solr/grebi_link_results", + "08_create_other_dbs/sqlite/grebi_make_compressed_blob", + "08_create_other_dbs/sqlite/grebi_make_sqlite", "grebi_shared" ] diff --git a/dataload/configs/datasource_configs/ols_efo_only.yaml b/dataload/configs/datasource_configs/ols_efo_only.yaml new file mode 100644 index 0000000..5093fc6 --- /dev/null +++ b/dataload/configs/datasource_configs/ols_efo_only.yaml @@ -0,0 +1,9 @@ +name: OLS +enabled: true +ingests: + - globs: ["/data/ontologies/ontologies.json"] + command: ' + grebi_ingest_ols + --datasource-name $GREBI_INGEST_DATASOURCE_NAME + --ontologies efo + --skip-obsolete' diff --git a/dataload/configs/pipeline_configs/ebi.json b/dataload/configs/pipeline_configs/ebi.json deleted file mode 100644 index 020fca2..0000000 --- a/dataload/configs/pipeline_configs/ebi.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "subgraphs": [ - "impc_x_gwas" ]} diff --git a/dataload/configs/pipeline_configs/hett_only.json b/dataload/configs/pipeline_configs/hett_only.json deleted file mode 100644 index ce5df8e..0000000 --- a/dataload/configs/pipeline_configs/hett_only.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "subgraphs": [ - "hett" - ] -} \ No newline at end of file diff --git a/dataload/configs/pipeline_configs/hra_only.json b/dataload/configs/pipeline_configs/hra_only.json deleted file mode 100644 index 92f8fd3..0000000 --- a/dataload/configs/pipeline_configs/hra_only.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "subgraphs": [ - "hra_kg" - ] -} \ No newline at end of file diff --git a/dataload/configs/subgraph_configs/Makefile b/dataload/configs/subgraph_configs/Makefile index 90e799d..9251d30 100644 --- a/dataload/configs/subgraph_configs/Makefile +++ b/dataload/configs/subgraph_configs/Makefile @@ -1,6 +1,6 @@ -all: ebi_monarch_xspecies.json ebi_monarch.json hett.json hra_kg.json impc.json monarch.json impc_x_gwas.json +all: ebi_monarch_xspecies.json ebi_monarch.json hett.json hra_kg.json impc.json monarch.json impc_x_gwas.json gwas_and_efo.json %.json: src/%.py python3 $< > $@ diff --git a/dataload/configs/subgraph_configs/gwas_and_efo.json b/dataload/configs/subgraph_configs/gwas_and_efo.json new file mode 100644 index 0000000..ade001c --- /dev/null +++ b/dataload/configs/subgraph_configs/gwas_and_efo.json @@ -0,0 +1,141 @@ +{ + "id": "EBI_MONARCH", + "name": "EBI Resources and MONARCH Initiative KG", + "bytes_per_merged_file": 1073741824, + "identifier_props": [ + "id", + "owl:equivalentClass", + "owl:equivalentProperty", + "owl:sameAs", + "grebi:hashId", + "grebi:equivalentTo", + "ols:iri", + "ols:shortForm", + "hgnc:ensembl_gene_id", + "obo:chebi/inchi", + "obo:chebi/inchikey", + "obo:chebi/smiles", + "impc:pmId", + "impc:humanGeneAccId", + "monarch:iri", + "skos:exactMatch", + "ncit:P368", + "ncit:C98965", + "dcterms:identifier", + "oboinowl:hasAlternativeId", + "robokop:equivalent_identifiers", + "mesh.vocab:identifier" + ], + "type_superclasses": [ + "mondo:0000001", + "efo:0000408", + "chebi:36080", + "chebi:24431", + "biolink:ChemicalEntity" + ], + "additional_equivalence_groups": [ + [ + "grebi:name", + "ols:label", + "rdfs:label", + "monarch:name", + "impc:name", + "reactome:displayName", + "dcterms:title", + "ncit:Preferred_Name", + "robokop:name" + ], + [ + "grebi:description", + "iao:definition", + "monarch:description", + "ols:definition", + "robokop:description" + ], + [ + "grebi:synonym", + "monarch:synonym", + "iao:alternative_label", + "ols:synonym", + "oboinowl:hasExactSynonym", + "dcterms:alternative" + ], + [ + "mondo:0000001", + "ogms:0000031" + ], + [ + "biolink:broad_match", + "skos:broader", + "skos:broadMatch", + "ols:directAncestor" + ], + [ + "biolink:subclass_of", + "ols:directParent", + "rdfs:subClassOf", + "rdfs:subPropertyOf" + ], + [ + "rdfs:isDefinedBy", + "ols:ontologyIri", + "ols:ontologyId" + ] + ], + "exclude_props": [ + "ols:hierarchicalProperty", + "ols:synonymProperty", + "ols:curie", + "ols:shortForm", + "ols:ontologyPreferredPrefix", + "ols:iri", + "ols:uri", + "ols:imported", + "ols:hasHierarchicalParents", + "ols:hasHierarchicalChildren", + "ols:hasDirectParents", + "ols:hasDirectChildren", + "ols:numDescendants", + "ols:numHierarchicalDescendants", + "oboinowl:id", + "oboinowl:url", + "monarch:iri", + "cco:hasDocument", + "cco:hasMolecule" + ], + "exclude_edges": [], + "exclude_self_referential_edges": [ + "foaf:page", + "rdfs:seeAlso", + "oboinowl:hasDbXref", + "biolink:subclass_of", + "biolink:broad_match", + "ols:ontology_purl", + "ols:ontologyId", + "cheminf:000407", + "cheminf:InChIKey", + "biolink:interacts_with", + "reactome:url", + "dc:Identifier", + "hgnc:agr", + "hgnc:gencc", + "monarch:xref", + "hgnc:uniprot_ids", + "hgnc:omim_id", + "hgnc:entrez_id", + "reactome:referenceGene", + "reactome:identifier", + "reactome:crossReference", + "edam:Ensembl_gene_id", + "obo:pr#has_gene_template", + "ols:relatedTo", + "ols:relatedFrom", + "robokop:smiles", + "ctd:ChemicalURL", + "ctd:UniProtIDs" + ], + "datasource_configs": [ + "./configs/datasource_configs/gwas.yaml", + "./configs/datasource_configs/ols_efo_only.yaml" + ] +} diff --git a/dataload/configs/subgraph_configs/src/gwas_and_efo.py b/dataload/configs/subgraph_configs/src/gwas_and_efo.py new file mode 100644 index 0000000..350f617 --- /dev/null +++ b/dataload/configs/subgraph_configs/src/gwas_and_efo.py @@ -0,0 +1,14 @@ + +from shared import config + +config['id'] = 'EBI_MONARCH' +config['name'] = 'EBI Resources and MONARCH Initiative KG' +config['datasource_configs'] = [ + "./configs/datasource_configs/gwas.yaml", + "./configs/datasource_configs/ols_efo_only.yaml" +] + +if __name__ == '__main__': + import json + print(json.dumps(config, indent=2)) + diff --git a/dataload/nextflow/codon_nextflow.config b/dataload/nextflow/codon_nextflow.config index 480e15d..79b0ed6 100644 --- a/dataload/nextflow/codon_nextflow.config +++ b/dataload/nextflow/codon_nextflow.config @@ -42,6 +42,12 @@ process { cpus = 32 } } +process { + withName: create_solr_results_cores { + memory = 150.GB + cpus = 32 + } +} @@ -96,6 +102,17 @@ process { } } +process { + withName: results_to_csv { + memory = 150.GB + } +} +process { + withName: link_results { + memory = 150.GB + } +} + diff --git a/dataload/nextflow/load_subgraph.nf b/dataload/nextflow/load_subgraph.nf index c689f25..a8d795a 100644 --- a/dataload/nextflow/load_subgraph.nf +++ b/dataload/nextflow/load_subgraph.nf @@ -4,12 +4,9 @@ nextflow.enable.dsl=2 import groovy.json.JsonSlurper jsonSlurper = new JsonSlurper() -params.tmp = "$GREBI_TMP" +params.out = "$GREBI_OUT_DIR" params.home = "$GREBI_DATALOAD_HOME" -params.config = "$GREBI_CONFIG" params.subgraph = "$GREBI_SUBGRAPH" -params.timestamp = "$GREBI_TIMESTAMP" -params.is_ebi = "$GREBI_IS_EBI" params.solr_mem = "140g" params.neo_tmp_path = "/dev/shm" @@ -30,13 +27,13 @@ workflow { indexed = index(merged.collect()) - link(merged.flatten(), indexed.metadata_jsonl, indexed.summary_json, Channel.value(config.exclude_edges + config.identifier_props), Channel.value(config.exclude_self_referential_edges + config.identifier_props), groups_txt) - merge_summary_jsons(indexed.summary_json.collect() + link.out.linked_summary.collect()) + link(merged.flatten(), indexed.entity_metadata_jsonl, indexed.graph_metadata_json, Channel.value(config.exclude_edges + config.identifier_props), Channel.value(config.exclude_self_referential_edges + config.identifier_props), groups_txt) + merge_graph_metadata_jsons(indexed.graph_metadata_json.collect() + link.out.linked_summary.collect()) compressed_blobs = create_compressed_blobs(link.out.nodes.mix(link.out.edges)) sqlite = create_sqlite(compressed_blobs.collect()) - neo_input_dir = prepare_neo(indexed.summary_json, link.out.nodes, link.out.edges) + neo_input_dir = prepare_neo(indexed.graph_metadata_json, link.out.nodes, link.out.edges) ids_csv = create_neo_ids_csv(indexed.ids_txt) neo_db = create_neo( @@ -46,31 +43,21 @@ workflow { ids_csv.collect() ) - mat_queries_csvs = run_materialised_queries(neo_db) - mat_queries_sqlite = csvs_to_sqlite(mat_queries_csvs.collect()) + run_materialised_queries(neo_db) + + csv_results = results_to_csv(run_materialised_queries.out.results.flatten()) + linked_results = link_results(run_materialised_queries.out.results.flatten(), indexed.entity_metadata_jsonl, groups_txt) + + add_query_metadatas_to_graph_metadata(run_materialised_queries.out.metadata.flatten().collect(), merge_graph_metadata_jsons.out) solr_inputs = prepare_solr(link.out.nodes, link.out.edges) - solr_nodes_core = create_solr_nodes_core(prepare_solr.out.nodes.collect(), indexed.names_txt) - solr_edges_core = create_solr_edges_core(prepare_solr.out.edges.collect(), indexed.names_txt) + solr_nodes_core = create_solr_nodes_core(prepare_solr.out.nodes.collect(), indexed.names_txt, merge_graph_metadata_jsons.out) + solr_edges_core = create_solr_edges_core(prepare_solr.out.edges.collect(), indexed.names_txt, merge_graph_metadata_jsons.out) solr_autocomplete_core = create_solr_autocomplete_core(indexed.names_txt) + solr_results_cores = create_solr_results_cores(linked_results) - solr_tgz = package_solr(solr_nodes_core, solr_edges_core, solr_autocomplete_core) + solr_tgz = package_solr(solr_nodes_core.concat(solr_edges_core).concat(solr_autocomplete_core).concat(solr_results_cores).collect()) neo_tgz = package_neo(neo_db) - - if(params.is_ebi == "true") { - copy_summary_to_ftp(merge_summary_jsons.out) - copy_solr_to_ftp(solr_tgz) - copy_neo_to_ftp(neo_tgz) - copy_sqlite_to_ftp(sqlite) - copy_mat_queries_to_ftp(mat_queries_csvs, mat_queries_sqlite) - - copy_summary_to_staging(merge_summary_jsons.out) - copy_solr_config_to_staging() - copy_solr_cores_to_staging(solr_nodes_core.concat(solr_edges_core).concat(solr_autocomplete_core)) - copy_sqlite_to_staging(sqlite) - copy_neo_to_staging(neo_db) - copy_mat_queries_to_staging(mat_queries_sqlite) - } } process prepare { @@ -206,14 +193,12 @@ process index { memory "4 GB" time "8h" - publishDir "${params.tmp}/${params.config}/${params.subgraph}", overwrite: true - input: val(merged_filenames) output: - path("metadata.jsonl"), emit: metadata_jsonl - path("summary.json"), emit: summary_json + path("entity_metadata.jsonl"), emit: entity_metadata_jsonl + path("graph_metadata.json"), emit: graph_metadata_json path("names.txt"), emit: names_txt path("ids_${params.subgraph}.txt"), emit: ids_txt @@ -224,8 +209,8 @@ process index { cat ${merged_filenames.iterator().join(" ")} \ | ${params.home}/target/release/grebi_index \ --subgraph-name ${params.subgraph} \ - --out-metadata-jsonl-path metadata.jsonl \ - --out-summary-json-path summary.json \ + --out-entity-metadata-jsonl-path entity_metadata.jsonl \ + --out-graph-metadata-json-path graph_metadata.json \ --out-names-txt names.txt \ --out-ids-txt ids_${params.subgraph}.txt """ @@ -241,8 +226,8 @@ process link { input: path(merged_filename) - path(metadata_jsonl) - path(index_summary_json) + path(entity_metadata_jsonl) + path(index_graph_metadata_json) val(exclude) val(exclude_self_referential) path(groups_txt) @@ -250,7 +235,7 @@ process link { output: path("linked_nodes_${task.index}.jsonl"), emit: nodes path("linked_edges_${task.index}.jsonl"), emit: edges - path("linked_summary_${task.index}.json"), emit: linked_summary + path("linked_graph_metadata_${task.index}.json"), emit: linked_summary script: """ @@ -258,35 +243,33 @@ process link { set -Eeuo pipefail cat ${merged_filename} \ | ${params.home}/target/release/grebi_link \ - --in-metadata-jsonl ${metadata_jsonl} \ - --in-summary-json ${index_summary_json} \ + --in-metadata-jsonl ${entity_metadata_jsonl} \ + --in-graph-metadata-json ${index_graph_metadata_json} \ --groups-txt ${groups_txt} \ --out-edges-jsonl linked_edges_${task.index}.jsonl \ - --out-summary-json linked_summary_${task.index}.json \ + --out-graph-metadata-json linked_graph_metadata_${task.index}.json \ --exclude ${exclude.iterator().join(",")} \ --exclude-self-referential ${exclude_self_referential.iterator().join(",")} \ > linked_nodes_${task.index}.jsonl """ } -process merge_summary_jsons { +process merge_graph_metadata_jsons { cache "lenient" memory "4 GB" time "1h" - publishDir "${params.tmp}/${params.config}/${params.subgraph}", overwrite: true - input: - path(summary_jsons) + path(graph_metadata_jsons) output: - path("${params.subgraph}_summary.json") + path("${params.subgraph}_metadata_merged.json") script: """ #!/usr/bin/env bash set -Eeuo pipefail - python3 ${params.home}/05_materialise/merge_summary_jsons.py ${summary_jsons} > ${params.subgraph}_summary.json + python3 ${params.home}/05_link/merge_graph_metadata_jsons.py ${graph_metadata_jsons} > ${params.subgraph}_metadata_merged.json """ } @@ -317,7 +300,7 @@ process create_sqlite { errorStrategy 'retry' maxRetries 10 - publishDir "${params.tmp}/${params.config}/${params.subgraph}", overwrite: true + publishDir "${params.out}/${params.subgraph}", overwrite: true input: val(compressed_blobs) @@ -343,10 +326,8 @@ process prepare_neo { memory "4 GB" time "1h" - publishDir "${params.tmp}/${params.config}/${params.subgraph}/neo4j_csv", overwrite: true - input: - path(summary_json) + path(graph_metadata_json) path(nodes_jsonl) path(edges_jsonl) @@ -360,7 +341,7 @@ process prepare_neo { #!/usr/bin/env bash set -Eeuo pipefail ${params.home}/target/release/grebi_make_neo_csv \ - --in-summary-jsons ${summary_json} \ + --in-graph-metadata-jsons ${graph_metadata_json} \ --in-nodes-jsonl ${nodes_jsonl} \ --in-edges-jsonl ${edges_jsonl} \ --out-nodes-csv-path neo_nodes_${params.subgraph}_${task.index}.csv \ @@ -421,8 +402,6 @@ process create_neo { time "8h" cpus "8" - publishDir "${params.tmp}/${params.config}/${params.subgraph}", overwrite: true - input: path(neo_inputs) @@ -433,7 +412,7 @@ process create_neo { """ #!/usr/bin/env bash set -Eeuo pipefail - PYTHONUNBUFFERED=true python3 ${params.home}/07_create_db/neo4j/neo4j_import.slurm.py \ + PYTHONUNBUFFERED=true python3 ${params.home}/06_create_neo_db/neo4j_import.slurm.py \ --in-csv-path . \ --out-db-path ${params.subgraph}_neo4j """ @@ -445,22 +424,99 @@ process run_materialised_queries { time "8h" cpus "8" - publishDir "${params.tmp}/${params.config}/${params.subgraph}", overwrite: true + publishDir "${params.out}/${params.subgraph}", overwrite: true input: path(neo_db) output: - path("materialised_queries/*") + path("query_results/queries.json"), emit: metadata + path("query_results/*.results.jsonl"), emit: results + path("query_results/*.json"), emit: metadatas script: """ #!/usr/bin/env bash set -Eeuo pipefail cp -r ${neo_db}/* ${params.neo_tmp_path} - PYTHONUNBUFFERED=true python3 ${params.home}/08_run_queries/run_queries.py \ + PYTHONUNBUFFERED=true python3 ${params.home}/07_run_queries/run_queries.py \ --in-db-path ${params.neo_tmp_path} \ - --out-csvs-path materialised_queries + --out-jsons-path query_results + """ +} + +process results_to_csv { + cache "lenient" + memory "8 GB" + time "8h" + cpus "8" + + input: + path(results_jsonl) + + output: + path("${results_jsonl.simpleName}.csv.gz") + + script: + """ + #!/usr/bin/env bash + set -Eeuo pipefail + cat ${results_jsonl} | \ + python3 ${params.home}/07_run_queries/jsonl_to_csv.py \ + | pigz --best > ${results_jsonl.simpleName}.csv.gz + """ +} + +process link_results { + cache "lenient" + memory "8 GB" + time "8h" + cpus "8" + + input: + path(results_jsonl) + path(entity_metadata_jsonl) + path(groups_txt) + + output: + path("${results_jsonl.simpleName}.linked_results.jsonl") + + script: + """ + #!/usr/bin/env bash + set -Eeuo pipefail + cat ${results_jsonl} | \ + ${params.home}/target/release/grebi_link_results \ + --in-metadata-jsonl ${entity_metadata_jsonl} \ + --groups-txt ${groups_txt} \ + > ${results_jsonl.simpleName}.linked_results.jsonl + """ +} + +process add_query_metadatas_to_graph_metadata { + + cache "lenient" + memory "8 GB" + time "8h" + cpus "8" + + publishDir "${params.out}/${params.subgraph}", overwrite: true + + input: + path(metadata_jsons) + path(graph_metadata_json) + + output: + path("${params.subgraph}_metadata.json") + + script: + """ + #!/usr/bin/env bash + set -Eeuo pipefail + python3 ${params.home}/07_run_queries/add_query_metadatas_to_graph_metadata.py \ + ${graph_metadata_json} \ + ${metadata_jsons} \ + > ${params.subgraph}_metadata.json """ } @@ -470,7 +526,7 @@ process csvs_to_sqlite { time "12h" cpus "8" - publishDir "${params.tmp}/${params.config}/${params.subgraph}", overwrite: true + publishDir "${params.out}/${params.subgraph}", overwrite: true input: path(csvs) @@ -483,7 +539,7 @@ process csvs_to_sqlite { #!/usr/bin/env bash set -Eeuo pipefail cp -r ${neo_db}/* ${params.neo_tmp_path} - PYTHONUNBUFFERED=true python3 ${params.home}/08_run_queries/csvs_to_sqlite.py --out-sqlite-path materialised_queries.sqlite3 + PYTHONUNBUFFERED=true python3 ${params.home}/07_run_queries/csvs_to_sqlite.py --out-sqlite-path materialised_queries.sqlite3 pigz --best materialised_queries.sqlite3 """ } @@ -494,12 +550,11 @@ process create_solr_nodes_core { memory "4 GB" time "23h" cpus "8" - - publishDir "${params.tmp}/${params.config}/${params.subgraph}/solr_cores", overwrite: true, saveAs: { filename -> filename.replace("solr/data/", "") } input: path(solr_inputs) path(names_txt) + path(graph_metadata_json) output: path("solr/data/grebi_nodes_${params.subgraph}") @@ -508,12 +563,12 @@ process create_solr_nodes_core { """ #!/usr/bin/env bash set -Eeuo pipefail - python3 ${params.home}/06_prepare_db_import/make_solr_config.py \ + python3 ${params.home}/08_create_other_dbs/solr/make_solr_config.py \ --subgraph-name ${params.subgraph} \ - --in-summary-json ${params.tmp}/${params.config}/${params.subgraph}/summary.json \ - --in-template-config-dir ${params.home}/06_prepare_db_import/solr_config_template \ + --in-graph-metadata-json ${graph_metadata_json} \ + --in-template-config-dir ${params.home}/08_create_other_dbs/solr/solr_config_template \ --out-config-dir solr_config - python3 ${params.home}/07_create_db/solr/solr_import.slurm.py \ + python3 ${params.home}/08_create_other_dbs/solr/solr_import.slurm.py \ --solr-config solr_config --core grebi_nodes_${params.subgraph} --in-data . --out-path solr --port 8985 --mem ${params.solr_mem} """ } @@ -524,11 +579,10 @@ process create_solr_edges_core { time "23h" cpus "8" - publishDir "${params.tmp}/${params.config}/${params.subgraph}/solr_cores", overwrite: true, saveAs: { filename -> filename.replace("solr/data/", "") } - input: path(solr_inputs) path(names_txt) + path(graph_metadata_json) output: path("solr/data/grebi_edges_${params.subgraph}") @@ -537,12 +591,12 @@ process create_solr_edges_core { """ #!/usr/bin/env bash set -Eeuo pipefail - python3 ${params.home}/06_prepare_db_import/make_solr_config.py \ + python3 ${params.home}/08_create_other_dbs/solr/make_solr_config.py \ --subgraph-name ${params.subgraph} \ - --in-summary-json ${params.tmp}/${params.config}/${params.subgraph}/summary.json \ - --in-template-config-dir ${params.home}/06_prepare_db_import/solr_config_template \ + --in-graph-metadata-json ${graph_metadata_json} \ + --in-template-config-dir ${params.home}/08_create_other_dbs/solr/solr_config_template \ --out-config-dir solr_config - python3 ${params.home}/07_create_db/solr/solr_import.slurm.py \ + python3 ${params.home}/08_create_other_dbs/solr/solr_import.slurm.py \ --solr-config solr_config --core grebi_edges_${params.subgraph} --in-data . --out-path /dev/shm/solr --port 8986 --mem ${params.solr_mem} mv /dev/shm/solr solr """ @@ -554,8 +608,6 @@ process create_solr_autocomplete_core { time "4h" cpus "4" - publishDir "${params.tmp}/${params.config}/${params.subgraph}/solr_cores", overwrite: true, saveAs: { filename -> filename.replace("solr/data/", "") } - input: path(names_txt) @@ -566,265 +618,83 @@ process create_solr_autocomplete_core { """ #!/usr/bin/env bash set -Eeuo pipefail - python3 ${params.home}/06_prepare_db_import/make_solr_autocomplete_config.py \ + python3 ${params.home}/08_create_other_dbs/solr/make_solr_autocomplete_config.py \ --subgraph-name ${params.subgraph} \ - --in-template-config-dir ${params.home}/06_prepare_db_import/solr_config_template \ + --in-template-config-dir ${params.home}/08_create_other_dbs/solr/solr_config_template \ --out-config-dir solr_config - python3 ${params.home}/07_create_db/solr/solr_import.slurm.py \ + python3 ${params.home}/08_create_other_dbs/solr/solr_import.slurm.py \ --solr-config solr_config --core grebi_autocomplete_${params.subgraph} --in-data . --in-names-txt ${names_txt} --out-path solr --port 8987 --mem ${params.solr_mem} """ } -process package_neo { +process create_solr_results_cores { cache "lenient" memory "4 GB" - time "8h" - cpus "8" - - publishDir "${params.tmp}/${params.config}/${params.subgraph}", overwrite: true + time "4h" + cpus "4" - input: - path("${params.subgraph}_neo4j") + input: + path(results_jsonl) output: - path("${params.subgraph}_neo4j.tgz") + path("solr/data/grebi_results__${params.subgraph}__${results_jsonl.simpleName}") script: """ - tar -chf ${params.subgraph}_neo4j.tgz --use-compress-program="pigz --fast" ${params.subgraph}_neo4j + #!/usr/bin/env bash + set -Eeuo pipefail + python3 ${params.home}/08_create_other_dbs/solr/make_solr_results_config.py \ + --subgraph-name ${params.subgraph} \ + --query-id ${results_jsonl.simpleName} \ + --in-template-config-dir ${params.home}/08_create_other_dbs/solr/solr_config_template \ + --out-config-dir solr_config + python3 ${params.home}/08_create_other_dbs/solr/solr_import.slurm.py \ + --solr-config solr_config --core grebi_results__${params.subgraph}__${results_jsonl.simpleName} --in-data . --out-path solr --port 8987 --mem ${params.solr_mem} """ } -process package_solr { +process package_neo { cache "lenient" memory "4 GB" time "8h" cpus "8" - publishDir "${params.tmp}/${params.config}/${params.subgraph}", overwrite: true + publishDir "${params.out}${params.subgraph}", overwrite: true input: - path(solr_nodes_core) - path(solr_edges_core) - path(solr_autocomplete_core) + path("${params.subgraph}_neo4j") output: - path("${params.subgraph}_solr.tgz") - - script: - """ - #!/usr/bin/env bash - set -Eeuo pipefail - cp -f ${params.home}/06_prepare_db_import/solr_config_template/*.xml . - cp -f ${params.home}/06_prepare_db_import/solr_config_template/*.cfg . - tar -chf ${params.subgraph}_solr.tgz --transform 's,^,solr/,' --use-compress-program="pigz --fast" \ - *.xml *.cfg ${solr_nodes_core} ${solr_edges_core} ${solr_autocomplete_core} - """ -} - -process copy_neo_to_ftp { - - cache "lenient" - memory "32 GB" - time "8h" - queue "datamover" - - input: - path("neo4j.tgz") - - script: - """ - #!/usr/bin/env bash - set -Eeuo pipefail - mkdir -p /nfs/ftp/public/databases/spot/kg/${params.subgraph}/${params.timestamp.trim()} - cp -f neo4j.tgz /nfs/ftp/public/databases/spot/kg/${params.subgraph}/${params.timestamp.trim()}/${params.subgraph}_neo4j.tgz - """ -} - -process copy_summary_to_ftp { - - cache "lenient" - memory "32 GB" - time "8h" - queue "datamover" - - input: - path(summary_json) - - script: - """ - #!/usr/bin/env bash - set -Eeuo pipefail - mkdir -p /nfs/ftp/public/databases/spot/kg/${params.subgraph}/${params.timestamp.trim()} - cp -f ${summary_json} /nfs/ftp/public/databases/spot/kg/${params.subgraph}/${params.timestamp.trim()}/ - """ -} - -process copy_solr_to_ftp { - - cache "lenient" - memory "32 GB" - time "8h" - queue "datamover" - - input: - path("solr.tgz") - - script: - """ - #!/usr/bin/env bash - set -Eeuo pipefail - mkdir -p /nfs/ftp/public/databases/spot/kg/${params.subgraph}/${params.timestamp.trim()} - cp -f solr.tgz /nfs/ftp/public/databases/spot/kg/${params.subgraph}/${params.timestamp.trim()}/${params.subgraph}_solr.tgz - """ -} - -process copy_sqlite_to_ftp { - - cache "lenient" - memory "32 GB" - time "8h" - queue "datamover" - - input: - path("${params.subgraph}.sqlite3") - - script: - """ - #!/usr/bin/env bash - set -Eeuo pipefail - mkdir -p /nfs/ftp/public/databases/spot/kg/${params.subgraph}/${params.timestamp.trim()} - cp -f ${params.subgraph}.sqlite3 /nfs/ftp/public/databases/spot/kg/${params.subgraph}/${params.timestamp.trim()}/${params.subgraph}.sqlite3 - """ -} - -process copy_mat_queries_to_ftp { - - cache "lenient" - memory "32 GB" - time "8h" - queue "datamover" - - input: - path(csvs) - path(sqlite) - - script: - """ - #!/usr/bin/env bash - set -Eeuo pipefail - mkdir -p /nfs/ftp/public/databases/spot/kg/${params.subgraph}/${params.timestamp.trim()}/query_results - cp -f ${csvs} /nfs/ftp/public/databases/spot/kg/${params.subgraph}/${params.timestamp.trim()}/query_results/ - cp -f ${sqlite} /nfs/ftp/public/databases/spot/kg/${params.subgraph}/${params.timestamp.trim()}/query_results/all_query_results.sqlite3.gz - """ -} - -process copy_summary_to_staging { - - cache "lenient" - memory "32 GB" - time "8h" - queue "datamover" - - input: - path(summary_json) - - script: - """ - #!/usr/bin/env bash - set -Eeuo pipefail - mkdir -p /nfs/public/rw/ontoapps/grebi/staging/summaries - cp -f ${summary_json} /nfs/public/rw/ontoapps/grebi/staging/summaries/ - """ -} - -process copy_solr_config_to_staging { - cache "lenient" - memory "32 GB" - time "8h" - queue "datamover" - - script: - """ - #!/usr/bin/env bash - set -Eeuo pipefail - cp -f ${params.home}/06_prepare_db_import/solr_config_template/*.xml . - cp -f ${params.home}/06_prepare_db_import/solr_config_template/*.cfg . - mkdir -p /nfs/public/rw/ontoapps/grebi/staging/solr - cp -LR * /nfs/public/rw/ontoapps/grebi/staging/solr/ - """ - -} - -process copy_solr_cores_to_staging { - cache "lenient" - memory "32 GB" - time "8h" - queue "datamover" - - input: - path(solr_core) + path("${params.subgraph}_neo4j.tgz") script: """ - #!/usr/bin/env bash - set -Eeuo pipefail - mkdir -p /nfs/public/rw/ontoapps/grebi/staging/solr - cp -LR * /nfs/public/rw/ontoapps/grebi/staging/solr/ + tar -chf ${params.subgraph}_neo4j.tgz --use-compress-program="pigz --fast" ${params.subgraph}_neo4j """ } -process copy_sqlite_to_staging { +process package_solr { cache "lenient" - memory "32 GB" + memory "4 GB" time "8h" - queue "datamover" - - input: - path(sqlite) - - script: - """ - #!/usr/bin/env bash - set -Eeuo pipefail - mkdir -p /nfs/public/rw/ontoapps/grebi/staging/sqlite - cp -LR * /nfs/public/rw/ontoapps/grebi/staging/sqlite/ - """ -} + cpus "8" -process copy_mat_queries_to_staging { - cache "lenient" - memory "32 GB" - time "8h" - queue "datamover" + publishDir "${params.out}/${params.subgraph}", overwrite: true input: - path(sqlite) + path(cores) - script: - """ - #!/usr/bin/env bash - set -Eeuo pipefail - mkdir -p /nfs/public/rw/ontoapps/grebi/staging/materialised_queries - cp -LR * /nfs/public/rw/ontoapps/grebi/staging/materialised_queries/ - """ -} - -process copy_neo_to_staging { - cache "lenient" - memory "32 GB" - time "8h" - queue "datamover" - - input: - path(neodb) + output: + path("${params.subgraph}_solr.tgz") script: """ #!/usr/bin/env bash set -Eeuo pipefail - mkdir -p /nfs/public/rw/ontoapps/grebi/staging/neo4j - cp -LR * /nfs/public/rw/ontoapps/grebi/staging/neo4j/ + cp -f ${params.home}/08_create_other_dbs/solr/solr_config_template/*.xml . + cp -f ${params.home}/08_create_other_dbs/solr/solr_config_template/*.cfg . + tar -chf ${params.subgraph}_solr.tgz --transform 's,^,solr/,' --use-compress-program="pigz --fast" \ + *.xml *.cfg ${cores} """ } diff --git a/dataload/nextflow/saturos_nextflow.config b/dataload/nextflow/saturos_nextflow.config index d6d1431..054a5ee 100644 --- a/dataload/nextflow/saturos_nextflow.config +++ b/dataload/nextflow/saturos_nextflow.config @@ -1,5 +1,6 @@ -process.executor = 'slurm' +params.solr_mem = "64g" +params.neo_tmp_path = "." process { withName: build_equiv_groups { @@ -8,7 +9,7 @@ process { } process { - withName: materialise { + withName: link { memory = 64.GB } } @@ -22,19 +23,25 @@ process { process { withName: create_solr_nodes_core { memory = 64.GB - cpus = 32 + cpus = 16 } } process { withName: create_solr_edges_core { memory = 64.GB - cpus = 32 + cpus = 16 } } process { withName: create_solr_autocomplete_core { memory = 64.GB - cpus = 32 + cpus = 16 + } +} +process { + withName: create_solr_results_cores { + memory = 64.GB + cpus = 16 } } @@ -43,31 +50,36 @@ process { process { withName: package_neo { memory = 32.GB - cpus = 32 + cpus = 16 } } process { withName: package_solr { memory = 32.GB - cpus = 32 + cpus = 16 } } +process { + withName: create_sqlite { + cpus = 16 + memory = 72.GB + } +} process { withName: create_neo { - cpus = 32 + cpus = 16 memory = 72.GB } } process { withName: run_materialised_queries { - memory = 1500.GB + memory = 64.GB cpus = 8 - neo_tmp_path = "/dev/shm" } } diff --git a/dataload/scripts/check_datarelease.sh b/dataload/scripts/check_datarelease.sh new file mode 100644 index 0000000..ea81e31 --- /dev/null +++ b/dataload/scripts/check_datarelease.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +SUBGRAPH=$1 +DATARELEASE_PATH=$2 + +if [ ! -d "$DATARELEASE_PATH" ]; then + echo "Data release path $DATARELEASE_PATH does not exist" + exit 1 +fi + +echo "Checking the data release for subgraph $SUBGRAPH at $DATARELEASE_PATH looks sane" + +if [ ! -f "$DATARELEASE_PATH/${SUBGRAPH}_neo4j.tgz" ]; then + echo "neo4j archive $DATARELEASE_PATH/${SUBGRAPH}_neo4j.tgz not found" + exit 1 +fi +if [ ! -f "$DATARELEASE_PATH/${SUBGRAPH}_solr.tgz" ]; then + echo "solr archive $DATARELEASE_PATH/${SUBGRAPH}_solr.tgz not found" + exit 1 +fi +if [ ! -f "$DATARELEASE_PATH/${SUBGRAPH}.sqlite3" ]; then + echo "sqlite3 database $DATARELEASE_PATH/${SUBGRAPH}.sqlite3 not found" + exit 1 +fi +if [ ! -f "$DATARELEASE_PATH/${SUBGRAPH}_metadata.json" ]; then + echo "summary json $DATARELEASE_PATH/${SUBGRAPH}_metadata.json not found" + exit 1 +fi +if [ ! -d "$DATARELEASE_PATH/query_results" ]; then + echo "query_results folder $DATARELEASE_PATH/query_results not found" + exit 1 +fi diff --git a/dataload/scripts/dataload.py b/dataload/scripts/dataload.py deleted file mode 100644 index 1e54155..0000000 --- a/dataload/scripts/dataload.py +++ /dev/null @@ -1,23 +0,0 @@ - -import json -import sys -import os -import subprocess -from pathlib import Path - -GREBI_DATALOAD_HOME = os.environ['GREBI_DATALOAD_HOME'] -GREBI_CONFIG = os.environ['GREBI_CONFIG'] -GREBI_NEXTFLOW_CONFIG = os.environ['GREBI_NEXTFLOW_CONFIG'] - -config = json.load(open(f'{GREBI_DATALOAD_HOME}/configs/pipeline_configs/{GREBI_CONFIG}.json')) - -for subgraph in config['subgraphs']: - print(f"===== LOADING SUBGRAPH: {subgraph} =====") - os.environ['GREBI_SUBGRAPH'] = subgraph - nextflow_dir_path = "nextflow_" + subgraph - Path(nextflow_dir_path).mkdir(parents=True, exist_ok=True) - res = os.system(f'cd {nextflow_dir_path} && nextflow {GREBI_DATALOAD_HOME}/nextflow/load_subgraph.nf -c {GREBI_NEXTFLOW_CONFIG} -resume') - if res != 0: - exit(res) - print(f"===== FINISHED LOADING SUBGRAPH: {subgraph} =====") - diff --git a/dataload/scripts/dataload_codon.sh b/dataload/scripts/dataload_codon.sh index 202c687..8569828 100755 --- a/dataload/scripts/dataload_codon.sh +++ b/dataload/scripts/dataload_codon.sh @@ -1,7 +1,13 @@ #!/bin/bash + +if [ -z "$GREBI_SUBGRAPH" ]; then + echo "Set GREBI_SUBGRAPH to run this script" + exit 1 +fi + export GREBI_DATALOAD_HOME=/nfs/production/parkinso/spot/grebi/dataload export GREBI_QUERY_YAMLS_PATH=/nfs/production/parkinso/spot/grebi/materialised_queries -export GREBI_TMP=/hps/nobackup/parkinso/spot/grebi/tmp +export GREBI_OUT_DIR=/hps/nobackup/parkinso/spot/grebi/$GREBI_SUBGRAPH/out export GREBI_CONFIG=ebi export GREBI_IS_EBI=true export GREBI_TIMESTAMP=$(date +"%Y-%b-%d") @@ -10,10 +16,17 @@ export GREBI_NEXTFLOW_CONFIG=$GREBI_DATALOAD_HOME/nextflow/codon_nextflow.config module load nextflow-22.10.1-gcc-11.2.0-ju5saqw module load python-3.10.2-gcc-9.3.0-gswnsij source /nfs/production/parkinso/spot/grebi/.venv/bin/activate -cd /hps/nobackup/parkinso/spot/grebi/ export PYTHONUNBUFFERED=true -srun -p datamover --time 1:0:0 --mem 8g bash -c "rm -rf /nfs/public/rw/ontoapps/grebi/staging && mkdir /nfs/public/rw/ontoapps/grebi/staging" -srun --time 3-0:0:0 --mem 8g bash -c "rm -rf nextflow* work* tmp" -srun --time 3-0:0:0 --mem 8g bash -c "python3 ${GREBI_DATALOAD_HOME}/scripts/dataload.py" + +srun --time 1:0:0 --mem 4g mkdir -p /hps/nobackup/parkinso/spot/grebi/$GREBI_SUBGRAPH +srun --time 1:0:0 --mem 4g mkdir -p $GREBI_OUT_DIR + +cd /hps/nobackup/parkinso/spot/grebi/$GREBI_SUBGRAPH + +echo "Loading subgraph $GREBI_SUBGRAPH" +echo "pwd is $(pwd)" +echo "user is $(whoami)" + +srun --time 6-0:0:0 --mem 32g nextflow $GREBI_DATALOAD_HOME/nextflow/load_subgraph.nf -c $GREBI_NEXTFLOW_CONFIG -resume diff --git a/dataload/scripts/dataload_saturos.sh b/dataload/scripts/dataload_saturos.sh index 3f2f966..9f4d5e1 100755 --- a/dataload/scripts/dataload_saturos.sh +++ b/dataload/scripts/dataload_saturos.sh @@ -1,17 +1,18 @@ #!/bin/bash export GREBI_DATALOAD_HOME=/home/james/grebi/dataload export GREBI_QUERY_YAMLS_PATH=/home/james/grebi/materialised_queries -export GREBI_TMP=/data/grebi_tmp -export GREBI_CONFIG=ebi +export GREBI_OUT_DIR=/data/grebi_tmp +export GREBI_CONFIG=gwas_efo_only export GREBI_IS_EBI=false export GREBI_TIMESTAMP=$(date +%Y_%m_%d__%H_%M) export RUST_BACKTRACE=full export GREBI_NEXTFLOW_CONFIG=$GREBI_DATALOAD_HOME/nextflow/saturos_nextflow.config -cd $GREBI_TMP +export GREBI_SUBGRAPH=gwas_and_efo +cd $GREBI_OUT_DIR export PYTHONUNBUFFERED=true source ~/grebi/.venv/bin/activate -rm -rf work tmp -python3 ${GREBI_DATALOAD_HOME}/scripts/dataload.py + +nextflow $GREBI_DATALOAD_HOME/nextflow/load_subgraph.nf -c $GREBI_NEXTFLOW_CONFIG -resume diff --git a/dataload/scripts/ebi_datarelease_to_ftp.sh b/dataload/scripts/ebi_datarelease_to_ftp.sh new file mode 100755 index 0000000..05a7907 --- /dev/null +++ b/dataload/scripts/ebi_datarelease_to_ftp.sh @@ -0,0 +1,2 @@ +#!/bin/bash + diff --git a/dataload/scripts/ebi_datarelease_to_staging.sh b/dataload/scripts/ebi_datarelease_to_staging.sh new file mode 100755 index 0000000..52384d9 --- /dev/null +++ b/dataload/scripts/ebi_datarelease_to_staging.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +if [ "$SLURM_JOB_PARTITION" != "datamover" ]; then + echo "Must run on a datamover node" + exit 1 +fi + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +SUBGRAPH=$1 +DATARELEASE_PATH=$2 + +./check_datarelease.sh $SUBGRAPH $DATARELEASE_PATH + +STAGING_PATH=/nfs/public/rw/ontoapps/grebi/staging + +if [ ! -d "$STAGING_PATH" ]; then + echo "Staging path $STAGING_PATH does not exist" + exit 1 +fi + +mkdir -p $STAGING_PATH/neo4j +mkdir -p $STAGING_PATH/solr +mkdir -p $STAGING_PATH/metadata +mkdir -p $STAGING_PATH/sqlite + +echo Removing old files from staging + +rm -rf $STAGING_PATH/neo4j/${SUBGRAPH}_neo4j +rm -rf $STAGING_PATH/solr/grebi_nodes_${SUBGRAPH} +rm -rf $STAGING_PATH/solr/grebi_edges_${SUBGRAPH} +rm -rf $STAGING_PATH/solr/grebi_results__${SUBGRAPH}__* +rm -rf $STAGING_PATH/metadata/${SUBGRAPH}_metadata.json +rm -rf $STAGING_PATH/sqlite/${SUBGRAPH}.sqlite3 + +echo Extracting new data release + +tar --use-compress-program=pigz -xf $DATARELEASE_PATH/${SUBGRAPH}_neo4j.tgz -C $STAGING_PATH/neo4j +tar --use-compress-program=pigz -xf $DATARELEASE_PATH/${SUBGRAPH}_solr.tgz -C $STAGING_PATH/solr +cp -f $DATARELEASE_PATH/${SUBGRAPH}_metadata.json $STAGING_PATH/metadata +cp -f $DATARELEASE_PATH/${SUBGRAPH}.sqlite3 $STAGING_PATH/sqlite + + + + + + + + + + + + + + + + + + + + + + diff --git a/dataload/scripts/start_local_solr.py b/dataload/scripts/start_local_solr.py index 36487c6..48b92bb 100644 --- a/dataload/scripts/start_local_solr.py +++ b/dataload/scripts/start_local_solr.py @@ -9,7 +9,7 @@ from subprocess import Popen, PIPE, STDOUT def main(): - solr_data_path = os.path.join(os.environ['GREBI_HPS_TMP'], os.environ['GREBI_CONFIG'], "07_create_db", "solr") + solr_data_path = os.path.join(os.environ['GREBI_HPS_TMP'], os.environ['GREBI_CONFIG'], "08_create_other_dbs", "solr") cmd = ' '.join([ 'docker run', diff --git a/materialised_queries/hello_world.yaml b/materialised_queries/hello_world.yaml new file mode 100644 index 0000000..1fab8a3 --- /dev/null +++ b/materialised_queries/hello_world.yaml @@ -0,0 +1,11 @@ +title: Hello world +description: Tester query to see if the pipeline is working +run_for_subgraphs: + - ebi_monarch_xspecies + - impc_x_gwas +uses_datasources: + - IMPC + - GWAS +cypher_query: |- + RETURN "Hello, World!" AS message + diff --git a/materialised_queries/impc_x_gwas.yaml b/materialised_queries/impc_x_gwas.yaml index 409dd75..8648e39 100644 --- a/materialised_queries/impc_x_gwas.yaml +++ b/materialised_queries/impc_x_gwas.yaml @@ -1,6 +1,6 @@ title: Human GWAS variants to mouse models in IMPC description: Connects human GWAS variants to mouse models in IMPC using multiple different graph paths through phenotype and disease ontologies -subgraphs: +run_for_subgraphs: - ebi_monarch_xspecies - impc_x_gwas uses_datasources: @@ -19,7 +19,7 @@ cypher_query: |- MATCH (speciesNeutralPhenotype)<-[:`biolink:broad_match`]-(descendantPhenotype)-[:sourceId]->(descendantSourceId) WHERE "OLS.mp" IN descendantPhenotype.`grebi:datasources` MATCH (descendantPhenotype)<-[:`impc:phenotype`]-(mouseGene) - RETURN "gwas->oba->upheno->mp->impc" AS graph_path, + RETURN DISTINCT "gwas->oba->upheno->mp->impc" AS graph_path, [id in snp.id WHERE id =~ "rs[0-9]*" | id][0] AS gwas_variant, [id in trait.id WHERE id =~ "oba:.*" | id][0] AS trait_id, trait.`grebi:name`[0] as trait_name, diff --git a/notebooks/summaries.ipynb b/notebooks/summaries.ipynb index de58999..5069a4a 100644 --- a/notebooks/summaries.ipynb +++ b/notebooks/summaries.ipynb @@ -22,7 +22,7 @@ "import json\n", "import matplotlib.pyplot as plt\n", "\n", - "with open(\"/home/james/ebi_monarch_xspecies_summary.json\", \"r\") as f:\n", + "with open(\"/home/james/ebi_monarch_xspecies_metadata.json\", \"r\") as f:\n", " sums = json.load(f)\n", "\n", "type_to_count = sums['displaytypes']\n", diff --git a/webapp/docker-compose.yml b/webapp/docker-compose.yml index 6be03af..b579a56 100644 --- a/webapp/docker-compose.yml +++ b/webapp/docker-compose.yml @@ -14,17 +14,17 @@ services: - grebi-neo4j - grebi-solr - grebi-resolver-service - - grebi-summary-service + - grebi-metadata-service links: - grebi-neo4j - grebi-solr - grebi-resolver-service - - grebi-summary-service + - grebi-metadata-service environment: - GREBI_NEO4J_HOST=bolt://grebi-neo4j:7687/ - GREBI_SOLR_HOST=http://grebi-solr:8983/ - GREBI_RESOLVER_HOST=http://grebi-resolver-service:8080/ - - GREBI_SUMMARY_HOST=http://grebi-summary-service:8081/ + - GREBI_METADATA_HOST=http://grebi-metadata-service:8081/ grebi-neo4j: image: neo4j:5.18.0 ports: @@ -49,14 +49,14 @@ services: - ${GREBI_SQLITE_SEARCH_PATH:?Need path to search for sqlite databases}:/sqlite environment: - GREBI_SQLITE_SEARCH_PATH=/sqlite - grebi-summary-service: - image: ghcr.io/ebispot/grebi_summary_service:dev + grebi-metadata-service: + image: ghcr.io/ebispot/grebi_metadata_service:dev ports: - 8081:8081 volumes: - - ${GREBI_SUMMARY_JSON_SEARCH_PATH:?Need path to search for summary json files}:/summaryjsons + - ${GREBI_METADATA_JSON_SEARCH_PATH:?Need path to search for metadata json files}:/metadata environment: - - GREBI_SUMMARY_JSON_SEARCH_PATH=/summaryjsons + - GREBI_METADATA_JSON_SEARCH_PATH=/metadata volumes: grebi-neo4j-data: diff --git a/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/GrebiApi.java b/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/GrebiApi.java index b808f23..930c524 100644 --- a/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/GrebiApi.java +++ b/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/GrebiApi.java @@ -19,9 +19,9 @@ import uk.ac.ebi.grebi.repo.GrebiNeoRepo; import uk.ac.ebi.grebi.db.GrebiSolrQuery; import uk.ac.ebi.grebi.db.ResolverClient; -import uk.ac.ebi.grebi.db.SummaryClient; +import uk.ac.ebi.grebi.db.MetadataClient; import uk.ac.ebi.grebi.repo.GrebiSolrRepo; -import uk.ac.ebi.grebi.repo.GrebiSummaryRepo; +import uk.ac.ebi.grebi.repo.GrebiMetadataRepo; public class GrebiApi { @@ -30,25 +30,25 @@ public static void main(String[] args) throws ParseException, org.apache.commons GrebiNeoRepo neo = null; GrebiSolrRepo solr = null; - GrebiSummaryRepo summary= null; + GrebiMetadataRepo metadata= null; Set sqliteSubgraphs = null; Set solrSubgraphs = null; - Set summarySubgraphs = null; + Set metadataServiceSubgraphs = null; Set neoSubgraphs = null; while(true) { try { solr = new GrebiSolrRepo(); - summary = new GrebiSummaryRepo(); + summary = new GrebiMetadataRepo(); sqliteSubgraphs = (new ResolverClient()).getSubgraphs(); solrSubgraphs = solr.getSubgraphs(); - summarySubgraphs = summary.getSubgraphs(); - if(new HashSet<>(List.of(sqliteSubgraphs, solrSubgraphs, summarySubgraphs)).size() != 1) { - throw new RuntimeException("SQLite/Solr/the summary jsons do not seem to contain the same subgraphs. Found: " + metadataServiceSubgraphs = metadata.getSubgraphs(); + if(new HashSet<>(List.of(sqliteSubgraphs, solrSubgraphs, metadataServiceSubgraphs)).size() != 1) { + throw new RuntimeException("SQLite/Solr/the metadata jsons do not seem to contain the same subgraphs. Found: " + String.join(",", sqliteSubgraphs) + " for SQLite (from resolver service) and " + String.join(",", solrSubgraphs) + " for Solr (from list of solr cores) and " - + String.join(",", summarySubgraphs) + " for the summary jsons (from summary server)" + + String.join(",", metadataServiceSubgraphs) + " for the summary jsons (from metadata server)" ); } break; @@ -67,12 +67,12 @@ public static void main(String[] args) throws ParseException, org.apache.commons try { neo = new GrebiNeoRepo(); neoSubgraphs = neo.getSubgraphs(); - if(new HashSet<>(List.of(sqliteSubgraphs, solrSubgraphs, summarySubgraphs)).size() != 1) { + if(new HashSet<>(List.of(sqliteSubgraphs, solrSubgraphs, metadataServiceSubgraphs)).size() != 1) { neo = null; throw new RuntimeException("SQLite/Solr/the summary jsons/neo4j do not seem to contain the same subgraphs. Found: " + String.join(",", sqliteSubgraphs) + " for SQLite (from resolver service) and " + String.join(",", solrSubgraphs) + " for Solr (from list of solr cores) and " - + String.join(",", summarySubgraphs) + " for the summary jsons (from summary server) and " + + String.join(",", metadataServiceSubgraphs) + " for the summary jsons (from summary server) and " + String.join(",", neoSubgraphs) + " for neo4j" ); } @@ -93,13 +93,13 @@ public static void main(String[] args) throws ParseException, org.apache.commons System.out.println("Found subgraphs: " + String.join(",", solrSubgraphs)); - run(neo, solr, summary, solrSubgraphs); + run(neo, solr, metadata, solrSubgraphs); } static void run( final GrebiNeoRepo neo, final GrebiSolrRepo solr, - final GrebiSummaryRepo summary, + final GrebiMetadataRepo metadata, final Set subgraphs ) { @@ -130,7 +130,7 @@ static void run( }) .get("/api/v1/subgraphs/{subgraph}", ctx -> { ctx.contentType("application/json"); - ctx.result(gson.toJson(summary.getSummary(ctx.pathParam("subgraph")))); + ctx.result(gson.toJson(metadata.getSummary(ctx.pathParam("subgraph")))); }) .get("/api/v1/subgraphs/{subgraph}/nodes/{nodeId}", ctx -> { ctx.contentType("application/json"); diff --git a/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/db/SummaryClient.java b/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/db/MetadataClient.java similarity index 80% rename from webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/db/SummaryClient.java rename to webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/db/MetadataClient.java index 7ddc564..80fab20 100644 --- a/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/db/SummaryClient.java +++ b/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/db/MetadataClient.java @@ -27,20 +27,20 @@ import org.apache.http.entity.ContentType; import com.google.common.base.Stopwatch; -public class SummaryClient { +public class MetadataClient { - static final String SUMMARY_HOST = System.getenv("GREBI_SUMMARY_HOST"); + static final String METADATA_HOST = System.getenv("GREBI_METADATA_HOST"); - public static String getSummaryHost() { - if (SUMMARY_HOST != null) - return SUMMARY_HOST; + public static String getMetadataHost() { + if (METADATA_HOST != null) + return METADATA_HOST; return "http://localhost:8081/"; } - public Map getSummaries() { + public Map getMetadatas() { HttpClient client = HttpClientBuilder.create().build(); - HttpGet request = new HttpGet(getSummaryHost()); + HttpGet request = new HttpGet(getMetadataHost()); HttpResponse response; try { response = client.execute(request); diff --git a/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/repo/GrebiMetadataRepo.java b/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/repo/GrebiMetadataRepo.java new file mode 100644 index 0000000..3ba0ece --- /dev/null +++ b/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/repo/GrebiMetadataRepo.java @@ -0,0 +1,34 @@ +package uk.ac.ebi.grebi.repo; + +import java.util.Map; +import java.util.Set; + +import com.google.gson.JsonElement; + +import uk.ac.ebi.grebi.db.MetadataClient; + +public class GrebiMetadataRepo { + + Map subgraph2metadata; + + public GrebiMetadataRepo() { + + MetadataClient MetadataClient = new MetadataClient(); + subgraph2metadata = MetadataClient.getMetadatas(); + + } + + public Set getSubgraphs() { + return subgraph2metadata.keySet(); + } + + public Map getMetadata(String subgraph) { + return subgraph2metadata.get(subgraph).getAsJsonObject().asMap(); + } + + public Set getAllEdgeProps(String subgraph) { + return getMetadata(subgraph).get("edge_props").getAsJsonObject().keySet(); + } + + +} diff --git a/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/repo/GrebiSummaryRepo.java b/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/repo/GrebiSummaryRepo.java deleted file mode 100644 index 6ddf584..0000000 --- a/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/repo/GrebiSummaryRepo.java +++ /dev/null @@ -1,34 +0,0 @@ -package uk.ac.ebi.grebi.repo; - -import java.util.Map; -import java.util.Set; - -import com.google.gson.JsonElement; - -import uk.ac.ebi.grebi.db.SummaryClient; - -public class GrebiSummaryRepo { - - Map subgraph2summary; - - public GrebiSummaryRepo() { - - SummaryClient summaryClient = new SummaryClient(); - subgraph2summary = summaryClient.getSummaries(); - - } - - public Set getSubgraphs() { - return subgraph2summary.keySet(); - } - - public Map getSummary(String subgraph) { - return subgraph2summary.get(subgraph).getAsJsonObject().asMap(); - } - - public Set getAllEdgeProps(String subgraph) { - return getSummary(subgraph).get("edge_props").getAsJsonObject().keySet(); - } - - -} diff --git a/webapp/grebi_summary_service/Dockerfile b/webapp/grebi_metadata_service/Dockerfile similarity index 63% rename from webapp/grebi_summary_service/Dockerfile rename to webapp/grebi_metadata_service/Dockerfile index 06b6e7d..76d9622 100644 --- a/webapp/grebi_summary_service/Dockerfile +++ b/webapp/grebi_metadata_service/Dockerfile @@ -7,6 +7,6 @@ COPY . /opt/ RUN cd /opt/ && ls && mvn clean package assembly:single -DskipTests EXPOSE 8080 -ENTRYPOINT ["java", "-jar", "/opt/target/grebi_summary_service-1.0-SNAPSHOT-jar-with-dependencies.jar"] +ENTRYPOINT ["java", "-jar", "/opt/target/grebi_metadata_service-1.0-SNAPSHOT-jar-with-dependencies.jar"] diff --git a/webapp/grebi_summary_service/pom.xml b/webapp/grebi_metadata_service/pom.xml similarity index 100% rename from webapp/grebi_summary_service/pom.xml rename to webapp/grebi_metadata_service/pom.xml diff --git a/webapp/grebi_summary_service/src/main/java/uk/ac/ebi/grebi_summary_service/GrebiSummarySvc.java b/webapp/grebi_metadata_service/src/main/java/uk/ac/ebi/grebi_metadata_service/GrebiMetadataSvc.java similarity index 70% rename from webapp/grebi_summary_service/src/main/java/uk/ac/ebi/grebi_summary_service/GrebiSummarySvc.java rename to webapp/grebi_metadata_service/src/main/java/uk/ac/ebi/grebi_metadata_service/GrebiMetadataSvc.java index f940aaf..a97d89c 100644 --- a/webapp/grebi_summary_service/src/main/java/uk/ac/ebi/grebi_summary_service/GrebiSummarySvc.java +++ b/webapp/grebi_metadata_service/src/main/java/uk/ac/ebi/grebi_metadata_service/GrebiMetadataSvc.java @@ -1,4 +1,4 @@ -package uk.ac.ebi.grebi_summary_service; +package uk.ac.ebi.grebi_metadata_service; import com.google.gson.Gson; import com.google.gson.JsonElement; @@ -13,19 +13,19 @@ import java.util.List; import java.util.Map; -public class GrebiSummarySvc { +public class GrebiMetadataSvc { private static Map jsons = new HashMap<>(); public static void main(String[] args) throws FileNotFoundException { Gson gson = new Gson(); - var files = Arrays.stream(new File(System.getenv("GREBI_SUMMARY_JSON_SEARCH_PATH")).listFiles()).filter(File::isFile).filter(f -> f.getName().endsWith("_summary.json")).toArray(File[]::new); + var files = Arrays.stream(new File(System.getenv("GREBI_METADATA_JSON_SEARCH_PATH")).listFiles()).filter(File::isFile).filter(f -> f.getName().endsWith("_metadata.json")).toArray(File[]::new); for (File f : files) { - var subgraph = f.getName().split("_summary.json")[0]; + var subgraph = f.getName().split("_metadata.json")[0]; jsons.put(subgraph, gson.fromJson(new InputStreamReader(new FileInputStream(f)), JsonElement.class)); - System.out.println("Loaded summary JSON for subgraph " + subgraph + " from " + f.getAbsolutePath()); + System.out.println("Loaded metadata JSON for subgraph " + subgraph + " from " + f.getAbsolutePath()); } Javalin app = Javalin.create(config -> { diff --git a/webapp/grebi_ui/src/frontends/ebi/pages/EbiDownloadsPage.tsx b/webapp/grebi_ui/src/frontends/ebi/pages/EbiDownloadsPage.tsx index 18c40be..2ec84f4 100644 --- a/webapp/grebi_ui/src/frontends/ebi/pages/EbiDownloadsPage.tsx +++ b/webapp/grebi_ui/src/frontends/ebi/pages/EbiDownloadsPage.tsx @@ -90,10 +90,10 @@ const data: any[] = [ }, { description: - "JSON summary of the KG contents (< 1 MB)", - downloadLabel: "summary.json", + "JSON metadata of the KG contents (< 1 MB)", + downloadLabel: "metadata.json", downloadLink: - "https://ftp.ebi.ac.uk/pub/databases/spot/kg/ebi_full_monarch/latest/summary.json", + "https://ftp.ebi.ac.uk/pub/databases/spot/kg/ebi_full_monarch/latest/metadata.json", format: "JSON", }, ]; diff --git a/webapp/k8chart/templates/backend_deployment.yaml b/webapp/k8chart/templates/backend_deployment.yaml index 7b1db9d..1c25399 100644 --- a/webapp/k8chart/templates/backend_deployment.yaml +++ b/webapp/k8chart/templates/backend_deployment.yaml @@ -34,8 +34,8 @@ spec: value: bolt://{{ .Release.Name }}-{{.Values.subgraph1 | replace "_" "-" }}-neo4j:7687;bolt://{{ .Release.Name }}-{{.Values.subgraph2 | replace "_" "-"}}-neo4j:7687 - name: GREBI_RESOLVER_HOST value: http://{{ .Release.Name }}-resolver:8080 - - name: GREBI_SUMMARY_HOST - value: http://{{ .Release.Name }}-summary-service:8081 + - name: GREBI_METADATA_HOST + value: http://{{ .Release.Name }}-metadata-service:8081 - name: GREBI_CONTEXT_PATH value: /kg - name: JDK_JAVA_OPTIONS diff --git a/webapp/k8chart/templates/summary_service_deployment.yaml b/webapp/k8chart/templates/metadata_service_deployment.yaml similarity index 66% rename from webapp/k8chart/templates/summary_service_deployment.yaml rename to webapp/k8chart/templates/metadata_service_deployment.yaml index cdc723c..1e43a0a 100644 --- a/webapp/k8chart/templates/summary_service_deployment.yaml +++ b/webapp/k8chart/templates/metadata_service_deployment.yaml @@ -1,26 +1,26 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ .Release.Name }}-summary-service + name: {{ .Release.Name }}-metadata-service labels: - app: {{ .Release.Name }}-summary-service + app: {{ .Release.Name }}-metadata-service spec: replicas: 1 selector: matchLabels: - app: {{ .Release.Name }}-summary-service + app: {{ .Release.Name }}-metadata-service template: metadata: labels: - app: {{ .Release.Name }}-summary-service + app: {{ .Release.Name }}-metadata-service spec: volumes: - name: {{ .Release.Name }}-import persistentVolumeClaim: claimName: {{ .Release.Name }}-import-pvc containers: - - name: summary-service - image: ghcr.io/ebispot/grebi_summary_service:{{.Values.imageTag}} + - name: metadata-service + image: ghcr.io/ebispot/grebi_metadata_service:{{.Values.imageTag}} imagePullPolicy: Always volumeMounts: - mountPath: "/data_import" @@ -34,8 +34,8 @@ spec: memory: 3Gi cpu: 0.5 env: - - name: GREBI_SUMMARY_JSON_SEARCH_PATH - value: /data_import/summaries + - name: GREBI_METADATA_JSON_SEARCH_PATH + value: /data_import/metadata - name: JAVA_TOOL_OPTIONS value: -Xmx2048m ports: diff --git a/webapp/k8chart/templates/services.yaml b/webapp/k8chart/templates/services.yaml index f6d8fba..979884e 100644 --- a/webapp/k8chart/templates/services.yaml +++ b/webapp/k8chart/templates/services.yaml @@ -97,9 +97,9 @@ spec: apiVersion: v1 kind: Service metadata: - name: {{ .Release.Name }}-summary-service + name: {{ .Release.Name }}-metadata-service labels: - app: {{ .Release.Name }}-summary-service + app: {{ .Release.Name }}-metadata-service spec: ports: - port: 8081 @@ -107,4 +107,4 @@ spec: name: http protocol: TCP selector: - app: {{ .Release.Name }}-summary-service + app: {{ .Release.Name }}-metadata-service diff --git a/webapp/up_saturos.fish b/webapp/up_saturos.fish index f103e72..ef59433 100755 --- a/webapp/up_saturos.fish +++ b/webapp/up_saturos.fish @@ -1,6 +1,6 @@ export GREBI_NEO_DATA_PATH=/data/grebi_from_codon/(ls /data/grebi_from_codon/ | grep neo | grep -v tgz)/data export GREBI_SOLR_PATH=/data/grebi_from_codon/(ls /data/grebi_from_codon/ | grep solr | grep -v tgz) export GREBI_SQLITE_SEARCH_PATH=/data/grebi_from_codon/ -export GREBI_SUMMARY_JSON_SEARCH_PATH=/data/grebi_from_codon/ +export GREBI_METADATA_JSON_SEARCH_PATH=/data/grebi_from_codon/