diff --git a/README.md b/README.md index 98e1155..5ebff8f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # GrEBI (Graphs@EBI) -HPC pipeline to aggregate knowledge graphs from [EMBL-EBI resources](https://www.ebi.ac.uk/services/data-resources-and-tools), the [MONARCH Initiative KG](https://monarch-initiative.github.io/monarch-ingest/Sources/), [ROBOKOP](https://robokop.renci.org/), [Ubergraph](https://github.com/INCATools/ubergraph), and other sources into giant (multi-terabyte) transient Neo4j+Solr+RocksDB databases for querying. +HPC pipeline to aggregate knowledge graphs from [EMBL-EBI resources](https://www.ebi.ac.uk/services/data-resources-and-tools), the [MONARCH Initiative KG](https://monarch-initiative.github.io/monarch-ingest/Sources/), [ROBOKOP](https://robokop.renci.org/), [Ubergraph](https://github.com/INCATools/ubergraph), and other sources into giant (multi-terabyte) transient Neo4j+Solr databases for querying. ## Outputs @@ -88,7 +88,7 @@ The pipeline is implemented as [Rust](https://www.rust-lang.org/) programs with * Cliques of equivalent nodes are merged into single nodes * Cliques of equivalent properties are merged into single properties (and for ontology-defined properties, the [qualified safe labels](https://github.com/VirtualFlyBrain/neo4j2owl/blob/master/README.md) are used) -The primary output of the pipeline is a [property graph](https://docs.oracle.com/en/database/oracle/property-graph/22.2/spgdg/what-are-property-graphs.html) for [Neo4j](https://github.com/neo4j/neo4j). The nodes and edges are also loaded into [Solr](https://solr.apache.org/) for full-text search and [RocksDB](https://rocksdb.org/) for id->object resolution. +The primary output of the pipeline is a [property graph](https://docs.oracle.com/en/database/oracle/property-graph/22.2/spgdg/what-are-property-graphs.html) for [Neo4j](https://github.com/neo4j/neo4j). The nodes and edges are also loaded into [Solr](https://solr.apache.org/) for full-text search and sqlite for id->compressed object resolution. diff --git a/dataload/01_ingest/grebi_ingest_sqlite/Cargo.toml b/dataload/01_ingest/grebi_ingest_sqlite/Cargo.toml index eed8874..5e7c7fe 100644 --- a/dataload/01_ingest/grebi_ingest_sqlite/Cargo.toml +++ b/dataload/01_ingest/grebi_ingest_sqlite/Cargo.toml @@ -7,7 +7,7 @@ edition = "2021" Inflector = "0.11.4" clap = { version = "4.4.11", features = ["derive"] } hex = "0.4.3" -rusqlite = "0.31.0" +rusqlite = "0.32.1" serde_json = { version = "1.0.108", features=["preserve_order"] } jemallocator = "0.5.4" diff --git a/dataload/07_create_db/rocksdb/grebi_make_rocks/Cargo.lock b/dataload/06_prepare_db_import/grebi_make_compressed_blob/Cargo.lock similarity index 100% rename from dataload/07_create_db/rocksdb/grebi_make_rocks/Cargo.lock rename to dataload/06_prepare_db_import/grebi_make_compressed_blob/Cargo.lock diff --git a/dataload/06_prepare_db_import/grebi_make_compressed_blob/Cargo.toml b/dataload/06_prepare_db_import/grebi_make_compressed_blob/Cargo.toml new file mode 100644 index 0000000..a00056f --- /dev/null +++ b/dataload/06_prepare_db_import/grebi_make_compressed_blob/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "grebi_make_compressed_blob" +version = "0.1.0" +edition = "2021" + +[dependencies] +clap = { version = "4.4.11", features = ["derive"] } +grebi_shared = { path = "../../grebi_shared" } +flate2 = {version="1.0.28", features=["zlib-ng"]} +serde_json = { version = "1.0.108", features=["preserve_order"] } +jemallocator = "0.5.4" + + diff --git a/dataload/06_prepare_db_import/grebi_make_compressed_blob/src/main.rs b/dataload/06_prepare_db_import/grebi_make_compressed_blob/src/main.rs new file mode 100644 index 0000000..03946dd --- /dev/null +++ b/dataload/06_prepare_db_import/grebi_make_compressed_blob/src/main.rs @@ -0,0 +1,52 @@ + +use flate2::write::ZlibEncoder; +use flate2::Compression; +use grebi_shared::get_id; +use std::io::BufReader; +use std::io::BufRead; +use std::io::BufWriter; +use std::io; +use std::io::Write; + +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +fn main() { + + let stdin = io::stdin().lock(); + let mut reader = BufReader::new(stdin); + + let stdout = io::stdout().lock(); + let mut writer = BufWriter::new(stdout); + + let mut n:i64 = 0; + + let mut line:Vec = Vec::new(); + + loop { + + line.clear(); + reader.read_until(b'\n', &mut line).unwrap(); + + if line.len() == 0 { + eprintln!("saw {} lines", n); + break; + } + + n = n + 1; + + let id = get_id(&line); + + writer.write_all(&(id.len() as u32).to_le_bytes()).unwrap(); + writer.write_all(id).unwrap(); + + let mut enc = ZlibEncoder::new(Vec::new(), Compression::new(9)); + + enc.write_all(&line).unwrap(); + let compressed = enc.finish().unwrap(); + + writer.write_all(&(compressed.len() as u32).to_le_bytes()).unwrap(); + writer.write_all(&compressed).unwrap(); + } + +} diff --git a/dataload/07_create_db/rocksdb/grebi_make_rocks/Cargo.toml b/dataload/07_create_db/rocksdb/grebi_make_rocks/Cargo.toml deleted file mode 100644 index 6a9d112..0000000 --- a/dataload/07_create_db/rocksdb/grebi_make_rocks/Cargo.toml +++ /dev/null @@ -1,14 +0,0 @@ -[package] -name = "grebi_make_rocks" -version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -clap = { version = "4.4.11", features = ["derive"] } -grebi_shared = { path = "../../../grebi_shared" } -rocksdb = { git = "https://github.com/rust-rocksdb/rust-rocksdb", branch = "master" } -flate2 = {version="1.0.28", features=["zlib-ng"]} -serde_json = { version = "1.0.108", features=["preserve_order"] } - diff --git a/dataload/07_create_db/rocksdb/grebi_make_rocks/src/main.rs b/dataload/07_create_db/rocksdb/grebi_make_rocks/src/main.rs deleted file mode 100644 index 3188afc..0000000 --- a/dataload/07_create_db/rocksdb/grebi_make_rocks/src/main.rs +++ /dev/null @@ -1,80 +0,0 @@ - -use grebi_shared::get_id; -use grebi_shared::json_lexer::JsonToken; -use rocksdb::WaitForCompactOptions; -use std::collections::HashMap; -use std::collections::HashSet; -use std::fs::File; -use std::io::BufReader; -use std::io::BufRead; -use std::io::Write; -use std::io; -use std::iter::Map; -use grebi_shared::get_subjects; -use clap::Parser; -use rocksdb::DB; -use rocksdb::Options; - -use grebi_shared::slice_merged_entity::SlicedEntity; -use grebi_shared::slice_merged_entity::SlicedReified; -use grebi_shared::slice_merged_entity::SlicedProperty; -use grebi_shared::json_lexer::{JsonTokenType}; -use serde_json::json; - - -#[derive(clap::Parser, Debug)] -#[command(author, version, about, long_about = None)] -struct Args { - - #[arg(long)] - rocksdb_path: String -} - -fn main() { - - let args = Args::parse(); - - let start_time = std::time::Instant::now(); - - let stdin = io::stdin().lock(); - let mut reader = BufReader::new(stdin); - - let mut options = Options::default(); - options.create_if_missing(true); - options.create_missing_column_families(true); - options.prepare_for_bulk_load(); - options.set_compression_type(rocksdb::DBCompressionType::Lz4); - options.set_max_open_files(900); // codon limit is 1024 per process - - let db = DB::open(&options, args.rocksdb_path).unwrap(); - - let mut line:Vec = Vec::new(); - let mut n:i64 = 0; - - loop { - - line.clear(); - reader.read_until(b'\n', &mut line).unwrap(); - - if line.len() == 0 { - eprintln!("saw {} subjects", n); - break; - } - - let id = get_id(&line); - db.put(&id, &line).unwrap(); - - n = n + 1; - - if n % 1000000 == 0 { - eprintln!("{}", n); - } - } - eprintln!("Building took {} seconds", start_time.elapsed().as_secs()); - - - let start_time2 = std::time::Instant::now(); - db.compact_range(None::<&[u8]>, None::<&[u8]>); - db.wait_for_compact(&WaitForCompactOptions::default()).unwrap(); - eprintln!("Compacting took {} seconds", start_time2.elapsed().as_secs()); -} diff --git a/dataload/07_create_db/sqlite/grebi_make_sqlite/Cargo.lock b/dataload/07_create_db/sqlite/grebi_make_sqlite/Cargo.lock new file mode 100644 index 0000000..777acac --- /dev/null +++ b/dataload/07_create_db/sqlite/grebi_make_sqlite/Cargo.lock @@ -0,0 +1,580 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" + +[[package]] +name = "anstyle-parse" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +dependencies = [ + "anstyle", + "windows-sys", +] + +[[package]] +name = "bindgen" +version = "0.65.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "jobserver", + "libc", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clang-sys" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "4.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfaff671f6b22ca62406885ece523383b9b64022e341e53e009a62ebc47a45f2" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a216b506622bb1d316cd51328dce24e07bdff4a6128a47c7e7fad11878d5adbb" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" + +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "flate2" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" +dependencies = [ + "crc32fast", + "libz-ng-sys", + "miniz_oxide", +] + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "grebi_json2rocks" +version = "0.1.0" +dependencies = [ + "clap", + "flate2", + "grebi_shared", + "rocksdb", +] + +[[package]] +name = "grebi_shared" +version = "0.1.0" + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "jobserver" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" +dependencies = [ + "libc", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.151" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" + +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "librocksdb-sys" +version = "0.11.0+8.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3386f101bcb4bd252d8e9d2fb41ec3b0862a15a62b478c355b2982efa469e3e" +dependencies = [ + "bindgen", + "bzip2-sys", + "cc", + "glob", + "libc", + "libz-sys", + "lz4-sys", + "zstd-sys", +] + +[[package]] +name = "libz-ng-sys" +version = "1.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dd9f43e75536a46ee0f92b758f6b63846e594e86638c61a9251338a65baea63" +dependencies = [ + "cmake", + "libc", +] + +[[package]] +name = "libz-sys" +version = "1.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "lz4-sys" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "memchr" +version = "2.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "pkg-config" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" + +[[package]] +name = "prettyplease" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39278fbbf5fb4f646ce651690877f89d1c5811a3d4acb27700c1cb3cdb78fd3b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "rocksdb" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb6f170a4041d50a0ce04b0d2e14916d6ca863ea2e422689a5b694395d299ffe" +dependencies = [ + "libc", + "librocksdb-sys", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "shlex" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "syn" +version = "2.0.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c8b28c477cc3bf0e7966561e3460130e1255f7a1cf71931075f1c5e7a7e269" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "zstd-sys" +version = "2.0.9+zstd.1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/dataload/07_create_db/sqlite/grebi_make_sqlite/Cargo.toml b/dataload/07_create_db/sqlite/grebi_make_sqlite/Cargo.toml new file mode 100644 index 0000000..377edf5 --- /dev/null +++ b/dataload/07_create_db/sqlite/grebi_make_sqlite/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "grebi_make_sqlite" +version = "0.1.0" +edition = "2021" + +[dependencies] +clap = { version = "4.4.11", features = ["derive"] } +grebi_shared = { path = "../../../grebi_shared" } +serde_json = { version = "1.0.108", features=["preserve_order"] } +rusqlite = {version="0.32.1", features=["backup"]} +jemallocator = "0.5.4" + + diff --git a/dataload/07_create_db/sqlite/grebi_make_sqlite/src/main.rs b/dataload/07_create_db/sqlite/grebi_make_sqlite/src/main.rs new file mode 100644 index 0000000..5786e72 --- /dev/null +++ b/dataload/07_create_db/sqlite/grebi_make_sqlite/src/main.rs @@ -0,0 +1,171 @@ +use rusqlite::Statement; +use core::slice; +use std::io::BufReader; +use std::io::StdinLock; +use std::io; +use std::io::Read; +use clap::Parser; + +use rusqlite::{params, Connection, Transaction}; + +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +#[derive(clap::Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + + #[arg(long)] + db_path: String, + + #[arg(long)] + batch_size: usize, + + #[arg(long)] + page_size: usize, + + #[arg(long)] + cache_size: usize +} + +fn insert( + stmt_batch:&mut Statement, + stmt_single:&mut Statement, + reader:&mut BufReader>, + batch_size:usize) { + + let start_time = std::time::Instant::now(); + + + let mut n:i64 = 0; + + + let mut start_time3 = std::time::Instant::now(); + + let mut buf:Vec = Vec::new(); + let mut param_locs: Vec<(usize, usize)> = Vec::new(); + + loop { + + let mut size_buf = [0u8; 4]; + + if let Err(e) = reader.read_exact(&mut size_buf) { + if e.kind() == io::ErrorKind::UnexpectedEof { + break; + } else { + panic!("read error: {}", e); + } + } + + let id_size = u32::from_le_bytes(size_buf); + + let id_start = buf.len(); + buf.reserve(id_size as usize); + + unsafe { + reader.read_exact( + slice::from_raw_parts_mut( buf.as_mut_ptr().add(buf.len()), id_size as usize) + ).unwrap(); + buf.set_len(buf.len() + id_size as usize); + } + + let blob_size = { + let mut blob_size_buf = [0u8; 4]; + reader.read_exact(&mut blob_size_buf).unwrap(); + u32::from_le_bytes(blob_size_buf) + }; + + let blob_start = buf.len(); + buf.reserve(blob_size as usize); + + unsafe { + reader.read_exact( + slice::from_raw_parts_mut( buf.as_mut_ptr().add(buf.len()), blob_size as usize) + ).unwrap(); + buf.set_len(buf.len() + blob_size as usize); + } + + param_locs.push((id_start, id_start+(id_size as usize))); + param_locs.push((blob_start, blob_start+(blob_size as usize))); + + if param_locs.len() == 2*batch_size { + stmt_batch.execute( + rusqlite::params_from_iter( + param_locs.iter().map(|(start, end)| &buf[*start..*end]) + ) + ).unwrap(); + param_locs.clear(); + buf.clear(); + } + + n = n + 1; + + if n % 1000000 == 0 { + eprintln!("inserted {} [last 1m took {} seconds]", n, start_time3.elapsed().as_secs()); + start_time3 = std::time::Instant::now(); + } + } + + // insert the last ones if we didn't reach batch size + if param_locs.len() > 0 { + let mut i = 0; + while i < param_locs.len() { + let id_loc = param_locs[i]; + let val_loc = param_locs[i+1]; + let id = &buf[id_loc.0..id_loc.1]; + let val = &buf[val_loc.0..val_loc.1]; + stmt_single.execute(params![id, val]).unwrap(); + i += 2; + } + } + + eprintln!("Inserting took {} seconds", start_time.elapsed().as_secs()); + +} + +fn main() { + + let args = Args::parse(); + + let stdin = io::stdin().lock(); + let mut reader = BufReader::new(stdin); + + let mut conn = Connection::open(args.db_path).unwrap(); + + let cache_size = args.cache_size; + let page_size = args.page_size; + + conn.execute_batch( + format!("PRAGMA journal_mode = OFF; + PRAGMA synchronous = 0; + PRAGMA cache_size = {cache_size}; + PRAGMA page_size = {page_size}; + PRAGMA locking_mode = EXCLUSIVE; + PRAGMA temp_store = MEMORY;").as_str() + ) + .expect("PRAGMA"); + conn.execute( + "CREATE TABLE IF NOT EXISTS id_to_json ( + id BLOB PRIMARY KEY, + json BLOB not null)", + [] + ) + .unwrap(); + + let tx = conn.transaction().unwrap(); + + { + let mut stmt_batch = tx + .prepare_cached(("INSERT INTO id_to_json VALUES (?, ?)".to_owned() + + &", (?, ?)".repeat(args.batch_size - 1)).as_str()).unwrap(); + + let mut stmt_single = tx.prepare_cached("INSERT INTO id_to_json VALUES (?, ?)").unwrap(); + + insert(&mut stmt_batch, &mut stmt_single, &mut reader, args.batch_size); + } + + let start_time2 = std::time::Instant::now(); + tx.commit().unwrap(); + eprintln!("Committing took {} seconds", start_time2.elapsed().as_secs()); + +} diff --git a/dataload/Cargo.lock b/dataload/Cargo.lock index b86eb5f..708725a 100644 --- a/dataload/Cargo.lock +++ b/dataload/Cargo.lock @@ -93,26 +93,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "bindgen" -version = "0.69.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" -dependencies = [ - "bitflags 2.6.0", - "cexpr", - "clang-sys", - "itertools", - "lazy_static", - "lazycell", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "syn 2.0.48", -] - [[package]] name = "bit-vec" version = "0.6.3" @@ -163,35 +143,13 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" -[[package]] -name = "bzip2-sys" -version = "0.1.11+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "cc" -version = "1.0.97" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "099a5357d84c4c61eb35fc8eafa9a79a902c2f76911e5747ced4e032edd8d9b4" -dependencies = [ - "jobserver", - "libc", - "once_cell", -] - -[[package]] -name = "cexpr" -version = "0.6.0" +version = "1.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +checksum = "9157bbaa6b165880c27a4293a474c91cdcf265cc68cc829bf10be0964a391caf" dependencies = [ - "nom", + "shlex", ] [[package]] @@ -200,17 +158,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" -[[package]] -name = "clang-sys" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67523a3b4be3ce1989d607a828d036249522dd9c1c8de7f4dd2dae43a37369d1" -dependencies = [ - "glob", - "libc", - "libloading", -] - [[package]] name = "clap" version = "4.4.11" @@ -344,12 +291,6 @@ dependencies = [ "proc-macro-error", ] -[[package]] -name = "either" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" - [[package]] name = "equivalent" version = "1.0.1" @@ -426,12 +367,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "glob" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" - [[package]] name = "grebi_assign_ids" version = "0.1.0" @@ -546,6 +481,17 @@ dependencies = [ "serde_yaml", ] +[[package]] +name = "grebi_make_compressed_blob" +version = "0.1.0" +dependencies = [ + "clap", + "flate2", + "grebi_shared", + "jemallocator", + "serde_json", +] + [[package]] name = "grebi_make_neo_csv" version = "0.1.0" @@ -566,23 +512,23 @@ dependencies = [ ] [[package]] -name = "grebi_make_rocks" +name = "grebi_make_solr" version = "0.1.0" dependencies = [ "clap", - "flate2", "grebi_shared", - "rocksdb", + "jemallocator", "serde_json", ] [[package]] -name = "grebi_make_solr" +name = "grebi_make_sqlite" version = "0.1.0" dependencies = [ "clap", "grebi_shared", "jemallocator", + "rusqlite", "serde_json", ] @@ -742,15 +688,6 @@ dependencies = [ "hashbrown", ] -[[package]] -name = "itertools" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" -dependencies = [ - "either", -] - [[package]] name = "itoa" version = "1.0.10" @@ -777,15 +714,6 @@ dependencies = [ "libc", ] -[[package]] -name = "jobserver" -version = "0.1.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" -dependencies = [ - "libc", -] - [[package]] name = "js-sys" version = "0.3.66" @@ -801,12 +729,6 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - [[package]] name = "libc" version = "0.2.151" @@ -823,36 +745,11 @@ dependencies = [ "libc", ] -[[package]] -name = "libloading" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" -dependencies = [ - "cfg-if", - "windows-targets", -] - -[[package]] -name = "librocksdb-sys" -version = "0.17.0+9.0.0" -source = "git+https://github.com/rust-rocksdb/rust-rocksdb?branch=master#daaaf85fffb1c981aa93ca418b380ea2ea91aac3" -dependencies = [ - "bindgen", - "bzip2-sys", - "cc", - "glob", - "libc", - "libz-sys", - "lz4-sys", - "zstd-sys", -] - [[package]] name = "libsqlite3-sys" -version = "0.28.0" +version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c10584274047cb335c23d3e61bcef8e323adae7c5c8c760540f73610177fc3f" +checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149" dependencies = [ "pkg-config", "vcpkg", @@ -868,17 +765,6 @@ dependencies = [ "libc", ] -[[package]] -name = "libz-sys" -version = "1.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e143b5e666b2695d28f6bca6497720813f699c9602dd7f5cac91008b8ada7f9" -dependencies = [ - "cc", - "pkg-config", - "vcpkg", -] - [[package]] name = "lmdb-zero" version = "0.4.4" @@ -897,28 +783,12 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" -[[package]] -name = "lz4-sys" -version = "1.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" -dependencies = [ - "cc", - "libc", -] - [[package]] name = "memchr" version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - [[package]] name = "miniz_oxide" version = "0.7.1" @@ -934,16 +804,6 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76aacdf8f9850a9db34e33e35abfc17a29b9577d0eb2cbfaeb734662cacca5b3" -[[package]] -name = "nom" -version = "7.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" -dependencies = [ - "memchr", - "minimal-lexical", -] - [[package]] name = "once_cell" version = "1.19.0" @@ -1089,20 +949,11 @@ dependencies = [ "rio_api", ] -[[package]] -name = "rocksdb" -version = "0.22.0" -source = "git+https://github.com/rust-rocksdb/rust-rocksdb?branch=master#daaaf85fffb1c981aa93ca418b380ea2ea91aac3" -dependencies = [ - "libc", - "librocksdb-sys", -] - [[package]] name = "rusqlite" -version = "0.31.0" +version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b838eba278d213a8beaf485bd313fd580ca4505a00d5871caeb1457c55322cae" +checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e" dependencies = [ "bitflags 2.6.0", "fallible-iterator", @@ -1112,12 +963,6 @@ dependencies = [ "smallvec", ] -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustversion" version = "1.0.14" @@ -1646,13 +1491,3 @@ dependencies = [ "quote", "syn 2.0.48", ] - -[[package]] -name = "zstd-sys" -version = "2.0.10+zstd.1.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" -dependencies = [ - "cc", - "pkg-config", -] diff --git a/dataload/Cargo.toml b/dataload/Cargo.toml index b6d958d..8658907 100644 --- a/dataload/Cargo.toml +++ b/dataload/Cargo.toml @@ -24,7 +24,8 @@ members = [ "06_prepare_db_import/grebi_make_neo_csv", "06_prepare_db_import/grebi_make_neo_ids_csv", "06_prepare_db_import/grebi_make_solr", - "07_create_db/rocksdb/grebi_make_rocks", + "06_prepare_db_import/grebi_make_compressed_blob", + "07_create_db/sqlite/grebi_make_sqlite", "grebi_shared" ] diff --git a/dataload/docker_envs/Dockerfile.rust_for_codon b/dataload/docker_envs/Dockerfile.rust_for_codon index fcfb936..b3f8caa 100644 --- a/dataload/docker_envs/Dockerfile.rust_for_codon +++ b/dataload/docker_envs/Dockerfile.rust_for_codon @@ -1,5 +1,5 @@ -FROM rust:1.74-buster as builder +FROM rust:1.79-buster as builder RUN apt-get update && apt-get install -y cmake clang diff --git a/dataload/docker_envs/build_and_push.sh b/dataload/docker_envs/build_and_push.sh index aba1a0b..c35a06e 100755 --- a/dataload/docker_envs/build_and_push.sh +++ b/dataload/docker_envs/build_and_push.sh @@ -5,11 +5,12 @@ set -e docker build -t ghcr.io/ebispot/grebi_solr_with_extras:9.5.0 -f Dockerfile.solr_with_extras . docker build -t ghcr.io/ebispot/grebi_neo4j_with_extras:5.18.0 -f Dockerfile.neo4j_with_extras . docker build -t ghcr.io/ebispot/grebi_python:3.11 -f Dockerfile.python . -docker build -t ghcr.io/ebispot/rust_for_codon:1.74 -f Dockerfile.rust_for_codon . +docker build -t ghcr.io/ebispot/rust_for_codon:1.79 -f Dockerfile.rust_for_codon . docker push ghcr.io/ebispot/grebi_solr_with_extras:9.5.0 docker push ghcr.io/ebispot/grebi_neo4j_with_extras:5.18.0 docker push ghcr.io/ebispot/grebi_python:3.11 -docker push ghcr.io/ebispot/rust_for_codon:1.74 +docker push ghcr.io/ebispot/rust_for_codon:1.79 + diff --git a/dataload/nextflow/codon_nextflow.config b/dataload/nextflow/codon_nextflow.config index 1a89c15..4ee99da 100644 --- a/dataload/nextflow/codon_nextflow.config +++ b/dataload/nextflow/codon_nextflow.config @@ -24,13 +24,6 @@ process { } } - -process { - withName: create_rocks { - memory = 1200.GB - } -} - process { withName: create_solr_nodes_core { memory = 150.GB @@ -79,12 +72,6 @@ process { cpus = 32 } } -process { - withName: package_rocks { - memory = 32.GB - cpus = 32 - } -} process { withName: package_solr { memory = 32.GB diff --git a/dataload/nextflow/load_subgraph.nf b/dataload/nextflow/load_subgraph.nf index 3701b52..3ed44ab 100644 --- a/dataload/nextflow/load_subgraph.nf +++ b/dataload/nextflow/load_subgraph.nf @@ -5,7 +5,6 @@ import groovy.json.JsonSlurper jsonSlurper = new JsonSlurper() params.tmp = "$GREBI_TMP" -params.fast_tmp = "$GREBI_FAST_TMP" params.home = "$GREBI_DATALOAD_HOME" params.config = "$GREBI_CONFIG" params.subgraph = "$GREBI_SUBGRAPH" @@ -33,9 +32,8 @@ workflow { materialise(merged.flatten(), indexed.metadata_jsonl, indexed.summary_json, Channel.value(config.exclude_edges + config.identifier_props), Channel.value(config.exclude_self_referential_edges + config.identifier_props), groups_txt) merge_summary_jsons(indexed.summary_json.collect() + materialise.out.mat_summary.collect()) - materialised_nodes_and_edges = materialise.out.nodes.collect() + materialise.out.edges.collect() - - rocks_db = create_rocks(materialised_nodes_and_edges) + compressed_blobs = create_compressed_blobs(materialise.out.nodes.mix(materialise.out.edges)) + sqlite = create_sqlite(compressed_blobs.collect()) neo_input_dir = prepare_neo(indexed.summary_json, materialise.out.nodes, materialise.out.edges) @@ -54,18 +52,17 @@ workflow { solr_tgz = package_solr(solr_nodes_core, solr_edges_core, solr_autocomplete_core) neo_tgz = package_neo(neo_db) - rocks_tgz = package_rocks(rocks_db) if(params.is_ebi == "true") { copy_summary_to_ftp(merge_summary_jsons.out) copy_solr_to_ftp(solr_tgz) copy_neo_to_ftp(neo_tgz) - copy_rocks_to_ftp(rocks_tgz) + copy_sqlite_to_ftp(sqlite) copy_summary_to_staging(merge_summary_jsons.out) copy_solr_config_to_staging() copy_solr_cores_to_staging(solr_nodes_core.concat(solr_edges_core).concat(solr_autocomplete_core)) - copy_rocksdb_to_staging(rocks_db) + copy_sqlite_to_staging(sqlite) copy_neo_to_staging(neo_db) } } @@ -287,9 +284,28 @@ process merge_summary_jsons { """ } -process create_rocks { +process create_compressed_blobs { cache "lenient" - memory "4 GB" + memory "16 GB" + time "1h" + + input: + path(mat_jsonl) + + output: + path("${params.subgraph}_${task.index}_compressed.blob"), emit: compressed_blob + + script: + """ + #!/usr/bin/env bash + set -Eeuo pipefail + cat ${mat_jsonl} | ${params.home}/target/release/grebi_make_compressed_blob > ${params.subgraph}_${task.index}_compressed.blob + """ +} + +process create_sqlite { + cache "lenient" + memory "128 GB" time "23h" cpus "8" errorStrategy 'retry' @@ -298,19 +314,21 @@ process create_rocks { publishDir "${params.tmp}/${params.config}/${params.subgraph}", overwrite: true input: - val(materialised) + val(compressed_blobs) output: - path("${params.subgraph}_rocksdb") + path("${params.subgraph}.sqlite3") script: """ #!/usr/bin/env bash set -Eeuo pipefail - cat ${materialised.iterator().join(" ")} \ - | ${params.home}/target/release/grebi_make_rocks \ - --rocksdb-path ${params.fast_tmp}/rocksdb && \ - mv ${params.fast_tmp}/rocksdb ${params.subgraph}_rocksdb + cat ${compressed_blobs.iterator().join(" ")} \ + | ${params.home}/target/release/grebi_make_sqlite \ + --db-path ${params.subgraph}.sqlite3 \ + --batch-size 450 \ + --page-size 16384 \ + --cache-size 1000000 """ } @@ -521,26 +539,6 @@ process package_neo { """ } -process package_rocks { - cache "lenient" - memory "4 GB" - time "8h" - cpus "8" - - publishDir "${params.tmp}/${params.config}/${params.subgraph}", overwrite: true - - input: - path("${params.subgraph}_rocksdb") - - output: - path("${params.subgraph}_rocksdb.tgz") - - script: - """ - tar -chf ${params.subgraph}_rocksdb.tgz --use-compress-program="pigz --fast" ${params.subgraph}_rocksdb - """ -} - process package_solr { cache "lenient" memory "4 GB" @@ -571,7 +569,7 @@ process package_solr { process copy_neo_to_ftp { cache "lenient" - memory "4 GB" + memory "32 GB" time "8h" queue "datamover" @@ -590,7 +588,7 @@ process copy_neo_to_ftp { process copy_summary_to_ftp { cache "lenient" - memory "4 GB" + memory "32 GB" time "8h" queue "datamover" @@ -609,7 +607,7 @@ process copy_summary_to_ftp { process copy_solr_to_ftp { cache "lenient" - memory "4 GB" + memory "32 GB" time "8h" queue "datamover" @@ -625,29 +623,29 @@ process copy_solr_to_ftp { """ } -process copy_rocks_to_ftp { +process copy_sqlite_to_ftp { cache "lenient" - memory "4 GB" + memory "32 GB" time "8h" queue "datamover" input: - path("rocksdb.tgz") + path("${params.subgraph}.sqlite3") script: """ #!/usr/bin/env bash set -Eeuo pipefail mkdir -p /nfs/ftp/public/databases/spot/kg/${params.config}/${params.timestamp.trim()} - cp -f rocksdb.tgz /nfs/ftp/public/databases/spot/kg/${params.config}/${params.timestamp.trim()}/${params.subgraph}_rocksdb.tgz + cp -f ${params.subgraph}.sqlite3 /nfs/ftp/public/databases/spot/kg/${params.config}/${params.timestamp.trim()}/${params.subgraph}.sqlite3 """ } process copy_summary_to_staging { cache "lenient" - memory "4 GB" + memory "32 GB" time "8h" queue "datamover" @@ -665,7 +663,7 @@ process copy_summary_to_staging { process copy_solr_config_to_staging { cache "lenient" - memory "4 GB" + memory "32 GB" time "8h" queue "datamover" @@ -683,7 +681,7 @@ process copy_solr_config_to_staging { process copy_solr_cores_to_staging { cache "lenient" - memory "4 GB" + memory "32 GB" time "8h" queue "datamover" @@ -699,27 +697,27 @@ process copy_solr_cores_to_staging { """ } -process copy_rocksdb_to_staging { +process copy_sqlite_to_staging { cache "lenient" - memory "4 GB" + memory "32 GB" time "8h" queue "datamover" input: - path(rocksdb) + path(sqlite) script: """ #!/usr/bin/env bash set -Eeuo pipefail - mkdir -p /nfs/public/rw/ontoapps/grebi/staging/rocksdb - cp -LR * /nfs/public/rw/ontoapps/grebi/staging/rocksdb/ + mkdir -p /nfs/public/rw/ontoapps/grebi/staging/sqlite + cp -LR * /nfs/public/rw/ontoapps/grebi/staging/sqlite/ """ } process copy_neo_to_staging { cache "lenient" - memory "4 GB" + memory "32 GB" time "8h" queue "datamover" diff --git a/dataload/nextflow/saturos_nextflow.config b/dataload/nextflow/saturos_nextflow.config index 4edb0dc..0fec219 100644 --- a/dataload/nextflow/saturos_nextflow.config +++ b/dataload/nextflow/saturos_nextflow.config @@ -19,12 +19,6 @@ process { } } -process { - withName: create_rocks { - memory = 64.GB - } -} - process { withName: create_solr_nodes_core { memory = 64.GB @@ -52,12 +46,6 @@ process { cpus = 32 } } -process { - withName: package_rocks { - memory = 32.GB - cpus = 32 - } -} process { withName: package_solr { memory = 32.GB diff --git a/dataload/scripts/build_release_on_codon.sh b/dataload/scripts/build_release_on_codon.sh index 9503069..f676022 100755 --- a/dataload/scripts/build_release_on_codon.sh +++ b/dataload/scripts/build_release_on_codon.sh @@ -1,3 +1,3 @@ #!/bin/bash -srun -t 1:00:00 --mem 64G -c 16 singularity run docker://ghcr.io/ebispot/rust_for_codon:1.74 bash -c "export CARGO_HOME=./cargo_home && cargo build --release" +srun -t 1:00:00 --mem 64G -c 16 singularity run docker://ghcr.io/ebispot/rust_for_codon:1.79 bash -c "export CARGO_HOME=./cargo_home && cargo build --release" diff --git a/dataload/scripts/dataload_codon.sh b/dataload/scripts/dataload_codon.sh index 44a20f1..f45fce4 100755 --- a/dataload/scripts/dataload_codon.sh +++ b/dataload/scripts/dataload_codon.sh @@ -1,7 +1,6 @@ #!/bin/bash export GREBI_DATALOAD_HOME=/nfs/production/parkinso/spot/grebi/dataload export GREBI_TMP=/hps/nobackup/parkinso/spot/grebi/tmp -export GREBI_FAST_TMP=/dev/shm export GREBI_CONFIG=ebi export GREBI_IS_EBI=true export GREBI_TIMESTAMP=$(date +%Y_%m_%d__%H_%M) diff --git a/dataload/scripts/dataload_local.sh b/dataload/scripts/dataload_local.sh index 295b1f9..4a3b52a 100755 --- a/dataload/scripts/dataload_local.sh +++ b/dataload/scripts/dataload_local.sh @@ -1,7 +1,6 @@ #!/bin/bash export GREBI_DATALOAD_HOME=~/grebi/dataload export GREBI_TMP=$(pwd) -export GREBI_FAST_TMP=/tmp export GREBI_CONFIG=ebi export GREBI_IS_EBI=false export GREBI_TIMESTAMP=$(date +%Y_%m_%d__%H_%M) diff --git a/dataload/scripts/dataload_saturos.sh b/dataload/scripts/dataload_saturos.sh index 01034bc..34654aa 100755 --- a/dataload/scripts/dataload_saturos.sh +++ b/dataload/scripts/dataload_saturos.sh @@ -1,7 +1,6 @@ #!/bin/bash export GREBI_DATALOAD_HOME=/home/james/grebi/dataload export GREBI_TMP=/data/grebi_tmp -export GREBI_FAST_TMP=/tmp export GREBI_CONFIG=ebi export GREBI_IS_EBI=false export GREBI_TIMESTAMP=$(date +%Y_%m_%d__%H_%M) diff --git a/webapp/docker-compose.yml b/webapp/docker-compose.yml index f086dd9..6be03af 100644 --- a/webapp/docker-compose.yml +++ b/webapp/docker-compose.yml @@ -46,9 +46,9 @@ services: ports: - 8080:8080 volumes: - - ${GREBI_ROCKSDB_SEARCH_PATH:?Need path to search for RocksDB databases}:/rocksdbs + - ${GREBI_SQLITE_SEARCH_PATH:?Need path to search for sqlite databases}:/sqlite environment: - - GREBI_ROCKSDB_SEARCH_PATH=/rocksdbs + - GREBI_SQLITE_SEARCH_PATH=/sqlite grebi-summary-service: image: ghcr.io/ebispot/grebi_summary_service:dev ports: diff --git a/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/GrebiApi.java b/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/GrebiApi.java index 45a9f42..b808f23 100644 --- a/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/GrebiApi.java +++ b/webapp/grebi_api/src/main/java/uk/ac/ebi/grebi/GrebiApi.java @@ -32,7 +32,7 @@ public static void main(String[] args) throws ParseException, org.apache.commons GrebiSolrRepo solr = null; GrebiSummaryRepo summary= null; - Set rocksDbSubgraphs = null; + Set sqliteSubgraphs = null; Set solrSubgraphs = null; Set summarySubgraphs = null; Set neoSubgraphs = null; @@ -41,12 +41,12 @@ public static void main(String[] args) throws ParseException, org.apache.commons try { solr = new GrebiSolrRepo(); summary = new GrebiSummaryRepo(); - rocksDbSubgraphs = (new ResolverClient()).getSubgraphs(); + sqliteSubgraphs = (new ResolverClient()).getSubgraphs(); solrSubgraphs = solr.getSubgraphs(); summarySubgraphs = summary.getSubgraphs(); - if(new HashSet<>(List.of(rocksDbSubgraphs, solrSubgraphs, summarySubgraphs)).size() != 1) { - throw new RuntimeException("RocksDB/Solr/the summary jsons do not seem to contain the same subgraphs. Found: " - + String.join(",", rocksDbSubgraphs) + " for RocksDB (from resolver service) and " + if(new HashSet<>(List.of(sqliteSubgraphs, solrSubgraphs, summarySubgraphs)).size() != 1) { + throw new RuntimeException("SQLite/Solr/the summary jsons do not seem to contain the same subgraphs. Found: " + + String.join(",", sqliteSubgraphs) + " for SQLite (from resolver service) and " + String.join(",", solrSubgraphs) + " for Solr (from list of solr cores) and " + String.join(",", summarySubgraphs) + " for the summary jsons (from summary server)" ); @@ -67,10 +67,10 @@ public static void main(String[] args) throws ParseException, org.apache.commons try { neo = new GrebiNeoRepo(); neoSubgraphs = neo.getSubgraphs(); - if(new HashSet<>(List.of(rocksDbSubgraphs, solrSubgraphs, summarySubgraphs)).size() != 1) { + if(new HashSet<>(List.of(sqliteSubgraphs, solrSubgraphs, summarySubgraphs)).size() != 1) { neo = null; - throw new RuntimeException("RocksDB/Solr/the summary jsons/neo4j do not seem to contain the same subgraphs. Found: " - + String.join(",", rocksDbSubgraphs) + " for RocksDB (from resolver service) and " + throw new RuntimeException("SQLite/Solr/the summary jsons/neo4j do not seem to contain the same subgraphs. Found: " + + String.join(",", sqliteSubgraphs) + " for SQLite (from resolver service) and " + String.join(",", solrSubgraphs) + " for Solr (from list of solr cores) and " + String.join(",", summarySubgraphs) + " for the summary jsons (from summary server) and " + String.join(",", neoSubgraphs) + " for neo4j" diff --git a/webapp/grebi_resolver_service/pom.xml b/webapp/grebi_resolver_service/pom.xml index e64bd38..80a83eb 100644 --- a/webapp/grebi_resolver_service/pom.xml +++ b/webapp/grebi_resolver_service/pom.xml @@ -40,21 +40,16 @@ slf4j-simple 2.0.10 - - org.springframework.data - spring-data-commons - 3.2.5 - com.google.guava guava 33.2.0-jre - - org.rocksdb - rocksdbjni - 9.1.1 - + + org.xerial + sqlite-jdbc + 3.47.1.0 + diff --git a/webapp/grebi_resolver_service/src/main/java/uk/ac/ebi/grebi_resolver_service/GrebiResolverSvc.java b/webapp/grebi_resolver_service/src/main/java/uk/ac/ebi/grebi_resolver_service/GrebiResolverSvc.java index 0798dd7..76e3af5 100644 --- a/webapp/grebi_resolver_service/src/main/java/uk/ac/ebi/grebi_resolver_service/GrebiResolverSvc.java +++ b/webapp/grebi_resolver_service/src/main/java/uk/ac/ebi/grebi_resolver_service/GrebiResolverSvc.java @@ -3,45 +3,57 @@ import com.google.gson.Gson; import com.google.gson.JsonElement; import com.google.gson.JsonParser; +import com.google.gson.stream.JsonReader; import io.javalin.Javalin; -import io.javalin.http.Context; -import org.rocksdb.Options; -import org.rocksdb.RocksDB; -import org.rocksdb.RocksDBException; +import org.sqlite.SQLiteConfig; +import org.sqlite.SQLiteOpenMode; import java.io.InputStreamReader; import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.sql.*; +import java.util.*; +import java.util.stream.Collectors; +import java.util.zip.Inflater; +import java.util.zip.InflaterInputStream; public class GrebiResolverSvc { - private static Map rocksDBs = new HashMap<>(); + + public static class Db { + public Connection connection; + } + + private static Map sqliteDBs = new HashMap<>(); public static void main(String[] args) { Gson gson = new Gson(); - RocksDB.loadLibrary(); + var dbfiles = Arrays.stream(new File(System.getenv("GREBI_SQLITE_SEARCH_PATH")).listFiles()) + .filter(File::isFile) + .filter(f -> f.getName().endsWith(".sqlite") || f.getName().endsWith(".sqlite3")) + .collect(Collectors.toList()); + + System.out.println("Found sqlite files: " + dbfiles); - Options options = new Options(); - options.setCreateIfMissing(false); + for (var dbfile : dbfiles) { + Db db = new Db(); - var dirs = Arrays.stream(new File(System.getenv("GREBI_ROCKSDB_SEARCH_PATH")).listFiles()).filter(File::isDirectory).filter(f -> f.getName().endsWith("_rocksdb")).toArray(File[]::new); + var subgraph = dbfile.getName().split("\\.")[0]; + + System.out.println("Loading SQLite DB for subgraph " + subgraph + " from " + dbfile.getAbsolutePath()); - for (File dir : dirs) { - RocksDB rocksDB = null; try { - rocksDB = RocksDB.openReadOnly(options, dir.getAbsolutePath()); - } catch (RocksDBException e) { + SQLiteConfig config = new SQLiteConfig(); + config.setReadOnly(true); + config.setOpenMode(SQLiteOpenMode.READONLY); + db.connection = DriverManager.getConnection("jdbc:sqlite:" + dbfile.getAbsolutePath(), config.toProperties()); + } catch (SQLException e) { e.printStackTrace(); return; } - var subgraph = dir.getName().split("_rocksdb")[0]; - rocksDBs.put(subgraph, rocksDB); - System.out.println("Loaded RocksDB for subgraph " + subgraph + " from " + dir.getAbsolutePath()); + + sqliteDBs.put(subgraph, db); + System.out.println("Loaded SQLite DB for subgraph " + subgraph + " from " + dbfile.getAbsolutePath()); } Javalin app = Javalin.create(config -> { @@ -49,51 +61,56 @@ public static void main(String[] args) { app.get("/subgraphs", ctx -> { ctx.contentType("application/json"); - ctx.result(gson.toJson(rocksDBs.keySet())); + ctx.result(gson.toJson(sqliteDBs.keySet())); }); app.post("/{subgraph}/resolve", ctx -> { var subgraph = ctx.pathParam("subgraph"); - var rocksdb = rocksDBs.get(subgraph); - if(rocksdb == null) { + var sqliteDb = sqliteDBs.get(subgraph); + if (sqliteDb == null) { ctx.status(404).result("Subgraph not found"); return; } List paramArray = gson.fromJson(new InputStreamReader(ctx.bodyInputStream()), List.class); - List keys = new ArrayList<>(); - for (String id : paramArray) { - keys.add(id.getBytes()); - } - Map results = new HashMap<>(); - try { - List values = rocksdb.multiGetAsList(keys); - int n = 0; - for (byte[] value : values) { - byte[] key = keys.get(n++); - if (value != null) { - JsonElement jsonElement = JsonParser.parseString(new String(value)); - results.put(new String(key), jsonElement); - } else { - results.put(new String(key), null); + + try (PreparedStatement stmt = sqliteDb.connection.prepareStatement( + "SELECT json FROM id_to_json WHERE id = ?")) { + + for (String id : paramArray) { + stmt.setBytes(1, id.getBytes()); + try (ResultSet rs = stmt.executeQuery()) { + if (rs.next()) { + var is = new InflaterInputStream(rs.getBinaryStream("json")); + JsonElement jsonElement = JsonParser.parseReader(new JsonReader(new InputStreamReader(is))); + results.put(id, jsonElement); + } else { + results.put(id, null); + } } } + ctx.contentType("application/json"); ctx.result(gson.toJson(results)); - } catch (RocksDBException e) { + + } catch (SQLException e) { ctx.status(500).result(e.getMessage()); } }); Runtime.getRuntime().addShutdownHook(new Thread(() -> { - for (RocksDB rocksDB : rocksDBs.values()) { - rocksDB.close(); + for (Db db : sqliteDBs.values()) { + try { + if (db.connection != null && !db.connection.isClosed()) { + db.connection.close(); + } + } catch (SQLException e) { + e.printStackTrace(); + } } })); } - } - diff --git a/webapp/grebi_ui/src/components/node_graph_view/GraphViewCtx.tsx b/webapp/grebi_ui/src/components/node_graph_view/GraphViewCtx.tsx index 38a51da..7999123 100644 --- a/webapp/grebi_ui/src/components/node_graph_view/GraphViewCtx.tsx +++ b/webapp/grebi_ui/src/components/node_graph_view/GraphViewCtx.tsx @@ -336,8 +336,8 @@ color:'gray', this.cy.destroy() this.cy = new CyWrapper(this.graphDiv, elements, style, layout) - this.cy.onClickElement = (id:string) => { - console.log(id) + this.cy.onClickElement = (elem:any) => { + elem.data('action') && this.doAction(elem.data('action')) } //this.dsSelectorDiv.innerHTML = '' @@ -461,5 +461,20 @@ color:'gray', } } + doAction(action:any) { + + let { type, nodeId } = action + + if(type === 'expandEdge') { + + let { direction, edgeType } = action + + if(direction === 'incoming') { + this.incoming_expandedEdgeIds.add(edgeType) + } else { + this.outgoing_expandedEdgeIds.add(edgeType) + } + } + } } diff --git a/webapp/grebi_ui/src/frontends/ebi/pages/EbiDownloadsPage.tsx b/webapp/grebi_ui/src/frontends/ebi/pages/EbiDownloadsPage.tsx index 7c95d13..18c40be 100644 --- a/webapp/grebi_ui/src/frontends/ebi/pages/EbiDownloadsPage.tsx +++ b/webapp/grebi_ui/src/frontends/ebi/pages/EbiDownloadsPage.tsx @@ -12,11 +12,11 @@ export default function EbiDownloadsPage() {
- Downloading Knowledge Graph Releases + Downloading Knowledge Graph Exports

- Neo4j, Solr, and RocksDB databases of the KG can be downloaded from  + Neo4j and Solr databases exports of the KG can be downloaded from