From bf2c347ec4b095be6e5640ef5c163e5bf8fed18f Mon Sep 17 00:00:00 2001 From: Runji Wang Date: Fri, 19 Apr 2024 19:43:42 +0800 Subject: [PATCH 1/5] replace Vec by boxed slice in array Signed-off-by: Runji Wang --- src/array/bytes_array.rs | 12 ++++++------ src/array/primitive_array.rs | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/array/bytes_array.rs b/src/array/bytes_array.rs index 72df61968..6d3a57c84 100644 --- a/src/array/bytes_array.rs +++ b/src/array/bytes_array.rs @@ -14,9 +14,9 @@ use crate::types::BlobRef; /// A collection of variable-length values. #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct BytesArray { - offset: Vec, + offset: Box<[usize]>, valid: BitVec, - data: Vec, + data: Box<[u8]>, _type: PhantomData, } @@ -108,8 +108,8 @@ impl ArrayFromDataExt for BytesArray { } Self { valid, - data, - offset, + data: data.into(), + offset: offset.into(), _type: PhantomData, } } @@ -197,8 +197,8 @@ impl ArrayBuilder for BytesArrayBuilder { fn take(&mut self) -> BytesArray { BytesArray { valid: mem::take(&mut self.valid), - data: mem::take(&mut self.data), - offset: mem::replace(&mut self.offset, vec![0]), + data: mem::take(&mut self.data).into(), + offset: mem::replace(&mut self.offset, vec![0]).into(), _type: PhantomData, } } diff --git a/src/array/primitive_array.rs b/src/array/primitive_array.rs index 60865b9f8..de48a00a9 100644 --- a/src/array/primitive_array.rs +++ b/src/array/primitive_array.rs @@ -16,7 +16,7 @@ use crate::types::{NativeType, F32, F64}; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct PrimitiveArray { valid: BitVec, - data: Vec, + data: Box<[T]>, } // Enable `collect()` an array from iterator of `Option`. @@ -34,7 +34,7 @@ impl FromIterator> for PrimitiveArray { // Enable `collect()` an array from iterator of `T`. impl FromIterator for PrimitiveArray { fn from_iter>(iter: I) -> Self { - let data: Vec = iter.into_iter().collect(); + let data: Box<[T]> = iter.into_iter().collect(); let size = data.len(); Self { data, @@ -45,7 +45,7 @@ impl FromIterator for PrimitiveArray { impl FromIterator for PrimitiveArray { fn from_iter>(iter: I) -> Self { - let data: Vec = iter.into_iter().map(F32::from).collect(); + let data: Box<[F32]> = iter.into_iter().map(F32::from).collect(); let size = data.len(); Self { data, @@ -56,7 +56,7 @@ impl FromIterator for PrimitiveArray { impl FromIterator for PrimitiveArray { fn from_iter>(iter: I) -> Self { - let data: Vec = iter.into_iter().map(F64::from).collect(); + let data: Box<[F64]> = iter.into_iter().map(F64::from).collect(); let size = data.len(); Self { data, @@ -172,7 +172,7 @@ impl ArrayBuilder for PrimitiveArrayBuilder { fn take(&mut self) -> PrimitiveArray { PrimitiveArray { valid: mem::take(&mut self.valid), - data: mem::take(&mut self.data), + data: mem::take(&mut self.data).into(), } } } @@ -192,7 +192,7 @@ impl PrimitiveArray { impl PrimitiveArray { /// Rescale the decimals. pub fn rescale(&mut self, scale: u8) { - for v in &mut self.data { + for v in self.data.iter_mut() { v.rescale(scale as u32); } } From b03568c08e868e984ed084cc2da254d099a41f02 Mon Sep 17 00:00:00 2001 From: Runji Wang Date: Fri, 19 Apr 2024 22:48:35 +0800 Subject: [PATCH 2/5] decrease data chunk capacity to 8 in hash semi join Signed-off-by: Runji Wang --- src/executor/hash_join.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/executor/hash_join.rs b/src/executor/hash_join.rs index b5a4bb3d0..eefe36a3b 100644 --- a/src/executor/hash_join.rs +++ b/src/executor/hash_join.rs @@ -161,9 +161,12 @@ impl HashSemiJoinExecutor2 { for (key, row) in keys_chunk.rows().zip(chunk.rows()) { let chunk = key_set .entry(key.values().collect()) - .or_insert_with(|| DataChunkBuilder::new(&self.right_types, 1024)) + .or_insert_with(|| DataChunkBuilder::new(&self.right_types, 8)) .push_row(row.values()); - assert!(chunk.is_none()); + assert!( + chunk.is_none(), + "FIXME: more than 8 rows with the same key is not supported" + ); } tokio::task::consume_budget().await; } From 45f9a7ae84245608404f0bb23ddf3cefadb1f036 Mon Sep 17 00:00:00 2001 From: Runji Wang Date: Fri, 19 Apr 2024 22:48:52 +0800 Subject: [PATCH 3/5] remove cargo config for bench Signed-off-by: Runji Wang --- Cargo.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f759e7eed..5534e9926 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -108,10 +108,6 @@ name = "array" harness = false name = "tpch" -[profile.bench] -codegen-units = 1 -lto = 'thin' - [workspace] members = ["proto"] From 34a36130c07c32935959e22e2a31793adce183b4 Mon Sep 17 00:00:00 2001 From: Runji Wang Date: Fri, 19 Apr 2024 22:54:12 +0800 Subject: [PATCH 4/5] enable tpch q21 in ci Signed-off-by: Runji Wang --- .github/workflows/bench.yml | 3 +-- .github/workflows/ci.yml | 25 +------------------------ 2 files changed, 2 insertions(+), 26 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index d04e04ff9..ec482fd1b 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -39,8 +39,7 @@ jobs: - name: Generate TPC-H 1GB dataset run: make tpch - name: Run benchmark - # FIXME: skip q21 as it will run out of memory - run: cargo bench --bench tpch -- --output-format bencher "q(1?\d|2[02])$" | tee output.txt + run: cargo bench --bench tpch -- --output-format bencher | tee output.txt - name: Store benchmark result if: github.event_name != 'pull_request' uses: benchmark-action/github-action-benchmark@v1 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 186ee3df7..d46b281f8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,27 +92,4 @@ jobs: run: | ./target/release/risinglight -f tests/sql/tpch/create.sql ./target/release/risinglight -f tests/sql/tpch/import.sql - ./target/release/risinglight -f tests/sql/tpch-full/_q1.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q2.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q3.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q4.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q5.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q6.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q7.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q8.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q9.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q10.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q11.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q12.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q13.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q14.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q15.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q16.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q17.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q18.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q19.slt - # FIXME: sqllogictest says the query result is mismatch, but it is actually correct - # ./target/release/risinglight -f tests/sql/tpch-full/_q20.slt - # FIXME: q21 runs out of memory - # ./target/release/risinglight -f tests/sql/tpch-full/_q21.slt - ./target/release/risinglight -f tests/sql/tpch-full/_q22.slt + ./target/release/risinglight -f tests/sql/tpch-full/_tpch_full.slt From 403c56e34349b7581aafd8632d055a66ae9d7dec Mon Sep 17 00:00:00 2001 From: Runji Wang Date: Fri, 19 Apr 2024 23:04:47 +0800 Subject: [PATCH 5/5] use unbounded data chunk builder Signed-off-by: Runji Wang --- src/array/data_chunk_builder.rs | 14 +++++++++++++- src/executor/hash_join.rs | 7 ++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/array/data_chunk_builder.rs b/src/array/data_chunk_builder.rs index d52bf0a59..35190f29e 100644 --- a/src/array/data_chunk_builder.rs +++ b/src/array/data_chunk_builder.rs @@ -26,6 +26,16 @@ impl DataChunkBuilder { } } + /// Create a [`DataChunkBuilder`] with unbounded capacity. + pub fn unbounded<'a>(data_types: impl IntoIterator) -> Self { + let array_builders = data_types.into_iter().map(ArrayBuilderImpl::new).collect(); + DataChunkBuilder { + array_builders, + size: 0, + capacity: usize::MAX, + } + } + /// Push a row in the Iterator. /// /// The row is accepted as an iterator of [`DataValue`], and it's required that the size of row @@ -86,7 +96,9 @@ impl DataChunkBuilder { .iter_mut() .map(|builder| { let chunk = builder.take(); - builder.reserve(capacity); + if capacity != usize::MAX { + builder.reserve(capacity); + } chunk }) .collect(), diff --git a/src/executor/hash_join.rs b/src/executor/hash_join.rs index eefe36a3b..313828c34 100644 --- a/src/executor/hash_join.rs +++ b/src/executor/hash_join.rs @@ -161,12 +161,9 @@ impl HashSemiJoinExecutor2 { for (key, row) in keys_chunk.rows().zip(chunk.rows()) { let chunk = key_set .entry(key.values().collect()) - .or_insert_with(|| DataChunkBuilder::new(&self.right_types, 8)) + .or_insert_with(|| DataChunkBuilder::unbounded(&self.right_types)) .push_row(row.values()); - assert!( - chunk.is_none(), - "FIXME: more than 8 rows with the same key is not supported" - ); + assert!(chunk.is_none()); } tokio::task::consume_budget().await; }