From 36ee587401eda65c7d8dd8085d44fceb65f84b00 Mon Sep 17 00:00:00 2001
From: Phoebe Goldman <phoebe@goldman-tribe.org>
Date: Wed, 8 Jan 2025 12:02:51 -0500
Subject: [PATCH 01/11] Page: track number of allocated var-len granules

Definition of `Page::bytes_used_by_rows` to follow.
This change seemed to stand on its own enough to deserve a separate commit.
---
 crates/table/src/page.rs | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/crates/table/src/page.rs b/crates/table/src/page.rs
index c1084f8fcc9..082c582a31e 100644
--- a/crates/table/src/page.rs
+++ b/crates/table/src/page.rs
@@ -248,6 +248,12 @@ struct VarHeader {
     /// pre-decrement this index.
     // TODO(perf,future-work): determine how to "lower" the high water mark when freeing the "top"-most granule.
     first: PageOffset,
+
+    /// The number of granules currently used by rows within this page.
+    ///
+    /// [`Page::bytes_used_by_rows`] needs this information.
+    /// Stored here because otherwise counting it would require traversing all the present rows.
+    num_granules: u16,
 }
 
 impl MemoryUsage for VarHeader {
@@ -256,12 +262,13 @@ impl MemoryUsage for VarHeader {
             next_free,
             freelist_len,
             first,
+            num_granules,
         } = self;
-        next_free.heap_usage() + freelist_len.heap_usage() + first.heap_usage()
+        next_free.heap_usage() + freelist_len.heap_usage() + first.heap_usage() + num_granules.heap_usage()
     }
 }
 
-static_assert_size!(VarHeader, 6);
+static_assert_size!(VarHeader, 8);
 
 impl Default for VarHeader {
     fn default() -> Self {
@@ -269,6 +276,7 @@ impl Default for VarHeader {
             next_free: FreeCellRef::NIL,
             freelist_len: 0,
             first: PageOffset::PAGE_END,
+            num_granules: 0,
         }
     }
 }
@@ -771,6 +779,8 @@ impl<'page> VarView<'page> {
             granule,
         );
 
+        self.header.num_granules += 1;
+
         Ok(granule)
     }
 
@@ -812,6 +822,7 @@ impl<'page> VarView<'page> {
         //       but we want to return a whole "run" of sequential freed chunks,
         //       which requries some bookkeeping (or an O(> n) linked list traversal).
         self.header.freelist_len += 1;
+        self.header.num_granules -= 1;
         let adjuster = self.adjuster();
 
         // SAFETY: Per caller contract, `offset` is a valid `VarLenGranule`,
@@ -1112,10 +1123,19 @@ impl Page {
     }
 
     /// Returns the number of rows stored in this page.
+    ///
+    /// This method runs in constant time.
     pub fn num_rows(&self) -> usize {
         self.header.fixed.num_rows as usize
     }
 
+    /// Returns the number of var-len granules allocated in this page.
+    ///
+    /// This method runs in constant time.
+    pub fn num_var_len_granules(&self) -> usize {
+        self.header.var.num_granules as usize
+    }
+
     /// Returns the range of row data starting at `offset` and lasting `size` bytes.
     pub fn get_row_data(&self, row: PageOffset, size: Size) -> &Bytes {
         &self.row_data[row.range(size)]

From ef8d40e28c28a038202895fa59e76caf97909f6a Mon Sep 17 00:00:00 2001
From: Phoebe Goldman <phoebe@goldman-tribe.org>
Date: Wed, 8 Jan 2025 12:05:04 -0500
Subject: [PATCH 02/11] `Table::num_rows` and `Table::bytes_used_by_rows`

We intend to bill based on these predictable metrics,
rather than the somewhat-unpredictable actual heap memory usage of the system.
As such, we need a way to compute them (duh).
This commit adds `Table` methods for computing the number of resident rows,
and the number of bytes stored by those rows.
---
 crates/table/src/page.rs  | 19 +++++++++++++++++++
 crates/table/src/table.rs | 29 +++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/crates/table/src/page.rs b/crates/table/src/page.rs
index 082c582a31e..17dd24c97b5 100644
--- a/crates/table/src/page.rs
+++ b/crates/table/src/page.rs
@@ -1136,6 +1136,25 @@ impl Page {
         self.header.var.num_granules as usize
     }
 
+    /// Returns the number of bytes used by rows stored in this page.
+    ///
+    /// This is necessarily an overestimate of live data bytes, as it includes:
+    /// - Padding bytes within the fixed-length portion of the rows.
+    /// - [`VarLenRef`] pointer-like portions of rows.
+    /// - Unused trailing parts of partially-filled [`VarLenGranule`]s.
+    /// - [`VarLenGranule`]s used to store [`BlobHash`]es.
+    ///
+    /// Note that large blobs themselves are not counted.
+    /// The caller should obtain a count of the bytes used by large blobs
+    /// from the [`super::blob_store::BlobStore`].
+    ///
+    /// This method runs in constant time.
+    pub fn bytes_used_by_rows(&self, fixed_row_size: Size) -> usize {
+        let fixed_row_bytes = self.num_rows() * fixed_row_size.len();
+        let var_len_bytes = self.num_var_len_granules() * VarLenGranule::SIZE.len();
+        fixed_row_bytes + var_len_bytes
+    }
+
     /// Returns the range of row data starting at `offset` and lasting `size` bytes.
     pub fn get_row_data(&self, row: PageOffset, size: Size) -> &Bytes {
         &self.row_data[row.range(size)]
diff --git a/crates/table/src/table.rs b/crates/table/src/table.rs
index 1ba60f3c8a4..f8e42dc8466 100644
--- a/crates/table/src/table.rs
+++ b/crates/table/src/table.rs
@@ -915,6 +915,35 @@ impl Table {
         self.compute_row_count(blob_store);
         self.rebuild_pointer_map(blob_store);
     }
+
+    /// Returns the number of rows resident in this table.
+    ///
+    /// This scales in runtime with the number of pages in the table.
+    pub fn num_rows(&self) -> u64 {
+        self.pages().iter().map(|page| page.num_rows() as u64).sum()
+    }
+
+    /// Returns the number of bytes used by rows resident in this table.
+    ///
+    /// This includes data bytes, padding bytes and some overhead bytes,
+    /// as described in the docs for [`Page::bytes_used_by_rows`],
+    /// but *does not* include:
+    ///
+    /// - Unallocated space within pages.
+    /// - Per-page overhead (e.g. page headers).
+    /// - Table overhead (e.g. the [`RowTypeLayout`], [`PointerMap`], [`Schema`] &c).
+    /// - Indices.
+    // TODO(energy): count memory usage by indices.
+    /// - Large blobs in the [`BlobStore`].
+    ///
+    /// Of these, the caller should inspect the blob store in order to account for memory usage by large blobs,
+    /// but we intend to eat all the other overheads when billing.
+    pub fn bytes_used_by_rows(&self) -> u64 {
+        self.pages()
+            .iter()
+            .map(|page| page.bytes_used_by_rows(self.inner.row_layout.size()) as u64)
+            .sum()
+    }
 }
 
 /// A reference to a single row within a table.

From 66a4270c3c559612e7de2113990c2cf159ee918d Mon Sep 17 00:00:00 2001
From: Phoebe Goldman <phoebe@goldman-tribe.org>
Date: Tue, 14 Jan 2025 12:40:34 -0500
Subject: [PATCH 03/11] Operator to compute index data size

Per out-of-band discussion, I am not sure this computation will actually be useful to us,
but it is the thing I can compute at this time.

See comment on `BTreeIndex::num_key_bytes` in btree_index.rs
for the specific counting implemented here.
---
 crates/table/src/btree_index.rs | 280 ++++++++++++++++++++++++++++----
 1 file changed, 249 insertions(+), 31 deletions(-)

diff --git a/crates/table/src/btree_index.rs b/crates/table/src/btree_index.rs
index 6635370aa61..9df558358ae 100644
--- a/crates/table/src/btree_index.rs
+++ b/crates/table/src/btree_index.rs
@@ -25,9 +25,11 @@ use super::indexes::RowPointer;
 use super::table::RowRef;
 use crate::{read_column::ReadColumn, static_assert_size, MemoryUsage};
 use core::ops::RangeBounds;
+use spacetimedb_lib::ProductValue;
 use spacetimedb_primitives::{ColList, IndexId};
 use spacetimedb_sats::{
-    algebraic_value::Packed, i256, product_value::InvalidFieldError, u256, AlgebraicType, AlgebraicValue, ProductType,
+    algebraic_value::Packed, i256, product_value::InvalidFieldError, u256, AlgebraicType, AlgebraicValue, ArrayValue,
+    ProductType, SumValue,
 };
 
 mod multimap;
@@ -321,29 +323,44 @@ impl TypedIndex {
     /// or may insert a nonsense value into the index.
     /// Note, however, that it will not invoke undefined behavior.
     ///
-    /// Returns `Ok(Some(existing_row))` if this index was a unique index that was violated.
-    /// The index is not inserted to in that case.
-    fn insert(&mut self, cols: &ColList, row_ref: RowRef<'_>) -> Result<Option<RowPointer>, InvalidFieldError> {
-        fn mm_insert_at_type<T: Ord + ReadColumn>(
+    /// The returned `usize` is the number of bytes used by the key.
+    /// [`BTreeIndex::check_and_insert`] will use this
+    /// to update the counter for [`BTreeIndex::num_key_bytes`].
+    /// We want to store said counter outside of the [`TypedIndex`] enum,
+    /// but we can only compute the size using type info within the [`TypedIndex`],
+    /// so we have to return the size across this boundary.
+    ///
+    /// Returns `Ok((Some(existing_row), key_size))` if this index was a unique index that was violated.
+    /// The new entry is not inserted to in that case.
+    ///
+    /// Returns `Ok((None, key_size))` if the new entry was successfully inserted into the index.
+    fn insert(
+        &mut self,
+        cols: &ColList,
+        row_ref: RowRef<'_>,
+    ) -> Result<(Option<RowPointer>, usize), InvalidFieldError> {
+        fn mm_insert_at_type<T: Ord + ReadColumn + KeySize>(
             this: &mut Index<T>,
             cols: &ColList,
             row_ref: RowRef<'_>,
-        ) -> Result<Option<RowPointer>, InvalidFieldError> {
+        ) -> Result<(Option<RowPointer>, usize), InvalidFieldError> {
             let col_pos = cols.as_singleton().unwrap();
-            let key = row_ref.read_col(col_pos).map_err(|_| col_pos)?;
+            let key: T = row_ref.read_col(col_pos).map_err(|_| col_pos)?;
+            let key_size = key.key_size_in_bytes();
             this.insert(key, row_ref.pointer());
-            Ok(None)
+            Ok((None, key_size))
         }
-        fn um_insert_at_type<T: Ord + ReadColumn>(
+        fn um_insert_at_type<T: Ord + ReadColumn + KeySize>(
             this: &mut UniqueIndex<T>,
             cols: &ColList,
             row_ref: RowRef<'_>,
-        ) -> Result<Option<RowPointer>, InvalidFieldError> {
+        ) -> Result<(Option<RowPointer>, usize), InvalidFieldError> {
             let col_pos = cols.as_singleton().unwrap();
-            let key = row_ref.read_col(col_pos).map_err(|_| col_pos)?;
-            Ok(this.insert(key, row_ref.pointer()).copied())
+            let key: T = row_ref.read_col(col_pos).map_err(|_| col_pos)?;
+            let key_size = key.key_size_in_bytes();
+            Ok((this.insert(key, row_ref.pointer()).copied(), key_size))
         }
-        let unique_violation = match self {
+        let (unique_violation, key_size) = match self {
             Self::Bool(idx) => mm_insert_at_type(idx, cols, row_ref),
             Self::U8(idx) => mm_insert_at_type(idx, cols, row_ref),
             Self::I8(idx) => mm_insert_at_type(idx, cols, row_ref),
@@ -360,8 +377,9 @@ impl TypedIndex {
             Self::String(idx) => mm_insert_at_type(idx, cols, row_ref),
             Self::AV(this) => {
                 let key = row_ref.project(cols)?;
+                let key_size = key.key_size_in_bytes();
                 this.insert(key, row_ref.pointer());
-                Ok(None)
+                Ok((None, key_size))
             }
             Self::UniqueBool(idx) => um_insert_at_type(idx, cols, row_ref),
             Self::UniqueU8(idx) => um_insert_at_type(idx, cols, row_ref),
@@ -379,10 +397,11 @@ impl TypedIndex {
             Self::UniqueString(idx) => um_insert_at_type(idx, cols, row_ref),
             Self::UniqueAV(this) => {
                 let key = row_ref.project(cols)?;
-                Ok(this.insert(key, row_ref.pointer()).copied())
+                let key_size = key.key_size_in_bytes();
+                Ok((this.insert(key, row_ref.pointer()).copied(), key_size))
             }
         }?;
-        Ok(unique_violation)
+        Ok((unique_violation, key_size))
     }
 
     /// Remove the row referred to by `row_ref` from the index `self`,
@@ -393,24 +412,34 @@ impl TypedIndex {
     /// this will behave oddly; it may return an error, do nothing,
     /// or remove the wrong value from the index.
     /// Note, however, that it will not invoke undefined behavior.
-    fn delete(&mut self, cols: &ColList, row_ref: RowRef<'_>) -> Result<bool, InvalidFieldError> {
-        fn mm_delete_at_type<T: Ord + ReadColumn>(
+    ///
+    /// If the row was present and has been deleted, returns `Ok(Some(key_size_in_bytes))`,
+    /// where `key_size_in_bytes` is the size of the key.
+    /// [`BTreeIndex::delete`] will use this
+    /// to update the counter for [`BTreeIndex::num_key_bytes`].
+    /// We want to store said counter outside of the [`TypedIndex`] enum,
+    /// but we can only compute the size using type info within the [`TypedIndex`],
+    /// so we have to return the size across this boundary.
+    fn delete(&mut self, cols: &ColList, row_ref: RowRef<'_>) -> Result<Option<usize>, InvalidFieldError> {
+        fn mm_delete_at_type<T: Ord + ReadColumn + KeySize>(
             this: &mut Index<T>,
             cols: &ColList,
             row_ref: RowRef<'_>,
-        ) -> Result<bool, InvalidFieldError> {
+        ) -> Result<Option<usize>, InvalidFieldError> {
             let col_pos = cols.as_singleton().unwrap();
-            let key = row_ref.read_col(col_pos).map_err(|_| col_pos)?;
-            Ok(this.delete(&key, &row_ref.pointer()))
+            let key: T = row_ref.read_col(col_pos).map_err(|_| col_pos)?;
+            let key_size = key.key_size_in_bytes();
+            Ok(this.delete(&key, &row_ref.pointer()).then_some(key_size))
         }
-        fn um_delete_at_type<T: Ord + ReadColumn>(
+        fn um_delete_at_type<T: Ord + ReadColumn + KeySize>(
             this: &mut UniqueIndex<T>,
             cols: &ColList,
             row_ref: RowRef<'_>,
-        ) -> Result<bool, InvalidFieldError> {
+        ) -> Result<Option<usize>, InvalidFieldError> {
             let col_pos = cols.as_singleton().unwrap();
-            let key = row_ref.read_col(col_pos).map_err(|_| col_pos)?;
-            Ok(this.delete(&key))
+            let key: T = row_ref.read_col(col_pos).map_err(|_| col_pos)?;
+            let key_size = key.key_size_in_bytes();
+            Ok(this.delete(&key).then_some(key_size))
         }
 
         match self {
@@ -430,7 +459,8 @@ impl TypedIndex {
             Self::String(this) => mm_delete_at_type(this, cols, row_ref),
             Self::AV(this) => {
                 let key = row_ref.project(cols)?;
-                Ok(this.delete(&key, &row_ref.pointer()))
+                let key_size = key.key_size_in_bytes();
+                Ok(this.delete(&key, &row_ref.pointer()).then_some(key_size))
             }
             Self::UniqueBool(this) => um_delete_at_type(this, cols, row_ref),
             Self::UniqueU8(this) => um_delete_at_type(this, cols, row_ref),
@@ -448,7 +478,8 @@ impl TypedIndex {
             Self::UniqueString(this) => um_delete_at_type(this, cols, row_ref),
             Self::UniqueAV(this) => {
                 let key = row_ref.project(cols)?;
-                Ok(this.delete(&key))
+                let key_size = key.key_size_in_bytes();
+                Ok(this.delete(&key).then_some(key_size))
             }
         }
     }
@@ -626,6 +657,124 @@ impl TypedIndex {
     }
 }
 
+trait KeySize {
+    fn key_size_in_bytes(&self) -> usize;
+}
+
+macro_rules! impl_key_size_primitive {
+    ($prim:ty) => {
+        impl KeySize for $prim {
+            fn key_size_in_bytes(&self) -> usize { std::mem::size_of::<Self>() }
+        }
+    };
+    ($($prim:ty,)*) => {
+        $(impl_key_size_primitive!($prim);)*
+    };
+}
+
+impl_key_size_primitive!(
+    bool,
+    u8,
+    i8,
+    u16,
+    i16,
+    u32,
+    i32,
+    u64,
+    i64,
+    u128,
+    i128,
+    spacetimedb_sats::algebraic_value::Packed<u128>,
+    spacetimedb_sats::algebraic_value::Packed<i128>,
+    u256,
+    i256,
+    spacetimedb_sats::F32,
+    spacetimedb_sats::F64,
+);
+
+impl KeySize for Box<str> {
+    fn key_size_in_bytes(&self) -> usize {
+        self.len() + std::mem::size_of::<super::var_len::VarLenRef>()
+    }
+}
+
+impl KeySize for AlgebraicValue {
+    fn key_size_in_bytes(&self) -> usize {
+        match self {
+            AlgebraicValue::Bool(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U8(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I8(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U16(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I16(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U32(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I32(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U64(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I64(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U128(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I128(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U256(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I256(x) => x.key_size_in_bytes(),
+            AlgebraicValue::F32(x) => x.key_size_in_bytes(),
+            AlgebraicValue::F64(x) => x.key_size_in_bytes(),
+            AlgebraicValue::String(x) => x.key_size_in_bytes(),
+            AlgebraicValue::Sum(x) => x.key_size_in_bytes(),
+            AlgebraicValue::Product(x) => x.key_size_in_bytes(),
+            AlgebraicValue::Array(x) => x.key_size_in_bytes(),
+
+            AlgebraicValue::Min | AlgebraicValue::Max => unreachable!(),
+        }
+    }
+}
+
+impl KeySize for SumValue {
+    fn key_size_in_bytes(&self) -> usize {
+        1 + self.value.key_size_in_bytes()
+    }
+}
+
+impl KeySize for ProductValue {
+    fn key_size_in_bytes(&self) -> usize {
+        self.elements.key_size_in_bytes()
+    }
+}
+
+impl<K> KeySize for [K]
+where
+    K: KeySize,
+{
+    // TODO(perf, bikeshedding): check that this optimized to `size_of::<K>() * self.len()`
+    // when `K` is a primitive.
+    fn key_size_in_bytes(&self) -> usize {
+        self.iter().map(|elt| elt.key_size_in_bytes()).sum()
+    }
+}
+
+impl KeySize for ArrayValue {
+    fn key_size_in_bytes(&self) -> usize {
+        match self {
+            ArrayValue::Sum(elts) => elts.key_size_in_bytes(),
+            ArrayValue::Product(elts) => elts.key_size_in_bytes(),
+            ArrayValue::Bool(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I8(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U8(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I16(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U16(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I32(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U32(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I64(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U64(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I128(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U128(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I256(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U256(elts) => elts.key_size_in_bytes(),
+            ArrayValue::F32(elts) => elts.key_size_in_bytes(),
+            ArrayValue::F64(elts) => elts.key_size_in_bytes(),
+            ArrayValue::String(elts) => elts.key_size_in_bytes(),
+            ArrayValue::Array(elts) => elts.key_size_in_bytes(),
+        }
+    }
+}
+
 /// A B-Tree based index on a set of [`ColId`]s of a table.
 #[derive(Debug, PartialEq, Eq)]
 pub struct BTreeIndex {
@@ -635,7 +784,19 @@ pub struct BTreeIndex {
     idx: TypedIndex,
     /// The key type of this index.
     /// This is the projection of the row type to the types of the columns indexed.
+    // TODO(perf, bikeshedding): Could trim `sizeof(BTreeIndex)` to 64 if this was `Box<AlgebraicType>`.
     pub key_type: AlgebraicType,
+
+    /// The number of rows in this index.
+    ///
+    /// Memoized counter for [`Self::num_rows`].
+    num_rows: u64,
+
+    /// The number of key bytes in this index.
+    ///
+    /// Memoized counter for [`Self::num_key_bytes`].
+    /// See that method for more detailed documentation.
+    num_key_bytes: u64,
 }
 
 impl MemoryUsage for BTreeIndex {
@@ -644,12 +805,18 @@ impl MemoryUsage for BTreeIndex {
             index_id,
             idx,
             key_type,
+            num_rows,
+            num_key_bytes,
         } = self;
-        index_id.heap_usage() + idx.heap_usage() + key_type.heap_usage()
+        index_id.heap_usage()
+            + idx.heap_usage()
+            + key_type.heap_usage()
+            + num_rows.heap_usage()
+            + num_key_bytes.heap_usage()
     }
 }
 
-static_assert_size!(BTreeIndex, 64);
+static_assert_size!(BTreeIndex, 80);
 
 impl BTreeIndex {
     /// Returns a new possibly unique index, with `index_id` for a set of columns.
@@ -665,6 +832,8 @@ impl BTreeIndex {
             index_id,
             idx: typed_index,
             key_type,
+            num_rows: 0,
+            num_key_bytes: 0,
         })
     }
 
@@ -678,6 +847,8 @@ impl BTreeIndex {
             index_id,
             idx,
             key_type,
+            num_rows: 0,
+            num_key_bytes: 0,
         }
     }
 
@@ -695,14 +866,30 @@ impl BTreeIndex {
         cols: &ColList,
         row_ref: RowRef<'_>,
     ) -> Result<Option<RowPointer>, InvalidFieldError> {
-        self.idx.insert(cols, row_ref)
+        let (res, size_in_bytes) = self.idx.insert(cols, row_ref)?;
+        if res.is_none() {
+            // No existing row; the new row was inserted.
+            // Update the `num_rows` and `num_key_bytes` counters
+            // to account for the new insertion.
+            self.num_rows += 1;
+            self.num_key_bytes += size_in_bytes as u64;
+        }
+        Ok(res)
     }
 
     /// Deletes `ptr` with its indexed value `col_value` from this index.
     ///
     /// Returns whether `ptr` was present.
     pub fn delete(&mut self, cols: &ColList, row_ref: RowRef<'_>) -> Result<bool, InvalidFieldError> {
-        self.idx.delete(cols, row_ref)
+        if let Some(size_in_bytes) = self.idx.delete(cols, row_ref)? {
+            // Was present, and deleted: update the `num_rows` and `num_key_bytes` counters.
+            self.num_rows -= 1;
+            self.num_key_bytes -= size_in_bytes as u64;
+            Ok(true)
+        } else {
+            // Was not present: don't update counters.
+            Ok(false)
+        }
     }
 
     /// Returns whether `value` is in this index.
@@ -741,12 +928,43 @@ impl BTreeIndex {
     /// rather than constructing a new `BTreeIndex`.
     pub fn clear(&mut self) {
         self.idx.clear();
+        self.num_key_bytes = 0;
+        self.num_rows = 0;
     }
 
     /// The number of unique keys in this index.
     pub fn num_keys(&self) -> usize {
         self.idx.num_keys()
     }
+
+    /// The number of rows stored in this index.
+    ///
+    /// Note that, for non-unique indexes, this may be larger than [`Self::num_keys`].
+    ///
+    /// This method runs in constant time.
+    pub fn num_rows(&self) -> u64 {
+        self.num_rows
+    }
+
+    /// The number of bytes stored in keys in this index.
+    ///
+    /// For non-unique indexes, duplicate keys are counted once for each row that refers to them,
+    /// even though the internal storage may deduplicate them as an optimization.
+    ///
+    /// This method runs in constant time.
+    ///
+    /// The key bytes of a value are defined depending on that value's type:
+    /// - Integer, float and boolean values take key bytes according to their [`std::mem::size_of`].
+    /// - Strings take key bytes equal to their length in bytes.
+    ///   No overhead is counted, unlike in the BFLATN or BSATN size.
+    /// - Sum values take 1 key byte for the tag, plus the key bytes of their active payload.
+    ///   Inactive variants and padding are not counted, unlike in the BFLATN size.
+    /// - Product values take key bytes equal to the sum of their elements' key bytes.
+    ///   Padding is not counted, unlike in the BFLATN size.
+    /// - Array values take key bytes equal to the sum of their elements' key bytes.
+    pub fn num_key_bytes(&self) -> u64 {
+        self.num_key_bytes
+    }
 }
 
 #[cfg(test)]

From d49dd8084232224bbb31fcfcab4091427b0bd42f Mon Sep 17 00:00:00 2001
From: Phoebe Goldman <phoebe@goldman-tribe.org>
Date: Wed, 15 Jan 2025 09:34:11 -0500
Subject: [PATCH 04/11] Move `KeySize` to its own file; export and document it

---
 crates/table/src/btree_index.rs          | 135 +--------------------
 crates/table/src/btree_index/key_size.rs | 143 +++++++++++++++++++++++
 2 files changed, 148 insertions(+), 130 deletions(-)
 create mode 100644 crates/table/src/btree_index/key_size.rs

diff --git a/crates/table/src/btree_index.rs b/crates/table/src/btree_index.rs
index 9df558358ae..76ad37de312 100644
--- a/crates/table/src/btree_index.rs
+++ b/crates/table/src/btree_index.rs
@@ -25,16 +25,17 @@ use super::indexes::RowPointer;
 use super::table::RowRef;
 use crate::{read_column::ReadColumn, static_assert_size, MemoryUsage};
 use core::ops::RangeBounds;
-use spacetimedb_lib::ProductValue;
 use spacetimedb_primitives::{ColList, IndexId};
 use spacetimedb_sats::{
-    algebraic_value::Packed, i256, product_value::InvalidFieldError, u256, AlgebraicType, AlgebraicValue, ArrayValue,
-    ProductType, SumValue,
+    algebraic_value::Packed, i256, product_value::InvalidFieldError, u256, AlgebraicType, AlgebraicValue, ProductType,
 };
 
+mod key_size;
 mod multimap;
 mod uniquemap;
 
+pub use key_size::KeySize;
+
 type Index<K> = multimap::MultiMap<K, RowPointer>;
 type IndexIter<'a, K> = multimap::MultiMapRangeIter<'a, K, RowPointer>;
 type UniqueIndex<K> = uniquemap::UniqueMap<K, RowPointer>;
@@ -657,124 +658,6 @@ impl TypedIndex {
     }
 }
 
-trait KeySize {
-    fn key_size_in_bytes(&self) -> usize;
-}
-
-macro_rules! impl_key_size_primitive {
-    ($prim:ty) => {
-        impl KeySize for $prim {
-            fn key_size_in_bytes(&self) -> usize { std::mem::size_of::<Self>() }
-        }
-    };
-    ($($prim:ty,)*) => {
-        $(impl_key_size_primitive!($prim);)*
-    };
-}
-
-impl_key_size_primitive!(
-    bool,
-    u8,
-    i8,
-    u16,
-    i16,
-    u32,
-    i32,
-    u64,
-    i64,
-    u128,
-    i128,
-    spacetimedb_sats::algebraic_value::Packed<u128>,
-    spacetimedb_sats::algebraic_value::Packed<i128>,
-    u256,
-    i256,
-    spacetimedb_sats::F32,
-    spacetimedb_sats::F64,
-);
-
-impl KeySize for Box<str> {
-    fn key_size_in_bytes(&self) -> usize {
-        self.len() + std::mem::size_of::<super::var_len::VarLenRef>()
-    }
-}
-
-impl KeySize for AlgebraicValue {
-    fn key_size_in_bytes(&self) -> usize {
-        match self {
-            AlgebraicValue::Bool(x) => x.key_size_in_bytes(),
-            AlgebraicValue::U8(x) => x.key_size_in_bytes(),
-            AlgebraicValue::I8(x) => x.key_size_in_bytes(),
-            AlgebraicValue::U16(x) => x.key_size_in_bytes(),
-            AlgebraicValue::I16(x) => x.key_size_in_bytes(),
-            AlgebraicValue::U32(x) => x.key_size_in_bytes(),
-            AlgebraicValue::I32(x) => x.key_size_in_bytes(),
-            AlgebraicValue::U64(x) => x.key_size_in_bytes(),
-            AlgebraicValue::I64(x) => x.key_size_in_bytes(),
-            AlgebraicValue::U128(x) => x.key_size_in_bytes(),
-            AlgebraicValue::I128(x) => x.key_size_in_bytes(),
-            AlgebraicValue::U256(x) => x.key_size_in_bytes(),
-            AlgebraicValue::I256(x) => x.key_size_in_bytes(),
-            AlgebraicValue::F32(x) => x.key_size_in_bytes(),
-            AlgebraicValue::F64(x) => x.key_size_in_bytes(),
-            AlgebraicValue::String(x) => x.key_size_in_bytes(),
-            AlgebraicValue::Sum(x) => x.key_size_in_bytes(),
-            AlgebraicValue::Product(x) => x.key_size_in_bytes(),
-            AlgebraicValue::Array(x) => x.key_size_in_bytes(),
-
-            AlgebraicValue::Min | AlgebraicValue::Max => unreachable!(),
-        }
-    }
-}
-
-impl KeySize for SumValue {
-    fn key_size_in_bytes(&self) -> usize {
-        1 + self.value.key_size_in_bytes()
-    }
-}
-
-impl KeySize for ProductValue {
-    fn key_size_in_bytes(&self) -> usize {
-        self.elements.key_size_in_bytes()
-    }
-}
-
-impl<K> KeySize for [K]
-where
-    K: KeySize,
-{
-    // TODO(perf, bikeshedding): check that this optimized to `size_of::<K>() * self.len()`
-    // when `K` is a primitive.
-    fn key_size_in_bytes(&self) -> usize {
-        self.iter().map(|elt| elt.key_size_in_bytes()).sum()
-    }
-}
-
-impl KeySize for ArrayValue {
-    fn key_size_in_bytes(&self) -> usize {
-        match self {
-            ArrayValue::Sum(elts) => elts.key_size_in_bytes(),
-            ArrayValue::Product(elts) => elts.key_size_in_bytes(),
-            ArrayValue::Bool(elts) => elts.key_size_in_bytes(),
-            ArrayValue::I8(elts) => elts.key_size_in_bytes(),
-            ArrayValue::U8(elts) => elts.key_size_in_bytes(),
-            ArrayValue::I16(elts) => elts.key_size_in_bytes(),
-            ArrayValue::U16(elts) => elts.key_size_in_bytes(),
-            ArrayValue::I32(elts) => elts.key_size_in_bytes(),
-            ArrayValue::U32(elts) => elts.key_size_in_bytes(),
-            ArrayValue::I64(elts) => elts.key_size_in_bytes(),
-            ArrayValue::U64(elts) => elts.key_size_in_bytes(),
-            ArrayValue::I128(elts) => elts.key_size_in_bytes(),
-            ArrayValue::U128(elts) => elts.key_size_in_bytes(),
-            ArrayValue::I256(elts) => elts.key_size_in_bytes(),
-            ArrayValue::U256(elts) => elts.key_size_in_bytes(),
-            ArrayValue::F32(elts) => elts.key_size_in_bytes(),
-            ArrayValue::F64(elts) => elts.key_size_in_bytes(),
-            ArrayValue::String(elts) => elts.key_size_in_bytes(),
-            ArrayValue::Array(elts) => elts.key_size_in_bytes(),
-        }
-    }
-}
-
 /// A B-Tree based index on a set of [`ColId`]s of a table.
 #[derive(Debug, PartialEq, Eq)]
 pub struct BTreeIndex {
@@ -953,15 +836,7 @@ impl BTreeIndex {
     ///
     /// This method runs in constant time.
     ///
-    /// The key bytes of a value are defined depending on that value's type:
-    /// - Integer, float and boolean values take key bytes according to their [`std::mem::size_of`].
-    /// - Strings take key bytes equal to their length in bytes.
-    ///   No overhead is counted, unlike in the BFLATN or BSATN size.
-    /// - Sum values take 1 key byte for the tag, plus the key bytes of their active payload.
-    ///   Inactive variants and padding are not counted, unlike in the BFLATN size.
-    /// - Product values take key bytes equal to the sum of their elements' key bytes.
-    ///   Padding is not counted, unlike in the BFLATN size.
-    /// - Array values take key bytes equal to the sum of their elements' key bytes.
+    /// See the [`KeySize`] trait for more details on how this method computes its result.
     pub fn num_key_bytes(&self) -> u64 {
         self.num_key_bytes
     }
diff --git a/crates/table/src/btree_index/key_size.rs b/crates/table/src/btree_index/key_size.rs
new file mode 100644
index 00000000000..2128341f8e6
--- /dev/null
+++ b/crates/table/src/btree_index/key_size.rs
@@ -0,0 +1,143 @@
+use spacetimedb_sats::{
+    algebraic_value::Packed, i256, u256, AlgebraicValue, ArrayValue, ProductValue, SumValue, F32, F64,
+};
+
+/// Index keys whose memory usage we can measure and report.
+///
+/// The reported memory usage of an index is based on:
+///
+/// - the number of entries in that index, i.e. the number of `RowPointer`s it stores,
+/// - the total size of the keys for every entry in that index.
+///
+/// This trait is used to measure the latter.
+/// The metric we measure, sometimes called "data size,"
+/// is the number of live user-supplied bytes in the key.
+/// This excludes padding and lengths, though it does include sum tags.
+///
+/// The key size of a value is defined depending on that value's type:
+/// - Integer, float and boolean values take bytes according to their [`std::mem::size_of`].
+/// - Strings take bytes equal to their length in bytes.
+///   No overhead is counted, unlike in the BFLATN or BSATN size.
+/// - Sum values take 1 byte for the tag, plus the bytes of their active payload.
+///   Inactive variants and padding are not counted, unlike in the BFLATN size.
+/// - Product values take bytes equal to the sum of their elements' bytes.
+///   Padding is not counted, unlike in the BFLATN size.
+/// - Array values take bytes equal to the sum of their elements' bytes.
+///   As with strings, no overhead is counted.
+pub trait KeySize {
+    fn key_size_in_bytes(&self) -> usize;
+}
+
+macro_rules! impl_key_size_primitive {
+    ($prim:ty) => {
+        impl KeySize for $prim {
+            fn key_size_in_bytes(&self) -> usize { std::mem::size_of::<Self>() }
+        }
+    };
+    ($($prim:ty,)*) => {
+        $(impl_key_size_primitive!($prim);)*
+    };
+}
+
+impl_key_size_primitive!(
+    bool,
+    u8,
+    i8,
+    u16,
+    i16,
+    u32,
+    i32,
+    u64,
+    i64,
+    u128,
+    i128,
+    Packed<u128>,
+    Packed<i128>,
+    u256,
+    i256,
+    F32,
+    F64,
+);
+
+impl KeySize for Box<str> {
+    fn key_size_in_bytes(&self) -> usize {
+        self.len()
+    }
+}
+
+impl KeySize for AlgebraicValue {
+    fn key_size_in_bytes(&self) -> usize {
+        match self {
+            AlgebraicValue::Bool(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U8(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I8(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U16(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I16(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U32(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I32(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U64(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I64(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U128(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I128(x) => x.key_size_in_bytes(),
+            AlgebraicValue::U256(x) => x.key_size_in_bytes(),
+            AlgebraicValue::I256(x) => x.key_size_in_bytes(),
+            AlgebraicValue::F32(x) => x.key_size_in_bytes(),
+            AlgebraicValue::F64(x) => x.key_size_in_bytes(),
+            AlgebraicValue::String(x) => x.key_size_in_bytes(),
+            AlgebraicValue::Sum(x) => x.key_size_in_bytes(),
+            AlgebraicValue::Product(x) => x.key_size_in_bytes(),
+            AlgebraicValue::Array(x) => x.key_size_in_bytes(),
+
+            AlgebraicValue::Min | AlgebraicValue::Max => unreachable!(),
+        }
+    }
+}
+
+impl KeySize for SumValue {
+    fn key_size_in_bytes(&self) -> usize {
+        1 + self.value.key_size_in_bytes()
+    }
+}
+
+impl KeySize for ProductValue {
+    fn key_size_in_bytes(&self) -> usize {
+        self.elements.key_size_in_bytes()
+    }
+}
+
+impl<K> KeySize for [K]
+where
+    K: KeySize,
+{
+    // TODO(perf, bikeshedding): check that this optimized to `size_of::<K>() * self.len()`
+    // when `K` is a primitive.
+    fn key_size_in_bytes(&self) -> usize {
+        self.iter().map(|elt| elt.key_size_in_bytes()).sum()
+    }
+}
+
+impl KeySize for ArrayValue {
+    fn key_size_in_bytes(&self) -> usize {
+        match self {
+            ArrayValue::Sum(elts) => elts.key_size_in_bytes(),
+            ArrayValue::Product(elts) => elts.key_size_in_bytes(),
+            ArrayValue::Bool(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I8(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U8(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I16(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U16(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I32(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U32(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I64(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U64(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I128(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U128(elts) => elts.key_size_in_bytes(),
+            ArrayValue::I256(elts) => elts.key_size_in_bytes(),
+            ArrayValue::U256(elts) => elts.key_size_in_bytes(),
+            ArrayValue::F32(elts) => elts.key_size_in_bytes(),
+            ArrayValue::F64(elts) => elts.key_size_in_bytes(),
+            ArrayValue::String(elts) => elts.key_size_in_bytes(),
+            ArrayValue::Array(elts) => elts.key_size_in_bytes(),
+        }
+    }
+}

From c7130ee6144a6ecdf5235181387dd1c63b7ed023 Mon Sep 17 00:00:00 2001
From: Phoebe Goldman <phoebe@goldman-tribe.org>
Date: Fri, 17 Jan 2025 09:35:40 -0500
Subject: [PATCH 05/11] Blob store usages; hook up index usages

---
 crates/table/src/blob_store.rs | 21 +++++++++++++++++++++
 crates/table/src/table.rs      | 27 +++++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/crates/table/src/blob_store.rs b/crates/table/src/blob_store.rs
index 3431b5b35cf..52078072ba6 100644
--- a/crates/table/src/blob_store.rs
+++ b/crates/table/src/blob_store.rs
@@ -104,6 +104,27 @@ pub trait BlobStore: Sync {
     ///
     /// Used when capturing a snapshot.
     fn iter_blobs(&self) -> BlobsIter<'_>;
+
+    /// Returns the amount of memory in bytes used by blobs in this `BlobStore`.
+    ///
+    /// Duplicate blobs are counted a number of times equal to their refcount.
+    /// This is in order to preserve the property that inserting a large blob
+    /// causes this quantity to increase by that blob's size,
+    /// and deleting a large blob causes it to decrease the same amount.
+    fn bytes_used_by_blobs(&self) -> u64 {
+        self.iter_blobs()
+            .map(|(_, uses, data)| data.len() as u64 * uses as u64)
+            .sum()
+    }
+
+    /// Returns the number of blobs, or more precisely, blob-usages, recorded in this `BlobStore`.
+    ///
+    /// Duplicate blobs are counted a number of times equal to their refcount.
+    /// This is in order to preserve the property that inserting a large blob
+    /// causes this quantity to increase by 1, and deleting a large blob causes it to decrease by 1.
+    fn num_blobs(&self) -> u64 {
+        self.iter_blobs().map(|(_, uses, _)| uses as u64).sum()
+    }
 }
 
 /// A blob store that panics on all operations.
diff --git a/crates/table/src/table.rs b/crates/table/src/table.rs
index f8e42dc8466..bfa11b29abb 100644
--- a/crates/table/src/table.rs
+++ b/crates/table/src/table.rs
@@ -932,11 +932,11 @@ impl Table {
     /// - Unallocated space within pages.
     /// - Per-page overhead (e.g. page headers).
     /// - Table overhead (e.g. the [`RowTypeLayout`], [`PointerMap`], [`Schema`] &c).
-    /// - Indices.
-    // TODO(energy): count memory usage by indices.
+    /// - Indexes.
     /// - Large blobs in the [`BlobStore`].
     ///
     /// Of these, the caller should inspect the blob store in order to account for memory usage by large blobs,
+    /// and call [`Self::bytes_used_by_index_keys`] to account for indexes,
     /// but we intend to eat all the other overheads when billing.
     pub fn bytes_used_by_rows(&self) -> u64 {
         self.pages()
@@ -944,6 +944,29 @@ impl Table {
             .map(|page| page.bytes_used_by_rows(self.inner.row_layout.size()) as u64)
             .sum()
     }
+
+    /// Returns the number of rows (or [`RowPointer`]s, more accurately)
+    /// stored in indexes by this table.
+    ///
+    /// This method runs in constant time.
+    pub fn num_rows_in_indexes(&self) -> u64 {
+        // Assume that each index contains all rows in the table.
+        self.num_rows() * self.indexes.len() as u64
+    }
+
+    /// Returns the number of bytes used by keys stored in indexes by this table.
+    ///
+    /// This method scales in runtime with the number of indexes in the table,
+    /// but not with the number of pages or rows.
+    ///
+    /// Key size is measured using a metric called "key size" or "data size,"
+    /// which is intended to capture the number of live user-supplied bytes,
+    /// not including representational overhead.
+    /// This is distinct from the BFLATN size measured by [`Self::bytes_used_by_rows`].
+    /// See the trait [`crate::btree_index::KeySize`] for specifics on the metric measured.
+    pub fn bytes_used_by_index_keys(&self) -> u64 {
+        self.indexes.iter().map(|(_, idx)| idx.num_key_bytes()).sum()
+    }
 }
 
 /// A reference to a single row within a table.

From c4e35cbf89741ceb39723b57abfb10f35e2fdcfe Mon Sep 17 00:00:00 2001
From: Phoebe Goldman <phoebe@goldman-tribe.org>
Date: Fri, 17 Jan 2025 10:05:36 -0500
Subject: [PATCH 06/11] clippy

---
 crates/table/src/table.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/table/src/table.rs b/crates/table/src/table.rs
index bfa11b29abb..4519b1259d1 100644
--- a/crates/table/src/table.rs
+++ b/crates/table/src/table.rs
@@ -965,7 +965,7 @@ impl Table {
     /// This is distinct from the BFLATN size measured by [`Self::bytes_used_by_rows`].
     /// See the trait [`crate::btree_index::KeySize`] for specifics on the metric measured.
     pub fn bytes_used_by_index_keys(&self) -> u64 {
-        self.indexes.iter().map(|(_, idx)| idx.num_key_bytes()).sum()
+        self.indexes.values().map(|idx| idx.num_key_bytes()).sum()
     }
 }
 

From 4eb083c825b06d786785ad907279250afecc7529 Mon Sep 17 00:00:00 2001
From: Phoebe Goldman <phoebe@goldman-tribe.org>
Date: Fri, 17 Jan 2025 10:31:44 -0500
Subject: [PATCH 07/11] Add and report data size metrics for `CommittedState`

---
 .../locking_tx_datastore/committed_state.rs   | 33 +++++++++++++++
 .../locking_tx_datastore/datastore.rs         |  4 ++
 crates/core/src/db/db_metrics/data_size.rs    | 42 +++++++++++++++++++
 crates/core/src/db/db_metrics/mod.rs          |  2 +
 4 files changed, 81 insertions(+)
 create mode 100644 crates/core/src/db/db_metrics/data_size.rs

diff --git a/crates/core/src/db/datastore/locking_tx_datastore/committed_state.rs b/crates/core/src/db/datastore/locking_tx_datastore/committed_state.rs
index 765d158e06d..df6bf334447 100644
--- a/crates/core/src/db/datastore/locking_tx_datastore/committed_state.rs
+++ b/crates/core/src/db/datastore/locking_tx_datastore/committed_state.rs
@@ -644,6 +644,39 @@ impl CommittedState {
         let index = table.indexes.get(col_list)?;
         Some(&index.key_type)
     }
+
+    pub(super) fn report_data_size(&self, database_identity: Identity) {
+        use crate::db::db_metrics::data_size::DATA_SIZE_METRICS;
+
+        for (table_id, table) in &self.tables {
+            let table_name = &table.schema.table_name;
+            DATA_SIZE_METRICS
+                .data_size_table_num_rows
+                .with_label_values(&database_identity, &table_id.0, table_name)
+                .set(table.num_rows() as _);
+            DATA_SIZE_METRICS
+                .data_size_table_bytes_used_by_rows
+                .with_label_values(&database_identity, &table_id.0, table_name)
+                .set(table.bytes_used_by_rows() as _);
+            DATA_SIZE_METRICS
+                .data_size_table_num_rows_in_indexes
+                .with_label_values(&database_identity, &table_id.0, table_name)
+                .set(table.num_rows_in_indexes() as _);
+            DATA_SIZE_METRICS
+                .data_size_table_bytes_used_by_index_keys
+                .with_label_values(&database_identity, &table_id.0, table_name)
+                .set(table.bytes_used_by_index_keys() as _);
+        }
+
+        DATA_SIZE_METRICS
+            .data_size_blob_store_num_blobs
+            .with_label_values(&database_identity)
+            .set(self.blob_store.num_blobs() as _);
+        DATA_SIZE_METRICS
+            .data_size_blob_store_bytes_used_by_blobs
+            .with_label_values(&database_identity)
+            .set(self.blob_store.bytes_used_by_blobs() as _);
+    }
 }
 
 pub struct CommittedIndexIterWithDeletedMutTx<'a> {
diff --git a/crates/core/src/db/datastore/locking_tx_datastore/datastore.rs b/crates/core/src/db/datastore/locking_tx_datastore/datastore.rs
index 901dee59ea7..94aabb171a4 100644
--- a/crates/core/src/db/datastore/locking_tx_datastore/datastore.rs
+++ b/crates/core/src/db/datastore/locking_tx_datastore/datastore.rs
@@ -681,6 +681,10 @@ pub(super) fn record_metrics(
                 .inc_by(deletes.len() as u64);
         }
     }
+
+    if let Some(committed_state) = committed_state {
+        committed_state.report_data_size(*db);
+    }
 }
 
 impl MutTx for Locking {
diff --git a/crates/core/src/db/db_metrics/data_size.rs b/crates/core/src/db/db_metrics/data_size.rs
new file mode 100644
index 00000000000..09430010a32
--- /dev/null
+++ b/crates/core/src/db/db_metrics/data_size.rs
@@ -0,0 +1,42 @@
+use once_cell::sync::Lazy;
+use prometheus::IntGaugeVec;
+use spacetimedb_lib::Identity;
+use spacetimedb_metrics::metrics_group;
+
+metrics_group!(
+    #[non_exhaustive]
+    pub struct DbDataSize {
+        #[name = spacetime_data_size_table_num_rows]
+        #[help = "The number of rows in a table"]
+        #[labels(db: Identity, table_id: u32, table_name: str)]
+        pub data_size_table_num_rows: IntGaugeVec,
+
+        #[name = spacetime_data_size_bytes_used_by_rows]
+        #[help = "The number of bytes used by rows in pages in a table"]
+        #[labels(db: Identity, table_id: u32, table_name: str)]
+        pub data_size_table_bytes_used_by_rows: IntGaugeVec,
+
+        #[name = spacetime_data_size_table_num_rows_in_indexes]
+        #[help = "The number of rows stored in indexes in a table"]
+        // TODO: Consider partitioning by index ID or index name.
+        #[labels(db: Identity, table_id: u32, table_name: str)]
+        pub data_size_table_num_rows_in_indexes: IntGaugeVec,
+
+        #[name = spacetime_data_size_table_bytes_used_by_index_keys]
+        #[help = "The number of bytes used by keys stored in indexes in a table"]
+        #[labels(db: Identity, table_id: u32, table_name: str)]
+        pub data_size_table_bytes_used_by_index_keys: IntGaugeVec,
+
+        #[name = spacetime_data_size_blob_store_num_blobs]
+        #[help = "The number of large blobs stored in a database's blob store"]
+        #[labels(db: Identity)]
+        pub data_size_blob_store_num_blobs: IntGaugeVec,
+
+        #[name = spacetime_data_size_blob_store_bytes_used_by_blobs]
+        #[help = "The number of bytes used by large blobs stored in a database's blob store"]
+        #[labels(db: Identity)]
+        pub data_size_blob_store_bytes_used_by_blobs: IntGaugeVec,
+    }
+);
+
+pub static DATA_SIZE_METRICS: Lazy<DbDataSize> = Lazy::new(DbDataSize::new);
diff --git a/crates/core/src/db/db_metrics/mod.rs b/crates/core/src/db/db_metrics/mod.rs
index 8dd965fa200..98d1717e924 100644
--- a/crates/core/src/db/db_metrics/mod.rs
+++ b/crates/core/src/db/db_metrics/mod.rs
@@ -5,6 +5,8 @@ use spacetimedb_lib::Identity;
 use spacetimedb_metrics::metrics_group;
 use spacetimedb_primitives::TableId;
 
+pub mod data_size;
+
 metrics_group!(
     #[non_exhaustive]
     pub struct DbMetrics {

From f938df21efa4cb915c563403e5767338efd0fdb3 Mon Sep 17 00:00:00 2001
From: Phoebe Goldman <phoebe@goldman-tribe.org>
Date: Mon, 27 Jan 2025 11:49:00 -0500
Subject: [PATCH 08/11] First pass at testing

Slow reconstructions of `num_rows` and `bytes_used_by_rows`.

Still to follow: index usage reporting.
---
 crates/sats/src/proptest.rs | 12 ++++++++
 crates/table/src/page.rs    | 56 +++++++++++++++++++++++++++++++++++++
 crates/table/src/table.rs   | 38 ++++++++++++++++++++++++-
 3 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/crates/sats/src/proptest.rs b/crates/sats/src/proptest.rs
index 2b0c883b92b..11f09aca2bc 100644
--- a/crates/sats/src/proptest.rs
+++ b/crates/sats/src/proptest.rs
@@ -207,6 +207,18 @@ pub fn generate_typed_row() -> impl Strategy<Value = (ProductType, ProductValue)
     generate_row_type(0..=SIZE).prop_flat_map(|ty| (Just(ty.clone()), generate_product_value(ty)))
 }
 
+pub fn generate_typed_row_vec(
+    num_rows_min: usize,
+    num_rows_max: usize,
+) -> impl Strategy<Value = (ProductType, Vec<ProductValue>)> {
+    generate_row_type(0..=SIZE).prop_flat_map(move |ty| {
+        (
+            Just(ty.clone()),
+            vec(generate_product_value(ty), num_rows_min..num_rows_max),
+        )
+    })
+}
+
 /// Generates a type `ty` and a value typed at `ty`.
 pub fn generate_typed_value() -> impl Strategy<Value = (AlgebraicType, AlgebraicValue)> {
     generate_algebraic_type().prop_flat_map(|ty| (Just(ty.clone()), generate_algebraic_value(ty)))
diff --git a/crates/table/src/page.rs b/crates/table/src/page.rs
index 17dd24c97b5..b7069cd47f3 100644
--- a/crates/table/src/page.rs
+++ b/crates/table/src/page.rs
@@ -1129,6 +1129,16 @@ impl Page {
         self.header.fixed.num_rows as usize
     }
 
+    #[cfg(test)]
+    /// Use this page's present rows bitvec to compute the number of present rows.
+    ///
+    /// This can be compared with [`Self::num_rows`] as a consistency check during tests.
+    pub fn reconstruct_num_rows(&self) -> usize {
+        // If we cared, we could rewrite this to `u64::count_ones` on each block of the bitset.
+        // We do not care. This method is slow.
+        self.header.fixed.present_rows.iter_set().count()
+    }
+
     /// Returns the number of var-len granules allocated in this page.
     ///
     /// This method runs in constant time.
@@ -1136,6 +1146,34 @@ impl Page {
         self.header.var.num_granules as usize
     }
 
+    #[cfg(test)]
+    /// # Safety
+    ///
+    /// - `var_len_visitor` must be a valid [`VarLenMembers`] visitor
+    ///   specialized to the type and layout of rows within this [`Page`].
+    /// - `fixed_row_size` must be exactly the length in bytes of fixed rows in this page,
+    ///   which must further be the length of rows expected by the `var_len_visitor`.
+    pub unsafe fn reconstruct_num_var_len_granules(
+        &self,
+        fixed_row_size: Size,
+        var_len_visitor: &impl VarLenMembers,
+    ) -> usize {
+        self.iter_fixed_len(fixed_row_size)
+            .flat_map(|row| unsafe {
+                // Safety: `row` came out of `iter_fixed_len`,
+                // which, due to caller requirements on `fixed_row_size`,
+                // is giving us valid, aligned, initialized rows of the row type.
+                var_len_visitor.visit_var_len(self.get_row_data(row, fixed_row_size))
+            })
+            .flat_map(|var_len_obj| unsafe {
+                // Safety: We believe `row` to be valid
+                // and `var_len_visitor` to be correctly visiting its var-len members.
+                // Therefore, `var_len_obj` is a valid var-len object.
+                self.iter_var_len_object(var_len_obj.first_granule)
+            })
+            .count()
+    }
+
     /// Returns the number of bytes used by rows stored in this page.
     ///
     /// This is necessarily an overestimate of live data bytes, as it includes:
@@ -1155,6 +1193,24 @@ impl Page {
         fixed_row_bytes + var_len_bytes
     }
 
+    #[cfg(test)]
+    /// # Safety
+    ///
+    /// - `var_len_visitor` must be a valid [`VarLenMembers`] visitor
+    ///   specialized to the type and layout of rows within this [`Page`].
+    /// - `fixed_row_size` must be exactly the length in bytes of fixed rows in this page,
+    ///   which must further be the length of rows expected by the `var_len_visitor`.
+    pub unsafe fn reconstruct_bytes_used_by_rows(
+        &self,
+        fixed_row_size: Size,
+        var_len_visitor: &impl VarLenMembers,
+    ) -> usize {
+        let fixed_row_bytes = self.reconstruct_num_rows() * fixed_row_size.len();
+        let var_len_bytes = unsafe { self.reconstruct_num_var_len_granules(fixed_row_size, var_len_visitor) }
+            * VarLenGranule::SIZE.len();
+        fixed_row_bytes + var_len_bytes
+    }
+
     /// Returns the range of row data starting at `offset` and lasting `size` bytes.
     pub fn get_row_data(&self, row: PageOffset, size: Size) -> &Bytes {
         &self.row_data[row.range(size)]
diff --git a/crates/table/src/table.rs b/crates/table/src/table.rs
index ef109d16cb2..f5811cbaf32 100644
--- a/crates/table/src/table.rs
+++ b/crates/table/src/table.rs
@@ -1088,6 +1088,11 @@ impl Table {
         self.pages().iter().map(|page| page.num_rows() as u64).sum()
     }
 
+    #[cfg(test)]
+    fn reconstruct_num_rows(&self) -> u64 {
+        self.pages().iter().map(|page| page.reconstruct_num_rows() as u64).sum()
+    }
+
     /// Returns the number of bytes used by rows resident in this table.
     ///
     /// This includes data bytes, padding bytes and some overhead bytes,
@@ -1110,6 +1115,18 @@ impl Table {
             .sum()
     }
 
+    #[cfg(test)]
+    fn reconstruct_bytes_used_by_rows(&self) -> u64 {
+        self.pages()
+            .iter()
+            .map(|page| unsafe {
+                // Safety: `page` is in `self`, and was constructed using `self.innser.row_layout` and `self.inner.visitor_prog`,
+                // so the three are mutually consistent.
+                page.reconstruct_bytes_used_by_rows(self.inner.row_layout.size(), &self.inner.visitor_prog)
+            } as u64)
+            .sum()
+    }
+
     /// Returns the number of rows (or [`RowPointer`]s, more accurately)
     /// stored in indexes by this table.
     ///
@@ -1701,7 +1718,7 @@ pub(crate) mod test {
     use spacetimedb_lib::db::raw_def::v9::{RawIndexAlgorithm, RawModuleDefV9Builder};
     use spacetimedb_primitives::{col_list, TableId};
     use spacetimedb_sats::bsatn::to_vec;
-    use spacetimedb_sats::proptest::generate_typed_row;
+    use spacetimedb_sats::proptest::{generate_typed_row, generate_typed_row_vec};
     use spacetimedb_sats::{product, AlgebraicType, ArrayValue};
     use spacetimedb_schema::def::ModuleDef;
     use spacetimedb_schema::schema::Schema as _;
@@ -1907,6 +1924,25 @@ pub(crate) mod test {
             prop_assert_eq!(bs_pv, bs_bsatn);
             prop_assert_eq!(table_pv, table_bsatn);
         }
+
+        #[test]
+        fn row_size_reporting_matches_slow_implementations((ty, vals) in generate_typed_row_vec(128, 2048)) {
+            let mut blob_store = HashMapBlobStore::default();
+            let mut table = table(ty.clone());
+
+            for row in vals {
+                prop_assume!(table.insert(&mut blob_store, &row).is_ok());
+            }
+
+            prop_assert_eq!(table.bytes_used_by_rows(), table.reconstruct_bytes_used_by_rows());
+            prop_assert_eq!(table.num_rows(), table.reconstruct_num_rows());
+
+            // TODO(testing): Determine if there's a meaningful way to test that the blob store reporting is correct.
+            // I (pgoldman 2025-01-27) doubt it, as the test would be "visit every blob and sum their size,"
+            // which is already what the actual implementation does.
+
+            // TODO(testing): Put one or more indexes on `table` and verify that they report the right usage.
+        }
     }
 
     fn insert_bsatn<'a>(

From 7b4a4e54de7eb9a1a91720c2236800e5264aa1c1 Mon Sep 17 00:00:00 2001
From: Phoebe Goldman <phoebe@goldman-tribe.org>
Date: Mon, 27 Jan 2025 14:44:42 -0500
Subject: [PATCH 09/11] Test that single-column indexes report usage as
 expected

---
 crates/table/src/table.rs | 66 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 3 deletions(-)

diff --git a/crates/table/src/table.rs b/crates/table/src/table.rs
index f5811cbaf32..e48fb6a1c12 100644
--- a/crates/table/src/table.rs
+++ b/crates/table/src/table.rs
@@ -1836,6 +1836,19 @@ pub(crate) mod test {
         insert_retrieve_body(ty, AlgebraicValue::from(arr)).unwrap();
     }
 
+    fn reconstruct_index_num_key_bytes(table: &Table, blob_store: &dyn BlobStore, index_id: IndexId) -> u64 {
+        let index = table.get_index_by_id(index_id).unwrap();
+
+        index
+            .seek(&(..))
+            .map(|row_ptr| {
+                let row_ref = table.get_row_ref(blob_store, row_ptr).unwrap();
+                let key = row_ref.project(&index.indexed_columns).unwrap();
+                crate::btree_index::KeySize::key_size_in_bytes(&key) as u64
+            })
+            .sum()
+    }
+
     proptest! {
         #![proptest_config(ProptestConfig { max_shrink_iters: 0x10000000, ..Default::default() })]
 
@@ -1930,18 +1943,65 @@ pub(crate) mod test {
             let mut blob_store = HashMapBlobStore::default();
             let mut table = table(ty.clone());
 
-            for row in vals {
-                prop_assume!(table.insert(&mut blob_store, &row).is_ok());
+            for row in &vals {
+                prop_assume!(table.insert(&mut blob_store, row).is_ok());
             }
 
             prop_assert_eq!(table.bytes_used_by_rows(), table.reconstruct_bytes_used_by_rows());
             prop_assert_eq!(table.num_rows(), table.reconstruct_num_rows());
+            prop_assert_eq!(table.num_rows(), vals.len() as u64);
 
             // TODO(testing): Determine if there's a meaningful way to test that the blob store reporting is correct.
             // I (pgoldman 2025-01-27) doubt it, as the test would be "visit every blob and sum their size,"
             // which is already what the actual implementation does.
+        }
+
+        #[test]
+        fn index_size_reporting_matches_slow_implementations_single_column((ty, vals) in generate_typed_row_vec(128, 2048)) {
+            let mut blob_store = HashMapBlobStore::default();
+            let mut table = table(ty.clone());
+
+            for row in &vals {
+                prop_assume!(table.insert(&mut blob_store, row).is_ok());
+            }
+
+            // We haven't added any indexes yet, so there should be 0 rows in indexes.
+            prop_assert_eq!(table.num_rows_in_indexes(), 0);
+
+            let index_id = IndexId(0);
+
+            // Add an index on column 0.
+            table.insert_index(&mut blob_store, index_id, BTreeIndex::new(&ty, ColList::from(ColId(0)), false).unwrap());
+
+            // We have one index, which should be fully populated,
+            // so in total we should have the same number of rows in indexes as we have rows.
+            prop_assert_eq!(table.num_rows_in_indexes(), table.num_rows());
+
+            let index = table.get_index_by_id(index_id).unwrap();
+
+            // One index, so table's reporting of bytes used should match that index's reporting.
+            prop_assert_eq!(table.bytes_used_by_index_keys(), index.num_key_bytes());
+
+            // Walk all the rows in the index, sum their key size,
+            // and assert it matches the `index.num_key_bytes()`
+            prop_assert_eq!(
+                index.num_key_bytes(),
+                reconstruct_index_num_key_bytes(&table, &blob_store, index_id)
+            );
+
+            // Walk all the rows we inserted, project them to the cols that will be their keys,
+            // sum their key size,
+            // and assert it matches the `index.num_key_bytes()`
+            let key_size_in_pvs = vals.iter().map(|row| {
+                crate::btree_index::KeySize::key_size_in_bytes(&row.elements[0]) as u64
+            }).sum();
+            prop_assert_eq!(index.num_key_bytes(), key_size_in_pvs);
+
+            // Add a duplicate of the same index, so we can check that all above quantities double.
+            table.insert_index(&mut blob_store, IndexId(1), BTreeIndex::new(&ty, ColList::from(ColId(0)), false).unwrap());
 
-            // TODO(testing): Put one or more indexes on `table` and verify that they report the right usage.
+            prop_assert_eq!(table.num_rows_in_indexes(), table.num_rows() * 2);
+            prop_assert_eq!(table.bytes_used_by_index_keys(), key_size_in_pvs * 2);
         }
     }
 

From 93cbda0b97cbd118a5cd513952426fc5ce9ae4de Mon Sep 17 00:00:00 2001
From: Phoebe Goldman <phoebe@goldman-tribe.org>
Date: Mon, 27 Jan 2025 16:49:24 -0500
Subject: [PATCH 10/11] Also test for two-column indexes

---
 crates/table/src/table.rs | 113 ++++++++++++++++++++++++--------------
 1 file changed, 73 insertions(+), 40 deletions(-)

diff --git a/crates/table/src/table.rs b/crates/table/src/table.rs
index e48fb6a1c12..6888f9b1039 100644
--- a/crates/table/src/table.rs
+++ b/crates/table/src/table.rs
@@ -1849,6 +1849,72 @@ pub(crate) mod test {
             .sum()
     }
 
+    /// Given a row type `ty`, a set of rows of that type `vals`,
+    /// and a set of columns within that type `indexed_columns`,
+    /// populate a table with `vals`, add an index on the `indexed_columns`,
+    /// and perform various assertions that the reported index size metrics are correct.
+    fn test_index_size_reporting(
+        ty: ProductType,
+        vals: Vec<ProductValue>,
+        indexed_columns: ColList,
+    ) -> Result<(), TestCaseError> {
+        let mut blob_store = HashMapBlobStore::default();
+        let mut table = table(ty.clone());
+
+        for row in &vals {
+            prop_assume!(table.insert(&mut blob_store, row).is_ok());
+        }
+
+        // We haven't added any indexes yet, so there should be 0 rows in indexes.
+        prop_assert_eq!(table.num_rows_in_indexes(), 0);
+
+        let index_id = IndexId(0);
+
+        // Add an index on column 0.
+        table.insert_index(
+            &blob_store,
+            index_id,
+            BTreeIndex::new(&ty, indexed_columns.clone(), false).unwrap(),
+        );
+
+        // We have one index, which should be fully populated,
+        // so in total we should have the same number of rows in indexes as we have rows.
+        prop_assert_eq!(table.num_rows_in_indexes(), table.num_rows());
+
+        let index = table.get_index_by_id(index_id).unwrap();
+
+        // One index, so table's reporting of bytes used should match that index's reporting.
+        prop_assert_eq!(table.bytes_used_by_index_keys(), index.num_key_bytes());
+
+        // Walk all the rows in the index, sum their key size,
+        // and assert it matches the `index.num_key_bytes()`
+        prop_assert_eq!(
+            index.num_key_bytes(),
+            reconstruct_index_num_key_bytes(&table, &blob_store, index_id)
+        );
+
+        // Walk all the rows we inserted, project them to the cols that will be their keys,
+        // sum their key size,
+        // and assert it matches the `index.num_key_bytes()`
+        let key_size_in_pvs = vals
+            .iter()
+            .map(|row| crate::btree_index::KeySize::key_size_in_bytes(&row.project(&indexed_columns).unwrap()) as u64)
+            .sum();
+        prop_assert_eq!(index.num_key_bytes(), key_size_in_pvs);
+
+        // Add a duplicate of the same index, so we can check that all above quantities double.
+        table.insert_index(
+            &blob_store,
+            IndexId(1),
+            BTreeIndex::new(&ty, indexed_columns, false).unwrap(),
+        );
+
+        prop_assert_eq!(table.num_rows_in_indexes(), table.num_rows() * 2);
+        prop_assert_eq!(table.bytes_used_by_index_keys(), key_size_in_pvs * 2);
+
+        Ok(())
+    }
+
     proptest! {
         #![proptest_config(ProptestConfig { max_shrink_iters: 0x10000000, ..Default::default() })]
 
@@ -1958,50 +2024,17 @@ pub(crate) mod test {
 
         #[test]
         fn index_size_reporting_matches_slow_implementations_single_column((ty, vals) in generate_typed_row_vec(128, 2048)) {
-            let mut blob_store = HashMapBlobStore::default();
-            let mut table = table(ty.clone());
-
-            for row in &vals {
-                prop_assume!(table.insert(&mut blob_store, row).is_ok());
-            }
+            prop_assume!(!ty.elements.is_empty());
 
-            // We haven't added any indexes yet, so there should be 0 rows in indexes.
-            prop_assert_eq!(table.num_rows_in_indexes(), 0);
-
-            let index_id = IndexId(0);
-
-            // Add an index on column 0.
-            table.insert_index(&mut blob_store, index_id, BTreeIndex::new(&ty, ColList::from(ColId(0)), false).unwrap());
-
-            // We have one index, which should be fully populated,
-            // so in total we should have the same number of rows in indexes as we have rows.
-            prop_assert_eq!(table.num_rows_in_indexes(), table.num_rows());
-
-            let index = table.get_index_by_id(index_id).unwrap();
-
-            // One index, so table's reporting of bytes used should match that index's reporting.
-            prop_assert_eq!(table.bytes_used_by_index_keys(), index.num_key_bytes());
-
-            // Walk all the rows in the index, sum their key size,
-            // and assert it matches the `index.num_key_bytes()`
-            prop_assert_eq!(
-                index.num_key_bytes(),
-                reconstruct_index_num_key_bytes(&table, &blob_store, index_id)
-            );
+            test_index_size_reporting(ty, vals, ColList::from(ColId(0)))?;
+        }
 
-            // Walk all the rows we inserted, project them to the cols that will be their keys,
-            // sum their key size,
-            // and assert it matches the `index.num_key_bytes()`
-            let key_size_in_pvs = vals.iter().map(|row| {
-                crate::btree_index::KeySize::key_size_in_bytes(&row.elements[0]) as u64
-            }).sum();
-            prop_assert_eq!(index.num_key_bytes(), key_size_in_pvs);
+        #[test]
+        fn index_size_reporting_matches_slow_implementations_two_column((ty, vals) in generate_typed_row_vec(128, 2048)) {
+            prop_assume!(ty.elements.len() >= 2);
 
-            // Add a duplicate of the same index, so we can check that all above quantities double.
-            table.insert_index(&mut blob_store, IndexId(1), BTreeIndex::new(&ty, ColList::from(ColId(0)), false).unwrap());
 
-            prop_assert_eq!(table.num_rows_in_indexes(), table.num_rows() * 2);
-            prop_assert_eq!(table.bytes_used_by_index_keys(), key_size_in_pvs * 2);
+            test_index_size_reporting(ty, vals, ColList::from([ColId(0), ColId(1)]))?;
         }
     }
 

From 81906253adeba36f8b43f24fed6534e220223b82 Mon Sep 17 00:00:00 2001
From: Phoebe Goldman <phoebe@goldman-tribe.org>
Date: Tue, 28 Jan 2025 10:32:09 -0500
Subject: [PATCH 11/11] Add TODO note in response to jeff's comment

---
 crates/core/src/db/datastore/locking_tx_datastore/datastore.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crates/core/src/db/datastore/locking_tx_datastore/datastore.rs b/crates/core/src/db/datastore/locking_tx_datastore/datastore.rs
index 52ff117033c..8ab90f2b07b 100644
--- a/crates/core/src/db/datastore/locking_tx_datastore/datastore.rs
+++ b/crates/core/src/db/datastore/locking_tx_datastore/datastore.rs
@@ -686,6 +686,8 @@ pub(super) fn record_metrics(
     }
 
     if let Some(committed_state) = committed_state {
+        // TODO(cleanliness,bikeshedding): Consider inlining `report_data_size` here,
+        // or moving the above metric writes into it, for consistency of organization.
         committed_state.report_data_size(*db);
     }
 }