From 36867e052d0bf92ee21427c3b1e7481d43ecb232 Mon Sep 17 00:00:00 2001 From: James Ross Date: Sat, 7 Feb 2026 14:26:31 -0800 Subject: [PATCH 1/2] =?UTF-8?q?feat(echo-cas):=20add=20content-addressed?= =?UTF-8?q?=20blob=20store=20crate=20(Phase=201=20=E2=80=94=20MemoryTier)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces echo-cas, a leaf crate providing the BlobStore trait and an in-memory implementation (MemoryTier) for content-addressed storage keyed by BLAKE3 hash. Sufficient for the in-browser website demo; disk/cold tiers and GC are deferred to Phase 3. --- Cargo.lock | 8 + Cargo.toml | 2 + crates/echo-cas/Cargo.toml | 19 ++ crates/echo-cas/README.md | 6 + crates/echo-cas/src/lib.rs | 145 +++++++++++++++ crates/echo-cas/src/memory.rs | 320 ++++++++++++++++++++++++++++++++++ 6 files changed, 500 insertions(+) create mode 100644 crates/echo-cas/Cargo.toml create mode 100644 crates/echo-cas/README.md create mode 100644 crates/echo-cas/src/lib.rs create mode 100644 crates/echo-cas/src/memory.rs diff --git a/Cargo.lock b/Cargo.lock index c4accd0b..6a7eda0b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1131,6 +1131,14 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "echo-cas" +version = "0.1.0" +dependencies = [ + "blake3", + "thiserror 1.0.69", +] + [[package]] name = "echo-config-fs" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 22362899..9a50f9be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ members = [ "crates/echo-wasm-bindings", "crates/echo-wesley-gen", "crates/echo-dry-tests", + "crates/echo-cas", "xtask" ] resolver = "2" @@ -35,6 +36,7 @@ rust-version = "1.90.0" [workspace.dependencies] echo-app-core = { version = "0.1.0", path = "crates/echo-app-core" } +echo-cas = { version = "0.1.0", path = "crates/echo-cas" } echo-config-fs = { version = "0.1.0", path = "crates/echo-config-fs" } echo-dind-tests = { version = "0.1.0", path = "crates/echo-dind-tests" } echo-dry-tests = { version = "0.1.0", path = "crates/echo-dry-tests" } diff --git a/crates/echo-cas/Cargo.toml b/crates/echo-cas/Cargo.toml new file mode 100644 index 00000000..dbe6585d --- /dev/null +++ b/crates/echo-cas/Cargo.toml @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +# © James Ross Ω FLYING•ROBOTS +[package] +name = "echo-cas" +version = "0.1.0" +edition = "2021" +license.workspace = true +repository.workspace = true +rust-version.workspace = true +description = "Content-addressed blob store for Echo" +readme = "README.md" +keywords = ["echo", "cas", "content-addressed"] +categories = ["data-structures"] + +[dependencies] +blake3 = "1.5" +thiserror = "1" + +[dev-dependencies] diff --git a/crates/echo-cas/README.md b/crates/echo-cas/README.md new file mode 100644 index 00000000..f7da6e92 --- /dev/null +++ b/crates/echo-cas/README.md @@ -0,0 +1,6 @@ + + + +# echo-cas + +Content-addressed blob store for Echo. diff --git a/crates/echo-cas/src/lib.rs b/crates/echo-cas/src/lib.rs new file mode 100644 index 00000000..b55cc680 --- /dev/null +++ b/crates/echo-cas/src/lib.rs @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: Apache-2.0 +// © James Ross Ω FLYING•ROBOTS +//! Content-addressed blob store for Echo. +//! +//! `echo-cas` provides a [`BlobStore`] trait for content-addressed storage keyed by +//! BLAKE3 hash. Phase 1 ships [`MemoryTier`] — sufficient for the in-browser website +//! demo. Disk/cold tiers, wire protocol, and GC come in Phase 3. +//! +//! # Hash Domain Policy +//! +//! CAS hash is content-only: `BLAKE3(bytes)` with no domain prefix. Two blobs with +//! identical bytes are the same CAS blob regardless of semantic type. This is by +//! design — deduplication is a feature, not a bug. Domain separation happens at the +//! typed-reference layer above (`TypedRef`: `schema_hash` + `type_id` + `layout_hash` + +//! `value_hash`). +//! +//! # Determinism Invariant +//! +//! No public API exposes store iteration order. CAS determinism is content-level +//! (same bytes → same hash), not collection-level. Any future `list`/`iter` API must +//! return results sorted by [`BlobHash`]. +#![forbid(unsafe_code)] +#![deny(missing_docs, rust_2018_idioms, unused_must_use)] +#![deny( + clippy::all, + clippy::pedantic, + clippy::nursery, + clippy::cargo, + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::todo, + clippy::unimplemented, + clippy::dbg_macro, + clippy::print_stdout, + clippy::print_stderr +)] +#![allow( + clippy::must_use_candidate, + clippy::return_self_not_must_use, + clippy::unreadable_literal, + clippy::missing_const_for_fn, + clippy::suboptimal_flops, + clippy::redundant_pub_crate, + clippy::many_single_char_names, + clippy::module_name_repetitions, + clippy::use_self +)] + +mod memory; +pub use memory::MemoryTier; + +use std::sync::Arc; + +/// A 32-byte BLAKE3 content hash. +/// +/// Thin newtype over `[u8; 32]` following the `NodeId`/`TypeId` pattern from +/// `warp-core`. The inner bytes are public for zero-cost access; the `Display` +/// impl renders lowercase hex for logging and error messages. +#[repr(transparent)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] +pub struct BlobHash(pub [u8; 32]); + +impl BlobHash { + /// View the hash as a byte slice. + pub fn as_bytes(&self) -> &[u8; 32] { + &self.0 + } +} + +impl std::fmt::Display for BlobHash { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + for byte in &self.0 { + write!(f, "{byte:02x}")?; + } + Ok(()) + } +} + +/// Compute the BLAKE3 content hash of `bytes`. +/// +/// No domain prefix — the content IS the identity. See module-level docs for +/// hash domain policy. +pub fn blob_hash(bytes: &[u8]) -> BlobHash { + let hash = blake3::hash(bytes); + BlobHash(*hash.as_bytes()) +} + +/// Errors that can occur during CAS operations. +#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)] +pub enum CasError { + /// Blob bytes did not match the declared hash. + #[error("[CAS_HASH_MISMATCH] expected {expected}, computed {computed}")] + HashMismatch { + /// The hash that was declared/expected. + expected: BlobHash, + /// The hash actually computed from the bytes. + computed: BlobHash, + }, +} + +/// Content-addressed blob store. +/// +/// Implementations store opaque byte blobs keyed by their BLAKE3 hash. The trait +/// is intentionally synchronous and object-safe for Phase 1. Async methods will be +/// added (likely as a separate `AsyncBlobStore` trait) when disk/network tiers +/// demand it. +/// +/// # Absence Semantics +/// +/// [`get`](BlobStore::get) returns `None` for missing blobs — this is **not** an +/// error. CAS is a lookup table: missing blobs are expected (not-yet-fetched, +/// GC'd, never stored). Error variants are reserved for integrity violations. +pub trait BlobStore { + /// Compute hash and store. Returns the content hash. + fn put(&mut self, bytes: &[u8]) -> BlobHash; + + /// Store with a pre-computed hash. Rejects if `BLAKE3(bytes) != expected`. + /// + /// On mismatch the store is unchanged and a [`CasError::HashMismatch`] is + /// returned. This method exists for receivers of `WANT`/`PROVIDE` messages + /// who already possess the hash. + /// + /// # Errors + /// + /// Returns [`CasError::HashMismatch`] if the computed hash differs from + /// `expected`. + fn put_verified(&mut self, expected: BlobHash, bytes: &[u8]) -> Result<(), CasError>; + + /// Retrieve blob by hash. Returns `None` if not stored — absence is not an + /// error. + fn get(&self, hash: &BlobHash) -> Option>; + + /// Check existence without retrieving. + fn has(&self, hash: &BlobHash) -> bool; + + /// Mark hash as a retention root. + /// + /// Legal on missing blobs (pre-pin intent). Pin semantics are set-based (not + /// reference-counted) in Phase 1. + fn pin(&mut self, hash: &BlobHash); + + /// Remove retention root. No-op if not pinned or not stored. + fn unpin(&mut self, hash: &BlobHash); +} diff --git a/crates/echo-cas/src/memory.rs b/crates/echo-cas/src/memory.rs new file mode 100644 index 00000000..eb10f60e --- /dev/null +++ b/crates/echo-cas/src/memory.rs @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: Apache-2.0 +// © James Ross Ω FLYING•ROBOTS +//! In-memory content-addressed blob store. +//! +//! [`MemoryTier`] is the Phase 1 `BlobStore` implementation — sufficient for the +//! in-browser website demo (single tab, no persistence). Disk and cold tiers are +//! deferred to Phase 3. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use crate::{blob_hash, BlobHash, BlobStore, CasError}; + +/// In-memory content-addressed blob store. +/// +/// Stores blobs in a `HashMap>` and tracks a pin-set for +/// retention roots. An optional byte budget is advisory — `put` always succeeds +/// but [`is_over_budget`](MemoryTier::is_over_budget) reports when the budget is +/// exceeded. Enforcement (eviction of unpinned blobs) is Phase 3 GC's job. +/// +/// # Pinning Invariants +/// +/// - `pin` on a missing blob is legal (records intent before the blob arrives). +/// - `put` of a pre-pinned hash preserves the pin. +/// - `unpin` on a missing blob is a no-op. +/// - Pin count is set cardinality, not reference count. +pub struct MemoryTier { + blobs: HashMap>, + pins: HashSet, + byte_count: usize, + max_bytes: Option, +} + +impl MemoryTier { + /// Create an empty store with no byte limit. + pub fn new() -> Self { + Self { + blobs: HashMap::new(), + pins: HashSet::new(), + byte_count: 0, + max_bytes: None, + } + } + + /// Create an empty store with an advisory byte budget. + /// + /// When the budget is exceeded, [`is_over_budget`](MemoryTier::is_over_budget) + /// returns `true`. Puts still succeed — enforcement is deferred to Phase 3 GC. + pub fn with_limits(max_bytes: usize) -> Self { + Self { + blobs: HashMap::new(), + pins: HashSet::new(), + byte_count: 0, + max_bytes: Some(max_bytes), + } + } + + /// Number of blobs currently stored. + pub fn len(&self) -> usize { + self.blobs.len() + } + + /// Returns `true` if no blobs are stored. + pub fn is_empty(&self) -> bool { + self.blobs.is_empty() + } + + /// Returns `true` if the given hash is in the pin-set. + pub fn is_pinned(&self, hash: &BlobHash) -> bool { + self.pins.contains(hash) + } + + /// Number of hashes in the pin-set. + pub fn pinned_count(&self) -> usize { + self.pins.len() + } + + /// Total bytes stored across all blobs. + pub fn byte_count(&self) -> usize { + self.byte_count + } + + /// Returns `true` if `byte_count` exceeds the configured budget. + /// + /// Always returns `false` if no budget was set. + pub fn is_over_budget(&self) -> bool { + self.max_bytes.is_some_and(|max| self.byte_count > max) + } +} + +impl Default for MemoryTier { + fn default() -> Self { + Self::new() + } +} + +impl BlobStore for MemoryTier { + fn put(&mut self, bytes: &[u8]) -> BlobHash { + let hash = blob_hash(bytes); + if self.blobs.contains_key(&hash) { + return hash; + } + self.byte_count += bytes.len(); + self.blobs.insert(hash, Arc::from(bytes)); + hash + } + + fn put_verified(&mut self, expected: BlobHash, bytes: &[u8]) -> Result<(), CasError> { + let computed = blob_hash(bytes); + if computed != expected { + return Err(CasError::HashMismatch { expected, computed }); + } + if !self.blobs.contains_key(&computed) { + self.byte_count += bytes.len(); + self.blobs.insert(computed, Arc::from(bytes)); + } + Ok(()) + } + + fn get(&self, hash: &BlobHash) -> Option> { + self.blobs.get(hash).cloned() + } + + fn has(&self, hash: &BlobHash) -> bool { + self.blobs.contains_key(hash) + } + + fn pin(&mut self, hash: &BlobHash) { + self.pins.insert(*hash); + } + + fn unpin(&mut self, hash: &BlobHash) { + self.pins.remove(hash); + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + + // ── 1. put + get round-trip ────────────────────────────────────────── + + #[test] + fn put_get_round_trip() { + let mut store = MemoryTier::new(); + let data = b"hello echo-cas"; + let hash = store.put(data); + let got = store.get(&hash); + assert!(got.is_some()); + assert_eq!(&*got.unwrap(), data); + } + + // ── 2. put_verified rejects hash mismatch ─────────────────────────── + + #[test] + fn put_verified_rejects_mismatch() { + let mut store = MemoryTier::new(); + let bad_hash = BlobHash([0xFF; 32]); + let result = store.put_verified(bad_hash, b"some bytes"); + assert!(result.is_err()); + let err = result.unwrap_err(); + match err { + CasError::HashMismatch { expected, .. } => { + assert_eq!(expected, bad_hash); + } + } + } + + // ── 3. put_verified mismatch does NOT mutate store ────────────────── + + #[test] + fn put_verified_mismatch_leaves_store_unchanged() { + let mut store = MemoryTier::new(); + let bad_hash = BlobHash([0xFF; 32]); + let _ = store.put_verified(bad_hash, b"should not be stored"); + assert_eq!(store.len(), 0); + assert_eq!(store.byte_count(), 0); + } + + // ── 4. has returns false for missing, true for stored ──────────────── + + #[test] + fn has_missing_and_present() { + let mut store = MemoryTier::new(); + let hash = blob_hash(b"test"); + assert!(!store.has(&hash)); + store.put(b"test"); + assert!(store.has(&hash)); + } + + // ── 5. put idempotence ────────────────────────────────────────────── + + #[test] + fn put_idempotence() { + let mut store = MemoryTier::new(); + let h1 = store.put(b"duplicate"); + let h2 = store.put(b"duplicate"); + assert_eq!(h1, h2); + assert_eq!(store.len(), 1); + } + + // ── 6. pre-pin then put ───────────────────────────────────────────── + + #[test] + fn pre_pin_then_put() { + let mut store = MemoryTier::new(); + let hash = blob_hash(b"arriving later"); + // Pin before the blob exists. + store.pin(&hash); + assert!(store.is_pinned(&hash)); + assert!(!store.has(&hash)); + // Now store the blob. + let stored_hash = store.put(b"arriving later"); + assert_eq!(hash, stored_hash); + // Pin must survive the put. + assert!(store.is_pinned(&hash)); + assert!(store.has(&hash)); + } + + // ── 7. pin/unpin lifecycle ────────────────────────────────────────── + + #[test] + fn pin_unpin_lifecycle() { + let mut store = MemoryTier::new(); + let hash = store.put(b"pinnable"); + assert!(!store.is_pinned(&hash)); + store.pin(&hash); + assert!(store.is_pinned(&hash)); + assert_eq!(store.pinned_count(), 1); + store.unpin(&hash); + assert!(!store.is_pinned(&hash)); + assert_eq!(store.pinned_count(), 0); + } + + // ── 8. unpin on missing blob = no-op ──────────────────────────────── + + #[test] + fn unpin_missing_is_noop() { + let mut store = MemoryTier::new(); + let hash = BlobHash([0xAA; 32]); + // Must not panic. + store.unpin(&hash); + assert!(!store.is_pinned(&hash)); + } + + // ── 9. get returns None for missing hash ──────────────────────────── + + #[test] + fn get_missing_returns_none() { + let store = MemoryTier::new(); + let hash = BlobHash([0xBB; 32]); + assert!(store.get(&hash).is_none()); + } + + // ── 10. empty store invariants ────────────────────────────────────── + + #[test] + fn empty_store_invariants() { + let store = MemoryTier::new(); + assert_eq!(store.len(), 0); + assert!(store.is_empty()); + assert_eq!(store.byte_count(), 0); + assert_eq!(store.pinned_count(), 0); + assert!(!store.is_over_budget()); + } + + // ── 11. byte_count tracks correctly across puts ───────────────────── + + #[test] + fn byte_count_tracking() { + let mut store = MemoryTier::new(); + store.put(b"aaaa"); // 4 bytes + assert_eq!(store.byte_count(), 4); + store.put(b"bbbbbb"); // 6 bytes + assert_eq!(store.byte_count(), 10); + // Duplicate put should NOT add bytes again. + store.put(b"aaaa"); + assert_eq!(store.byte_count(), 10); + } + + // ── 12. with_limits + is_over_budget ──────────────────────────────── + + #[test] + fn with_limits_and_over_budget() { + let mut store = MemoryTier::with_limits(10); + assert!(!store.is_over_budget()); + store.put(b"12345"); // 5 bytes, within budget + assert!(!store.is_over_budget()); + store.put(b"1234567"); // +7 = 12, over budget + assert!(store.is_over_budget()); + // Put still succeeds — budget is advisory. + assert_eq!(store.len(), 2); + } + + // ── 13. large blob smoke test ─────────────────────────────────────── + + #[test] + fn large_blob_round_trip() { + let mut store = MemoryTier::new(); + let big = vec![0x42u8; 8 * 1024 * 1024]; // 8 MiB + let hash = store.put(&big); + let got = store.get(&hash); + assert!(got.is_some()); + assert_eq!(got.unwrap().len(), 8 * 1024 * 1024); + // Verify the hash matches the free function. + assert_eq!(hash, blob_hash(&big)); + } + + // ── 14. put convenience returns correct hash ──────────────────────── + + #[test] + fn put_returns_correct_hash() { + let mut store = MemoryTier::new(); + let data = b"verify hash correctness"; + let expected = blob_hash(data); + let got = store.put(data); + assert_eq!(got, expected); + } +} From b02444626aa7e8175081fae0d76a88fc5a41d36e Mon Sep 17 00:00:00 2001 From: James Ross Date: Sat, 7 Feb 2026 15:55:24 -0800 Subject: [PATCH 2/2] fix(echo-cas): address PR review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Upgrade thiserror 1 → 2 (leaf crate, no compatibility concern) - Add put_verified happy-path test (store, retrieve, idempotence) - Add pin idempotence test (set semantics: double-pin keeps count at 1) - Document is_over_budget budget semantics (exclusive) and byte_count monotonic invariant (no removal until Phase 3 GC) - Remove empty [dev-dependencies] section - Update bytes 1.11.0 → 1.11.1 (RUSTSEC-2026-0007, workspace-wide) --- Cargo.lock | 6 +++--- crates/echo-cas/Cargo.toml | 4 +--- crates/echo-cas/src/memory.rs | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6a7eda0b..2d56a1cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -499,9 +499,9 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" [[package]] name = "bytes" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" dependencies = [ "serde", ] @@ -1136,7 +1136,7 @@ name = "echo-cas" version = "0.1.0" dependencies = [ "blake3", - "thiserror 1.0.69", + "thiserror 2.0.17", ] [[package]] diff --git a/crates/echo-cas/Cargo.toml b/crates/echo-cas/Cargo.toml index dbe6585d..bd62f5a1 100644 --- a/crates/echo-cas/Cargo.toml +++ b/crates/echo-cas/Cargo.toml @@ -14,6 +14,4 @@ categories = ["data-structures"] [dependencies] blake3 = "1.5" -thiserror = "1" - -[dev-dependencies] +thiserror = "2" diff --git a/crates/echo-cas/src/memory.rs b/crates/echo-cas/src/memory.rs index eb10f60e..f0ece338 100644 --- a/crates/echo-cas/src/memory.rs +++ b/crates/echo-cas/src/memory.rs @@ -80,7 +80,9 @@ impl MemoryTier { self.byte_count } - /// Returns `true` if `byte_count` exceeds the configured budget. + /// Returns `true` if `byte_count` exceeds the configured budget (exclusive: + /// exactly `max_bytes` is within budget). `byte_count` is monotonically + /// increasing in Phase 1 — there is no blob removal pathway until Phase 3 GC. /// /// Always returns `false` if no budget was set. pub fn is_over_budget(&self) -> bool { @@ -317,4 +319,34 @@ mod tests { let got = store.put(data); assert_eq!(got, expected); } + + // ── 15. put_verified happy path ───────────────────────────────────── + + #[test] + fn put_verified_happy_path() { + let mut store = MemoryTier::new(); + let data = b"verified blob"; + let hash = blob_hash(data); + assert!(store.put_verified(hash, data).is_ok()); + assert_eq!(store.len(), 1); + assert_eq!(store.byte_count(), data.len()); + assert_eq!(&*store.get(&hash).unwrap(), data); + // Idempotent: second call doesn't double-count. + assert!(store.put_verified(hash, data).is_ok()); + assert_eq!(store.len(), 1); + assert_eq!(store.byte_count(), data.len()); + } + + // ── 16. pin idempotence (set semantics) ───────────────────────────── + + #[test] + fn pin_idempotence() { + let mut store = MemoryTier::new(); + let hash = store.put(b"pin me twice"); + store.pin(&hash); + store.pin(&hash); + assert_eq!(store.pinned_count(), 1); + store.unpin(&hash); + assert!(!store.is_pinned(&hash)); + } }