mhambre · mhambre · May 29, 2026 · May 29, 2026 · May 31, 2026 · May 31, 2026
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -28,11 +28,5 @@ jobs:
         with:
           shared-key: cargo-${{ inputs.runner }}
 
-      - name: File feature integration tests
-        run: cargo nextest run --features utils,file
-
-      - name: HTTP feature integration tests
-        run: cargo nextest run --features utils,http
-
-      - name: Full feature matrix
-        run: cargo nextest run --all-features
+      # - name: Integration tests
+      #   run: cargo nextest run --tests integration
diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
@@ -18,37 +18,37 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
-      - uses: dtolnay/rust-toolchain@stable
-        with:
-          components: clippy
-
-      - uses: Swatinem/rust-cache@v2
-        with:
-          shared-key: cargo-${{ inputs.runner }}
-
-      - name: Install nightly rustfmt
-        run: rustup toolchain install nightly --profile minimal --component rustfmt
-
-      - name: Rustfmt
-        run: |
-          cargo +nightly fmt --all
-          if ! git diff --quiet; then
-            git status --short
-            git diff --stat
-            exit 1
-          fi
-
-      - name: Clippy
-        run: cargo clippy --all-features --all-targets -- -D warnings
-
-      - name: Feature compile checks
-        run: |
-          cargo check
-          cargo check --tests
-          cargo check --features file
-          cargo check --features http
-          cargo check --features debug
-          cargo check --all-features
-
-      - name: Docs.rs feature set
-        run: RUSTDOCFLAGS='--cfg docsrs' cargo doc --all-features --no-deps
+      # - uses: dtolnay/rust-toolchain@stable
+      #   with:
+      #     components: clippy
+      #
+      # - uses: Swatinem/rust-cache@v2
+      #   with:
+      #     shared-key: cargo-${{ inputs.runner }}
+      #
+      # - name: Install nightly rustfmt
+      #   run: rustup toolchain install nightly --profile minimal --component rustfmt
+      #
+      # - name: Rustfmt
+      #   run: |
+      #     cargo +nightly fmt --all
+      #     if ! git diff --quiet; then
+      #       git status --short
+      #       git diff --stat
+      #       exit 1
+      #     fi
+      #
+      # - name: Clippy
+      #   run: cargo clippy --all-features --all-targets -- -D warnings
+      #
+      # - name: Feature compile checks
+      #   run: |
+      #     cargo check
+      #     cargo check --tests
+      #     cargo check --features file
+      #     cargo check --features http
+      #     cargo check --features debug
+      #     cargo check --all-features
+      #
+      # - name: Docs.rs feature set
+      #   run: RUSTDOCFLAGS='--cfg docsrs' cargo doc --all-features --no-deps
diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml
@@ -18,21 +18,21 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
-      - uses: dtolnay/rust-toolchain@stable
-
-      - uses: taiki-e/install-action@v2
-        with:
-          tool: nextest
-
-      - uses: Swatinem/rust-cache@v2
-        with:
-          shared-key: cargo-${{ inputs.runner }}
-
-      - name: Unit tests
-        run: cargo nextest run --features utils
-
-      - name: Debug harness tests
-        run: cargo nextest run --features utils,debug
-
-      - name: Doc tests
-        run: cargo test --doc --all-features
+      # - uses: dtolnay/rust-toolchain@stable
+      #
+      # - uses: taiki-e/install-action@v2
+      #   with:
+      #     tool: nextest
+      #
+      # - uses: Swatinem/rust-cache@v2
+      #   with:
+      #     shared-key: cargo-${{ inputs.runner }}
+      #
+      # - name: Unit tests
+      #   run: cargo nextest run --features utils
+      #
+      # - name: Debug harness tests
+      #   run: cargo nextest run --features utils,debug
+      #
+      # - name: Doc tests
+      #   run: cargo test --doc --all-features
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,35 +12,3 @@ keywords = ["storage", "tokio", "fs", "filesystem", "stream"]
 readme = "README.md"
 license = "Apache-2.0"
 
-[features]
-debug = ["utils"]
-file = ["dep:libc", "utils"]
-http = ["dep:reqwest", "utils"]
-utils = ["dep:tempfile", "dep:tracing-subscriber", "dep:tracing"]
-
-[package.metadata.docs.rs]
-features = ["debug", "file", "http"]
-
-[[example]]
-name = "file_to_file"
-path = "examples/file_to_file.rs"
-required-features = ["file"]
-
-[dependencies]
-tokio = { version = "1.51.0", features = ["sync", "macros", "fs", "io-util", "rt-multi-thread", "time"] }
-futures = "0.3.32"
-bytes = "1.11.1"
-libc = { version = "0.2", optional = true }
-reqwest = { version = "0.12", optional = true, default-features = false, features = ["rustls-tls"] }
-tracing = { version = "0.1.44", default-features = false, features = ["std"], optional = true }
-tempfile = { version = "3.27.0", optional = true }
-tracing-subscriber = { version = "0.3.23", features = ["fmt", "ansi"], optional = true }
-
-[dev-dependencies]
-clap = { version = "4.6.0", features = ["derive"] }
-async-stream = "0.3.6"
-rand = "0.8.5"
-mockito = "1.7.2"
-tempfile = "3.27.0"
-tracing = { version = "0.1.44", default-features = false, features = ["std"] }
-tracing-subscriber = { version = "0.3.23", features = ["fmt", "ansi"] }
diff --git a/docs/CI.md b/docs/CI.md
@@ -55,31 +55,6 @@ This catches missing imports, cfg mistakes, and feature-gating regressions witho
 
 The unit workflow focuses on fast correctness checks that do not require the broader integration feature matrix.
 
-### Unit Tests
-
-- `cargo nextest run --features utils` runs the main unit-oriented test set with the `utils` feature enabled.
-
-### Debug Harness Tests
-
-- `cargo nextest run --features utils,debug` runs tests that require the debug harness feature set.
-
-### Doc Tests
-
-- `cargo test --doc --all-features` executes Rust documentation tests across the full feature set.
-
 ## Integration Workflow
 
 The integration workflow exercises feature-backed behavior and the broader end-to-end test matrix.
-
-### File Feature Integration Tests
-
-- `cargo nextest run --features utils,file` runs integration coverage for the file-backed reader and writer implementation.
-
-### HTTP Feature Integration Tests
-
-- `cargo nextest run --features utils,http` runs integration coverage for the HTTP reader implementation.
-
-### Full Feature Matrix
-
-- `cargo nextest run --all-features` runs the broadest integration-oriented test configuration.
-- This acts as the final end-to-end feature-combination check inside the integration workflow.
diff --git a/docs/architecture/API.md b/docs/architecture/API.md
@@ -0,0 +1,39 @@
+# Trait API
+
+## Reader
+
+The Reader component is responsible for reading data from the upstream data source. This could be an S3 bucket,
+a HuggingFace repository, a remote FTP server, or any other distant data source. The only requirement is the
+ability to read explicit byte ranges from the data source. As such the user must provide two functions to
+produce a functional Reader:
+
+- `func len() -> usize`: The number of bytes in the data source.
+- `func read(offset: usize, length: usize) -> io::Result<Bytes>`: Read a byte range from the data source.
+
+**Note**: It is up to the Developer to ensure access patterns made by the Reader are efficient for the underlying
+data source (i.e. managing rate limits, connection pooling, etc.). SparseIO will not attempt to optimize access patterns
+for the Reader to keep genericity and flexibility.
+
+## Writer
+
+The Writer component is responsible for writing data to the downstream cache in order to optimize access
+speeds on future reads. This could be a local disk, a remote cache server, or any other location where data
+can be written. As such the user is expected to provide three functions to produce a functional Writer:
+
+- `func write(key: &str, offset: usize, data: Bytes)`: Write a byte range to the cache.
+- `func read(key: &str, offset: usize, length: usize) -> Result<Option<Bytes>>`: Read a byte range from the cache.
+- `func delete(key: &str)`: Delete a key from the cache.
+
+## Metadata
+
+The Metadata component is responsible for keeping track of the data in the cache, the state of cache coverage for
+individual data sources, and other relevant metadata for the application. As such it is just a generic interface
+to a key-value store and the user is expected to provide three functions to produce a functional Metadata store:
+
+- `func get(key: &str) -> Result<Option<Bytes>>`: Get a value from the metadata store.
+- `func set(key: &str, value: Bytes)`: Set a value in the metadata store.
+- `func delete(key: &str)`: Delete a key from the metadata store.
+
+## Sample User Application Diagram
+
+<img src="../static/sparseio-sample-implementation.png" alt="User implementation architecture diagram" width="1000"/>
diff --git a/docs/architecture/CAS.md b/docs/architecture/CAS.md
@@ -0,0 +1,29 @@
+# Content-Addressable Storage (CAS)
+
+In order to optimize our cache storage efficiency we utilize a content-addressable storage (CAS) system.
+Large storage services take advantage of this for files that may be duplicated with only partial differences.
+This can be especially useful for AI/ML workloads and is exactly how HuggingFace stores their models and
+datasets through their XET architecture. It also has its uses in storage of database backups, ISOs, and more.
+
+Below you can see a simple example of how CAS deduplicates data. In this example we have two documents that are
+mostly the same except for the middle paragraph. Rather than caching 6 chunks of data total (3 for each document),
+we are able to dedupe the first and last chunk, resulting in only 4 chunks of data being stored in our cache. In
+something like a SFT (Supervised Finetuned) model, this can be a huge space saver as large amounts of tensors may
+remain unchanged, or for full database backup where only a few records may have changed since the last backup.
+
+<img src="../static/sparseio-cas-split-diagram.png" alt="CAS Example" width="1200"/>
+
+In our architecture when a read request is initially made for a file we first check in our Metadata store to see
+if we have the data for that file at the requested offset, this is precisely why chunk lengths are declared immutable,
+we assert that you cannot construct a SparseIO instance from a metadata store tracking a different chunk size.
+If we don't have the chuk we retrieve the data using a Reader and calculate the SHA256 hash of the requested byte range
+and check if it exists already in our cache. If it does we can just map the key for that specific offset in the file to
+the hash of the respective chunk. Otherwise we use our Writer to write the chunk to cache while also mapping the key.
+This is where the content-addressable part comes in, we are using the hash of the content to map it as a location
+in our cache.
+
+The problem with this approach however is that we are unable to invalidate cache when a file is removed
+or an eviction policy is triggered. As such we use a separate key in our metadata store to keep track of the
+reference count for each hash in our cache. If we attempt to delete a chunk from cache and the reference count
+is greater than 1 we just decrement the reference count and leave the chunk in cache. In the case that it ends up
+being 1 we know we can safely delete the chunk.
diff --git a/docs/architecture/OBJECTS.md b/docs/architecture/OBJECTS.md
@@ -0,0 +1,55 @@
+# Construction
+
+## SparseIO Object
+
+The SparseIO object is the main interface for users of the library. It provides the
+backing logic management behind the interaction between the `Reader`, `Writer`,
+and `Metadata` objects to maintain the abstraction from the user.
+
+When a user creates a SparseIO object, they do it through a SparseIOBuilder, which
+is a [builder pattern](https://www.lurklurk.org/effective-rust/builders.html) making
+it easy to construct the object without having to worry about defaults.
+
+Three things are required to construct a SparseIO object:
+
+- A [Writer](./API.md#Writer) implementation
+- A [Metadata](./API.md#Metadata) implementation
+- A [ReaderRegistry](#ReaderRegistry) implementation
+
+There are some other tunable parameters that can be set in the builder, such as the
+`chunk_size`, control over prefetching behavior, etc. However these are explained a
+bit further in the [SparseIOBuilder docs](http://docs.rs/sparseio/latest/sparseio/struct.SparseIOBuilder.html).
+
+## ReaderRegistry
+
+An important design decision of SparseIO was how to cleanly support the construction
+and reconstruction of SparseIO objects when our metadata must be serializable. To
+overcome this challenge we designed a type called a `ReaderRegistry`, which is
+essentially a mapping of a `Reader` implementation to a recipe for reconstruction
+based on serialized metadata.
+
+This does leave it up to the user to ensure that they properly manage the `ReaderRegistry`
+to ensure that existing metadata is fully supported by all `Reader` implementations
+in the registry.
+
+This also defines how users do interact with the `Reader` itself when attempting to
+read an object. For example, if a user constructed a `Reader` dedicated to reading from their
+website that just takes raw paths from a URI defined explicitly in their reader the
+registry key could be "mysite". As such, the canonicalized path for interacting with this
+reader would be `mysite+/path/to/object`. This allows users to easily manage multiple readers
+with different sources in one SparseIO object.
+
+## Viewer
+
+The Viewer is a non-user contructable object but is one of the most interacted with
+objects in our architecture. It provides the API abstracting the grunt-work we
+manage behind the scenes in the interactions between the `Reader`, `Writer`, and `Metadata`
+objects.
+
+When a user attempts to read an object through the `SparseIO` object through `SparseIO::open`
+(e.g. `open("mysite+/path/to/object")`) they are actually getting back a `Viewer` object.
+The Viewer provides a couple of core methods for interacting with the data including:
+
+- `read(offset: usize, length: usize) -> io::Result<Bytes>`: Read a byte range from the object.
+- `len() -> usize`: Get the total length of the object in bytes.
+- `to_bytestream() -> impl Stream<Item = io::Result<Bytes>>`: Convert the Viewer to a bytestream for easier integration with async applications.
diff --git a/docs/architecture/index.md b/docs/architecture/index.md
@@ -0,0 +1,24 @@
+# SparseIO General Architecture
+
+## Overview
+
+The general goal of SparseIO is to provide a flexible framework for building applications
+managing large and complex data where access patterns are non-linear. This includes applications
+in scientific computing, machine learning, and data analytics.
+
+The goal is to make it as easy as possible for a developer to choose one-or-more upstream data
+sources to read from ([Reader](./API.md#Reader)), a downstream location to write cache to ([Writer](./API.md#Writer)),
+and a metadata management KV store to keep track of the data ([Metadata](./API.md#Metadata)).
+
+## Table of Contents
+
+- [Core Objects](./OBJECTS.md): An explanation of the core objects in our architecture, and
+  the decisions behind their design.
+  - [SparseIO Object](./OBJECTS.md#SparseIOObject)
+  - [ReaderRegistry](./OBJECTS.md#ReaderRegistry)
+  - [Viewer](./OBJECTS.md#Viewer)
+- [Trait API](./API.md): An explanation of the Developer-facing API for leveraging SparseIO with custom
+  resources.
+  - [Reader](./API.md#Reader)
+  - [Writer](./API.md#Writer)
+  - [Metadata](./API.md#Metadata)
diff --git a/docs/index.md b/docs/index.md
@@ -3,3 +3,4 @@
 ## Table of Contents
 
 - [CI Pipelines](./CI.md): Documentation on GitHub Actions pipelines
+- [Architecture Overview](./architecture/index.md): An overview of the architecture of SparseIO
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,3 +3,4 @@
		## Table of Contents

		- [CI Pipelines](./CI.md): Documentation on GitHub Actions pipelines
		- [Architecture Overview](./architecture/index.md): An overview of the architecture of SparseIO