-
Notifications
You must be signed in to change notification settings - Fork 0
feat(permanent): chunking for values > 600 bytes (book/page envelope) #8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| /** | ||
| * Chunking for memories that don't fit in a single Algorand note. | ||
| * | ||
| * Algorand caps tx notes at 1024 bytes. Subtract the JSON envelope | ||
| * (~150 bytes with book/page/total fields) and the encryption layer | ||
| * (envelope adds ~40 bytes, base64 inflates by 4/3) and you have | ||
| * ~600 bytes of plaintext per chunk that reliably fit. | ||
| * | ||
| * On save: the caller decides whether to chunk. `chunkValue` splits on | ||
| * a fixed byte boundary; chunks are reassembled in `joinChunks` by | ||
| * sorting on `page` ascending. | ||
| * | ||
| * On recall: callers collect all txs/ASAs for a given key, group by the | ||
| * save's `created` timestamp (one save = one book), require all pages | ||
| * to be present, and concatenate. | ||
| */ | ||
|
|
||
| /** | ||
| * Max plaintext bytes per chunk. Conservative — leaves headroom for | ||
| * UTF-8 multi-byte expansion, envelope JSON, and the encryption | ||
| * envelope overhead in `@corvidlabs/ts-algochat`. | ||
| */ | ||
| export const MAX_CLEARTEXT_PER_CHUNK = 600; | ||
|
|
||
| /** | ||
| * Split `value` into N chunks of at most `MAX_CLEARTEXT_PER_CHUNK` | ||
| * bytes each. Operates on UTF-8 byte boundaries — if a multi-byte | ||
| * codepoint straddles the cut, the chunk boundary is pushed back to | ||
| * the prior codepoint start so we never produce invalid UTF-8. | ||
| */ | ||
| export function chunkValue(value: string): string[] { | ||
| const bytes = Buffer.from(value, "utf-8"); | ||
| if (bytes.length <= MAX_CLEARTEXT_PER_CHUNK) return [value]; | ||
|
|
||
| const chunks: string[] = []; | ||
| let offset = 0; | ||
| while (offset < bytes.length) { | ||
| let end = Math.min(offset + MAX_CLEARTEXT_PER_CHUNK, bytes.length); | ||
| // Walk back if we landed in the middle of a UTF-8 continuation byte | ||
| // (0b10xxxxxx, i.e. (byte & 0xC0) === 0x80). We stop walking once | ||
| // we hit a leading byte; this caps regression at 3 bytes. | ||
| while (end < bytes.length && (bytes[end] & 0xc0) === 0x80) end--; | ||
| chunks.push(bytes.slice(offset, end).toString("utf-8")); | ||
| offset = end; | ||
| } | ||
| return chunks; | ||
| } | ||
|
|
||
| /** | ||
| * Reassemble pages back into the original value. Caller is responsible | ||
| * for passing the pages in correct order (page 1..N). | ||
| */ | ||
| export function joinChunks(chunks: string[]): string { | ||
| return chunks.join(""); | ||
| } | ||
|
|
||
| /** Heuristic: does this value need chunking? */ | ||
| export function needsChunking(value: string): boolean { | ||
| return Buffer.byteLength(value, "utf-8") > MAX_CLEARTEXT_PER_CHUNK; | ||
| } | ||
Binary file not shown.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| import { test, expect, describe } from "bun:test"; | ||
| import { chunkValue, joinChunks, needsChunking, MAX_CLEARTEXT_PER_CHUNK } from "../src/chunking.js"; | ||
|
|
||
| describe("chunkValue", () => { | ||
| test("short input returns a single chunk", () => { | ||
| const v = "hello world"; | ||
| const chunks = chunkValue(v); | ||
| expect(chunks).toEqual([v]); | ||
| }); | ||
|
|
||
| test("input at exactly MAX_CLEARTEXT_PER_CHUNK returns single chunk", () => { | ||
| const v = "a".repeat(MAX_CLEARTEXT_PER_CHUNK); | ||
| const chunks = chunkValue(v); | ||
| expect(chunks.length).toBe(1); | ||
| expect(chunks[0].length).toBe(MAX_CLEARTEXT_PER_CHUNK); | ||
| }); | ||
|
|
||
| test("input one byte over the limit produces two chunks", () => { | ||
| const v = "a".repeat(MAX_CLEARTEXT_PER_CHUNK + 1); | ||
| const chunks = chunkValue(v); | ||
| expect(chunks.length).toBe(2); | ||
| expect(chunks[0].length).toBe(MAX_CLEARTEXT_PER_CHUNK); | ||
| expect(chunks[1].length).toBe(1); | ||
| }); | ||
|
|
||
| test("3000-byte input produces 5 chunks of 600 bytes", () => { | ||
| const v = "a".repeat(3000); | ||
| const chunks = chunkValue(v); | ||
| expect(chunks.length).toBe(5); | ||
| for (const c of chunks) { | ||
| expect(Buffer.byteLength(c, "utf-8")).toBeLessThanOrEqual(MAX_CLEARTEXT_PER_CHUNK); | ||
| } | ||
| }); | ||
|
|
||
| test("round-trip: chunkValue → joinChunks preserves ASCII content", () => { | ||
| const v = "x".repeat(2500); | ||
| expect(joinChunks(chunkValue(v))).toBe(v); | ||
| }); | ||
|
|
||
| test("multi-byte UTF-8 codepoints are never split mid-character", () => { | ||
| // 🌟 is 4 bytes in UTF-8. Repeat enough to span chunk boundaries. | ||
| const v = "🌟".repeat(200); // 800 bytes | ||
| const chunks = chunkValue(v); | ||
| for (const c of chunks) { | ||
| // Decoding shouldn't throw or insert U+FFFD replacement chars. | ||
| const decoded = Buffer.from(c, "utf-8").toString("utf-8"); | ||
| expect(decoded).toBe(c); | ||
| expect(decoded).not.toContain("�"); | ||
| } | ||
| expect(joinChunks(chunks)).toBe(v); | ||
| }); | ||
|
|
||
| test("round-trip preserves mixed ASCII + emoji + CJK + accented", () => { | ||
| const segment = "Hello 世界 ñoño 🚀 — مرحبا — "; | ||
| const v = segment.repeat(60); | ||
| expect(joinChunks(chunkValue(v))).toBe(v); | ||
| }); | ||
|
|
||
| test("empty string returns one empty chunk", () => { | ||
| expect(chunkValue("")).toEqual([""]); | ||
| }); | ||
| }); | ||
|
|
||
| describe("needsChunking", () => { | ||
| test("short ASCII does not need chunking", () => { | ||
| expect(needsChunking("hello")).toBe(false); | ||
| }); | ||
|
|
||
| test("input at boundary does not need chunking", () => { | ||
| expect(needsChunking("a".repeat(MAX_CLEARTEXT_PER_CHUNK))).toBe(false); | ||
| }); | ||
|
|
||
| test("input over boundary needs chunking", () => { | ||
| expect(needsChunking("a".repeat(MAX_CLEARTEXT_PER_CHUNK + 1))).toBe(true); | ||
| }); | ||
|
|
||
| test("UTF-8 multi-byte expansion can push a short string over", () => { | ||
| // 200 emoji = 800 bytes (each is 4 bytes UTF-8) | ||
| expect(needsChunking("🌟".repeat(200))).toBe(true); | ||
| // But the same JS .length is 400, well under MAX | ||
| expect("🌟".repeat(200).length).toBe(400); | ||
| }); | ||
| }); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,99 @@ | ||
| import { test, expect, describe } from "bun:test"; | ||
| import { __test } from "../src/permanent.js"; | ||
|
|
||
| const { reassemble } = __test; | ||
|
|
||
| interface E { | ||
| key: string; | ||
| value: string; | ||
| txid: string; | ||
| created: string; | ||
| round: number; | ||
| tombstone: boolean; | ||
| book?: string; | ||
| page?: number; | ||
| total?: number; | ||
| } | ||
|
|
||
| const ent = (over: Partial<E>): E => ({ | ||
| key: "k", | ||
| value: "", | ||
| txid: "tx", | ||
| created: "2026-05-18T00:00:00Z", | ||
| round: 1, | ||
| tombstone: false, | ||
| ...over, | ||
| }); | ||
|
|
||
| describe("permanent reassemble", () => { | ||
| test("legacy single-chunk entries pass through unchanged", () => { | ||
| const input: E[] = [ent({ key: "a", value: "hello", round: 5 })]; | ||
| expect(reassemble(input)).toEqual(input); | ||
| }); | ||
|
|
||
| test("tombstones pass through unchanged", () => { | ||
| const input: E[] = [ent({ key: "a", tombstone: true, round: 10 })]; | ||
| expect(reassemble(input)).toEqual(input); | ||
| }); | ||
|
|
||
| test("two pages with matching key+created are joined in page order", () => { | ||
| const input: E[] = [ | ||
| ent({ key: "k", value: "World", round: 2, book: "k", page: 2, total: 2, txid: "tx2" }), | ||
| ent({ key: "k", value: "Hello ", round: 1, book: "k", page: 1, total: 2, txid: "tx1" }), | ||
| ]; | ||
| const out = reassemble(input); | ||
| expect(out.length).toBe(1); | ||
| expect(out[0].value).toBe("Hello World"); | ||
| expect(out[0].round).toBe(2); // max round across pages | ||
| expect(out[0].total).toBe(2); | ||
| }); | ||
|
|
||
| test("missing pages drop the whole record (not partial)", () => { | ||
| // total=3 but only 2 pages present | ||
| const input: E[] = [ | ||
| ent({ key: "k", value: "A", page: 1, total: 3 }), | ||
| ent({ key: "k", value: "C", page: 3, total: 3 }), | ||
| ]; | ||
| const out = reassemble(input); | ||
| expect(out.length).toBe(0); | ||
| }); | ||
|
|
||
| test("two separate saves of same key produce two reassembled records", () => { | ||
| // Save 1 (older): 2 chunks | ||
| // Save 2 (newer): 2 chunks at a different timestamp | ||
| const input: E[] = [ | ||
| ent({ key: "k", value: "OldA", created: "2026-05-17T00:00:00Z", round: 1, page: 1, total: 2 }), | ||
| ent({ key: "k", value: "OldB", created: "2026-05-17T00:00:00Z", round: 2, page: 2, total: 2 }), | ||
| ent({ key: "k", value: "NewA", created: "2026-05-18T00:00:00Z", round: 5, page: 1, total: 2 }), | ||
| ent({ key: "k", value: "NewB", created: "2026-05-18T00:00:00Z", round: 6, page: 2, total: 2 }), | ||
| ]; | ||
| const out = reassemble(input); | ||
| expect(out.length).toBe(2); | ||
| // Both reassembled — caller's "latest by round" picks the newer one | ||
| const values = out.map(e => e.value).sort(); | ||
| expect(values).toEqual(["NewANewB", "OldAOldB"]); | ||
| }); | ||
|
|
||
| test("mixed single-chunk and multi-chunk entries are both preserved", () => { | ||
| const input: E[] = [ | ||
| ent({ key: "single", value: "lonely" }), | ||
| ent({ key: "multi", value: "X", page: 1, total: 2 }), | ||
| ent({ key: "multi", value: "Y", page: 2, total: 2 }), | ||
| ]; | ||
| const out = reassemble(input); | ||
| expect(out.length).toBe(2); | ||
| const byKey = Object.fromEntries(out.map(e => [e.key, e.value])); | ||
| expect(byKey).toEqual({ single: "lonely", multi: "XY" }); | ||
| }); | ||
|
|
||
| test("page numbering must be contiguous 1..total — gap drops the record", () => { | ||
| // total=3 but pages [1, 1, 3] — duplicate page 1, missing page 2 | ||
| const input: E[] = [ | ||
| ent({ key: "k", value: "A", page: 1, total: 3, txid: "t1" }), | ||
| ent({ key: "k", value: "A'", page: 1, total: 3, txid: "t2" }), | ||
| ent({ key: "k", value: "C", page: 3, total: 3, txid: "t3" }), | ||
| ]; | ||
| const out = reassemble(input); | ||
| expect(out.length).toBe(0); | ||
| }); | ||
| }); |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
chunkValuefunction is susceptible to an infinite loop ifMAX_CLEARTEXT_PER_CHUNKis set to a value smaller than the byte length of a single UTF-8 character (e.g., if it were reduced to 2 for testing or future changes). Ifendwalks back all the way tooffset, theoffsetwill never advance. While 600 bytes is plenty for any valid UTF-8 character (max 4 bytes), adding a guard ensures robustness against configuration changes or malformed input.