Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions src/chunking.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/**
* Chunking for memories that don't fit in a single Algorand note.
*
* Algorand caps tx notes at 1024 bytes. Subtract the JSON envelope
* (~150 bytes with book/page/total fields) and the encryption layer
* (envelope adds ~40 bytes, base64 inflates by 4/3) and you have
* ~600 bytes of plaintext per chunk that reliably fit.
*
* On save: the caller decides whether to chunk. `chunkValue` splits on
* a fixed byte boundary; chunks are reassembled in `joinChunks` by
* sorting on `page` ascending.
*
* On recall: callers collect all txs/ASAs for a given key, group by the
* save's `created` timestamp (one save = one book), require all pages
* to be present, and concatenate.
*/

/**
* Max plaintext bytes per chunk. Conservative — leaves headroom for
* UTF-8 multi-byte expansion, envelope JSON, and the encryption
* envelope overhead in `@corvidlabs/ts-algochat`.
*/
export const MAX_CLEARTEXT_PER_CHUNK = 600;

/**
* Split `value` into N chunks of at most `MAX_CLEARTEXT_PER_CHUNK`
* bytes each. Operates on UTF-8 byte boundaries — if a multi-byte
* codepoint straddles the cut, the chunk boundary is pushed back to
* the prior codepoint start so we never produce invalid UTF-8.
*/
export function chunkValue(value: string): string[] {
const bytes = Buffer.from(value, "utf-8");
if (bytes.length <= MAX_CLEARTEXT_PER_CHUNK) return [value];

const chunks: string[] = [];
let offset = 0;
while (offset < bytes.length) {
let end = Math.min(offset + MAX_CLEARTEXT_PER_CHUNK, bytes.length);
// Walk back if we landed in the middle of a UTF-8 continuation byte
// (0b10xxxxxx, i.e. (byte & 0xC0) === 0x80). We stop walking once
// we hit a leading byte; this caps regression at 3 bytes.
while (end < bytes.length && (bytes[end] & 0xc0) === 0x80) end--;
chunks.push(bytes.slice(offset, end).toString("utf-8"));
offset = end;
}
Comment on lines +37 to +45
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The chunkValue function is susceptible to an infinite loop if MAX_CLEARTEXT_PER_CHUNK is set to a value smaller than the byte length of a single UTF-8 character (e.g., if it were reduced to 2 for testing or future changes). If end walks back all the way to offset, the offset will never advance. While 600 bytes is plenty for any valid UTF-8 character (max 4 bytes), adding a guard ensures robustness against configuration changes or malformed input.

  while (offset < bytes.length) {
    let end = Math.min(offset + MAX_CLEARTEXT_PER_CHUNK, bytes.length);
    // Walk back if we landed in the middle of a UTF-8 continuation byte
    // (0b10xxxxxx, i.e. (byte & 0xC0) === 0x80). We stop walking once
    // we hit a leading byte; this caps regression at 3 bytes.
    while (end > offset && end < bytes.length && (bytes[end] & 0xc0) === 0x80) end--;
    
    // If the chunk size is too small to fit even one full character, 
    // force advance to avoid an infinite loop.
    if (end === offset) end = Math.min(offset + MAX_CLEARTEXT_PER_CHUNK, bytes.length);

    chunks.push(bytes.slice(offset, end).toString("utf-8"));
    offset = end;
  }

return chunks;
}

/**
* Reassemble pages back into the original value. Caller is responsible
* for passing the pages in correct order (page 1..N).
*/
export function joinChunks(chunks: string[]): string {
return chunks.join("");
}

/** Heuristic: does this value need chunking? */
export function needsChunking(value: string): boolean {
return Buffer.byteLength(value, "utf-8") > MAX_CLEARTEXT_PER_CHUNK;
}
Binary file modified src/permanent.ts
Binary file not shown.
83 changes: 83 additions & 0 deletions test/chunking.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import { test, expect, describe } from "bun:test";
import { chunkValue, joinChunks, needsChunking, MAX_CLEARTEXT_PER_CHUNK } from "../src/chunking.js";

describe("chunkValue", () => {
test("short input returns a single chunk", () => {
const v = "hello world";
const chunks = chunkValue(v);
expect(chunks).toEqual([v]);
});

test("input at exactly MAX_CLEARTEXT_PER_CHUNK returns single chunk", () => {
const v = "a".repeat(MAX_CLEARTEXT_PER_CHUNK);
const chunks = chunkValue(v);
expect(chunks.length).toBe(1);
expect(chunks[0].length).toBe(MAX_CLEARTEXT_PER_CHUNK);
});

test("input one byte over the limit produces two chunks", () => {
const v = "a".repeat(MAX_CLEARTEXT_PER_CHUNK + 1);
const chunks = chunkValue(v);
expect(chunks.length).toBe(2);
expect(chunks[0].length).toBe(MAX_CLEARTEXT_PER_CHUNK);
expect(chunks[1].length).toBe(1);
});

test("3000-byte input produces 5 chunks of 600 bytes", () => {
const v = "a".repeat(3000);
const chunks = chunkValue(v);
expect(chunks.length).toBe(5);
for (const c of chunks) {
expect(Buffer.byteLength(c, "utf-8")).toBeLessThanOrEqual(MAX_CLEARTEXT_PER_CHUNK);
}
});

test("round-trip: chunkValue → joinChunks preserves ASCII content", () => {
const v = "x".repeat(2500);
expect(joinChunks(chunkValue(v))).toBe(v);
});

test("multi-byte UTF-8 codepoints are never split mid-character", () => {
// 🌟 is 4 bytes in UTF-8. Repeat enough to span chunk boundaries.
const v = "🌟".repeat(200); // 800 bytes
const chunks = chunkValue(v);
for (const c of chunks) {
// Decoding shouldn't throw or insert U+FFFD replacement chars.
const decoded = Buffer.from(c, "utf-8").toString("utf-8");
expect(decoded).toBe(c);
expect(decoded).not.toContain("�");
}
expect(joinChunks(chunks)).toBe(v);
});

test("round-trip preserves mixed ASCII + emoji + CJK + accented", () => {
const segment = "Hello 世界 ñoño 🚀 — مرحبا — ";
const v = segment.repeat(60);
expect(joinChunks(chunkValue(v))).toBe(v);
});

test("empty string returns one empty chunk", () => {
expect(chunkValue("")).toEqual([""]);
});
});

describe("needsChunking", () => {
test("short ASCII does not need chunking", () => {
expect(needsChunking("hello")).toBe(false);
});

test("input at boundary does not need chunking", () => {
expect(needsChunking("a".repeat(MAX_CLEARTEXT_PER_CHUNK))).toBe(false);
});

test("input over boundary needs chunking", () => {
expect(needsChunking("a".repeat(MAX_CLEARTEXT_PER_CHUNK + 1))).toBe(true);
});

test("UTF-8 multi-byte expansion can push a short string over", () => {
// 200 emoji = 800 bytes (each is 4 bytes UTF-8)
expect(needsChunking("🌟".repeat(200))).toBe(true);
// But the same JS .length is 400, well under MAX
expect("🌟".repeat(200).length).toBe(400);
});
});
99 changes: 99 additions & 0 deletions test/permanent-reassemble.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import { test, expect, describe } from "bun:test";
import { __test } from "../src/permanent.js";

const { reassemble } = __test;

interface E {
key: string;
value: string;
txid: string;
created: string;
round: number;
tombstone: boolean;
book?: string;
page?: number;
total?: number;
}

const ent = (over: Partial<E>): E => ({
key: "k",
value: "",
txid: "tx",
created: "2026-05-18T00:00:00Z",
round: 1,
tombstone: false,
...over,
});

describe("permanent reassemble", () => {
test("legacy single-chunk entries pass through unchanged", () => {
const input: E[] = [ent({ key: "a", value: "hello", round: 5 })];
expect(reassemble(input)).toEqual(input);
});

test("tombstones pass through unchanged", () => {
const input: E[] = [ent({ key: "a", tombstone: true, round: 10 })];
expect(reassemble(input)).toEqual(input);
});

test("two pages with matching key+created are joined in page order", () => {
const input: E[] = [
ent({ key: "k", value: "World", round: 2, book: "k", page: 2, total: 2, txid: "tx2" }),
ent({ key: "k", value: "Hello ", round: 1, book: "k", page: 1, total: 2, txid: "tx1" }),
];
const out = reassemble(input);
expect(out.length).toBe(1);
expect(out[0].value).toBe("Hello World");
expect(out[0].round).toBe(2); // max round across pages
expect(out[0].total).toBe(2);
});

test("missing pages drop the whole record (not partial)", () => {
// total=3 but only 2 pages present
const input: E[] = [
ent({ key: "k", value: "A", page: 1, total: 3 }),
ent({ key: "k", value: "C", page: 3, total: 3 }),
];
const out = reassemble(input);
expect(out.length).toBe(0);
});

test("two separate saves of same key produce two reassembled records", () => {
// Save 1 (older): 2 chunks
// Save 2 (newer): 2 chunks at a different timestamp
const input: E[] = [
ent({ key: "k", value: "OldA", created: "2026-05-17T00:00:00Z", round: 1, page: 1, total: 2 }),
ent({ key: "k", value: "OldB", created: "2026-05-17T00:00:00Z", round: 2, page: 2, total: 2 }),
ent({ key: "k", value: "NewA", created: "2026-05-18T00:00:00Z", round: 5, page: 1, total: 2 }),
ent({ key: "k", value: "NewB", created: "2026-05-18T00:00:00Z", round: 6, page: 2, total: 2 }),
];
const out = reassemble(input);
expect(out.length).toBe(2);
// Both reassembled — caller's "latest by round" picks the newer one
const values = out.map(e => e.value).sort();
expect(values).toEqual(["NewANewB", "OldAOldB"]);
});

test("mixed single-chunk and multi-chunk entries are both preserved", () => {
const input: E[] = [
ent({ key: "single", value: "lonely" }),
ent({ key: "multi", value: "X", page: 1, total: 2 }),
ent({ key: "multi", value: "Y", page: 2, total: 2 }),
];
const out = reassemble(input);
expect(out.length).toBe(2);
const byKey = Object.fromEntries(out.map(e => [e.key, e.value]));
expect(byKey).toEqual({ single: "lonely", multi: "XY" });
});

test("page numbering must be contiguous 1..total — gap drops the record", () => {
// total=3 but pages [1, 1, 3] — duplicate page 1, missing page 2
const input: E[] = [
ent({ key: "k", value: "A", page: 1, total: 3, txid: "t1" }),
ent({ key: "k", value: "A'", page: 1, total: 3, txid: "t2" }),
ent({ key: "k", value: "C", page: 3, total: 3, txid: "t3" }),
];
const out = reassemble(input);
expect(out.length).toBe(0);
});
});
Loading