keymanapp · jahorton · Nov 11, 2025 · Nov 11, 2025 · Nov 4, 2025 · Nov 12, 2025
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts
@@ -12,8 +12,9 @@ import { LexicalModelTypes } from '@keymanapp/common-types';
 import { deepCopy, KMWString } from "@keymanapp/web-utils";
 
 import { SearchPath } from "./search-path.js";
-import { SearchSpace, TokenInputSource } from "./search-space.js";
+import { SearchSpace, PathInputProperties } from "./search-space.js";
 import { TokenSplitMap } from "./context-tokenization.js";
+import { generateSubsetId } from './tokenization-subsets.js';
 
 import Distribution = LexicalModelTypes.Distribution;
 import LexicalModel = LexicalModelTypes.LexicalModel;
@@ -110,9 +111,13 @@ export class ContextToken {
 
       rawTransformDistributions.forEach((entry) => {
         searchSpace = new SearchPath(searchSpace, entry, {
-          trueTransform: entry[0].sample,
-          inputStartIndex: 0,
-          bestProbFromSet: 1
+          segment: {
+            trueTransform: entry[0].sample,
+            transitionId: entry[0].sample.id,
+            start: 0
+          },
+          bestProbFromSet: 1,
+          subsetId: generateSubsetId()
         });
       });
 
@@ -124,7 +129,7 @@ export class ContextToken {
    * Call this to record the original keystroke Transforms for the context range
    * corresponding to this token.
    */
-  addInput(inputSource: TokenInputSource, distribution: Distribution<Transform>) {
+  addInput(inputSource: PathInputProperties, distribution: Distribution<Transform>) {
     this._searchSpace = new SearchPath(this._searchSpace, distribution, inputSource);
   }
 
@@ -143,8 +148,8 @@ export class ContextToken {
    * Denotes the original keystroke Transforms comprising the range corresponding
    * to this token.
    */
-  get inputRange() {
-    return this.searchSpace.sourceIdentifiers;
+  get inputSegments() {
+    return this.searchSpace.inputSegments;
   }
 
   /**
@@ -163,15 +168,6 @@ export class ContextToken {
     return this.searchSpace.sourceRangeKey;
   }
 
-  /**
-   * Gets a simple, compact string-based representation of `inputRange`.
-   *
-   * This should only ever be used for debugging purposes.
-   */
-  get sourceText(): string {
-    return this.searchSpace.likeliestSourceText;
-  }
-
   /**
    * Generates text corresponding to the net effects of the most likely inputs
    * received that can correspond to the current instance.
@@ -192,7 +188,7 @@ export class ContextToken {
     // Thus, we don't set the .isWhitespace flag field.
     const resultToken = new ContextToken(lexicalModel);
 
-    let lastSourceInput: TokenInputSource;
+    let lastSourceInput: PathInputProperties;
     let lastInputDistrib: Distribution<Transform>;
     for(const token of tokensToMerge) {
       const inputCount = token.inputCount;
@@ -203,7 +199,7 @@ export class ContextToken {
       }
 
       // Are we re-merging on a previously split transform?
-      if(lastSourceInput?.trueTransform != token.inputRange[0].trueTransform) {
+      if(lastSourceInput?.segment.trueTransform != token.inputSegments[0].segment.trueTransform) {
         if(lastSourceInput) {
           resultToken.addInput(lastSourceInput, lastInputDistrib);
         } // else:  there's nothing to add as input
@@ -232,9 +228,9 @@ export class ContextToken {
       // Ignore the last entry for now - it may need to merge with a matching
       // entry in the next token!
       for(let i = startIndex; i < inputCount - 1; i++) {
-        resultToken.addInput(token.inputRange[i], token.searchSpace.inputSequence[i]);
+        resultToken.addInput(token.inputSegments[i], token.searchSpace.inputSequence[i]);
       }
-      lastSourceInput = token.inputRange[inputCount-1];
+      lastSourceInput = token.inputSegments[inputCount-1];
       lastInputDistrib = token.searchSpace.inputSequence[inputCount-1];
     }
 
@@ -257,7 +253,7 @@ export class ContextToken {
 
     // Build an alternate version of the transforms:  if we preprocess all deleteLefts,
     // what text remains from each?
-    const alteredSources = preprocessInputSources(this.inputRange);
+    const alteredSources = preprocessInputSources(this.inputSegments);
 
     const blankContext = { left: '', startOfBuffer: true, endOfBuffer: true };
     const splitSpecs = split.matches.slice();
@@ -313,16 +309,18 @@ export class ContextToken {
           };
         });
 
-        const priorSourceInput = overextendedToken.inputRange[lastInputIndex];
+        const priorSourceInput = overextendedToken.inputSegments[lastInputIndex];
         constructingToken.addInput(priorSourceInput, headDistribution);
         tokensFromSplit.push(constructingToken);
 
         constructingToken = new ContextToken(lexicalModel);
         backupToken = new ContextToken(constructingToken);
         constructingToken.addInput({
-          trueTransform: priorSourceInput.trueTransform,
-          inputStartIndex: priorSourceInput.inputStartIndex + extraCharsAdded,
-          bestProbFromSet: priorSourceInput.bestProbFromSet
+          ...priorSourceInput,
+          segment: {
+            ...priorSourceInput.segment,
+            start: priorSourceInput.segment.start + extraCharsAdded
+          }
         }, tailDistribution);
 
         const lenToCommit = lenBeforeLastApply + extraCharsAdded;
@@ -338,34 +336,34 @@ export class ContextToken {
 
       backupToken = new ContextToken(constructingToken);
       lenBeforeLastApply = KMWString.length(currentText.left);
-      currentText = applyTransform(alteredSources[transformIndex].trueTransform, currentText);
-      constructingToken.addInput(this.inputRange[transformIndex], this.searchSpace.inputSequence[transformIndex]);
+      currentText = applyTransform(alteredSources[transformIndex].segment.trueTransform, currentText);
+      constructingToken.addInput(this.inputSegments[transformIndex], this.searchSpace.inputSequence[transformIndex]);
       transformIndex++;
     }
 
     return tokensFromSplit;
   }
 }
 
-export function preprocessInputSources(inputSources: ReadonlyArray<TokenInputSource>) {
+export function preprocessInputSources(inputSources: ReadonlyArray<PathInputProperties>) {
   const alteredSources = deepCopy(inputSources);
   let trickledDeleteLeft = 0;
   for(let i = alteredSources.length - 1; i >= 0; i--) {
     const source = alteredSources[i];
     if(trickledDeleteLeft) {
-      const insLen = KMWString.length(source.trueTransform.insert);
+      const insLen = KMWString.length(source.segment.trueTransform.insert);
       if(insLen <= trickledDeleteLeft) {
-        source.trueTransform.insert = '';
+        source.segment.trueTransform.insert = '';
         trickledDeleteLeft -= insLen;
       } else {
-        source.trueTransform.insert = KMWString.substring(source.trueTransform.insert, 0, insLen - trickledDeleteLeft);
+        source.segment.trueTransform.insert = KMWString.substring(source.segment.trueTransform.insert, 0, insLen - trickledDeleteLeft);
         trickledDeleteLeft = 0;
       }
     }
-    trickledDeleteLeft += source.trueTransform.deleteLeft;
-    source.trueTransform.deleteLeft = 0;
+    trickledDeleteLeft += source.segment.trueTransform.deleteLeft;
+    source.segment.trueTransform.deleteLeft = 0;
   }
 
-  alteredSources[0].trueTransform.deleteLeft = trickledDeleteLeft;
+  alteredSources[0].segment.trueTransform.deleteLeft = trickledDeleteLeft;
   return alteredSources;
 }
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
@@ -158,16 +158,6 @@ export class ContextTokenization {
     return this.tail.spaceId;
   }
 
-  /**
-   * Returns plain-text strings representing the most probable representation for all
-   * tokens represented by this tokenization instance.
-   *
-   * Intended for debugging use only.
-   */
-  get sourceText() {
-    return this.tokens.map(token => token.sourceText);
-  }
-
   /**
    * Returns a plain-text string representing the most probable representation for all
    * tokens represented by this tokenization instance.
@@ -596,9 +586,13 @@ export class ContextTokenization {
         distribution = distribution.map((mass) => ({sample: { ...mass.sample, deleteLeft: 0 }, p: mass.p }));
       }
       affectedToken.addInput({
-        trueTransform: sourceInput,
-        inputStartIndex: appliedLength,
-        bestProbFromSet: bestProbFromSet
+        segment: {
+          trueTransform: sourceInput,
+          transitionId: sourceInput.id,
+          start: appliedLength
+        },
+        bestProbFromSet: bestProbFromSet,
+        subsetId: pendingTokenization.inputSubsetId
       }, distribution);
       appliedLength += KMWString.length(distribution[0].sample.insert);
 

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts
@@ -10,12 +10,11 @@
 
 import { QueueComparator as Comparator, KMWString, PriorityQueue } from '@keymanapp/web-utils';
 import { LexicalModelTypes } from '@keymanapp/common-types';
-import { applyTransform } from '@keymanapp/models-templates';
 
 import { EDIT_DISTANCE_COST_SCALE, SearchNode, SearchResult } from './distance-modeler.js';
-import { generateSpaceSeed, PathResult, SearchSpace, TokenInputSource } from './search-space.js';
+import { generateSpaceSeed, PathResult, SearchSpace, PathInputProperties } from './search-space.js';
+import { generateSubsetId } from './tokenization-subsets.js';
 
-import Context = LexicalModelTypes.Context;
 import Distribution = LexicalModelTypes.Distribution;
 import LexicalModel = LexicalModelTypes.LexicalModel;
 import ProbabilityMass = LexicalModelTypes.ProbabilityMass;
@@ -30,7 +29,7 @@ export const QUEUE_NODE_COMPARATOR: Comparator<SearchNode> = function(arg1, arg2
 export class SearchPath implements SearchSpace {
   private selectionQueue: PriorityQueue<SearchNode> = new PriorityQueue(QUEUE_NODE_COMPARATOR);
   readonly inputs?: Distribution<Transform>;
-  readonly inputSource?: TokenInputSource;
+  readonly inputSource?: PathInputProperties;
 
   readonly parentSpace: SearchSpace;
   readonly spaceId: number;
@@ -62,6 +61,7 @@ export class SearchPath implements SearchSpace {
   /**
    * Extends an existing SearchSpace (and its correction data) by a keystroke based
    * on a subset of the incoming keystroke's fat-finger distribution.
+   *
    * @param space
    * @param inputs
    * @param srcKeystroke The sample from the incoming distribution that represents data actually
@@ -79,31 +79,35 @@ export class SearchPath implements SearchSpace {
    * @param srcKeystroke Data about the actual context range represented by `inputs` and
    * its underlying keystroke.
    */
-  constructor(space: SearchSpace, inputs: Distribution<Transform>, srcKeystroke: TokenInputSource);
-  constructor(arg1: LexicalModel | SearchSpace, inputs?: Distribution<Transform>, inputSource?: TokenInputSource | ProbabilityMass<Transform>) {
+  constructor(space: SearchSpace, inputs: Distribution<Transform>, srcKeystroke: PathInputProperties);
+  constructor(arg1: LexicalModel | SearchSpace, inputs?: Distribution<Transform>, inputSource?: PathInputProperties | ProbabilityMass<Transform>) {
     // If we're taking in a pre-constructed search node, it's got an associated,
     // pre-assigned spaceID - so use that.
     const isExtending = (arg1 instanceof SearchPath);
     this.spaceId = generateSpaceSeed();
 
     // Coerce inputSource to TokenInputSource format.
-    if(inputSource && (inputSource as TokenInputSource).trueTransform == undefined) {
+    if(inputSource && (inputSource as ProbabilityMass<Transform>).sample != undefined) {
       const keystroke = inputSource as ProbabilityMass<Transform>;
       inputSource = {
-        trueTransform: keystroke.sample,
+        segment: {
+          trueTransform: keystroke.sample,
+          transitionId: keystroke.sample.id,
+          start: 0
+        },
         bestProbFromSet: keystroke.p,
-        inputStartIndex: 0
+        subsetId: generateSubsetId()
       }
     };
 
-    const inputSrc = inputSource as TokenInputSource;
+    const inputSrc = inputSource as PathInputProperties;
 
     if(isExtending) {
       const parentSpace = arg1 as SearchSpace;
       const logTierCost = -Math.log(inputSrc.bestProbFromSet);
 
       const transitionId = (inputs?.[0].sample.id);
-      if(transitionId !== undefined && inputSrc.trueTransform.id != transitionId) {
+      if(transitionId !== undefined && inputSrc.segment.transitionId != transitionId) {
         throw new Error("Input distribution and input-source transition IDs must match");
       }
 
@@ -198,23 +202,6 @@ export class SearchPath implements SearchSpace {
     }
   }
 
-  get likeliestSourceText(): string {
-    let prefixContext: Context = { left: this.parentSpace?.likeliestSourceText ?? '', startOfBuffer: true, endOfBuffer: true };
-    const inputTransform = this.inputSource?.trueTransform ?? { insert: '', deleteLeft: 0 };
-
-    const excessDeletes = inputTransform.deleteLeft - KMWString.length(prefixContext.left);
-    if(excessDeletes > 0) {
-      prefixContext = {
-        ...prefixContext,
-        // \u{2421} = ␡ (Unicode symbol for Delete)
-        left: '\u{2421}'.repeat(excessDeletes) + prefixContext.left
-      };
-    }
-
-    const result = applyTransform(inputTransform, prefixContext);
-    return result.left;
-  }
-
   get parents() {
     // The SearchPath class may only have a single parent.
     return this.parentSpace ? [this.parentSpace] : [];
@@ -362,15 +349,15 @@ export class SearchPath implements SearchSpace {
     return Object.values(this.returnedValues ?? {}).map(v => new SearchResult(v));
   }
 
-  public get sourceIdentifiers(): TokenInputSource[] {
+  public get inputSegments(): PathInputProperties[] {
     if(!this.parentSpace) {
       return [];
     }
 
-    const parentSources = this.parentSpace.sourceIdentifiers;
+    const parentSources = this.parentSpace.inputSegments;
     if(this.inputSource) {
-      const inputId = this.inputSource.trueTransform.id;
-      if(inputId && parentSources.length > 0 && parentSources[parentSources.length - 1].trueTransform.id == inputId) {
+      const inputId = this.inputSource.segment.transitionId;
+      if(inputId && parentSources.length > 0 && parentSources[parentSources.length - 1].segment.transitionId == inputId) {
         return parentSources;
       }
 
@@ -386,11 +373,11 @@ export class SearchPath implements SearchSpace {
    */
   get sourceRangeKey(): string {
     const components: string[] = [];
-    const sources = this.sourceIdentifiers;
+    const sources = this.inputSegments;
 
     for(const source of sources) {
-      const i = source.inputStartIndex;
-      components.push(`T${source.trueTransform.id}${i != 0 ? '@' + i : ''}`);
+      const i = source.segment.start;
+      components.push(`T${source.segment.transitionId}${i != 0 ? '@' + i : ''}`);
     }
 
     return components.join('+');