Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
216 changes: 173 additions & 43 deletions lua/diffview/scene/inline_diff.lua
Original file line number Diff line number Diff line change
Expand Up @@ -156,23 +156,13 @@ local function is_word_token(s)
end

-- Classify a UTF-8 character into a subword class for `tokenize`:
-- "lower" (ASCII a-z, plus any non-ASCII byte — see below), "upper"
-- (ASCII A-Z), "digit" (ASCII 0-9), "under" (`_`), or `nil` for ASCII
-- non-word characters.
--
-- ASCII-only classification by design: every multi-byte character and
-- every stray high byte from malformed UTF-8 falls into "lower". This
-- keeps multi-byte runs joined with adjacent ASCII lowercase rather
-- than splitting per-character, and avoids carrying a Unicode category
-- table in pure Lua. Known trade-offs that follow from this:
-- * Accented uppercase (`Ü`, `É`) does not trigger a lower→upper
-- split, so `fooÜber` stays one token.
-- * Unicode punctuation/symbols (`—`, emoji) are not treated as
-- non-word and stay inside word tokens rather than splitting them.
-- ASCII camelCase/PascalCase/snake_case/digit splits — the common case
-- in source code — work as expected. Stray high bytes share the
-- "lower" bucket so classification stays consistent with
-- `is_word_byte`, which treats them as word bytes.
-- "lower" (ASCII a-z), "upper" (ASCII A-Z), "digit" (ASCII 0-9),
-- "under" (`_`), or `nil` for ASCII non-word characters. Multi-byte
-- characters and stray high bytes from malformed UTF-8 bucket as
-- "lower" so non-Latin runs join adjacent ASCII lowercase. Known
-- consequences: accented uppercase (`Ü`, `É`) doesn't trigger a
-- lower→upper split, and Unicode punctuation/symbols stay inside word
-- tokens rather than splitting them.
---@param ch string
---@return "lower"|"upper"|"digit"|"under"|nil
local function subword_class(ch)
Expand Down Expand Up @@ -201,33 +191,172 @@ local function subword_class(ch)
return nil
end

-- Tokenize `s` for subword-level intraline diffing. Splits both on
-- ASCII non-word characters AND on subword boundaries within word-char
-- runs: camelCase (`fooBar` → `foo`, `Bar`), acronym→word
-- (`XMLParser` → `XML`, `Parser`), digit↔letter (`error123abc` →
-- `error`, `123`, `abc`), and underscore (`audio_preservation` →
-- `audio`, `_`, `preservation`). Each ASCII non-word character becomes
-- its own token; each subword run becomes one token. Multi-byte chars
-- bucket as lowercase letters; see `subword_class` for the trade-offs.
-- Returns the token list and a parallel byte-range map.
-- Minimum length for `coalesce_hex_runs` to consider folding subword
-- splits back into a single hex-literal token. Sized to catch full
-- SHAs and longer abbreviations without lumping in short identifier
-- suffixes; 7-char abbreviations stay subword-split.
local HEX_TOKEN_MIN_LEN = 8

-- Upper bound on the longest contiguous letter run permitted in a
-- hex-literal candidate. Random hex averages a max letter run of ~3;
-- pseudo-hex words like `cafe`, `face`, `dead`, `cafef00d`, `decade`
-- exceed the threshold. Hashes that roll a 5+ letter streak skip the
-- coalesce and fall back to the existing `INTRALINE_MAX_HUNKS` gate.
local HEX_TOKEN_MAX_LETTER_RUN = 4

-- True when the concatenation of `tokens[i..j]` (with combined byte
-- length `total_len`) looks like a hash or numeric literal rather
-- than a coincidentally-all-hex identifier. Combines four cheap
-- signals in one O(n) pass over the token slice's bytes:
-- * length >= `HEX_TOKEN_MIN_LEN`,
-- * single-case hex (every letter in `[a-f]` or every letter in
-- `[A-F]`),
-- * digit/letter transitions >= `ceil(length / 4)` (rejects
-- word-prefixed candidates like `decade1234567`),
-- * max contiguous letter run <= `HEX_TOKEN_MAX_LETTER_RUN`
-- (rejects dictionary-shaped runs like `cafef00d1234`).
-- Walking the slice directly lets `coalesce_hex_runs` decide before
-- allocating a merged string, so a non-hex word run (e.g. most
-- camelCase identifiers) bails on its first non-[0-9A-Fa-f] byte.
---@param tokens string[]
---@param i integer
---@param j integer
---@param total_len integer
---@return boolean
local function is_hex_run_in_tokens(tokens, i, j, total_len)
if total_len < HEX_TOKEN_MIN_LEN then
return false
end
local has_lo, has_up = false, false
local prev_class = 0 -- 0 = none, 1 = digit, 2 = letter.
local transitions = 0
local cur_letter_run, max_letter_run = 0, 0
for k = i, j do
local t = tokens[k]
for p = 1, #t do
local b = t:byte(p)
local class
if b >= 0x30 and b <= 0x39 then
class = 1
cur_letter_run = 0
elseif b >= 0x61 and b <= 0x66 then
class = 2
has_lo = true
cur_letter_run = cur_letter_run + 1
if cur_letter_run > max_letter_run then
max_letter_run = cur_letter_run
end
elseif b >= 0x41 and b <= 0x46 then
class = 2
has_up = true
cur_letter_run = cur_letter_run + 1
if cur_letter_run > max_letter_run then
max_letter_run = cur_letter_run
end
else
return false
end
if prev_class ~= 0 and prev_class ~= class then
transitions = transitions + 1
end
prev_class = class
end
end
if has_lo and has_up then
return false
end
if max_letter_run > HEX_TOKEN_MAX_LETTER_RUN then
return false
end
if transitions < math.ceil(total_len / 4) then
return false
end
return true
end

-- Thin string wrapper around `is_hex_run_in_tokens`. The production
-- path (`coalesce_hex_runs`) drives the token-slice variant directly;
-- this wrapper exists so the predicate stays exercisable from tests
-- with a single string input.
---@param s string
---@return boolean
local function is_hex_run(s)
return is_hex_run_in_tokens({ s }, 1, 1, #s)
end

-- Walk a fresh `tokenize` token list and merge runs of byte-adjacent
-- word tokens whose joined string passes `is_hex_run_in_tokens`. A
-- 40-char hash that would otherwise fragment into ~25 alternating
-- digit/letter subwords folds back into one token, so a 1:1 hash
-- replacement renders as a whole-token swap instead of slipping under
-- `INTRALINE_MAX_HUNKS` from coincidental subword matches between two
-- unrelated hashes.
--
-- Per-character tokenization was tried first, but `vim.diff --minimal`
-- matches coincidental letters between dissimilar lines (e.g.
-- "something." and "any tracked metric." share 't', 'e', 'm', 'i', '.')
-- and splits the diff into small hunks. Rendered in overleaf style those
-- fragments interleave deleted and inserted text into unreadable
-- character-level noise.
-- The walk is bounded by byte adjacency so it can't span a non-word
-- char: `"v1.0-1c9dfb..."` keeps `v`, dots, and dashes as natural
-- boundaries. Within one source word run the walk is all-or-nothing.
---@param tokens string[]
---@param byte_map { byte: integer, byte_len: integer }[]
---@return string[]
---@return { byte: integer, byte_len: integer }[]
local function coalesce_hex_runs(tokens, byte_map)
local out, out_map = {}, {}
local i = 1
while i <= #tokens do
if not is_word_token(tokens[i]) then
out[#out + 1] = tokens[i]
out_map[#out_map + 1] = byte_map[i]
i = i + 1
else
local j = i
while
j + 1 <= #tokens
and is_word_token(tokens[j + 1])
and byte_map[j + 1].byte == byte_map[j].byte + byte_map[j].byte_len
do
j = j + 1
end

if j > i then
local merged_len = byte_map[j].byte + byte_map[j].byte_len - byte_map[i].byte
if is_hex_run_in_tokens(tokens, i, j, merged_len) then
out[#out + 1] = table.concat(tokens, "", i, j)
out_map[#out_map + 1] = {
byte = byte_map[i].byte,
byte_len = merged_len,
}
else
for k = i, j do
out[#out + 1] = tokens[k]
out_map[#out_map + 1] = byte_map[k]
end
end
else
out[#out + 1] = tokens[i]
out_map[#out_map + 1] = byte_map[i]
end
i = j + 1
end
end
return out, out_map
end

-- Tokenize `s` for intraline diffing. Splits at ASCII non-word
-- characters and at subword boundaries within word-char runs:
-- camelCase (`fooBar` → `foo`, `Bar`), acronym→word (`XMLParser` →
-- `XML`, `Parser`), digit↔letter (`error123abc` → `error`, `123`,
-- `abc`), and underscore (`audio_preservation` → `audio`, `_`,
-- `preservation`). Each non-word char becomes its own token; each
-- subword run becomes one token. Multi-byte chars bucket as
-- lowercase; see `subword_class`. Returns the token list and a
-- parallel byte-range map.
--
-- Whole-identifier word tokens fixed that but introduced a second
-- failure: a 1:1 token replacement between unrelated identifiers sharing
-- only a structural prefix (`EventStateOpen` / `EventStateClose`)
-- admitted char-level refinement on the prefix-overlap gate, then
-- `vim.diff` latched onto a coincidental letter inside the differing
-- tails (`Open` / `Close` share an `e`) and produced interleaved
-- per-char noise. Subword tokens close the gap: the divergent subword
-- now pairs directly with its replacement, the resulting char-level
-- diff has no shared prefix/suffix to admit refinement, and the hunk
-- renders as a clean whole-token replacement.
-- Subword granularity keeps `vim.diff --minimal` from latching onto
-- coincidental letters in dissimilar lines or in the divergent tails
-- of structurally similar identifiers, both of which would render as
-- per-char noise in overleaf style. A post-pass (`coalesce_hex_runs`)
-- folds the splits back into one token for long hash-like hex runs
-- so git SHAs and similar literals stay atomic.
---@param s string
---@return string[] tokens
---@return { byte: integer, byte_len: integer }[] byte_map
Expand Down Expand Up @@ -298,7 +427,7 @@ local function tokenize(s)
end
flush()

return tokens, byte_map
return coalesce_hex_runs(tokens, byte_map)
end

-- Decompose `s` into UTF-8 characters with byte offsets. Used to refine a
Expand Down Expand Up @@ -1713,6 +1842,7 @@ M._test = {
is_word_char = is_word_char,
is_word_token = is_word_token,
subword_class = subword_class,
is_hex_run = is_hex_run,
tokenize = tokenize,
split_chars = split_chars,
diff_units = diff_units,
Expand Down
113 changes: 109 additions & 4 deletions lua/diffview/tests/functional/inline_diff_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,45 @@ describe("diffview.scene.inline_diff", function()
)
end)

it("tokenize coalesces a long single-case hex hash into one token", function()
-- 40-char SHA-1, all-lowercase hex with mixed digit/letter runs:
-- without the post-pass the run subword-splits into ~25 tokens,
-- which fragment the diff against an unrelated hash and admit
-- coincidental subword matches under `INTRALINE_MAX_HUNKS`.
local s = "1c9dfb261b8be35f689c6d83dfd3e92b7f59ecf8"
local tokens, m = h.tokenize(s)
assert.are.same({ s }, tokens)
assert.are.equal(0, m[1].byte)
assert.are.equal(40, m[1].byte_len)
end)

it("tokenize coalesces uppercase hex with alternating digits too", function()
-- All-uppercase run with frequent digit↔letter alternation passes
-- the same predicate.
assert.are.same({ "AB12CD34EF56AB78" }, (h.tokenize("AB12CD34EF56AB78")))
end)

it("tokenize leaves short hex-like runs subword-split", function()
-- 7 chars, below `HEX_TOKEN_MIN_LEN`: stays subword-split so a
-- short identifier suffix isn't collapsed into one opaque token.
assert.are.same({ "abc", "1234" }, (h.tokenize("abc1234")))
end)

it("tokenize rejects pseudo-hex with low transition density", function()
-- 1-2 transitions across 12-13 chars sit below `ceil(len/4)`, so
-- word-prefixed patterns keep their split and `decade` /
-- `1234567` stay diffable as semantic units.
assert.are.same({ "decade", "1234567" }, (h.tokenize("decade1234567")))
assert.are.same({ "cafebabe", "1234" }, (h.tokenize("cafebabe1234")))
assert.are.same({ "face", "1234", "abcd" }, (h.tokenize("face1234abcd")))
end)

it("tokenize rejects pseudo-hex with a long letter run", function()
-- 3 transitions hits `ceil(12/4)=3`, but `cafef` letter run = 5
-- exceeds `HEX_TOKEN_MAX_LETTER_RUN`, so the predicate rejects.
assert.are.same({ "cafef", "00", "d", "1234" }, (h.tokenize("cafef00d1234")))
end)

it("subword_class buckets ASCII and multi-byte chars correctly", function()
assert.are.equal("lower", h.subword_class("a"))
assert.are.equal("upper", h.subword_class("Z"))
Expand All @@ -795,15 +834,51 @@ describe("diffview.scene.inline_diff", function()
assert.is_false(h.is_word_token(""))
end)

it("is_hex_run accepts long single-case hex with high transition density", function()
assert.is_true(h.is_hex_run("1c9dfb261b8be35f689c6d83dfd3e92b7f59ecf8"))
assert.is_true(h.is_hex_run("AB12CD34EF56AB78"))
end)

it("is_hex_run rejects strings shorter than HEX_TOKEN_MIN_LEN", function()
assert.is_false(h.is_hex_run("abc1234"))
assert.is_false(h.is_hex_run(""))
end)

it("is_hex_run rejects mixed-case hex", function()
-- `tokenize`'s upper→lower split already separates the cases, so a
-- mixed-case merge candidate never reaches `is_hex_run` in practice;
-- the predicate rejects defensively for correctness.
assert.is_false(h.is_hex_run("AbCd1234"))
end)

it("is_hex_run rejects strings with non-hex chars", function()
assert.is_false(h.is_hex_run("hello123abc"))
assert.is_false(h.is_hex_run("1234567890_"))
end)

it("is_hex_run rejects low transition density", function()
assert.is_false(h.is_hex_run("decade1234567"))
assert.is_false(h.is_hex_run("cafebabe1234"))
end)

it("is_hex_run rejects long letter runs", function()
-- `cafef` = 5-char letter run, exceeds `HEX_TOKEN_MAX_LETTER_RUN`.
assert.is_false(h.is_hex_run("cafef00d1234"))
end)

it("is_hex_run rejects pure-digit and pure-letter runs", function()
-- These can't reach the predicate from `coalesce_hex_runs` (their
-- source word run is one subword token), but rejecting them on
-- their own merits keeps the contract sharp: zero transitions.
assert.is_false(h.is_hex_run("12345678"))
assert.is_false(h.is_hex_run("abcdefab"))
end)

it("diff_units returns [] when either side is empty", function()
assert.are.same({}, h.diff_units({}, { "a" }))
assert.are.same({}, h.diff_units({ "a" }, {}))
end)

it("INTRALINE_MAX_HUNKS is a positive integer", function()
assert.is_true(h.INTRALINE_MAX_HUNKS >= 1)
end)

it("refinement_safe admits single-hunk sub-diffs regardless of overlap", function()
-- One hunk cannot interleave, so even unrelated words (foo/bar) are safe.
assert.is_true(h.refinement_safe({ "f", "o", "o" }, { "b", "a", "r" }, 1))
Expand Down Expand Up @@ -895,6 +970,36 @@ describe("diffview.scene.inline_diff", function()
local vls = virt_line_hls(extmarks(bufnr))
assert.are.same({ "DiffviewDiffDeleteInline" }, vls)
end)

it("renders a JSON commit-hash modification as one whole-token swap", function()
-- A pair of lazy-lock.json-style lines differing only in a 40-char
-- SHA. Without `coalesce_hex_runs`, the two unrelated hashes
-- subword-split into ~17-19 alternating digit/lowercase tokens
-- whose coincidental matches kept some pairs under
-- `INTRALINE_MAX_HUNKS` and rendered as fragmented per-token
-- highlights; after coalescing, the diff is a single 1:1 hash
-- replacement covering exactly the new hash's byte range.
local old_line = ' "x": { "commit": "ffa44ee9470743a7697d28df3a1a216fdfe2b09d" }'
local new_line = ' "x": { "commit": "cf4c30892644f01ebfb1e248eeca9e259856f9dc" }'
local bufnr = fresh_buf({ new_line })
inline_diff.render(bufnr, { old_line }, { new_line })
local cr = char_ranges(extmarks(bufnr))
assert.are.equal(1, #cr)
local hash_start = string.find(new_line, "cf4c3089", 1, true) - 1
assert.are.equal(0, cr[1].row)
assert.are.equal(hash_start, cr[1].start)
assert.are.equal(hash_start + 40, cr[1].finish)
end)

it("emits the entire deleted hash as one inline virt_text in overleaf", function()
local old_line = ' "x": { "commit": "ffa44ee9470743a7697d28df3a1a216fdfe2b09d" }'
local new_line = ' "x": { "commit": "cf4c30892644f01ebfb1e248eeca9e259856f9dc" }'
local bufnr = fresh_buf({ new_line })
inline_diff.render(bufnr, { old_line }, { new_line }, { style = "overleaf" })
local virts = inline_virt_texts(extmarks(bufnr), "DiffviewDiffDeleteInline")
assert.are.equal(1, #virts)
assert.are.equal("ffa44ee9470743a7697d28df3a1a216fdfe2b09d", virts[1].text)
end)
end)

describe("intraline tokenization", function()
Expand Down
Loading