Skip to content

Commit 4f99bc2

Browse files
committed
Use precomputed index to get mutation coordinate
Signed-off-by: Keshav Priyadarshi <git@keshav.space>
1 parent da158f4 commit 4f99bc2

File tree

9 files changed

+33
-20
lines changed

9 files changed

+33
-20
lines changed

Makefile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ dev:
1919

2020
isort:
2121
@echo "-> Apply isort changes to ensure proper imports ordering"
22-
${VENV}/bin/isort --sl -l 100 src tests setup.py --skip="tests/testfiles/stemming/"
22+
${VENV}/bin/isort --sl -l 100 src tests setup.py --skip="tests/testfiles/"
2323

2424
black:
2525
@echo "-> Apply black code formatter"
26-
${VENV}/bin/black -l 100 src tests setup.py --exclude="tests/testfiles/stemming/"
26+
${VENV}/bin/black -l 100 src tests setup.py --exclude="tests/testfiles/"
2727

2828
doc8:
2929
@echo "-> Run doc8 validation"
@@ -35,9 +35,9 @@ check:
3535
@echo "-> Run pycodestyle (PEP8) validation"
3636
@${ACTIVATE} pycodestyle --max-line-length=100 --exclude=.eggs,venv,lib,thirdparty,docs,migrations,settings.py,.cache,tests/testfiles/stemming/ .
3737
@echo "-> Run isort imports ordering validation"
38-
@${ACTIVATE} isort --sl --check-only -l 100 setup.py src tests . --skip="tests/testfiles/stemming/"
38+
@${ACTIVATE} isort --sl --check-only -l 100 setup.py src tests . --skip="tests/testfiles/"
3939
@echo "-> Run black validation"
40-
@${ACTIVATE} black --check --check -l 100 src tests setup.py --exclude="tests/testfiles/stemming/"
40+
@${ACTIVATE} black --check --check -l 100 src tests setup.py --exclude="tests/testfiles/"
4141

4242
clean:
4343
@echo "-> Clean the Python env"

src/matchcode_toolkit/stemming.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class TreeSitterWheelNotInstalled(Exception):
5959

6060
def get_parser(location):
6161
"""
62-
Get the appropriate tree-sitter parser and string identifier for
62+
Get the appropriate tree-sitter parser and grammar config for
6363
file at location.
6464
"""
6565
file_type = Type(location)
@@ -108,27 +108,34 @@ def traverse(node, language_info, mutation_index):
108108
traverse(child, language_info, mutation_index)
109109

110110

111-
def apply_mutation(text, start_point, end_point, replacement):
111+
def apply_mutation(text, start_point, end_point, replacement, successive_line_count):
112+
"""Mutate tokens between start and end points with replacement string."""
113+
112114
start_row, start_col = start_point
113115
end_row, end_col = end_point
114116

115-
lines = text.splitlines()
116-
117-
# Compute the start and end indices, +1 for newline.
118-
start_index = sum(len(line) + 1 for line in lines[:start_row]) + start_col
119-
end_index = sum(len(line) + 1 for line in lines[:end_row]) + end_col
117+
# Compute 1D mutation position from 2D coordinates
118+
start_index = successive_line_count[start_row] + start_col
119+
end_index = successive_line_count[end_row] + end_col
120120

121121
modified_text = text[:start_index] + replacement + text[end_index:]
122+
modified_lines = modified_text.splitlines(keepends=True)
122123

123-
modified_lines = modified_text.splitlines()
124124
# Remove empty comment lines.
125125
if not replacement and modified_lines[start_row].strip() == "":
126126
del modified_lines[start_row]
127127

128-
return "\n".join(modified_lines)
128+
return "".join(modified_lines)
129129

130130

131131
def get_stem_code(location):
132+
"""
133+
Return the stemmed code for the code file at the specified `location`.
134+
135+
Parse the code using tree-sitter, create a mutation index for tokens that
136+
need to be replaced or removed, and apply these mutations bottom-up to
137+
generate the stemmed code.
138+
"""
132139
parser_result = get_parser(location)
133140
if not parser_result:
134141
return
@@ -143,11 +150,17 @@ def get_stem_code(location):
143150
# Apply mutations bottom-up
144151
mutations = dict(sorted(mutations.items(), reverse=True))
145152
text = source.decode()
153+
cur_count = 0
154+
lines = text.splitlines(keepends=True)
155+
successive_line_count = [cur_count := cur_count + len(line) for line in lines]
156+
successive_line_count.insert(0, 0)
157+
146158
for value in mutations.values():
147159
text = apply_mutation(
148160
text=text,
149161
end_point=value["end_point"],
150162
start_point=value["start_point"],
151163
replacement=("idf" if value["type"] == "identifier" else ""),
164+
successive_line_count=successive_line_count,
152165
)
153166
return text

tests/testfiles/stemming/c/main-stemmed.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,4 +68,4 @@ int idf idf(HINSTANCE idf, HINSTANCE idf,
6868
idf();
6969

7070
return idf;
71-
}
71+
}

tests/testfiles/stemming/cpp/string-stemmed.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2220,4 +2220,4 @@ wxString* wxCArrayString::idf()
22202220
wxString *idf = idf();
22212221
idf = NULL;
22222222
return idf;
2223-
}
2223+
}

tests/testfiles/stemming/golang/utils-stemmed.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,4 +124,4 @@ func idf(idf []string, idf bool) {
124124

125125
}
126126

127-
}
127+
}

tests/testfiles/stemming/java/contenttype-stemmed.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,4 +113,4 @@ boolean idf() {
113113
return (idf+1 == idf);
114114
}
115115

116-
}
116+
}

tests/testfiles/stemming/javascript/utils-stemmed.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,4 @@ function idf(idf, idf) {
4343
idf.exports = {
4444
addMetaDataFilesToPackage,
4545
buildPackageArchive,
46-
};
46+
};

tests/testfiles/stemming/python/sync_scancode_scans-stemmed.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,4 +100,4 @@ def idf(idf, idf):
100100
idf, idf = idf.idf.idf(idf=idf(idf), idf=idf.idf)
101101
idf = idf.idf(idf=idf)
102102
idf.idf(idf=idf, idf=idf)
103-
return idf
103+
return idf

tests/testfiles/stemming/rust/metrics-stemmeds.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -524,4 +524,4 @@ mod idf {
524524
"GRPC_ENDPOINT_WHITELIST must be sorted in code to allow binary search"
525525
);
526526
}
527-
}
527+
}

0 commit comments

Comments
 (0)