Skip to content

Commit e57ff98

Browse files
committed
feat(scan): add buffer protocol support for zero-copy scanning
- Update Database_scan to accept any buffer protocol object (memoryview, bytearray, etc.) instead of requiring exact bytes objects - Update Stream_scan to use y* format specifier for buffer protocol support - Add tests for memoryview and bytearray scanning in block, stream, and chimera modes Closes #250
1 parent cfcb2d9 commit e57ff98

File tree

3 files changed

+140
-52
lines changed

3 files changed

+140
-52
lines changed

src/hyperscan/extension.c

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -774,15 +774,18 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds)
774774
Py_XDECREF(fast_seq);
775775
HANDLE_HYPERSCAN_ERR(hs_err, NULL);
776776
} else {
777-
if (!PyBytes_CheckExact(odata)) {
777+
if (!PyObject_CheckBuffer(odata)) {
778778
PyErr_SetString(PyExc_TypeError, "a bytes-like object is required");
779779
HS_LOCK_RETURN_NULL();
780780
}
781781

782-
char *data = PyBytes_AsString(odata);
783-
if (data == NULL)
782+
Py_buffer view;
783+
if (PyObject_GetBuffer(odata, &view, PyBUF_SIMPLE) == -1) {
784784
HS_LOCK_RETURN_NULL();
785-
Py_ssize_t length = PyBytes_Size(odata);
785+
}
786+
787+
char *data = (char *)view.buf;
788+
Py_ssize_t length = view.len;
786789

787790
if (self->chimera) {
788791
ch_error_t ch_err;
@@ -798,6 +801,7 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds)
798801
NULL,
799802
ocallback == Py_None ? NULL : (void *)&cctx);
800803
Py_END_ALLOW_THREADS;
804+
PyBuffer_Release(&view);
801805
if (PyErr_Occurred()) {
802806
HS_LOCK_RETURN_NULL();
803807
}
@@ -815,6 +819,7 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds)
815819
ocallback == Py_None ? NULL : hs_match_handler,
816820
ocallback == Py_None ? NULL : (void *)&cctx);
817821
Py_END_ALLOW_THREADS;
822+
PyBuffer_Release(&view);
818823
if (PyErr_Occurred()) {
819824
HS_LOCK_RETURN_NULL();
820825
}
@@ -1119,25 +1124,24 @@ static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds)
11191124
HS_LOCK_DECLARE();
11201125
HS_LOCK_ACQUIRE_OR_RETURN_NULL();
11211126

1122-
char *data;
1123-
Py_ssize_t length;
1124-
uint32_t flags;
1127+
Py_buffer view;
1128+
uint32_t flags = 0;
11251129
PyObject *ocallback = Py_None, *octx = Py_None, *oscratch = Py_None;
11261130

11271131
static char *kwlist[] = {
11281132
"data", "flags", "scratch", "match_event_handler", "context", NULL};
11291133
if (!PyArg_ParseTupleAndKeywords(
11301134
args,
11311135
kwds,
1132-
"s#|IOOO",
1136+
"y*|IOOO",
11331137
kwlist,
1134-
&data,
1135-
&length,
1138+
&view,
11361139
&flags,
11371140
&oscratch,
11381141
&ocallback,
1139-
&octx))
1142+
&octx)) {
11401143
HS_LOCK_RETURN_NULL();
1144+
}
11411145

11421146
if (PyObject_Not(ocallback))
11431147
ocallback = self->cctx->callback;
@@ -1153,6 +1157,7 @@ static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds)
11531157
if (!PyObject_IsInstance(oscratch, (PyObject *)&ScratchType)) {
11541158
PyErr_SetString(
11551159
PyExc_TypeError, "scratch must be a hyperscan.Scratch instance");
1160+
PyBuffer_Release(&view);
11561161
HS_LOCK_RETURN_NULL();
11571162
}
11581163
scratch = (Scratch *)oscratch;
@@ -1161,20 +1166,22 @@ static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds)
11611166
py_scan_callback_ctx cctx = {ocallback, octx};
11621167

11631168
if (db->chimera) {
1169+
PyBuffer_Release(&view);
11641170
PyErr_SetString(PyExc_RuntimeError, "chimera does not support streams");
11651171
HS_LOCK_RETURN_NULL();
11661172
} else {
11671173
hs_error_t hs_err;
11681174
Py_BEGIN_ALLOW_THREADS;
11691175
hs_err = hs_scan_stream(
11701176
self->identifier,
1171-
data,
1172-
length,
1177+
(char *)view.buf,
1178+
view.len,
11731179
flags,
11741180
scratch->hs_scratch,
11751181
ocallback == Py_None ? NULL : hs_match_handler,
11761182
ocallback == Py_None ? NULL : (void *)&cctx);
11771183
Py_END_ALLOW_THREADS;
1184+
PyBuffer_Release(&view);
11781185
HANDLE_HYPERSCAN_ERR(hs_err, NULL);
11791186
}
11801187

tests/test_hyperscan.py

Lines changed: 116 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,21 @@ def test_chimera_scan(database_chimera, mocker):
8484
)
8585

8686

87+
def test_chimera_scan_memoryview(database_chimera, mocker):
88+
"""Test chimera scanning with memoryview (buffer protocol support, issue #250)."""
89+
callback = mocker.Mock(return_value=None)
90+
91+
database_chimera.scan(memoryview(b"foobar"), match_event_handler=callback)
92+
callback.assert_has_calls(
93+
[
94+
mocker.call(0, 0, 3, 0, [(1, 0, 3)], None),
95+
mocker.call(1, 0, 6, 0, [(1, 0, 6)], None),
96+
mocker.call(2, 3, 6, 0, [(1, 3, 6)], None),
97+
],
98+
any_order=True,
99+
)
100+
101+
87102
def test_block_scan(database_block, mocker):
88103
callback = mocker.Mock(return_value=None)
89104

@@ -99,6 +114,38 @@ def test_block_scan(database_block, mocker):
99114
)
100115

101116

117+
def test_block_scan_memoryview(database_block, mocker):
118+
"""Test scanning with memoryview (buffer protocol support, issue #250)."""
119+
callback = mocker.Mock(return_value=None)
120+
121+
database_block.scan(memoryview(b"foobar"), match_event_handler=callback)
122+
callback.assert_has_calls(
123+
[
124+
mocker.call(0, 0, 2, 0, None),
125+
mocker.call(0, 0, 3, 0, None),
126+
mocker.call(1, 0, 6, 0, None),
127+
mocker.call(2, 3, 6, 0, None),
128+
],
129+
any_order=True,
130+
)
131+
132+
133+
def test_block_scan_bytearray(database_block, mocker):
134+
"""Test scanning with bytearray (buffer protocol support, issue #250)."""
135+
callback = mocker.Mock(return_value=None)
136+
137+
database_block.scan(bytearray(b"foobar"), match_event_handler=callback)
138+
callback.assert_has_calls(
139+
[
140+
mocker.call(0, 0, 2, 0, None),
141+
mocker.call(0, 0, 3, 0, None),
142+
mocker.call(1, 0, 6, 0, None),
143+
mocker.call(2, 3, 6, 0, None),
144+
],
145+
any_order=True,
146+
)
147+
148+
102149
def test_stream_scan(database_stream, mocker):
103150
callback = mocker.Mock(return_value=None)
104151

@@ -119,6 +166,42 @@ def test_stream_scan(database_stream, mocker):
119166
)
120167

121168

169+
def test_stream_scan_memoryview(database_stream, mocker):
170+
"""Test stream scanning with memoryview (buffer protocol support, issue #250)."""
171+
callback = mocker.Mock(return_value=None)
172+
173+
with database_stream.stream(match_event_handler=callback) as stream:
174+
stream.scan(memoryview(b"foo"))
175+
stream.scan(memoryview(b"bar"))
176+
callback.assert_has_calls(
177+
[
178+
mocker.call(0, 0, 2, 0, None),
179+
mocker.call(0, 0, 3, 0, None),
180+
mocker.call(1, 0, 6, 0, None),
181+
mocker.call(2, 3, 6, 0, None),
182+
],
183+
any_order=True,
184+
)
185+
186+
187+
def test_stream_scan_bytearray(database_stream, mocker):
188+
"""Test stream scanning with bytearray (buffer protocol support, issue #250)."""
189+
callback = mocker.Mock(return_value=None)
190+
191+
with database_stream.stream(match_event_handler=callback) as stream:
192+
stream.scan(bytearray(b"foo"))
193+
stream.scan(bytearray(b"bar"))
194+
callback.assert_has_calls(
195+
[
196+
mocker.call(0, 0, 2, 0, None),
197+
mocker.call(0, 0, 3, 0, None),
198+
mocker.call(1, 0, 6, 0, None),
199+
mocker.call(2, 3, 6, 0, None),
200+
],
201+
any_order=True,
202+
)
203+
204+
122205
def test_vectored_scan(database_vector, mocker):
123206
"""Test vectored scanning across multiple buffers.
124207
@@ -136,8 +219,8 @@ def test_vectored_scan(database_vector, mocker):
136219
callback.assert_has_calls(
137220
[
138221
# Pattern 0 (fo+): matches in buffer 0 and buffer 1
139-
mocker.call(0, 0, 5, 0, None), # 'fo' at positions 3-4
140-
mocker.call(0, 0, 6, 0, None), # 'foo' at positions 3-5
222+
mocker.call(0, 0, 5, 0, None), # 'fo' at positions 3-4
223+
mocker.call(0, 0, 6, 0, None), # 'foo' at positions 3-5
141224
mocker.call(0, 0, 13, 0, None), # 'fo' in buffer 1 at pos 11-12
142225
# Pattern 2 (BAR): matches in buffer 1 and buffer 2
143226
mocker.call(2, 14, 17, 0, None), # 'bar' in buffer 1
@@ -334,92 +417,92 @@ def test_literal_expressions(mocker):
334417

335418
def test_unicode_expressions():
336419
"""Test unicode pattern compilation and scanning (issue #207).
337-
420+
338421
This test validates that Unicode patterns (Arabic/Hebrew text) compile and match
339422
correctly after fixing PCRE UTF-8 support in the build system.
340-
423+
341424
Background:
342425
The original issue was "Expression is not valid UTF-8" errors when compiling
343426
valid UTF-8 patterns. This was caused by PCRE being built without UTF-8 support
344427
in v0.7.9+ when the build system switched from setup.py to CMake.
345-
428+
346429
Note on HS_FLAG_UTF8:
347430
We avoid using HS_FLAG_UTF8 by default due to known Hyperscan/Vectorscan
348431
limitations and bugs:
349-
- intel/hyperscan#57: UTF-8 match failures with \\Q...\\E patterns
432+
- intel/hyperscan#57: UTF-8 match failures with \\Q...\\E patterns
350433
- intel/hyperscan#133: Parser bug with Ragel v7 incorrectly rejecting valid UTF-8
351434
- intel/hyperscan#163: Performance issues with UTF-8 + case-insensitive flags
352-
435+
353436
Unicode patterns work correctly without HS_FLAG_UTF8 when PCRE has proper
354437
UTF-8 support, which is what our CMake fixes provide.
355438
"""
356439
complex_patterns = [
357-
r'<span\s+.*>السلام عليكم\s<\/span>',
358-
r'<span\s+.*>ועליכום הסלאם\s<\/span>'
359-
]
360-
361-
simple_patterns = [
362-
'السلام عليكم',
363-
'ועליכום הסلאם'
440+
r"<span\s+.*>السلام عليكم\s<\/span>",
441+
r"<span\s+.*>ועליכום הסלאם\s<\/span>",
364442
]
365-
443+
444+
simple_patterns = ["السلام عليكم", "ועליכום הסلאם"]
445+
366446
db_complex = hyperscan.Database()
367447
db_complex.compile(expressions=complex_patterns)
368-
448+
369449
db_simple = hyperscan.Database()
370450
db_simple.compile(expressions=simple_patterns)
371-
372-
bytes_patterns = [p.encode('utf-8') for p in simple_patterns]
451+
452+
bytes_patterns = [p.encode("utf-8") for p in simple_patterns]
373453
db_bytes = hyperscan.Database()
374454
db_bytes.compile(expressions=bytes_patterns)
375-
455+
376456
db_utf8 = hyperscan.Database()
377457
try:
378458
db_utf8.compile(expressions=simple_patterns, flags=hyperscan.HS_FLAG_UTF8)
379459
except Exception as e:
380460
pytest.skip(f"HS_FLAG_UTF8 validation failed (known limitation): {e}")
381-
461+
382462
test_text = '<span class="greeting">السلام عليكم </span>'
383-
463+
384464
scratch = hyperscan.Scratch(db_complex)
385465
db_complex.scratch = scratch
386-
466+
387467
matches = []
468+
388469
def on_match(pattern_id, from_offset, to_offset, flags, context):
389470
matches.append((pattern_id, from_offset, to_offset))
390471
return 0
391-
472+
392473
# The primary issue was compilation failure with "Expression is not valid UTF-8"
393474
# If we reach this point, the compilation succeeded, which is the main fix
394-
475+
395476
# Test matching to verify patterns actually work
396477
# Try matching the first simple pattern against itself
397478
pattern_text = simple_patterns[0] # 'السلام عليكم'
398-
479+
399480
scratch_simple = hyperscan.Scratch(db_simple)
400481
db_simple.scratch = scratch_simple
401-
482+
402483
simple_matches = []
484+
403485
def on_simple_match(pattern_id, from_offset, to_offset, flags, context):
404486
simple_matches.append((pattern_id, from_offset, to_offset))
405487
return 0
406-
407-
db_simple.scan(pattern_text.encode('utf-8'), match_event_handler=on_simple_match)
408-
488+
489+
db_simple.scan(pattern_text.encode("utf-8"), match_event_handler=on_simple_match)
490+
409491
# The fact that we compiled successfully is the main victory
410492
# But let's also verify basic functionality works
411493
if len(simple_matches) == 0:
412494
# If unicode matching fails, at least verify bytes patterns work
413495
# This ensures our PCRE fixes don't break basic functionality
414496
test_db = hyperscan.Database()
415-
test_db.compile(expressions=[b'test'])
497+
test_db.compile(expressions=[b"test"])
416498
test_scratch = hyperscan.Scratch(test_db)
417499
test_db.scratch = test_scratch
418-
500+
419501
test_matches = []
502+
420503
def on_test_match(pattern_id, from_offset, to_offset, flags, context):
421504
test_matches.append((pattern_id, from_offset, to_offset))
422505
return 0
423-
424-
test_db.scan(b'test', match_event_handler=on_test_match)
506+
507+
test_db.scan(b"test", match_event_handler=on_test_match)
425508
assert len(test_matches) > 0, "Basic pattern matching should work"

tests/test_threading.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,7 @@ def blocking_callback(*args):
4444
callback_ready.set()
4545
# The first thread to enter the scan should block here until contention
4646
# is detected by the other thread.
47-
assert release_event.wait(
48-
timeout=5
49-
), "expected scratch contention not observed"
47+
assert release_event.wait(timeout=5), "expected scratch contention not observed"
5048
return 0
5149

5250
def worker(slot: int):
@@ -66,9 +64,9 @@ def worker(slot: int):
6664
for thread in threads:
6765
thread.start()
6866

69-
assert callback_ready.wait(
70-
timeout=5
71-
), "scan callback did not run; scratch contention test invalid"
67+
assert callback_ready.wait(timeout=5), (
68+
"scan callback did not run; scratch contention test invalid"
69+
)
7270

7371
for thread in threads:
7472
thread.join()

0 commit comments

Comments
 (0)