Skip to content

Commit a5b3c10

Browse files
committed
allow iterating only over a range of indices
1 parent 2dee91e commit a5b3c10

File tree

2 files changed

+52
-26
lines changed

2 files changed

+52
-26
lines changed

Modules/clinic/unicodedata.c.h

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -408,31 +408,35 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
408408
}
409409

410410
PyDoc_STRVAR(unicodedata_UCD_iter_graphemes__doc__,
411-
"iter_graphemes($self, unistr, /)\n"
411+
"iter_graphemes($self, unistr, start=0, end=sys.maxsize, /)\n"
412412
"--\n"
413413
"\n"
414414
"Returns an iterator to iterate over grapheme clusters in unistr.\n"
415415
"\n"
416416
"It uses extended grapheme cluster rules from TR29.");
417417

418418
#define UNICODEDATA_UCD_ITER_GRAPHEMES_METHODDEF \
419-
{"iter_graphemes", (PyCFunction)unicodedata_UCD_iter_graphemes, METH_O, unicodedata_UCD_iter_graphemes__doc__},
419+
{"iter_graphemes", (PyCFunction)unicodedata_UCD_iter_graphemes, METH_FASTCALL, unicodedata_UCD_iter_graphemes__doc__},
420420

421421
static PyObject *
422-
unicodedata_UCD_iter_graphemes_impl(PyObject *self, PyObject *unistr);
422+
unicodedata_UCD_iter_graphemes_impl(PyObject *self, PyObject *unistr,
423+
int start, Py_ssize_t end);
423424

424425
static PyObject *
425-
unicodedata_UCD_iter_graphemes(PyObject *self, PyObject *arg)
426+
unicodedata_UCD_iter_graphemes(PyObject *self, PyObject **args, Py_ssize_t nargs)
426427
{
427428
PyObject *return_value = NULL;
428429
PyObject *unistr;
430+
int start = 0;
431+
Py_ssize_t end = PY_SSIZE_T_MAX - 1;
429432

430-
if (!PyArg_Parse(arg, "U:iter_graphemes", &unistr)) {
433+
if (!_PyArg_ParseStack(args, nargs, "U|in:iter_graphemes",
434+
&unistr, &start, &end)) {
431435
goto exit;
432436
}
433-
return_value = unicodedata_UCD_iter_graphemes_impl(self, unistr);
437+
return_value = unicodedata_UCD_iter_graphemes_impl(self, unistr, start, end);
434438

435439
exit:
436440
return return_value;
437441
}
438-
/*[clinic end generated code: output=88c185f6e080eec9 input=a9049054013a1b77]*/
442+
/*[clinic end generated code: output=7d4b4e2561674e6e input=a9049054013a1b77]*/

Modules/unicodedata.c

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,21 @@
1919
#include "ucnhash.h"
2020
#include "structmember.h"
2121

22+
/* helper macro to fixup start/end slice values */
23+
#define ADJUST_INDICES(start, end, len) \
24+
if (end > len) \
25+
end = len; \
26+
else if (end < 0) { \
27+
end += len; \
28+
if (end < 0) \
29+
end = 0; \
30+
} \
31+
if (start < 0) { \
32+
start += len; \
33+
if (start < 0) \
34+
start = 0; \
35+
}
36+
2237
/*[clinic input]
2338
module unicodedata
2439
class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
@@ -1278,6 +1293,7 @@ typedef struct {
12781293
PyObject_HEAD
12791294
PyObject* str;
12801295
Py_ssize_t pos;
1296+
Py_ssize_t end;
12811297
} GraphemeClusterIterator;
12821298

12831299
static void
@@ -1308,24 +1324,24 @@ GCI_iternext(GraphemeClusterIterator *self)
13081324
{
13091325
int kind = PyUnicode_KIND(self->str);
13101326
void *pstr = PyUnicode_DATA(self->str);
1311-
if (PyUnicode_READ(kind, pstr, self->pos)) {
1312-
int start = self->pos;
1313-
GCBState s = STATE_sot;
1314-
while (1) {
1315-
if (!PyUnicode_READ(kind, pstr, self->pos)) {
1316-
return PyUnicode_Substring(self->str, start, self->pos);
1317-
}
1318-
Py_UCS4 chr = PyUnicode_READ(kind, pstr, self->pos);
1319-
int prop = _getrecord_ex(chr)->grapheme_cluster_break;
1320-
s = GRAPH_CLUSTER_AUTOMATON[s][prop];
1321-
if (s == STATE_BREAK) {
1322-
return PyUnicode_Substring(self->str, start, self->pos);
1323-
}
1324-
++self->pos;
1325-
}
1326-
} else {
1327+
if (self->pos == self->end) {
13271328
return NULL;
13281329
}
1330+
1331+
int start = self->pos;
1332+
GCBState s = STATE_sot;
1333+
while (1) {
1334+
if (self->pos == self->end) {
1335+
return PyUnicode_Substring(self->str, start, self->pos);
1336+
}
1337+
Py_UCS4 chr = PyUnicode_READ(kind, pstr, self->pos);
1338+
int prop = _getrecord_ex(chr)->grapheme_cluster_break;
1339+
s = GRAPH_CLUSTER_AUTOMATON[s][prop];
1340+
if (s == STATE_BREAK) {
1341+
return PyUnicode_Substring(self->str, start, self->pos);
1342+
}
1343+
++self->pos;
1344+
}
13291345
}
13301346

13311347
static PyTypeObject GraphemeClusterIteratorType = {
@@ -1346,6 +1362,8 @@ unicodedata.UCD.iter_graphemes
13461362
13471363
self: self
13481364
unistr: unicode
1365+
start: int = 0
1366+
end: Py_ssize_t(c_default="PY_SSIZE_T_MAX - 1") = sys.maxsize
13491367
/
13501368
13511369
Returns an iterator to iterate over grapheme clusters in unistr.
@@ -1354,19 +1372,23 @@ It uses extended grapheme cluster rules from TR29.
13541372
[clinic start generated code]*/
13551373

13561374
static PyObject *
1357-
unicodedata_UCD_iter_graphemes_impl(PyObject *self, PyObject *unistr)
1358-
/*[clinic end generated code: output=92374c1d94db4165 input=59c4794a7f2e6742]*/
1375+
unicodedata_UCD_iter_graphemes_impl(PyObject *self, PyObject *unistr,
1376+
int start, Py_ssize_t end)
1377+
/*[clinic end generated code: output=96aa5bb59138ea9c input=5667e0efb55be68a]*/
13591378
{
13601379
GraphemeClusterIterator *gci = PyObject_GC_New(GraphemeClusterIterator,
13611380
&GraphemeClusterIteratorType);
13621381

13631382
if (!gci)
13641383
return NULL;
13651384

1385+
Py_ssize_t len = PyUnicode_GET_LENGTH(unistr);
1386+
ADJUST_INDICES(start, end, len);
13661387
gci->str = unistr;
13671388
Py_INCREF(unistr);
13681389
PyObject_GC_Track(gci);
1369-
gci->pos = 0;
1390+
gci->pos = start;
1391+
gci->end = end;
13701392
return (PyObject*)gci;
13711393
}
13721394

0 commit comments

Comments
 (0)