Skip to content

Commit dd1a53f

Browse files
committed
dynamic attributes, descriptors and first concurrency examples
1 parent 0618105 commit dd1a53f

27 files changed

+1157
-222
lines changed

concurrency/charfinder.py

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Unicode character finder utility:
5+
find characters based on words in their official names.
6+
7+
This can be used from the command line, just pass words as arguments.
8+
9+
Here is the ``main`` function which makes it happen::
10+
11+
>>> main('rook') # doctest: +NORMALIZE_WHITESPACE
12+
U+2656 ♖ WHITE CHESS ROOK
13+
U+265C ♜ BLACK CHESS ROOK
14+
(2 matches for 'rook')
15+
>>> main('rook', 'black') # doctest: +NORMALIZE_WHITESPACE
16+
U+265C ♜ BLACK CHESS ROOK
17+
(1 match for 'rook black')
18+
>>> main('white bishop') # doctest: +NORMALIZE_WHITESPACE
19+
U+2657 ♗ WHITE CHESS BISHOP
20+
(1 match for 'white bishop')
21+
>>> main("jabberwocky's vest")
22+
(No match for "jabberwocky's vest")
23+
24+
25+
For exploring words that occur in the character names, there is the
26+
``word_report`` function::
27+
28+
>>> index = UnicodeNameIndex(sample_chars)
29+
>>> index.word_report()
30+
3 SIGN
31+
2 A
32+
2 EURO
33+
2 LATIN
34+
2 LETTER
35+
1 CAPITAL
36+
1 CURRENCY
37+
1 DOLLAR
38+
1 SMALL
39+
>>> index = UnicodeNameIndex()
40+
>>> index.word_report(7)
41+
13196 SYLLABLE
42+
11735 HANGUL
43+
7616 LETTER
44+
2232 WITH
45+
2180 SIGN
46+
2122 SMALL
47+
1709 CAPITAL
48+
49+
Note: character names starting with the string ``'CJK UNIFIED IDEOGRAPH'``
50+
are not indexed. Those names are not useful for searching, since the only
51+
unique part of the name is the codepoint in hexadecimal.
52+
53+
"""
54+
55+
import sys
56+
import re
57+
import unicodedata
58+
import pickle
59+
import warnings
60+
61+
RE_WORD = re.compile('\w+')
62+
63+
INDEX_NAME = 'charfinder_index.pickle'
64+
MINIMUM_SAVE_LEN = 10000
65+
CJK_PREFIX = 'CJK UNIFIED IDEOGRAPH'
66+
67+
sample_chars = [
68+
'$', # DOLLAR SIGN
69+
'A', # LATIN CAPITAL LETTER A
70+
'a', # LATIN SMALL LETTER A
71+
'\u20a0', # EURO-CURRENCY SIGN
72+
'\u20ac', # EURO SIGN
73+
]
74+
75+
76+
def tokenize(text):
77+
"""return iterable of uppercased words"""
78+
for match in RE_WORD.finditer(text):
79+
yield match.group().upper()
80+
81+
82+
class UnicodeNameIndex:
83+
84+
def __init__(self, chars=None):
85+
self.load(chars)
86+
87+
def load(self, chars=None):
88+
self.index = None
89+
if chars is None:
90+
try:
91+
with open(INDEX_NAME, 'rb') as fp:
92+
self.index = pickle.load(fp)
93+
except OSError:
94+
pass
95+
if self.index is None:
96+
self.build_index(chars)
97+
if len(self.index) > MINIMUM_SAVE_LEN:
98+
try:
99+
self.save()
100+
except OSError as exc:
101+
warnings.warn('Could not save {!r}: {}'
102+
.format(INDEX_NAME, exc))
103+
104+
def save(self):
105+
with open(INDEX_NAME, 'wb') as fp:
106+
pickle.dump(self.index, fp)
107+
108+
def build_index(self, chars=None):
109+
if chars is None:
110+
chars = (chr(i) for i in range(32, sys.maxunicode))
111+
index = {}
112+
for char in chars:
113+
try:
114+
name = unicodedata.name(char)
115+
except ValueError:
116+
continue
117+
if name.startswith(CJK_PREFIX):
118+
name = CJK_PREFIX
119+
code = ord(char)
120+
121+
for word in tokenize(name):
122+
index.setdefault(word, set()).add(code)
123+
124+
self.index = index
125+
126+
def __len__(self):
127+
return len(self.index)
128+
129+
def word_rank(self, top=None):
130+
res = [(len(self.index[key]), key) for key in self.index]
131+
res.sort(key=lambda item: (-item[0], item[1]))
132+
if top is not None:
133+
res = res[:top]
134+
return res
135+
136+
def word_report(self, top=None):
137+
"""
138+
Generate report with most frequent words
139+
140+
>>> index = UnicodeNameIndex()
141+
>>> index.word_report(7)
142+
13196 SYLLABLE
143+
11735 HANGUL
144+
7616 LETTER
145+
2232 WITH
146+
2180 SIGN
147+
2122 SMALL
148+
1709 CAPITAL
149+
"""
150+
for postings, key in self.word_rank(top):
151+
print('{:5} {}'.format(postings, key))
152+
153+
def find_codes(self, query):
154+
result_sets = []
155+
for word in tokenize(query):
156+
if word in self.index:
157+
result_sets.append(self.index[word])
158+
else: # shorcut: no such word
159+
result_sets = []
160+
break
161+
if result_sets:
162+
result = result_sets[0]
163+
result.intersection_update(*result_sets[1:])
164+
else:
165+
result = set()
166+
if len(result) > 0:
167+
for code in sorted(result):
168+
yield code
169+
170+
def describe(self, code):
171+
code_str = 'U+{:04X}'.format(code)
172+
char = chr(code)
173+
name = unicodedata.name(char)
174+
return '{:7}\t{}\t{}'.format(code_str, char, name)
175+
176+
def find_descriptions(self, query):
177+
for code in self.find_codes(query):
178+
yield self.describe(code)
179+
180+
181+
def main(*args):
182+
index = UnicodeNameIndex()
183+
query = ' '.join(args)
184+
counter = 0
185+
for line in index.find_descriptions(query):
186+
print(line)
187+
counter += 1
188+
if counter == 0:
189+
msg = 'No match'
190+
elif counter == 1:
191+
msg = '1 match'
192+
else:
193+
msg = '{} matches'.format(counter)
194+
print('({} for {!r})'.format(msg, query))
195+
196+
197+
if __name__ == '__main__':
198+
if len(sys.argv) > 1:
199+
main(*sys.argv[1:])
200+
else:
201+
print('Usage: {} word1 [word2]...'.format(sys.argv[0]))

concurrency/http_charserver.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import asyncio
2+
from aiohttp import web
3+
4+
from charfinder import UnicodeNameIndex
5+
6+
TEMPLATE = '''
7+
<!DOCTYPE html>
8+
<html lang="en">
9+
<head>
10+
<meta charset="utf-8">
11+
<title>title</title>
12+
</head>
13+
<body>
14+
<form action="/">
15+
<input type="search" name="query" value="{query}">
16+
<input type="submit" value="find">
17+
</form>
18+
<p>{message}</p>
19+
<hr>
20+
<pre>
21+
{result}
22+
</pre>
23+
</body>
24+
</html>
25+
'''
26+
27+
CONTENT_TYPE = 'text/html; charset=UTF-8'
28+
29+
index = None # a UnicodeNameIndex instance
30+
31+
32+
@asyncio.coroutine
33+
def handle(request):
34+
query = request.GET.get('query', '')
35+
print('Query: {!r}'.format(query))
36+
if query:
37+
lines = list(index.find_descriptions(query))
38+
res = '\n'.join(lines)
39+
plural = 'es' if len(lines) > 1 else ''
40+
msg = '{} match{} for {!r}'.format(len(lines), plural, query)
41+
else:
42+
lines = []
43+
res = ''
44+
msg = 'Type words describing characters, e.g. chess.'
45+
46+
text = TEMPLATE.format(query=query, result=res, message=msg)
47+
return web.Response(content_type=CONTENT_TYPE, text=text)
48+
49+
50+
@asyncio.coroutine
51+
def init(loop):
52+
app = web.Application(loop=loop)
53+
app.router.add_route('GET', '/', handle)
54+
55+
server = yield from loop.create_server(app.make_handler(),
56+
'127.0.0.1', 8080)
57+
host = server.sockets[0].getsockname()
58+
print('Serving on {}. Hit CTRL-C to stop.'.format(host))
59+
60+
61+
def main():
62+
loop = asyncio.get_event_loop()
63+
loop.run_until_complete(init(loop))
64+
loop.run_forever()
65+
66+
67+
if __name__ == '__main__':
68+
index = UnicodeNameIndex()
69+
main()

concurrency/tcp_charserver.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env python3
2+
3+
import asyncio
4+
5+
from charfinder import UnicodeNameIndex
6+
7+
CRLF = b'\r\n'
8+
PROMPT = b'?> '
9+
10+
index = None # a UnicodeNameIndex instance
11+
12+
13+
def writeln(writer, arg):
14+
if isinstance(arg, str):
15+
lines = [arg]
16+
else:
17+
lines = arg
18+
writer.writelines(line.encode() + CRLF for line in lines)
19+
20+
21+
@asyncio.coroutine
22+
def handle_queries(reader, writer):
23+
while True:
24+
writer.write(PROMPT)
25+
yield from writer.drain()
26+
data = yield from reader.readline()
27+
try:
28+
query = data.decode().strip()
29+
except UnicodeDecodeError:
30+
query = '\x00'
31+
if ord(query[:1]) < 32:
32+
break
33+
client = writer.get_extra_info('peername')
34+
print('Received from {}: {}'.format(client, query))
35+
lines = list(index.find_descriptions(query))
36+
if lines:
37+
writeln(writer, lines)
38+
plural = 'es' if len(lines) > 1 else ''
39+
msg = '({} match{} for {!r})'.format(len(lines), plural, query)
40+
writeln(writer, msg)
41+
print('Sent: {} lines + total'.format(len(lines)))
42+
else:
43+
writeln(writer, '(No match for {!r})'.format(query))
44+
print('Sent: 1 line, no match')
45+
yield from writer.drain()
46+
47+
print('Close the client socket')
48+
writer.close()
49+
50+
51+
def main():
52+
loop = asyncio.get_event_loop()
53+
coro = asyncio.start_server(handle_queries, '127.0.0.1', 8888, loop=loop)
54+
server = loop.run_until_complete(coro)
55+
56+
host = server.sockets[0].getsockname()
57+
print('Serving on {}. Hit CTRL-C to stop.'.format(host))
58+
try:
59+
loop.run_forever()
60+
except KeyboardInterrupt: # CTRL+C pressed
61+
pass
62+
63+
server.close()
64+
loop.run_until_complete(server.wait_closed())
65+
loop.close()
66+
67+
68+
if __name__ == '__main__':
69+
index = UnicodeNameIndex()
70+
main()

0 commit comments

Comments
 (0)