Skip to content

Commit ab6ce5b

Browse files
committed
wikipedia pictures download example
1 parent 73d98de commit ab6ce5b

37 files changed

+2043
-38
lines changed
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="utf-8">
5+
<title>Charserver</title>
6+
<script type="text/javascript">
7+
//(function() {
8+
var BASE_URL = 'http://127.0.0.1:8888/chars';
9+
var RESULTS_PER_REQUEST = 10;
10+
var REQUEST_DELAY = 1000; // in milliseconds
11+
var httpRequest = new XMLHttpRequest();
12+
httpRequest.onreadystatechange = processResponse;
13+
14+
function requestMaker(start) {
15+
var makeRequest = function (event) {
16+
var query = document.getElementById('queryField').value;
17+
var limit = RESULTS_PER_REQUEST;
18+
httpRequest.open('GET', BASE_URL+'?query='+query+'&limit='+limit);
19+
httpRequest.send();
20+
document.getElementById('message').textContent = 'Query: ' + query;
21+
var table = document.getElementById('results');
22+
var tr;
23+
while (tr = table.lastChild) table.removeChild(tr);
24+
return false; // don't submit form
25+
}
26+
return makeRequest;
27+
}
28+
29+
function processResponse() {
30+
if (httpRequest.readyState === 4) {
31+
var query = document.getElementById('queryField').value;
32+
if (httpRequest.status === 200) {
33+
fillTable(httpRequest.responseText);
34+
} else {
35+
alert('query: ' + query + '\nstatus: '+httpRequest.status);
36+
}
37+
}
38+
}
39+
40+
function getSymbols(string) {
41+
// needed for iterating over Unicode characters after U+FFFF
42+
var length = string.length;
43+
var index = -1;
44+
var output = [];
45+
var character;
46+
var charCode;
47+
while (++index < length) {
48+
character = string.charAt(index);
49+
charCode = character.charCodeAt(0);
50+
if (charCode >= 0xD800 && charCode <= 0xDBFF) {
51+
output.push(character + string.charAt(++index));
52+
} else {
53+
output.push(character);
54+
}
55+
}
56+
return output;
57+
}
58+
59+
// from: https://developer.mozilla.org/...
60+
// en-US/docs/Web/JavaScript/Reference/Global_Objects/String/charCodeAt
61+
function knownCharCodeAt(str, idx) {
62+
str += '';
63+
var code,
64+
end = str.length;
65+
66+
var surrogatePairs = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g;
67+
while ((surrogatePairs.exec(str)) != null) {
68+
var li = surrogatePairs.lastIndex;
69+
if (li - 2 < idx) {
70+
idx++;
71+
}
72+
else {
73+
break;
74+
}
75+
}
76+
if (idx >= end || idx < 0) {
77+
return NaN;
78+
}
79+
code = str.charCodeAt(idx);
80+
var hi, low;
81+
if (0xD800 <= code && code <= 0xDBFF) {
82+
hi = code;
83+
// Go one further, "characters" is part of a surrogate pair
84+
low = str.charCodeAt(idx + 1);
85+
return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;
86+
}
87+
return code;
88+
}
89+
90+
function codePointStr(uniChar) {
91+
if (uniChar.length == 1) {
92+
var code = uniChar.charCodeAt(0);
93+
} else { // characters after U+FFFF
94+
var code = knownCharCodeAt(uniChar, 0);
95+
};
96+
var codeStr = code.toString(16);
97+
var padding = Array(Math.max(4 - codeStr.length + 1, 0)).join(0);
98+
return 'U+' + padding + codeStr.toUpperCase();
99+
}
100+
101+
function fillTable(responseData) {
102+
var results = JSON.parse(responseData);
103+
console.log(results);
104+
var table = document.getElementById('results');
105+
var tr;
106+
var characters = getSymbols(results.chars);
107+
for (var i=results.start; i < results.stop; i++) {
108+
ch = characters[i];
109+
if (ch == '\n') continue;
110+
if (ch == '\x00') break;
111+
var hexCode = codePointStr(ch);
112+
tr = document.createElement('tr');
113+
tr.appendChild(document.createElement('td'));
114+
tr.appendChild(document.createElement('th'));
115+
tr.cells[0].appendChild(document.createTextNode(hexCode));
116+
tr.cells[1].appendChild(document.createTextNode(ch));
117+
tr.id = hexCode;
118+
table.appendChild(tr);
119+
if (results.stop < results.total) {
120+
setTimeout(requestMaker(results.stop)(), REQUEST_DELAY);
121+
}
122+
}
123+
}
124+
window.onload = function() {
125+
var requester = requestMaker(0);
126+
document.getElementById('queryForm').onsubmit = requester;
127+
document.getElementById('queryButton').onclick = requester;
128+
}
129+
//})();
130+
</script>
131+
</head>
132+
<body>
133+
<p>
134+
<form id="queryForm">
135+
<input id="queryField" type="search" name="query" value="">
136+
<input id="queryButton" type="button" value="find">
137+
Examples: {links}
138+
</form>
139+
</p>
140+
<p id="message">{message}</p>
141+
<hr>
142+
<table id="results">
143+
</table>
144+
</body>
145+
</html>
Lines changed: 37 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,17 @@
6060
import unicodedata
6161
import pickle
6262
import warnings
63+
import itertools
64+
from collections import namedtuple
6365

6466
RE_WORD = re.compile('\w+')
6567
RE_UNICODE_NAME = re.compile('^[A-Z0-9 -]+$')
6668
RE_CODEPOINT = re.compile('U\+([0-9A-F]{4,6})')
6769

6870
INDEX_NAME = 'charfinder_index.pickle'
6971
MINIMUM_SAVE_LEN = 10000
70-
CJK_PREFIX = 'CJK UNIFIED IDEOGRAPH'
72+
CJK_UNI_PREFIX = 'CJK UNIFIED IDEOGRAPH'
73+
CJK_CMP_PREFIX = 'CJK COMPATIBILITY IDEOGRAPH'
7174

7275
sample_chars = [
7376
'$', # DOLLAR SIGN
@@ -83,6 +86,7 @@ def tokenize(text):
8386
for match in RE_WORD.finditer(text):
8487
yield match.group().upper()
8588

89+
8690
def query_type(text):
8791
text_upper = text.upper()
8892
if 'U+' in text_upper:
@@ -92,6 +96,7 @@ def query_type(text):
9296
else:
9397
return 'CHARACTERS'
9498

99+
CharDescription = namedtuple('CharDescription', 'code_str char name')
95100

96101
class UnicodeNameIndex:
97102

@@ -128,12 +133,13 @@ def build_index(self, chars=None):
128133
name = unicodedata.name(char)
129134
except ValueError:
130135
continue
131-
if name.startswith(CJK_PREFIX):
132-
name = CJK_PREFIX
133-
code = ord(char)
136+
if name.startswith(CJK_UNI_PREFIX):
137+
name = CJK_UNI_PREFIX
138+
elif name.startswith(CJK_CMP_PREFIX):
139+
name = CJK_CMP_PREFIX
134140

135141
for word in tokenize(name):
136-
index.setdefault(word, set()).add(code)
142+
index.setdefault(word, set()).add(char)
137143

138144
self.index = index
139145

@@ -151,7 +157,8 @@ def word_report(self, top=None):
151157
for postings, key in self.word_rank(top):
152158
print('{:5} {}'.format(postings, key))
153159

154-
def find_codes(self, query):
160+
def find_chars(self, query, start=0, stop=None):
161+
stop = sys.maxsize if stop is None else stop
155162
result_sets = []
156163
for word in tokenize(query):
157164
if word in self.index:
@@ -160,23 +167,30 @@ def find_codes(self, query):
160167
result_sets = []
161168
break
162169
if result_sets:
163-
result = result_sets[0]
164-
result.intersection_update(*result_sets[1:])
165-
else:
166-
result = set()
167-
if len(result) > 0:
168-
for code in sorted(result):
169-
yield code
170-
171-
def describe(self, code):
172-
code_str = 'U+{:04X}'.format(code)
173-
char = chr(code)
170+
result = result_sets[0].intersection(*result_sets[1:])
171+
result = sorted(result) # must sort for consistency
172+
for char in itertools.islice(result, start, stop):
173+
yield char
174+
175+
def find_codes(self, query, start=0, stop=None):
176+
return (ord(char) for char
177+
in self.find_chars(query, start, stop))
178+
179+
def describe(self, char):
180+
code_str = 'U+{:04X}'.format(ord(char))
174181
name = unicodedata.name(char)
175-
return '{:7}\t{}\t{}'.format(code_str, char, name)
182+
return CharDescription(code_str, char, name)
183+
184+
def find_descriptions(self, query, start=0, stop=None):
185+
for char in self.find_chars(query, start, stop):
186+
yield self.describe(char)
187+
188+
def describe_str(self, char):
189+
return '{:7}\t{}\t{}'.format(*self.describe(char))
176190

177-
def find_descriptions(self, query):
178-
for code in self.find_codes(query):
179-
yield self.describe(code)
191+
def find_description_strs(self, query, start=0, stop=None):
192+
for char in self.find_chars(query, start, stop):
193+
yield self.describe_str(char)
180194

181195
@staticmethod # not an instance method due to concurrency
182196
def status(query, counter):
@@ -192,7 +206,8 @@ def status(query, counter):
192206
def main(*args):
193207
index = UnicodeNameIndex()
194208
query = ' '.join(args)
195-
for n, line in enumerate(index.find_descriptions(query), 1):
209+
n = 0
210+
for n, line in enumerate(index.find_description_strs(query), 1):
196211
print(line)
197212
print('({})'.format(index.status(query, n)))
198213

3.29 MB
Binary file not shown.
Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,25 @@
2323
</p>
2424
<p>{message}</p>
2525
<hr>
26-
<pre>
26+
<table>
2727
{result}
28-
</pre>
28+
</table>
2929
</body>
3030
</html>
3131
'''
3232

33-
CONTENT_TYPE = 'text/html; charset=UTF-8'
34-
35-
EXAMPLE_WORDS = ('chess cat circled Malayalam digit Roman face Ethiopic'
33+
EXAMPLE_WORDS = ('bismillah chess cat circled Malayalam digit Roman face Ethiopic'
3634
' black mark symbol dot operator Braille hexagram').split()
35+
3736
LINK_TPL = '<a href="/?query={0}" title="find &quot;{0}&quot;">{0}</a>'
3837

38+
LINKS_HTML = ', '.join(LINK_TPL.format(word)
39+
for word in sorted(EXAMPLE_WORDS, key=str.upper))
40+
41+
ROW_TPL = '<tr><td>{code_str}</td><th>{char}</th><td>{name}</td></tr>'
42+
43+
CONTENT_TYPE = 'text/html; charset=UTF-8'
44+
3945
index = None # a UnicodeNameIndex instance
4046

4147

@@ -44,19 +50,18 @@ def handle(request):
4450
query = request.GET.get('query', '')
4551
print('Query: {!r}'.format(query))
4652
if query:
47-
lines = list(index.find_descriptions(query))
48-
res = '\n'.join(lines)
49-
msg = index.status(query, len(lines))
53+
descriptions = list(index.find_descriptions(query))
54+
res = '\n'.join(ROW_TPL.format(**vars(descr))
55+
for descr in descriptions)
56+
msg = index.status(query, len(descriptions))
5057
else:
51-
lines = []
58+
descriptions = []
5259
res = ''
5360
msg = 'Type words describing characters.'
5461

55-
links = ', '.join(LINK_TPL.format(word)
56-
for word in sorted(EXAMPLE_WORDS, key=str.upper))
5762
text = PAGE_TPL.format(query=query, result=res,
58-
message=msg, links=links)
59-
print('Sending {} results'.format(len(lines)))
63+
message=msg, links=LINKS_HTML)
64+
print('Sending {} results'.format(len(descriptions)))
6065
return web.Response(content_type=CONTENT_TYPE, text=text)
6166

6267

@@ -77,7 +82,7 @@ def main(address="127.0.0.1", port=8888):
7782
loop.run_until_complete(init(loop, address, port))
7883
loop.run_forever()
7984

80-
85+
8186
if __name__ == '__main__':
8287
index = UnicodeNameIndex()
8388
main(*sys.argv[1:])

0 commit comments

Comments
 (0)