Skip to content

Commit c6df60b

Browse files
committed
updated wikipedia examples
1 parent a38ab95 commit c6df60b

File tree

3 files changed

+130
-59
lines changed

3 files changed

+130
-59
lines changed

concurrency/wikipedia/build_fixture.py

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,24 @@
11
import sys
22
import argparse
33
import os
4+
import urllib
45

5-
from daypicts import get_picture_url, validate_date, gen_dates
6-
from daypicts import NoPictureForDate
7-
from daypicts import POTD_PATH
6+
import requests
87

9-
FIXTURE_DIR = 'fixture/'
8+
from daypicts import get_picture_url, get_picture_urls
9+
from daypicts import validate_date, gen_dates, picture_type
10+
from daypicts import NoPictureForDate
11+
from daypicts import REMOTE_PICT_BASE_URL, PICT_EXCEPTIONS
1012

13+
FIXTURE_DOC_DIR = 'fixture/docroot/'
14+
FIXTURE_TEMPLATE_POTD_DIR = FIXTURE_DOC_DIR + 'Template-POTD/'
1115

1216
def parse_args(argv):
1317
parser = argparse.ArgumentParser(description=main.__doc__)
1418
date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
1519
parser.add_argument('date', help=date_help)
20+
parser.add_argument('-u', '--url_only', action='store_true',
21+
help='get picture URLS only')
1622

1723
args = parser.parse_args(argv)
1824

@@ -47,18 +53,45 @@ def save_picture_urls(dates, save_path):
4753
fp.write(snippet)
4854

4955

56+
def save_pictures(dates, save_path, verbose=False):
57+
urls_ok = []
58+
for date, url in get_picture_urls(dates, verbose):
59+
response = requests.get(url)
60+
file_path = os.path.join(save_path,
61+
url.replace(REMOTE_PICT_BASE_URL, ''))
62+
file_path = urllib.parse.unquote(file_path)
63+
octets = response.content
64+
# http://en.wikipedia.org/wiki/Template:POTD/2013-06-15
65+
66+
if date not in PICT_EXCEPTIONS:
67+
assert picture_type(octets) is not None, url
68+
69+
try:
70+
os.makedirs(os.path.dirname(file_path))
71+
except FileExistsError:
72+
pass
73+
with open(file_path, 'wb') as fp:
74+
fp.write(octets)
75+
76+
print(file_path)
77+
return urls_ok
78+
79+
5080
def main(argv):
5181
"""Build test fixture from Wikipedia "POTD" data"""
5282

53-
save_path = os.path.join(FIXTURE_DIR,POTD_PATH)
5483
try:
55-
os.makedirs(save_path)
84+
os.makedirs(FIXTURE_TEMPLATE_POTD_DIR)
5685
except FileExistsError:
5786
pass
5887

5988
dates, args = parse_args(argv)
6089

61-
save_picture_urls(dates, save_path)
90+
if args.url_only:
91+
save_picture_urls(dates, FIXTURE_TEMPLATE_POTD_DIR)
92+
else:
93+
save_pictures(dates, FIXTURE_DOC_DIR)
94+
6295

6396
if __name__ == '__main__':
6497
main(sys.argv[1:])

concurrency/wikipedia/daypicts.py

Lines changed: 70 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,23 @@
1818
import sys
1919
import argparse
2020
import re
21-
import imghdr
2221
import time
2322
import datetime
23+
import os
24+
import imghdr
25+
import warnings
2426

2527
import requests
2628

27-
SAVE_DIR = 'pictures/'
28-
POTD_PATH = 'Template:POTD/'
29-
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/' + POTD_PATH
29+
SAVE_DIR = 'downloaded/'
30+
31+
#POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
32+
POTD_BASE_URL = 'http://127.0.0.1:8001/Template-POTD/'
33+
34+
REMOTE_PICT_BASE_URL = 'http://upload.wikimedia.org/wikipedia/'
35+
LOCAL_PICT_BASE_URL = 'http://127.0.0.1:8001/'
36+
PICT_BASE_URL = LOCAL_PICT_BASE_URL
37+
3038
POTD_IMAGE_RE = re.compile(r'src="(//upload\..*?)"')
3139
PODT_EARLIEST_TEMPLATE = '2007-01-01'
3240

@@ -35,12 +43,11 @@
3543
RE_DATE = RE_MONTH + r'-([0-3]\d)'
3644
ISO_DATE_FMT = '%Y-%m-%d'
3745

38-
DATEFORMS = [
39-
('date', re.compile('^' + RE_DATE + '$')),
40-
('month', re.compile('^' + RE_MONTH + '$')),
41-
('year', re.compile('^' + RE_YEAR + '$'))
42-
]
46+
PICT_EXCEPTIONS = {
47+
'2013-06-15', # .webm movie [1]
48+
}
4349

50+
#[1] http://en.wikipedia.org/wiki/Template:POTD/2013-06-15
4451

4552
class NoPictureForDate(Exception):
4653
'''No Picture of the Day found for {iso_date}'''
@@ -51,31 +58,15 @@ class NoPictureTemplateBefore(ValueError):
5158

5259

5360
def get_picture_url(iso_date):
54-
page_url = POTD_BASE_URL+iso_date
61+
page_url = POTD_BASE_URL + iso_date
62+
print(page_url)
5563
response = requests.get(page_url)
5664
pict_url = POTD_IMAGE_RE.search(response.text)
5765
if pict_url is None:
5866
raise NoPictureForDate(iso_date)
5967
return 'http:' + pict_url.group(1)
6068

6169

62-
def get_picture(iso_date):
63-
pict_url = get_picture_url(iso_date)
64-
response = requests.get(pict_url)
65-
octets = response.content
66-
return octets
67-
68-
69-
def get_picture_type(octets):
70-
pict_type = imghdr.what(None, octets)
71-
if pict_type is None:
72-
if (octets.startswith(b'<') and
73-
b'<svg' in octets[:200] and
74-
octets.rstrip().endswith(b'</svg>')):
75-
pict_type = 'svg'
76-
return pict_type
77-
78-
7970
def validate_date(text):
8071
try:
8172
parts = [int(part) for part in text.split('-')]
@@ -116,8 +107,8 @@ def gen_dates(iso_parts):
116107
yield iso_parts
117108

118109

119-
def get_picture_urls(dates, verbose=False, save_fixture=False):
120-
urls = []
110+
def get_picture_urls(dates, verbose=False):
111+
date_urls = []
121112
count = 0
122113
for date in dates:
123114
try:
@@ -132,8 +123,50 @@ def get_picture_urls(dates, verbose=False, save_fixture=False):
132123
print(url.split('/')[-1])
133124
else:
134125
print(url)
135-
urls.append(url)
136-
return urls
126+
date_urls.append((date, url))
127+
return date_urls
128+
129+
130+
def picture_type(octets):
131+
pict_type = imghdr.what(None, octets)
132+
if pict_type is None:
133+
if (octets.startswith(b'<') and
134+
b'<svg' in octets[:200] and
135+
octets.rstrip().endswith(b'</svg>')):
136+
pict_type = 'svg'
137+
return pict_type
138+
139+
140+
def get_pictures(dates, verbose=False):
141+
urls_ok = []
142+
try:
143+
os.makedirs(SAVE_DIR)
144+
except FileExistsError:
145+
pass
146+
for date, url in get_picture_urls(dates, verbose):
147+
if PICT_BASE_URL == LOCAL_PICT_BASE_URL:
148+
url = url.replace(REMOTE_PICT_BASE_URL, PICT_BASE_URL)
149+
response = requests.get(url)
150+
if response.status_code != 200:
151+
warnings.warn('HTTP code {}: {}'.format(response.status_code, url))
152+
continue
153+
octets = response.content
154+
if date not in PICT_EXCEPTIONS:
155+
assert picture_type(octets) is not None, url
156+
file_path = url.replace(PICT_BASE_URL, '')
157+
file_name = os.path.basename(file_path)
158+
path = os.path.join(SAVE_DIR, date.split('-')[0])
159+
file_path = os.path.join(path, file_name)
160+
#import pdb; pdb.set_trace()
161+
try:
162+
os.makedirs(path)
163+
except FileExistsError:
164+
pass
165+
with open(file_path, 'wb') as fp:
166+
fp.write(octets)
167+
urls_ok.append(url)
168+
print(file_path)
169+
return urls_ok
137170

138171

139172
def parse_args(argv):
@@ -175,7 +208,12 @@ def main(argv, get_picture_urls):
175208

176209
t0 = time.time()
177210

178-
urls = get_picture_urls(dates, args.verbose, args.fixture_save)
211+
if args.url_only:
212+
urls = get_picture_urls(dates, args.verbose)
213+
else:
214+
urls = get_pictures(dates, args.verbose)
215+
216+
179217

180218
elapsed = time.time() - t0
181219
if args.verbose:

concurrency/wikipedia/test_daypicts.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,6 @@
77
from daypicts import *
88

99

10-
GIF_MIN = (b'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
11-
b'\x00\x00\x01\x00\x01\x00\x00\x02\x00;')
12-
SVG_MIN = b'<svg xmlns="http://www.w3.org/2000/svg"></svg>'
13-
SVG_XML_DECL = b'<?xml version="1.0" encoding="UTF-8"?>' + SVG_MIN
14-
NOISE = b'\xb0\x0bU\xbe]L\n\x92\xbe\xc6\xf65"\xcc\xa3\xe3'
15-
16-
1710
@pytest.mark.network
1811
def test_get_picture_url_existing():
1912
url = get_picture_url('2012-01-01')
@@ -28,19 +21,6 @@ def test_get_picture_url_not_existing():
2821
get_picture_url('2013-09-12')
2922

3023

31-
def test_get_picture_type_imghdr():
32-
assert get_picture_type(GIF_MIN) == 'gif'
33-
34-
35-
def test_get_picture_type_svg():
36-
assert get_picture_type(SVG_MIN) == 'svg'
37-
assert get_picture_type(SVG_XML_DECL) == 'svg'
38-
39-
40-
def test_get_picture_type_unknown():
41-
assert get_picture_type(NOISE) is None
42-
43-
4424
def test_validate_full_date():
4525
parts = validate_date('2015-1-2')
4626
assert parts == '2015-01-02'
@@ -85,3 +65,23 @@ def test_gen_year_dates_leap():
8565
dates = list(gen_year_dates('2012'))
8666
assert len(dates) == 366
8767
assert dates[365] == '2012-12-31'
68+
69+
70+
GIF_MIN = (b'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
71+
b'\x00\x00\x01\x00\x01\x00\x00\x02\x00;')
72+
SVG_MIN = b'<svg xmlns="http://www.w3.org/2000/svg"></svg>'
73+
SVG_XML_DECL = b'<?xml version="1.0" encoding="UTF-8"?>' + SVG_MIN
74+
NOISE = b'\xb0\x0bU\xbe]L\n\x92\xbe\xc6\xf65"\xcc\xa3\xe3'
75+
76+
def test_picture_type_imghdr():
77+
assert picture_type(GIF_MIN) == 'gif'
78+
79+
80+
def test_picture_type_svg():
81+
assert picture_type(SVG_MIN) == 'svg'
82+
assert picture_type(SVG_XML_DECL) == 'svg'
83+
84+
85+
def test_picture_type_unknown():
86+
assert picture_type(NOISE) is None
87+

0 commit comments

Comments
 (0)