1818import sys
1919import argparse
2020import re
21- import imghdr
2221import time
2322import datetime
23+ import os
24+ import imghdr
25+ import warnings
2426
2527import requests
2628
27- SAVE_DIR = 'pictures/'
28- POTD_PATH = 'Template:POTD/'
29- POTD_BASE_URL = 'http://en.wikipedia.org/wiki/' + POTD_PATH
29+ SAVE_DIR = 'downloaded/'
30+
31+ #POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
32+ POTD_BASE_URL = 'http://127.0.0.1:8001/Template-POTD/'
33+
34+ REMOTE_PICT_BASE_URL = 'http://upload.wikimedia.org/wikipedia/'
35+ LOCAL_PICT_BASE_URL = 'http://127.0.0.1:8001/'
36+ PICT_BASE_URL = LOCAL_PICT_BASE_URL
37+
3038POTD_IMAGE_RE = re .compile (r'src="(//upload\..*?)"' )
3139PODT_EARLIEST_TEMPLATE = '2007-01-01'
3240
3543RE_DATE = RE_MONTH + r'-([0-3]\d)'
3644ISO_DATE_FMT = '%Y-%m-%d'
3745
38- DATEFORMS = [
39- ('date' , re .compile ('^' + RE_DATE + '$' )),
40- ('month' , re .compile ('^' + RE_MONTH + '$' )),
41- ('year' , re .compile ('^' + RE_YEAR + '$' ))
42- ]
46+ PICT_EXCEPTIONS = {
47+ '2013-06-15' , # .webm movie [1]
48+ }
4349
50+ #[1] http://en.wikipedia.org/wiki/Template:POTD/2013-06-15
4451
4552class NoPictureForDate (Exception ):
4653 '''No Picture of the Day found for {iso_date}'''
@@ -51,31 +58,15 @@ class NoPictureTemplateBefore(ValueError):
5158
5259
5360def get_picture_url (iso_date ):
54- page_url = POTD_BASE_URL + iso_date
61+ page_url = POTD_BASE_URL + iso_date
62+ print (page_url )
5563 response = requests .get (page_url )
5664 pict_url = POTD_IMAGE_RE .search (response .text )
5765 if pict_url is None :
5866 raise NoPictureForDate (iso_date )
5967 return 'http:' + pict_url .group (1 )
6068
6169
62- def get_picture (iso_date ):
63- pict_url = get_picture_url (iso_date )
64- response = requests .get (pict_url )
65- octets = response .content
66- return octets
67-
68-
69- def get_picture_type (octets ):
70- pict_type = imghdr .what (None , octets )
71- if pict_type is None :
72- if (octets .startswith (b'<' ) and
73- b'<svg' in octets [:200 ] and
74- octets .rstrip ().endswith (b'</svg>' )):
75- pict_type = 'svg'
76- return pict_type
77-
78-
7970def validate_date (text ):
8071 try :
8172 parts = [int (part ) for part in text .split ('-' )]
@@ -116,8 +107,8 @@ def gen_dates(iso_parts):
116107 yield iso_parts
117108
118109
119- def get_picture_urls (dates , verbose = False , save_fixture = False ):
120- urls = []
110+ def get_picture_urls (dates , verbose = False ):
111+ date_urls = []
121112 count = 0
122113 for date in dates :
123114 try :
@@ -132,8 +123,50 @@ def get_picture_urls(dates, verbose=False, save_fixture=False):
132123 print (url .split ('/' )[- 1 ])
133124 else :
134125 print (url )
135- urls .append (url )
136- return urls
126+ date_urls .append ((date , url ))
127+ return date_urls
128+
129+
130+ def picture_type (octets ):
131+ pict_type = imghdr .what (None , octets )
132+ if pict_type is None :
133+ if (octets .startswith (b'<' ) and
134+ b'<svg' in octets [:200 ] and
135+ octets .rstrip ().endswith (b'</svg>' )):
136+ pict_type = 'svg'
137+ return pict_type
138+
139+
140+ def get_pictures (dates , verbose = False ):
141+ urls_ok = []
142+ try :
143+ os .makedirs (SAVE_DIR )
144+ except FileExistsError :
145+ pass
146+ for date , url in get_picture_urls (dates , verbose ):
147+ if PICT_BASE_URL == LOCAL_PICT_BASE_URL :
148+ url = url .replace (REMOTE_PICT_BASE_URL , PICT_BASE_URL )
149+ response = requests .get (url )
150+ if response .status_code != 200 :
151+ warnings .warn ('HTTP code {}: {}' .format (response .status_code , url ))
152+ continue
153+ octets = response .content
154+ if date not in PICT_EXCEPTIONS :
155+ assert picture_type (octets ) is not None , url
156+ file_path = url .replace (PICT_BASE_URL , '' )
157+ file_name = os .path .basename (file_path )
158+ path = os .path .join (SAVE_DIR , date .split ('-' )[0 ])
159+ file_path = os .path .join (path , file_name )
160+ #import pdb; pdb.set_trace()
161+ try :
162+ os .makedirs (path )
163+ except FileExistsError :
164+ pass
165+ with open (file_path , 'wb' ) as fp :
166+ fp .write (octets )
167+ urls_ok .append (url )
168+ print (file_path )
169+ return urls_ok
137170
138171
139172def parse_args (argv ):
@@ -175,7 +208,12 @@ def main(argv, get_picture_urls):
175208
176209 t0 = time .time ()
177210
178- urls = get_picture_urls (dates , args .verbose , args .fixture_save )
211+ if args .url_only :
212+ urls = get_picture_urls (dates , args .verbose )
213+ else :
214+ urls = get_pictures (dates , args .verbose )
215+
216+
179217
180218 elapsed = time .time () - t0
181219 if args .verbose :
0 commit comments