|
8 | 8 |
|
9 | 9 | from __future__ import annotations |
10 | 10 |
|
11 | | -__version__ = "3.0.5" |
| 11 | +__version__ = "3.0.6" |
12 | 12 |
|
13 | 13 | import array |
14 | 14 | import doctest |
|
18 | 18 | import sys |
19 | 19 | import tempfile |
20 | 20 | import time |
| 21 | +import warnings |
21 | 22 | import zipfile |
22 | 23 | from collections.abc import Container, Iterable, Iterator, Reversible, Sequence |
23 | 24 | from datetime import date |
|
42 | 43 | overload, |
43 | 44 | ) |
44 | 45 | from urllib.error import HTTPError |
45 | | -from urllib.parse import urlparse, urlunparse |
| 46 | +from urllib.parse import urlparse, urlunparse, ParseResult |
46 | 47 | from urllib.request import Request, urlopen |
47 | 48 |
|
48 | 49 | # Create named logger |
@@ -2195,6 +2196,129 @@ def __geo_interface__(self) -> GeoJSONFeatureCollection: |
2195 | 2196 | ) |
2196 | 2197 |
|
2197 | 2198 |
|
| 2199 | + |
| 2200 | +def _save_to_named_tmp_file( |
| 2201 | + bytes_stream: ReadableBinStream, |
| 2202 | + initial_bytes: bytes = b"", |
| 2203 | + suffix: str | None = None, |
| 2204 | + ) -> tempfile._TemporaryFileWrapper[bytes]: |
| 2205 | + """ Write stream to a read+write tempfile. |
| 2206 | + Gets deleted when garbage collected. |
| 2207 | + """ |
| 2208 | + tmp_file_obj = tempfile.NamedTemporaryFile( |
| 2209 | + mode="w+b", suffix=suffix, delete=True |
| 2210 | + ) |
| 2211 | + if initial_bytes: |
| 2212 | + tmp_file_obj.write(initial_bytes) |
| 2213 | + tmp_file_obj.write(bytes_stream.read()) |
| 2214 | + tmp_file_obj.seek(0) |
| 2215 | + return tmp_file_obj |
| 2216 | + |
| 2217 | +HTML_SIGNATURES_UC = ( |
| 2218 | + b"<!DOCTYPE", |
| 2219 | + b"<HTML", |
| 2220 | + b"<HEAD", |
| 2221 | + b"<BODY", |
| 2222 | +) |
| 2223 | + |
| 2224 | +class UnsuccessfulFileDownload(Warning): pass |
| 2225 | + |
| 2226 | +SUPPORTED_URL_SCHEMES = frozenset(["http", "https"]) # must be lower case |
| 2227 | +DEFAULT_USER_AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36" |
| 2228 | + |
| 2229 | + |
| 2230 | +@overload |
| 2231 | +def _try_to_download_binary_file( |
| 2232 | + urlinfo: ParseResult, |
| 2233 | +) -> tuple[bytes, ReadableBinStream]: ... |
| 2234 | +@overload |
| 2235 | +def _try_to_download_binary_file( |
| 2236 | + urlinfo: ParseResult, |
| 2237 | + ext: str | None, |
| 2238 | + suppress_http_errors: bool, |
| 2239 | + user_agent: str, |
| 2240 | +) -> tuple[bytes, ReadableBinStream | None]: ... |
| 2241 | +@overload |
| 2242 | +def _try_to_download_binary_file( |
| 2243 | + urlinfo: ParseResult, |
| 2244 | + ext: str | None, |
| 2245 | + suppress_http_errors: bool, |
| 2246 | +) -> tuple[bytes, ReadableBinStream | None]: ... |
| 2247 | +def _try_to_download_binary_file( |
| 2248 | + urlinfo: ParseResult, |
| 2249 | + ext: str | None = None, # LiteralString[".shp", ".dbf", ".shx"] | None = None |
| 2250 | + suppress_http_errors: bool = False, |
| 2251 | + user_agent: str = DEFAULT_USER_AGENT, |
| 2252 | + ) -> tuple[bytes, ReadableBinStream | None]: |
| 2253 | + """ Tries to open a parsed url and download a file served from it. |
| 2254 | + Warns if Content-Type is html and if bytes look like html |
| 2255 | + """ |
| 2256 | + |
| 2257 | + |
| 2258 | + if ext is not None: |
| 2259 | + urlpath, _ = os.path.splitext(urlinfo.path) # Removes e.g. ".shp", including the "." |
| 2260 | + urlinfo = urlinfo._replace(path = f"{urlpath}.{ext}") |
| 2261 | + |
| 2262 | + url = urlunparse(urlinfo) |
| 2263 | + |
| 2264 | + req = Request( |
| 2265 | + url, |
| 2266 | + headers={ |
| 2267 | + "User-agent": user_agent, |
| 2268 | + }, |
| 2269 | + # Don't enforce method="GET", let urllib pick |
| 2270 | + # whichever defaults it thinks are best, |
| 2271 | + # to allow possible future |
| 2272 | + # support for shapefiles via ftp or on local network addresses. |
| 2273 | + ) |
| 2274 | + |
| 2275 | + try: |
| 2276 | + resp = urlopen(req) |
| 2277 | + except HTTPError as e: |
| 2278 | + msg = f"{e.msg}, occurred when trying to open: {url}, reason: {e.reason}. " |
| 2279 | + if not suppress_http_errors: |
| 2280 | + e.msg = msg # Add helpful info to the default abrupt 404 message. |
| 2281 | + raise e |
| 2282 | + elif ext != ".shx": |
| 2283 | + # Technically the .shx is required for an ESRI Shapefile, |
| 2284 | + # but it's not needed for PyShp, it only contains indices of shapes. |
| 2285 | + warnings.warn(msg, category=UnsuccessfulFileDownload) |
| 2286 | + return b"", None |
| 2287 | + |
| 2288 | + |
| 2289 | + content_type = resp.headers.get("Content-Type", "") |
| 2290 | + if "text/html" in content_type: |
| 2291 | + msg = f"Server returned HTML Content-Type: {content_type})" |
| 2292 | + |
| 2293 | + # It is preferable not to add special cases for every possible |
| 2294 | + # hosting service, but Github is a frequent source of frustration |
| 2295 | + # in our own tests, and there has literally been an issue open for |
| 2296 | + # over a year to locate a shapefile downloadable from elsewhere |
| 2297 | + # that nobody has yet answered. So if someone requests support |
| 2298 | + # for another service hosting a public shapefile, at least that |
| 2299 | + # issue can finally be closed (and James can delete his Github |
| 2300 | + # test data repo). |
| 2301 | + if urlinfo.netloc.lower().endswith("github.com"): |
| 2302 | + msg = f'{msg}\nAppend "?raw=true" after the file name to download from Github repos. ' |
| 2303 | + warnings.warn(msg, category= UnsuccessfulFileDownload) |
| 2304 | + return b"", None |
| 2305 | + |
| 2306 | + initial_bytes = resp.read(40) |
| 2307 | + if initial_bytes.upper().startswith(HTML_SIGNATURES_UC): |
| 2308 | + msg = f"Response body appears to be HTML despite Content-Type: '{content_type}'" |
| 2309 | + warnings.warn(msg, category= UnsuccessfulFileDownload) |
| 2310 | + |
| 2311 | + |
| 2312 | + # All PyShp cares about is that the response has a .read method |
| 2313 | + # that returns bytes. But at the cost of importing http.client |
| 2314 | + # we could type this stricter as tuple[bytes, HTTPResponse]: |
| 2315 | + # "For HTTP and HTTPS URLs, this function returns a |
| 2316 | + # http.client.HTTPResponse object slightly modified." |
| 2317 | + return initial_bytes, cast(ReadableBinStream, resp) |
| 2318 | + |
| 2319 | + |
| 2320 | + |
| 2321 | + |
2198 | 2322 | class ShapefileException(Exception): |
2199 | 2323 | """An exception to handle shapefile specific problems.""" |
2200 | 2324 |
|
@@ -2287,25 +2411,26 @@ def __init__( |
2287 | 2411 | tempfile._TemporaryFileWrapper[bytes] | io.BufferedReader |
2288 | 2412 | ) |
2289 | 2413 | # Create a zip file handle |
2290 | | - if zpath.startswith("http"): |
| 2414 | + urlinfo = urlparse(zpath) |
| 2415 | + |
| 2416 | + resp: ReadableBinStream | None |
| 2417 | + if urlinfo.scheme in SUPPORTED_URL_SCHEMES: |
2291 | 2418 | # Zipfile is from a url |
2292 | | - # Download to a temporary url and treat as normal zipfile |
2293 | | - req = Request( |
2294 | | - zpath, |
2295 | | - headers={ |
2296 | | - "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36" |
2297 | | - }, |
2298 | | - ) |
2299 | | - resp = urlopen(req) |
2300 | | - # write zipfile data to a read+write tempfile and use as source, gets deleted when garbage collected |
2301 | | - zipfileobj = tempfile.NamedTemporaryFile( |
2302 | | - mode="w+b", suffix=".zip", delete=True |
| 2419 | + # Download to a temporary file and treat as normal zipfile |
| 2420 | + sniffed_bytes, resp = _try_to_download_binary_file(urlinfo=urlinfo) |
| 2421 | + |
| 2422 | + |
| 2423 | + # Use named tmp file as source for zip file data. |
| 2424 | + zipfileobj = _save_to_named_tmp_file( |
| 2425 | + resp, |
| 2426 | + initial_bytes = sniffed_bytes, |
| 2427 | + suffix=".zip", |
2303 | 2428 | ) |
2304 | | - zipfileobj.write(resp.read()) |
2305 | | - zipfileobj.seek(0) |
| 2429 | + |
2306 | 2430 | else: |
2307 | 2431 | # Zipfile is from a file |
2308 | 2432 | zipfileobj = open(zpath, mode="rb") |
| 2433 | + |
2309 | 2434 | # Open the zipfile archive |
2310 | 2435 | with zipfile.ZipFile(zipfileobj, "r") as archive: |
2311 | 2436 | if not shapefile: |
@@ -2336,12 +2461,8 @@ def __init__( |
2336 | 2461 | for cased_ext in [lower_ext, lower_ext.upper()]: |
2337 | 2462 | try: |
2338 | 2463 | member = archive.open(f"{shapefile}.{cased_ext}") |
2339 | | - # write zipfile member data to a read+write tempfile and use as source, gets deleted on close() |
2340 | | - fileobj = tempfile.NamedTemporaryFile( |
2341 | | - mode="w+b", delete=True |
2342 | | - ) |
2343 | | - fileobj.write(member.read()) |
2344 | | - fileobj.seek(0) |
| 2464 | + # Use read+write tempfile as source for member data. |
| 2465 | + fileobj = _save_to_named_tmp_file(member) |
2345 | 2466 | setattr(self, lower_ext, fileobj) |
2346 | 2467 | self._files_to_close.append(fileobj) |
2347 | 2468 | except (OSError, AttributeError, KeyError): |
@@ -2369,31 +2490,19 @@ def __init__( |
2369 | 2490 | # Shapefile is from a url |
2370 | 2491 | # Download each file to temporary path and treat as normal shapefile path |
2371 | 2492 | urlinfo = urlparse(path) |
2372 | | - urlpath = urlinfo[2] |
2373 | | - urlpath, _ = os.path.splitext(urlpath) |
2374 | | - shapefile = os.path.basename(urlpath) |
2375 | 2493 | for ext in ["shp", "shx", "dbf"]: |
2376 | | - try: |
2377 | | - _urlinfo = list(urlinfo) |
2378 | | - _urlinfo[2] = urlpath + "." + ext |
2379 | | - _path = urlunparse(_urlinfo) |
2380 | | - req = Request( |
2381 | | - _path, |
2382 | | - headers={ |
2383 | | - "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36" |
2384 | | - }, |
2385 | | - ) |
2386 | | - resp = urlopen(req) |
2387 | | - # write url data to a read+write tempfile and use as source, gets deleted on close() |
2388 | | - fileobj = tempfile.NamedTemporaryFile( |
2389 | | - mode="w+b", delete=True |
| 2494 | + |
| 2495 | + sniffed_bytes, resp = _try_to_download_binary_file( |
| 2496 | + urlinfo=urlinfo, |
| 2497 | + ext=ext, |
| 2498 | + suppress_http_errors=True, |
2390 | 2499 | ) |
2391 | | - fileobj.write(resp.read()) |
2392 | | - fileobj.seek(0) |
2393 | | - setattr(self, ext, fileobj) |
2394 | | - self._files_to_close.append(fileobj) |
2395 | | - except HTTPError: |
2396 | | - pass |
| 2500 | + if resp is None: |
| 2501 | + continue |
| 2502 | + # Use tempfile as source for url data. |
| 2503 | + fileobj = _save_to_named_tmp_file(resp, initial_bytes = sniffed_bytes) |
| 2504 | + setattr(self, ext, fileobj) |
| 2505 | + self._files_to_close.append(fileobj) |
2397 | 2506 | if self.shp or self.dbf: |
2398 | 2507 | # Load and exit early |
2399 | 2508 | self.load() |
|
0 commit comments