Skip to content

Commit f10ba79

Browse files
authored
Merge pull request #389 from JamesParrott/Separate_download_and_tempfile_creation
Make downloading from Github slightly less annoying. Refactor common URL and NamedTemporaryFile code.
2 parents 6b63fea + 0ee5025 commit f10ba79

3 files changed

Lines changed: 172 additions & 49 deletions

File tree

README.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ The Python Shapefile Library (PyShp) reads and writes ESRI Shapefiles in pure Py
88

99
- **Author**: [Joel Lawhead](https://github.com/GeospatialPython)
1010
- **Maintainers**: [Karim Bahgat](https://github.com/karimbahgat)
11-
- **Version**: 3.0.5
12-
- **Date**: 18th May 2026
11+
- **Version**: 3.0.6
12+
- **Date**: 19th May 2026
1313
- **License**: [MIT](https://github.com/GeospatialPython/pyshp/blob/master/LICENSE.TXT)
1414

1515
## Contents
@@ -93,7 +93,13 @@ part of your geospatial project.
9393

9494
# Version Changes
9595

96-
## LATEST
96+
## 3.0.6
97+
98+
### URL Downloading
99+
- Unify tempfile creation and shapefile download logic.
100+
- Check "Content-Type" header and sniff initial bytes
101+
of response in order to possibly reject html responses, before parsing as a shapefile, to give a more useful error to users.
102+
- Special case shapefiles hosted in Github repos to suggest appending the query string `?raw=true`.
97103

98104
### Testing:
99105
- Add shapefile from Open Natual Hazard Modelling ([Paula Spannring](https://github.com/PaulaSp3)

changelog.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
1-
LATEST
1+
VERSION 3.0.6
2+
3+
2026-05-19
4+
URL Downloading:
5+
* Unify tempfile creation and shapefile download logic.
6+
* Check "Content-Type" header and sniff initial bytes
7+
of response in order to possibly reject html responses, before parsing as a shapefile, to give a more useful error to users.
8+
* Special case shapefiles hosted in Github repos to suggest appending the query string `?raw=true`.
9+
210

311
2026-05-18
412
Testing:

src/shapefile.py

Lines changed: 154 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from __future__ import annotations
1010

11-
__version__ = "3.0.5"
11+
__version__ = "3.0.6"
1212

1313
import array
1414
import doctest
@@ -18,6 +18,7 @@
1818
import sys
1919
import tempfile
2020
import time
21+
import warnings
2122
import zipfile
2223
from collections.abc import Container, Iterable, Iterator, Reversible, Sequence
2324
from datetime import date
@@ -42,7 +43,7 @@
4243
overload,
4344
)
4445
from urllib.error import HTTPError
45-
from urllib.parse import urlparse, urlunparse
46+
from urllib.parse import urlparse, urlunparse, ParseResult
4647
from urllib.request import Request, urlopen
4748

4849
# Create named logger
@@ -2195,6 +2196,129 @@ def __geo_interface__(self) -> GeoJSONFeatureCollection:
21952196
)
21962197

21972198

2199+
2200+
def _save_to_named_tmp_file(
2201+
bytes_stream: ReadableBinStream,
2202+
initial_bytes: bytes = b"",
2203+
suffix: str | None = None,
2204+
) -> tempfile._TemporaryFileWrapper[bytes]:
2205+
""" Write stream to a read+write tempfile.
2206+
Gets deleted when garbage collected.
2207+
"""
2208+
tmp_file_obj = tempfile.NamedTemporaryFile(
2209+
mode="w+b", suffix=suffix, delete=True
2210+
)
2211+
if initial_bytes:
2212+
tmp_file_obj.write(initial_bytes)
2213+
tmp_file_obj.write(bytes_stream.read())
2214+
tmp_file_obj.seek(0)
2215+
return tmp_file_obj
2216+
2217+
HTML_SIGNATURES_UC = (
2218+
b"<!DOCTYPE",
2219+
b"<HTML",
2220+
b"<HEAD",
2221+
b"<BODY",
2222+
)
2223+
2224+
class UnsuccessfulFileDownload(Warning): pass
2225+
2226+
SUPPORTED_URL_SCHEMES = frozenset(["http", "https"]) # must be lower case
2227+
DEFAULT_USER_AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"
2228+
2229+
2230+
@overload
2231+
def _try_to_download_binary_file(
2232+
urlinfo: ParseResult,
2233+
) -> tuple[bytes, ReadableBinStream]: ...
2234+
@overload
2235+
def _try_to_download_binary_file(
2236+
urlinfo: ParseResult,
2237+
ext: str | None,
2238+
suppress_http_errors: bool,
2239+
user_agent: str,
2240+
) -> tuple[bytes, ReadableBinStream | None]: ...
2241+
@overload
2242+
def _try_to_download_binary_file(
2243+
urlinfo: ParseResult,
2244+
ext: str | None,
2245+
suppress_http_errors: bool,
2246+
) -> tuple[bytes, ReadableBinStream | None]: ...
2247+
def _try_to_download_binary_file(
2248+
urlinfo: ParseResult,
2249+
ext: str | None = None, # LiteralString[".shp", ".dbf", ".shx"] | None = None
2250+
suppress_http_errors: bool = False,
2251+
user_agent: str = DEFAULT_USER_AGENT,
2252+
) -> tuple[bytes, ReadableBinStream | None]:
2253+
""" Tries to open a parsed url and download a file served from it.
2254+
Warns if Content-Type is html and if bytes look like html
2255+
"""
2256+
2257+
2258+
if ext is not None:
2259+
urlpath, _ = os.path.splitext(urlinfo.path) # Removes e.g. ".shp", including the "."
2260+
urlinfo = urlinfo._replace(path = f"{urlpath}.{ext}")
2261+
2262+
url = urlunparse(urlinfo)
2263+
2264+
req = Request(
2265+
url,
2266+
headers={
2267+
"User-agent": user_agent,
2268+
},
2269+
# Don't enforce method="GET", let urllib pick
2270+
# whichever defaults it thinks are best,
2271+
# to allow possible future
2272+
# support for shapefiles via ftp or on local network addresses.
2273+
)
2274+
2275+
try:
2276+
resp = urlopen(req)
2277+
except HTTPError as e:
2278+
msg = f"{e.msg}, occurred when trying to open: {url}, reason: {e.reason}. "
2279+
if not suppress_http_errors:
2280+
e.msg = msg # Add helpful info to the default abrupt 404 message.
2281+
raise e
2282+
elif ext != ".shx":
2283+
# Technically the .shx is required for an ESRI Shapefile,
2284+
# but it's not needed for PyShp, it only contains indices of shapes.
2285+
warnings.warn(msg, category=UnsuccessfulFileDownload)
2286+
return b"", None
2287+
2288+
2289+
content_type = resp.headers.get("Content-Type", "")
2290+
if "text/html" in content_type:
2291+
msg = f"Server returned HTML Content-Type: {content_type})"
2292+
2293+
# It is preferable not to add special cases for every possible
2294+
# hosting service, but Github is a frequent source of frustration
2295+
# in our own tests, and there has literally been an issue open for
2296+
# over a year to locate a shapefile downloadable from elsewhere
2297+
# that nobody has yet answered. So if someone requests support
2298+
# for another service hosting a public shapefile, at least that
2299+
# issue can finally be closed (and James can delete his Github
2300+
# test data repo).
2301+
if urlinfo.netloc.lower().endswith("github.com"):
2302+
msg = f'{msg}\nAppend "?raw=true" after the file name to download from Github repos. '
2303+
warnings.warn(msg, category= UnsuccessfulFileDownload)
2304+
return b"", None
2305+
2306+
initial_bytes = resp.read(40)
2307+
if initial_bytes.upper().startswith(HTML_SIGNATURES_UC):
2308+
msg = f"Response body appears to be HTML despite Content-Type: '{content_type}'"
2309+
warnings.warn(msg, category= UnsuccessfulFileDownload)
2310+
2311+
2312+
# All PyShp cares about is that the response has a .read method
2313+
# that returns bytes. But at the cost of importing http.client
2314+
# we could type this stricter as tuple[bytes, HTTPResponse]:
2315+
# "For HTTP and HTTPS URLs, this function returns a
2316+
# http.client.HTTPResponse object slightly modified."
2317+
return initial_bytes, cast(ReadableBinStream, resp)
2318+
2319+
2320+
2321+
21982322
class ShapefileException(Exception):
21992323
"""An exception to handle shapefile specific problems."""
22002324

@@ -2287,25 +2411,26 @@ def __init__(
22872411
tempfile._TemporaryFileWrapper[bytes] | io.BufferedReader
22882412
)
22892413
# Create a zip file handle
2290-
if zpath.startswith("http"):
2414+
urlinfo = urlparse(zpath)
2415+
2416+
resp: ReadableBinStream | None
2417+
if urlinfo.scheme in SUPPORTED_URL_SCHEMES:
22912418
# Zipfile is from a url
2292-
# Download to a temporary url and treat as normal zipfile
2293-
req = Request(
2294-
zpath,
2295-
headers={
2296-
"User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"
2297-
},
2298-
)
2299-
resp = urlopen(req)
2300-
# write zipfile data to a read+write tempfile and use as source, gets deleted when garbage collected
2301-
zipfileobj = tempfile.NamedTemporaryFile(
2302-
mode="w+b", suffix=".zip", delete=True
2419+
# Download to a temporary file and treat as normal zipfile
2420+
sniffed_bytes, resp = _try_to_download_binary_file(urlinfo=urlinfo)
2421+
2422+
2423+
# Use named tmp file as source for zip file data.
2424+
zipfileobj = _save_to_named_tmp_file(
2425+
resp,
2426+
initial_bytes = sniffed_bytes,
2427+
suffix=".zip",
23032428
)
2304-
zipfileobj.write(resp.read())
2305-
zipfileobj.seek(0)
2429+
23062430
else:
23072431
# Zipfile is from a file
23082432
zipfileobj = open(zpath, mode="rb")
2433+
23092434
# Open the zipfile archive
23102435
with zipfile.ZipFile(zipfileobj, "r") as archive:
23112436
if not shapefile:
@@ -2336,12 +2461,8 @@ def __init__(
23362461
for cased_ext in [lower_ext, lower_ext.upper()]:
23372462
try:
23382463
member = archive.open(f"{shapefile}.{cased_ext}")
2339-
# write zipfile member data to a read+write tempfile and use as source, gets deleted on close()
2340-
fileobj = tempfile.NamedTemporaryFile(
2341-
mode="w+b", delete=True
2342-
)
2343-
fileobj.write(member.read())
2344-
fileobj.seek(0)
2464+
# Use read+write tempfile as source for member data.
2465+
fileobj = _save_to_named_tmp_file(member)
23452466
setattr(self, lower_ext, fileobj)
23462467
self._files_to_close.append(fileobj)
23472468
except (OSError, AttributeError, KeyError):
@@ -2369,31 +2490,19 @@ def __init__(
23692490
# Shapefile is from a url
23702491
# Download each file to temporary path and treat as normal shapefile path
23712492
urlinfo = urlparse(path)
2372-
urlpath = urlinfo[2]
2373-
urlpath, _ = os.path.splitext(urlpath)
2374-
shapefile = os.path.basename(urlpath)
23752493
for ext in ["shp", "shx", "dbf"]:
2376-
try:
2377-
_urlinfo = list(urlinfo)
2378-
_urlinfo[2] = urlpath + "." + ext
2379-
_path = urlunparse(_urlinfo)
2380-
req = Request(
2381-
_path,
2382-
headers={
2383-
"User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"
2384-
},
2385-
)
2386-
resp = urlopen(req)
2387-
# write url data to a read+write tempfile and use as source, gets deleted on close()
2388-
fileobj = tempfile.NamedTemporaryFile(
2389-
mode="w+b", delete=True
2494+
2495+
sniffed_bytes, resp = _try_to_download_binary_file(
2496+
urlinfo=urlinfo,
2497+
ext=ext,
2498+
suppress_http_errors=True,
23902499
)
2391-
fileobj.write(resp.read())
2392-
fileobj.seek(0)
2393-
setattr(self, ext, fileobj)
2394-
self._files_to_close.append(fileobj)
2395-
except HTTPError:
2396-
pass
2500+
if resp is None:
2501+
continue
2502+
# Use tempfile as source for url data.
2503+
fileobj = _save_to_named_tmp_file(resp, initial_bytes = sniffed_bytes)
2504+
setattr(self, ext, fileobj)
2505+
self._files_to_close.append(fileobj)
23972506
if self.shp or self.dbf:
23982507
# Load and exit early
23992508
self.load()

0 commit comments

Comments
 (0)