Skip to content

Commit 6d0b1ae

Browse files
committed
Add response sniffer to mitigate html in shapefiles
1 parent 2b63981 commit 6d0b1ae

1 file changed

Lines changed: 79 additions & 10 deletions

File tree

dl.py

Lines changed: 79 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import io
12
import os
23
import pathlib
34
from struct import unpack
@@ -8,7 +9,7 @@
89
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"
910

1011

11-
def dl(path, ext=".shp"):
12+
def dl(path, ext=".shp", user_agent=USER_AGENT):
1213
urlinfo = urlparse(path)
1314
urlpath = urlinfo[2]
1415
urlpath, _ = os.path.splitext(urlpath)
@@ -18,21 +19,89 @@ def dl(path, ext=".shp"):
1819
req = Request(
1920
_path,
2021
headers={
21-
"User-agent": USER_AGENT,
22+
"User-agent": user_agent,
2223
},
2324
)
2425
resp = urlopen(req)
2526
return resp
2627

2728

29+
HTML_SIGNATURES = (
30+
b"<!DOCTYPE",
31+
b"<!doctype",
32+
b"<html",
33+
b"<HTML",
34+
b"<head",
35+
b"<HEAD",
36+
b"<body",
37+
b"<BODY",
38+
)
39+
40+
41+
def is_html(data: bytes) -> bool:
42+
# Skip optional BOM and leading whitespace
43+
stripped = data.lstrip(b"\xef\xbb\xbf \t\r\n")
44+
return stripped.startswith(HTML_SIGNATURES)
45+
46+
47+
class PeekedResponse:
48+
"""Wraps a response, pre-reading a prefix to allow sniffing."""
49+
50+
def __init__(self, resp, peek_size=512):
51+
self._resp = resp
52+
self._prefix = resp.read(peek_size)
53+
self._stream = io.BytesIO(self._prefix)
54+
self._exhausted = False
55+
56+
def read(self, n=-1):
57+
if self._exhausted:
58+
return self._resp.read(n)
59+
data = self._stream.read(n)
60+
if n == -1 or len(data) < n:
61+
self._exhausted = True
62+
data += self._resp.read() if n == -1 else self._resp.read(n - len(data))
63+
return data
64+
65+
# Proxy everything else the parser might need
66+
def __getattr__(self, name):
67+
return getattr(self._resp, name)
68+
69+
70+
def sniff_first_then_dl(path, ext=".shp", user_agent=USER_AGENT):
71+
urlinfo = urlparse(path)
72+
urlpath, _ = os.path.splitext(urlinfo[2])
73+
_urlinfo = list(urlinfo)
74+
_urlinfo[2] = urlpath + "." + ext
75+
_path = urlunparse(_urlinfo)
76+
77+
req = Request(_path, headers={"User-agent": user_agent})
78+
resp = urlopen(req)
79+
80+
# Layer 1: Content-Type header
81+
content_type = resp.headers.get("Content-Type", "")
82+
if "text/html" in content_type:
83+
raise ValueError(
84+
f"Server returned HTML, not a binary file (Content-Type: {content_type})"
85+
)
86+
87+
# Layer 2: Byte sniffing
88+
peeked = PeekedResponse(resp)
89+
if is_html(peeked._prefix):
90+
raise ValueError(
91+
f"Response body appears to be HTML despite Content-Type: '{content_type}'"
92+
)
93+
94+
return peeked
95+
96+
2897
def dl_all(path):
2998
for ext in [
30-
# "shp",
31-
# "shx",
32-
# "dbf",
99+
"shp",
100+
"shx",
101+
"dbf",
33102
]:
34103
try:
35-
resp = dl(path, ext)
104+
resp = sniff_first_then_dl(path, ext)
36105
except HTTPError as e:
37106
raise e
38107
print(resp.status)
@@ -52,8 +121,8 @@ def test_downloads():
52121
# "https://www.example.com",
53122
# "https://github.com/OpenNHM/AvaFrameData/blob/main/avaPopeletzbach/eventArea20090407.shp?raw=true",
54123
# "https://github.com/OpenNHM/AvaFrameData/blob/main/avaPopeletzbach/eventArea20090407.dbf",
55-
# "https://github.com/nvkelso/natural-earth-vector/blob/master/110m_cultural/ne_110m_admin_0_tiny_countries.shp?raw=true"
56-
"https://github.com/nvkelso/natural-earth-vector/blob/master/110m_cultural/ne_110m_admin_0_tiny_countries.dbf"
124+
"https://github.com/nvkelso/natural-earth-vector/blob/master/110m_cultural/ne_110m_admin_0_tiny_countries.shp?raw=true"
125+
# "https://github.com/nvkelso/natural-earth-vector/blob/master/110m_cultural/ne_110m_admin_0_tiny_countries.shp",
57126
]:
58127
print(dl_all(url))
59128

@@ -77,9 +146,9 @@ def direct_unpack():
77146

78147

79148
def main():
80-
# test_downloads()
149+
test_downloads()
81150
# test_header_parse()
82-
direct_unpack()
151+
# direct_unpack()
83152

84153

85154
if __name__ == "__main__":

0 commit comments

Comments
 (0)