1+ import io
12import os
23import pathlib
34from struct import unpack
89USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"
910
1011
11- def dl(path, ext=".shp"):
12+ def dl (path , ext = ".shp" , user_agent = USER_AGENT ):
1213 urlinfo = urlparse (path )
1314 urlpath = urlinfo [2 ]
1415 urlpath , _ = os .path .splitext (urlpath )
@@ -18,21 +19,89 @@ def dl(path, ext=".shp"):
1819 req = Request (
1920 _path ,
2021 headers = {
21- "User-agent": USER_AGENT ,
22+ "User-agent" : user_agent ,
2223 },
2324 )
2425 resp = urlopen (req )
2526 return resp
2627
2728
29+ HTML_SIGNATURES = (
30+ b"<!DOCTYPE" ,
31+ b"<!doctype" ,
32+ b"<html" ,
33+ b"<HTML" ,
34+ b"<head" ,
35+ b"<HEAD" ,
36+ b"<body" ,
37+ b"<BODY" ,
38+ )
39+
40+
41+ def is_html (data : bytes ) -> bool :
42+ # Skip optional BOM and leading whitespace
43+ stripped = data .lstrip (b"\xef \xbb \xbf \t \r \n " )
44+ return stripped .startswith (HTML_SIGNATURES )
45+
46+
47+ class PeekedResponse :
48+ """Wraps a response, pre-reading a prefix to allow sniffing."""
49+
50+ def __init__ (self , resp , peek_size = 512 ):
51+ self ._resp = resp
52+ self ._prefix = resp .read (peek_size )
53+ self ._stream = io .BytesIO (self ._prefix )
54+ self ._exhausted = False
55+
56+ def read (self , n = - 1 ):
57+ if self ._exhausted :
58+ return self ._resp .read (n )
59+ data = self ._stream .read (n )
60+ if n == - 1 or len (data ) < n :
61+ self ._exhausted = True
62+ data += self ._resp .read () if n == - 1 else self ._resp .read (n - len (data ))
63+ return data
64+
65+ # Proxy everything else the parser might need
66+ def __getattr__ (self , name ):
67+ return getattr (self ._resp , name )
68+
69+
70+ def sniff_first_then_dl (path , ext = ".shp" , user_agent = USER_AGENT ):
71+ urlinfo = urlparse (path )
72+ urlpath , _ = os .path .splitext (urlinfo [2 ])
73+ _urlinfo = list (urlinfo )
74+ _urlinfo [2 ] = urlpath + "." + ext
75+ _path = urlunparse (_urlinfo )
76+
77+ req = Request (_path , headers = {"User-agent" : user_agent })
78+ resp = urlopen (req )
79+
80+ # Layer 1: Content-Type header
81+ content_type = resp .headers .get ("Content-Type" , "" )
82+ if "text/html" in content_type :
83+ raise ValueError (
84+ f"Server returned HTML, not a binary file (Content-Type: { content_type } )"
85+ )
86+
87+ # Layer 2: Byte sniffing
88+ peeked = PeekedResponse (resp )
89+ if is_html (peeked ._prefix ):
90+ raise ValueError (
91+ f"Response body appears to be HTML despite Content-Type: '{ content_type } '"
92+ )
93+
94+ return peeked
95+
96+
2897def dl_all (path ):
2998 for ext in [
30- # "shp",
31- # "shx",
32- # "dbf",
99+ "shp" ,
100+ "shx" ,
101+ "dbf" ,
33102 ]:
34103 try :
35- resp = dl (path, ext)
104+ resp = sniff_first_then_dl (path , ext )
36105 except HTTPError as e :
37106 raise e
38107 print (resp .status )
@@ -52,8 +121,8 @@ def test_downloads():
52121 # "https://www.example.com",
53122 # "https://github.com/OpenNHM/AvaFrameData/blob/main/avaPopeletzbach/eventArea20090407.shp?raw=true",
54123 # "https://github.com/OpenNHM/AvaFrameData/blob/main/avaPopeletzbach/eventArea20090407.dbf",
55- # "https://github.com/nvkelso/natural-earth-vector/blob/master/110m_cultural/ne_110m_admin_0_tiny_countries.shp?raw=true"
56- "https://github.com/nvkelso/natural-earth-vector/blob/master/110m_cultural/ne_110m_admin_0_tiny_countries.dbf"
124+ "https://github.com/nvkelso/natural-earth-vector/blob/master/110m_cultural/ne_110m_admin_0_tiny_countries.shp?raw=true"
125+ # "https://github.com/nvkelso/natural-earth-vector/blob/master/110m_cultural/ne_110m_admin_0_tiny_countries.shp",
57126 ]:
58127 print (dl_all (url ))
59128
@@ -77,9 +146,9 @@ def direct_unpack():
77146
78147
79148def main ():
80- # test_downloads()
149+ test_downloads ()
81150 # test_header_parse()
82- direct_unpack()
151+ # direct_unpack()
83152
84153
85154if __name__ == "__main__" :
0 commit comments