-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathparse.py
More file actions
146 lines (124 loc) · 3.94 KB
/
parse.py
File metadata and controls
146 lines (124 loc) · 3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import re
from tld import get_tld
MAX_URL_LEN = 1000
MIN_URL_LEN = 4
class URLParsed(object):
"""
Returns parsed URL with attributes:
subdomain
suffix
tld
domain
path
local (bool)
"""
def get_subdomain(self):
try:
sub = get_tld(self.url, as_object=True).subdomain
if sub and len(sub):
return sub
return None
except:
return None
def get_suffix(self):
try:
suffix = get_tld(self.url, as_object=True).suffix
if suffix and len(suffix):
return suffix
except:
return None
def parse_tld(self):
try:
parsed = get_tld(self.url, as_object=True)
if parsed.suffix and len(parsed.suffix):
return parsed.tld
return None
except:
return None
def get_domain(self):
if self.subdomain and len(self.subdomain):
return self.subdomain + '.' + self.tld
elif self.tld and len(self.tld):
return self.tld
return None
def get_path(self):
"""
Appends local '/' where doesn't exist
"""
try:
path = self.original.rsplit(self.domain,1)[1]
except AttributeError:
return None
except:
path = self.original.rsplit(self.domain,1)[0]
if len(path):
if path[:1] != '/':
return '/' + path
return path
return None
def get_path_list(self):
"""
Returns list of elements in the path
Removes query string
"""
try:
str_path = self.path.strip('/').split('/')
except:
return None
# if it exists, assign query string and strip it
last_elem = str_path[-1]
# if it's a query string, strip and return it
if '?' in last_elem:
position = last_elem.find('?')
last_path_elem = last_elem[:position]
self.query_string = last_elem[position+1:] # remove '?'
return str_path[:-1] + [last_path_elem]
# TODO add query string parser to dict
return str_path
def get_local(self):
return False if self.domain else True
def sanitize_url(self, url=None):
"""
Remove non-link '/' edge case urls
add 'http://' for tld parsing trickery
- javascript
- urls longer than 1000 chars
- urls shorter than 4 chars
"""
str_url = url.strip() # strip whitespace
# outlying cases
if len(str_url) > MAX_URL_LEN or len(str_url) < MIN_URL_LEN or str_url[:10] == 'javascript':
return None
elif str_url[:4] != 'http':
return 'http://' + str_url
return str_url
def __init__(self, url, test=False):
self.original = url # store original url but process sanitized url
self.url = self.sanitize_url(url)
self.subdomain = None
self.suffix = None
self.tld = None
self.domain = None
self.path = None
self.local = None
self.path_list = None
self.query_string = None
if self.url:
self.subdomain = self.get_subdomain()
self.suffix = self.get_suffix()
self.tld = self.parse_tld()
self.domain = self.get_domain()
self.path = self.get_path()
self.local = self.get_local()
self.query_string = None
self.path_list = self.get_path_list()
if test:
print 'url: ', self.url
print 'subdomain: ', self.subdomain
print 'suffix: ', self.suffix
print 'tld: ', self.tld
print 'domain: ', self.domain
print 'path: ', self.path
print 'local: ', self.local
print 'path_list: ', self.path_list
print 'query_string: ', self.query_string