-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatawrangle_ps_q2
More file actions
executable file
·109 lines (83 loc) · 2.9 KB
/
datawrangle_ps_q2
File metadata and controls
executable file
·109 lines (83 loc) · 2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with cities infobox data, audit it, come up with a cleaning idea and then clean it up.
Since in the previous quiz you made a decision on which value to keep for the "areaLand" field,
you now know what has to be done.
Finish the function fix_area(). It will receive a string as an input, and it has to return a float
representing the value of the area or None.
You have to change the function fix_area. You can use extra functions if you like, but changes to process_file
will not be taken into account.
The rest of the code is just an example on how this function can be used.
"""
import codecs
import csv
import json
import pprint
CITIES = 'cities.csv'
def is_value_int(value):
try:
tmp=int(value)
return True
except ValueError, TypeError:
return False
def is_value_float(value):
try:
tmp=float(value)
return True
except ValueError, TypeError:
return False
def audit_field(fieldvalue):
if fieldvalue==None or fieldvalue=="" or fieldvalue=="NULL":
return type(None)
if is_value_int(fieldvalue):
return type(1)
if is_value_float(fieldvalue):
return type(1.1)
if fieldvalue.startswith('{'):
return type(list())
return type('abc')
def keywithmaxval(d):
""" a) create a list of the dict's keys and values;
b) return the key with the max value"""
v=list(d.values())
k=list(d.keys())
return k[v.index(max(v))]
def fix_area(area):
# YOUR CODE HERE
audit_res = audit_field(area)
if audit_res==type(1) or audit_res==type(1.1):
area = float(area)
if audit_res==type(None) or audit_res==type('str'):
area=None
if audit_field(area)==type(list()):
areaset = area.strip('{}').split('|')
# convert to main values, convert back to strings, strip all decimals, then zeroes from right
areaset = {float(x):len(str(float(x)).replace('.','').rstrip('0')) for x in areaset}
area = keywithmaxval(areaset)
print area
return area
def process_file(filename):
# CHANGES TO THIS FUNCTION WILL BE IGNORED WHEN YOU SUBMIT THE EXERCISE
data = []
with open(filename, "r") as f:
reader = csv.DictReader(f)
#skipping the extra metadata
for i in range(3):
l = reader.next()
# processing file
for line in reader:
# calling your function to fix the area value
if "areaLand" in line:
line["areaLand"] = fix_area(line["areaLand"])
data.append(line)
return data
def test():
data = process_file(CITIES)
print "Printing three example results:"
for n in range(5,8):
pprint.pprint(data[n]["areaLand"])
assert data[8]["areaLand"] == 55166700.0
assert data[3]["areaLand"] == None
if __name__ == "__main__":
test()