-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathread_call_copy.py
More file actions
67 lines (55 loc) · 2 KB
/
read_call_copy.py
File metadata and controls
67 lines (55 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# read_call_copy.py
#
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
def read_file(filepath=None, file_separator=None, target=None):
# check for valid inputs and set defaults
if filepath is None:
print("Please provide a file path.")
sys.exit(1)
elif filepath.upper() == "TITANIC":
filepath = "https://github.com/EpistasisLab/pmlb/raw/refs/heads/master/datasets/titanic/titanic.tsv.gz"
if file_separator is None:
print("Please provide a file separator.")
sys.exit(1)
if file_separator.upper() == "T":
file_separator = "\t"
elif file_separator.upper() == "C":
file_separator = ","
elif file_separator.upper() == "SPACE":
file_separator = " "
elif file_separator.upper() == "SEMICOLON":
file_separator = ";"
elif file_separator.upper() == "TAB":
file_separator = "\t"
elif file_separator.upper() == "PIPE":
file_separator = "|"
elif file_separator.upper() == "COLON":
file_separator = ":"
elif file_separator.upper() == "NONE":
file_separator = None
elif file_separator.upper() == "OTHER":
file_separator = input("Please provide a custom file separator: ")
if target is None:
print("Please provide a target variable.")
sys.exit(1)
# read the file
df = pd.read_csv(filepath, sep=file_separator, engine='python')
df_new = df.dropna()
# dataframe = df_new
# split dataset into features and target
df_features = df_new.drop(columns=[target])
df_target = df_new[target]
return df_new, df_features, df_target
file_path = input("Full path to dataframe: ")
file_separator = input("Separator used in dataframe (i.e., tab): ")
target_col = input("Target feature in the dataframe (column name): ")
df, features, target_values = read_file(
file_path,
file_separator,
target_col
)
# drop rows with missing data while preserving the original dataset
df_new = df.dropna()
df_new_copy = df_new.copy()