Skip to content

Latest commit

 

History

History
271 lines (197 loc) · 5.43 KB

File metadata and controls

271 lines (197 loc) · 5.43 KB

New empty dataframe

df = pd.DataFrame()

# With column names
df = pd.DataFrame(columns = ['stock', 'beta_capm', 'beta_fama', 'beta_smb', 'beta_hml', 'avg_return'])

Rows

Info about the dataframe

print(df.info())
print(df.shape)

# Column names
print(df.columns.values)

# Datatypes
print(df.dtypes)

Print the Nth row

# The first row
print(myDF.iloc[0])

Print others

# Without index
print(df.to_string(index=False))

Count rows in different ways

# Show nr of rows containing a specific value
print(sum(df['myFruitColumn'] == 'apple'))
print(sum(df['myCountsColumn'] >= 11))

# Count apples, oranges and bananas with groupby
print(df.groupby(['myFruitColumn']).size())

# Show nr of rows
print(len(df.index))
# OR (inde 0 of the shape that gives a row,col tuple)
print(df.shape[0])
# OR
rows = len(df.axes[0])
# OR (slowest)
print(df.count())

Columns

Show

print(len(df.columns))
# OR
print(df.shape[1])
# OR
cols = len(df.axes[1])

Display the Dataframe

pd.set_option('display.max_rows', 5)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
display(myDF)


#### Print with more characters in Jupyter notebook before breaking to new rows
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 8,
                       'display.width', 1000
                       ):    
  print(df)

# OR
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):    
    print(df)

Rename Dataframe

# From test to TEST
myNewDF = myDF.rename(columns = {'test':'TEST', 'adi':'ADI', 'w23':'W24'}, inplace = True)

Conditions

IF

if 'A' in df.columns: print("is in")
if 'A' not in df: print("not in")

# One or more columns exist (Not all below are tested!!!)
if set(['A','C']).issubset(df.columns): print("they exist")
if {'A', 'C'}.issubset(df.columns): print("they exist")
if set(('A','B')) <= set(df.columns): print("they exist")
if all(item in df.columns for item in ['A','C']): print("they exist")
if all(item in df.columns for item in ['A','C']): print("they exist")
if df.columns.isin(['A', 'C']).any(): print("they exist")
if not df.columns.isin(['A', 'C']).any(): print("not in")

Loops

For

for i, row in df.iterrows():
    theCar = row["cars"]

Files

csv

# Save
myDf.to_csv(filename, sep=";")

# Load
myDf = pd.read_csv(filename, sep=";", axis=0)

# Load with spearator regex (at least 2 blandspaces)
myDf = pd.read_csv(filename, sep="\s{2,}", axis=0)

# Treat values as na values
data = pd.read_csv(file,na_values=['*', '**', '***'])

# Only import some columns
data = pd.read_csv(filename, sep = "\t", usecols = ["Date", "Items", "Name", "SomeOtherCol"])


# Save to CSV file
Station1.to_csv(file1, sep=",", index=False, float_format="%.2f")

plain text (everything to 1 column)

data = []
with open('filename', "r") as f_in:
  for line in map(str.strip, f_in):
    if now line
      continue
    data.append({"colname": line})
    
myDF = pd.DataFrame(data)

Excel

df = pd.read_excel(myExcelfile, sheet_name="return")

Calculate

# Example
avg_smb = df_factors_12["smb_vw"].mean()

.median()
.round(0)
.cov()
.std()
.var()

# Standard deviation
.std()

# Unique occurences
.nunique()

String stuff

.astype(str)
.str.slice(start = 0, stop= 6)

Re-type

.astype(int)


# Print the types
print(df.dtypes)

Get different items from a DF

Station1_may = Station1.loc[Station1["YR--MODAHRMN"].astype(str).str.slice(start = 0, stop= 6) == '201705']

Get different items from a DF

# Print the top rows
myDF.head()
# Print the last rows
myDF.tail()

Time and date

# Convert 'Date' from string to datetime format (example 'Date': ['20200101', '20200201', '20200301', '20200401'])
data['Date'] = pd.to_datetime(data['Date'], format='%Y%m%d')

# Extracting year, month, and day from the 'Date' column
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

Grouping and multiindex

# Group a DF
df_grp = data.groupby(["Year","Month"]).mean(numeric_only=False)
# Get a subset DF based on the grouping index
df_march = df_grp.loc[2020,3]
# Take out one of the columns
mean_sweden = df_march["Sweden_Daily"]

# Loop through the groups
for idx, df_select in df_grp.groupby(level=[0, 1]):
    # Convert the month number to name
    s = datetime.date(1900, idx[1], 1).strftime('%B')

    # Add the data to the array
    a.append([idx[0], s, round(df_select["Sweden_Daily"].mean(),0)])

# More: https://pandas.pydata.org/docs/reference/groupby.html

Filter based on / Find data based on

# Create 2 new dataframes based on an content in a column (in this case the column AnimalId)
newDF1 = df.loc[selected["AnimalId"] == 2998]
newDF2 = df.loc[selected["AnimalId"] == 2850]

#Identify the peak 
idx_max_Sweden = df['Sweden_Daily'].idxmax()
theDate = data.loc[idx_max_Sweden,"Date"].date()

# Get the sum of the Sweden_Daily values, when Date = March (m)
theSum = df.loc[df["Date"].dt.month==3]["Sweden_Daily"].sum()