nstack · afiodorov · Jul 23, 2018 · Jul 23, 2018 · Jul 23, 2018 · Jul 23, 2018
diff --git a/orders-1-run.log b/orders-1-run.log
@@ -0,0 +1,67 @@
+Extracting orders.zip...
+Reading orders-1.csv...
+Read in 533623 rows...
+Dropped 65 rows containing nan...
+Renamed Customer ID to id...
+Parsed dates...
+Renamed Order Date to date...
+Renamed Price to price...
+Dropped 1411 rows containing -1 in id column...
+
+Percentiles for average days between orders for returning customers:
+count                      51501
+mean     32 days 05:31:38.178793
+std      24 days 13:12:07.576172
+min       0 days 01:42:51.428571
+0%        0 days 01:42:51.428571
+10%              8 days 06:00:00
+20%             13 days 00:00:00
+30.0%           17 days 04:00:00
+40%             21 days 12:00:00
+50%      26 days 03:25:42.857142
+60.0%           31 days 10:40:00
+70%             38 days 08:00:00
+80%             47 days 16:00:00
+90%             64 days 00:00:00
+max            179 days 00:00:00
+Name: date, dtype: object
+
+90% of the customers that make repeated purchase make it after 64 days.
+Latest date found in the dataset: 2018-06-27 00:00:00.
+Customers who have purchases after 2018-04-24 00:00:00 are considered healthy.
+Created 271183 transactions corresponding to healthy customers.
+Number of unique customers: 123468.
+Number of healthy customers: 39610.
+Proportion of healthy customers: 0.32.
+Adding secondsSinceRegistration feature...
+Adding numOfTransactions features...
+Size of training data set: 398630.
+Size of test data set: 133517.
+Proportion of transactions corresponding to healthy customers in training dataset: 0.51.
+Proportion of transactions corresponding to healthy customers in test dataset: 0.51.
+Finding the best random forest model...
+Min impurity: 0.000. Area under ROC curve: 0.764.
+Min impurity: 0.020. Area under ROC curve: 0.686.
+Min impurity: 0.040. Area under ROC curve: 0.684.
+Min impurity: 0.060. Area under ROC curve: 0.500.
+Min impurity: 0.080. Area under ROC curve: 0.500.
+Min impurity: 0.100. Area under ROC curve: 0.500.
+Min impurity: 0.120. Area under ROC curve: 0.500.
+Min impurity: 0.140. Area under ROC curve: 0.500.
+Min impurity: 0.160. Area under ROC curve: 0.500.
+Min impurity: 0.180. Area under ROC curve: 0.500.
+Min impurity: 0.200. Area under ROC curve: 0.500.
+Min impurity: 0.220. Area under ROC curve: 0.500.
+Min impurity: 0.240. Area under ROC curve: 0.500.
+Min impurity: 0.260. Area under ROC curve: 0.500.
+Min impurity: 0.280. Area under ROC curve: 0.500.
+Most important features:
+[('secondsSinceRegistration', 0.45706872620175287), ('price', 0.41632602073665953), ('numOfTransactions', 0.1266052530615876)]
+Adding health score column...
+Saving full csv...
+Printig csv...
+
+id,health_score
+1000001170897,0.2505100887996181
+1000015470766,1.0
+...
diff --git a/orders-2-run.log b/orders-2-run.log
@@ -0,0 +1,65 @@
+Extracting orders.zip...
+Reading orders-2.csv...
+Read in 499999 rows...
+Dropped 5 rows containing nan...
+Parsed dates...
+Dropped 1 rows containing -1 in id column...
+
+Percentiles for average days between orders for returning customers:
+count                      51720
+mean     16 days 18:23:21.946281
+std      14 days 01:22:24.347529
+min              0 days 00:00:01
+0%               0 days 00:00:01
+10%       3 days 00:41:38.900000
+20%       5 days 12:14:16.070000
+30.0%     7 days 20:25:11.034782
+40%      10 days 08:18:29.133333
+50%      13 days 01:51:06.333333
+60.0%    16 days 05:22:57.191111
+70%      20 days 05:32:30.519999
+80%      25 days 18:30:30.033333
+90%      35 days 11:43:19.660000
+max      89 days 11:15:48.500000
+Name: date, dtype: object
+
+90% of the customers that make repeated purchase make it after 35 days.
+Latest date found in the dataset: 2018-05-01 23:56:52.
+Customers who have purchases after 2018-03-27 12:13:32.340000 are considered healthy.
+Created 385972 transactions corresponding to healthy customers.
+Number of unique customers: 86207.
+Number of healthy customers: 39508.
+Proportion of healthy customers: 0.46.
+Adding secondsSinceRegistration feature...
+Adding numOfTransactions features...
+Size of training data set: 374392.
+Size of test data set: 125601.
+Proportion of transactions corresponding to healthy customers in training dataset: 0.77.
+Proportion of transactions corresponding to healthy customers in test dataset: 0.77.
+Finding the best random forest model...
+Min impurity: 0.000. Area under ROC curve: 0.738.
+Min impurity: 0.020. Area under ROC curve: 0.724.
+Min impurity: 0.040. Area under ROC curve: 0.724.
+Min impurity: 0.060. Area under ROC curve: 0.500.
+Min impurity: 0.080. Area under ROC curve: 0.500.
+Min impurity: 0.100. Area under ROC curve: 0.500.
+Min impurity: 0.120. Area under ROC curve: 0.500.
+Min impurity: 0.140. Area under ROC curve: 0.500.
+Min impurity: 0.160. Area under ROC curve: 0.500.
+Min impurity: 0.180. Area under ROC curve: 0.500.
+Min impurity: 0.200. Area under ROC curve: 0.500.
+Min impurity: 0.220. Area under ROC curve: 0.500.
+Min impurity: 0.240. Area under ROC curve: 0.500.
+Min impurity: 0.260. Area under ROC curve: 0.500.
+Min impurity: 0.280. Area under ROC curve: 0.500.
+Most important features:
+[('secondsSinceRegistration', 0.32592714760036945), ('price', 0.27353931173752621), ('profit', 0.20867555823625111), ('numOfTransactions', 0.19185798242585314)]
+Adding health score column...
+Saving full csv...
+Printig csv...
+
+id,health_score
+0000b0000ac761edbe8480f8273fdb07,0.7
+00017b2e0c5ecad4d92f6d7b7f4fa1f0,1.0
+00018653eaf26f920973426529654866,0.325
+...
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+pandas
+sklearn
diff --git a/solution.py b/solution.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+
+import argparse
+import zipfile
+import datetime
+
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+import sklearn.metrics
+import sklearn.ensemble
+
+
+def read_df(file_name):
+    print("Reading %s..." % file_name, flush=True)
+    df = pd.read_csv(file_name, dtype={'id': str, 'Customer ID': str,
+                                       'Product SKU': str, 'Price': str,
+                                       'price': float, 'profit': float})
+    original_len = len(df)
+    print('Read in %d rows...' % original_len, flush=True)
+    df.dropna(inplace=True)
+    print("Dropped %d rows containing nan..." % (original_len - len(df)), flush=True)
+
+    if 'Customer ID' in df.columns:
+        df.rename(columns={'Customer ID': 'id'}, inplace=True)
+        print("Renamed Customer ID to id...", flush=True)
+
+    if 'Order Date' in df.columns:
+        df['Order Date'] = pd.to_datetime(df['Order Date'].str[:10], format="%d/%m/%Y")
+    else:
+        df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d %H:%M:%S")
+    print("Parsed dates...", flush=True)
+
+    if 'Order Date' in df.columns:
+        df.rename(columns={'Order Date': 'date'}, inplace=True)
+        print("Renamed Order Date to date...", flush=True)
+
+    if 'Price' in df.columns:
+        df['Price'] = df['Price'].str.replace(',', '').astype(float)
+        df.rename(columns={'Price': 'price'}, inplace=True)
+        print("Renamed Price to price...", flush=True)
+
+    original_len = len(df)
+    df = df[df['id'] != '-1']
+    print("Dropped %d rows containing -1 in id column..." % (original_len - len(df)), flush=True)
+    print(flush=True)
+
+    return df
+
+
+def create_healthy_binary_target(df):
+    average_days_between_orders = df.groupby('id')['date'].apply(
+        lambda x: (x.max() - x.min()) / len(x))
+    average_days_between_orders = average_days_between_orders[
+        average_days_between_orders != datetime.timedelta(0)]
+
+    print("Percentiles for average days between orders for returning customers:", flush=True)
+    print(average_days_between_orders.describe(percentiles=np.arange(0, 1, 0.1)), flush=True)
+    print(flush=True)
+
+    quantile = average_days_between_orders.quantile(q=0.9)
+    print("90%% of the customers that make repeated purchase make it after %d days." %
+          quantile.days, flush=True)
+    latest_date = df['date'].max()
+
+    cut_off = latest_date - quantile
+    print("Latest date found in the dataset: %s." % latest_date, flush=True)
+    print("Customers who have purchases after %s are considered healthy." % cut_off, flush=True)
+
+    df['healthy'] = df.groupby('id')['date'].transform(max) > cut_off
+    print("Created %d transactions corresponding to healthy customers." % np.sum(df['healthy']),
+          flush=True)
+
+    unique_customers = df.groupby('id')['healthy'].tail(1)
+    num_unique = len(unique_customers)
+    num_healthy = np.sum(unique_customers)
+    print("Number of unique customers: %d." % num_unique, flush=True)
+    print("Number of healthy customers: %d." % num_healthy, flush=True)
+    print("Proportion of healthy customers: %0.2f." % (num_healthy / num_unique), flush=True)
+
+    return df
+
+
+def add_features(df):
+    print("Adding secondsSinceRegistration feature...", flush=True)
+    df['secondsSinceRegistration'] = df.groupby('id')['date'].transform(
+        lambda x: (x - x.min())).apply(lambda x: x.total_seconds())
+    print("Adding numOfTransactions features...", flush=True)
+    df['numOfTransactions'] = df.groupby('id')['date'].transform(lambda x: np.argsort(x) + 1)
+
+    return df
+
+
+def train_test_splitting(df):
+    train_ids, _ = train_test_split(df['id'].unique())
+    train_mask = df['id'].isin(train_ids)
+    df_train = df[train_mask]
+    df_test = df[~train_mask]
+    print("Size of training data set: %d." % len(df_train), flush=True)
+    print("Size of test data set: %d." % len(df_test), flush=True)
+    train_mean = df_train['healthy'].mean()
+    print("Proportion of transactions corresponding to healthy customers in training dataset: "
+          "%0.2f." % train_mean, flush=True)
+    test_mean = df_test['healthy'].mean()
+    print("Proportion of transactions corresponding to healthy customers in test dataset: %0.2f." %
+          test_mean, flush=True)
+
+    return df_train, df_test
+
+
+def features(df):
+    features_ = ['price', 'secondsSinceRegistration', 'numOfTransactions']
+    if 'profit' in df.columns:
+        features_.append('profit')
+    return features_
+
+
+def create_find_roc_metric(df_test):
+    orders_test_latest = df_test.sort_values('date').groupby('id').tail(1)
+
+    def find_roc_metric(model):
+        healthy_proba = model.predict_proba(orders_test_latest[features(df_test)])[:, 1]
+        roc_metric = sklearn.metrics.roc_auc_score(orders_test_latest['healthy'], healthy_proba)
+        return roc_metric
+
+    return find_roc_metric
+
+
+def find_best_model(metric_func, df_train):
+    print("Finding the best random forest model...", flush=True)
+    curr_area = -100
+    curr_model = None
+
+    search_dict = {}
+
+    for min_impurity_decrease in list(np.arange(0.0, 0.3, 0.02)):
+        rf = sklearn.ensemble.RandomForestClassifier(min_impurity_decrease=min_impurity_decrease)
+        rf.fit(df_train[features], df_train['healthy'])
+        area = metric_func(rf)
+        if area > curr_area:
+            curr_area = area
+            curr_model = rf
+
+        search_dict[min_impurity_decrease] = area
+        print("Min impurity: %0.3f. Area under ROC curve: %0.3f." % (min_impurity_decrease, area),
+              flush=True)
+
+    return curr_model
+
+
+def print_feature_importances(model, df):
+    print("Most important features:", flush=True)
+    print(sorted(list(zip(features(df), model.feature_importances_)), key=lambda x: x[1],
+                 reverse=True), flush=True)
+
+
+def customers_health(model, df):
+    if 'health_score' in df:
+        del df['health_score']
+
+    print("Adding health score column...", flush=True)
+    orders_latest = df.sort_values('date').groupby('id').tail(1)
+    proba = model.predict_proba(orders_latest[features(df)])[:, 1]
+    health_score = pd.DataFrame({'id': orders_latest['id'], 'health_score': proba})
+    return df.join(health_score.set_index('id'), how='inner', on='id')
+
+
+def print_csv(df):
+    print("Printig csv...", flush=True)
+    print()
+    print("id,health_score", flush=True)
+    print(df.groupby('id')['health_score'].max().to_csv())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Train health model.')
+    parser.add_argument('file_name', type=str, help='orders-1.csv or orders-2.csv.')
+
+    args = parser.parse_args()
+    file_name = args.file_name
+
+    with zipfile.ZipFile("orders.zip", 'r') as zip_ref:
+        print("Extracting orders.zip...", flush=True)
+        zip_ref.extractall('.')
+
+    df = read_df(file_name)
+    df = create_healthy_binary_target(df)
+    df = add_features(df)
+    df_train, df_test = train_test_splitting(df)
+    roc_metric = create_find_roc_metric(df_test)
+    model = find_best_model(roc_metric, df_train)
+    print_feature_importances(model, df)
+    df = customers_health(model, df)
+    print("Saving full csv...")
+    df.to_csv(file_name[:-4] + '-new.csv', index=False)
+    print_csv(df)
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    numpy
+    pandas
+    sklearn