udacity · CameronJLSweeney · May 4, 2025 · May 4, 2025 · May 4, 2025 · May 4, 2025
@@ -2,17 +2,18 @@
 
 import requests
 
-# TODO: send a GET using the URL http://127.0.0.1:8000
-r = None # Your code here
+# Send a GET using the URL http://127.0.0.1:8000
+url="http://127.0.0.1:8000/predict" # Your code here
 
+r = requests.get("http://127.0.0.1:8000")
 # TODO: print the status code
-# print()
+print("Status Code:", r.status_code)
 # TODO: print the welcome message
-# print()
+print("Results:", r.json())
 
 
 
-data = {
+input_data = {
     "age": 37,
     "workclass": "Private",
     "fnlgt": 178356,
@@ -26,13 +27,19 @@
     "capital-gain": 0,
     "capital-loss": 0,
     "hours-per-week": 40,
-    "native-country": "United-States",
+    "native-country": "United-States"
 }
 
-# TODO: send a POST using the data above
-r = None # Your code here
 
-# TODO: print the status code
-# print()
-# TODO: print the result
-# print()
+# Send a POST using the data above
+response = requests.post(url, json=input_data) # Your code here
+
+# Print the status code
+print("Status Code: {response.status_code}")
+
+print("Response text:", response.text)
+# Print the result
+if response.status_code == 200:
+    print("Result:", response.json())
+else:
+    print(f"Failed with status code {response.status_code}. Response text: {response.text}")
@@ -1,8 +1,9 @@
 import os
-
+import pickle
 import pandas as pd
-from fastapi import FastAPI
+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, Field
+import joblib
 
 from ml.data import apply_label, process_data
 from ml.model import inference, load_model
@@ -25,26 +26,33 @@ class Data(BaseModel):
     capital_loss: int = Field(..., example=0, alias="capital-loss")
     hours_per_week: int = Field(..., example=40, alias="hours-per-week")
     native_country: str = Field(..., example="United-States", alias="native-country")
+try:
+    encoder_path = os.path.join(os.getcwd(), "model", "encoder.pkl") # TODO: enter the path for the saved encoder 
+    encoder = load_model(encoder_path)
+    print("Encoder loaded successfully.")
+
+    model_path = os.path.join(os.getcwd(), "model","model.pkl") # TODO: enter the path for the saved model 
+    model = load_model(model_path)
+    print("Model loaded successfully.")
 
-path = None # TODO: enter the path for the saved encoder 
-encoder = load_model(path)
 
-path = None # TODO: enter the path for the saved model 
-model = load_model(path)
+except Exception as e:
+    print(f"Error loading model, encoder, or label binarizer: {e}")
+    raise HTTPException(status_code=500,  detail=f"Error loading model, encoder, or label binarizer: {str(e)}")
 
-# TODO: create a RESTful API using FastAPI
-app = None # your code here
+# Create a RESTful API using FastAPI
+app = FastAPI() # your code here
 
-# TODO: create a GET on the root giving a welcome message
+# Create a GET on the root giving a welcome message
 @app.get("/")
 async def get_root():
     """ Say hello!"""
     # your code here
-    pass
+    return {"message": "Hello from the API!"}
 
 
-# TODO: create a POST on a different path that does model inference
-@app.post("/data/")
+# Create a POST on a different path that does model inference
+@app.post("/predict")
 async def post_inference(data: Data):
     # DO NOT MODIFY: turn the Pydantic model into a dict.
     data_dict = data.dict()
@@ -65,10 +73,14 @@ async def post_inference(data: Data):
         "native-country",
     ]
     data_processed, _, _, _ = process_data(
-        # your code here
+        data,
+        categorical_features=cat_features,
+        training=False,# your code here
+        encoder=encoder
         # use data as data input
         # use training = False
         # do not need to pass lb as input
     )
-    _inference = None # your code here to predict the result using data_processed
-    return {"result": apply_label(_inference)}
+    _inference = model.predict(data_processed)
+    result = apply_label(_inference)
+    return {"result": result}
@@ -53,12 +53,15 @@ def process_data(
     X_categorical = X[categorical_features].values
     X_continuous = X.drop(*[categorical_features], axis=1)
 
-    if training is True:
+    if training:
         encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
         lb = LabelBinarizer()
         X_categorical = encoder.fit_transform(X_categorical)
         y = lb.fit_transform(y.values).ravel()
     else:
+        if encoder is None:
+            raise ValueError("Encoder and LabelBinarizer must be provided during inference")
+
         X_categorical = encoder.transform(X_categorical)
         try:
             y = lb.transform(y.values).ravel()

@@ -2,9 +2,12 @@
 from sklearn.metrics import fbeta_score, precision_score, recall_score
 from ml.data import process_data
 # TODO: add necessary import
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import GridSearchCV, StratifiedKFold
+import joblib
 
 # Optional: implement hyperparameter tuning.
-def train_model(X_train, y_train):
+def train_model(X_train, y_train, cv=None):
     """
     Trains a machine learning model and returns it.
 
@@ -19,8 +22,19 @@ def train_model(X_train, y_train):
     model
         Trained machine learning model.
     """
-    # TODO: implement the function
-    pass
+    # Train and return a model
+    if cv is None:
+        cv = StratifiedKFold(n_splits=5)
+
+    param_grid = {
+        'n_estimators': [100, 200],
+        'max_depth': [None, 10, 20],
+    }
+    clf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=cv)
+    clf.fit(X_train, y_train)
+    return clf.best_estimator_
+
+
 
 
 def compute_model_metrics(y, preds):
@@ -50,7 +64,7 @@ def inference(model, X):
 
     Inputs
     ------
-    model : ???
+    model : sklearn.base.BaseEstimator
         Trained machine learning model.
     X : np.array
         Data used for prediction.
@@ -59,8 +73,8 @@ def inference(model, X):
     preds : np.array
         Predictions from the model.
     """
-    # TODO: implement the function
-    pass
+    # Run model inferences and return the predictions
+    return model.predict(X)
 
 def save_model(model, path):
     """ Serializes model to a file.
@@ -72,13 +86,13 @@ def save_model(model, path):
     path : str
         Path to save pickle file.
     """
-    # TODO: implement the function
-    pass
+    # Save a model
+    joblib.dump(model, path)
 
 def load_model(path):
     """ Loads pickle file from `path` and returns it."""
-    # TODO: implement the function
-    pass
+    # Load a model
+    return joblib.load(path)
 
 
 def performance_on_categorical_slice(
@@ -107,7 +121,7 @@ def performance_on_categorical_slice(
         Trained sklearn OneHotEncoder, only used if training=False.
     lb : sklearn.preprocessing._label.LabelBinarizer
         Trained sklearn LabelBinarizer, only used if training=False.
-    model : ???
+    model : RandomForestClassifier
         Model used for the task.
 
     Returns
@@ -117,12 +131,33 @@ def performance_on_categorical_slice(
     fbeta : float
 
     """
-    # TODO: implement the function
+    # Computes the metrics on a slice of the data
+    data_slice = data[data[column_name]==slice_value]
+
     X_slice, y_slice, _, _ = process_data(
         # your code here
         # for input data, use data in column given as "column_name", with the slice_value 
         # use training = False
+        data_slice,
+        categorical_features=categorical_features,
+        label=label,
+        encoder=encoder,
+        lb=lb,
+        training=False
     )
-    preds = None # your code here to get prediction on X_slice using the inference function
+    preds = inference(model, X_slice) # your code here to get prediction on X_slice using the inference function
     precision, recall, fbeta = compute_model_metrics(y_slice, preds)
+
+    #Prepare the log message for this slice
+    log_message = (
+        f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {fbeta:.4f}\n"
+        f"{column_name}: {slice_value}, Count: {len(data_slice)}\n"
+    )
+    #Apend the results to slice_output.txt
+    with open('slice_output.txt', 'a') as f:
+        f.write(log_message)
+
+    #Print the result for terminal log
+    print(log_message)
+
     return precision, recall, fbeta
@@ -3,16 +3,53 @@
 For additional information see the Model Card paper: https://arxiv.org/pdf/1810.03993.pdf
 
 ## Model Details
-
+Model Name: Random Forest Classifier
+Version: v1.0
+Type: Supervised Classification (Binary Classification)
+Model Architecture: Random Forest with hyperparameter tuning (n_estimators: 100-200, max_depth: None, 10, 20) using GridSearchCV
+Training Time: Approximately 2 hours (estimated)
+Last Trained: 2025-05-04
 ## Intended Use
+The model is designed to predict whether an individual's income exceeds $50k per year, based on demographic and employment-related features from U.S. Census data. Intended for educational purposes, especially for demonstrating machine learning workflows and deployment pipelines. 
 
 ## Training Data
+Source: UC Irvine Adult Census Income Dataset (loaded from census.csv)
+Features:
+    -Categorical: workclass, education, marital-status, occupation, relationship, race, sex, native-country
+    -Numerical: age, hours-per-week, education-num, capital-gain, capital-loss, etc
+Target Label: salary (<=50k, >50k)
+Preprocessing:
+    -OneHotEncoding for categorical variables
+    -LabelBinarizer for the target variable
+    -Train-test split: 80/20
 
 ## Evaluation Data
+The test dataset is a 20% hold-out sample from the original dataset
+Used to evaluate generalization performance and perform slice-based fairness analysis
 
 ## Metrics
 _Please include the metrics used and your model's performance on those metrics._
+Precision: 0.7866
+Recall: 0.6149
+F1 Score: 0.6902
+
+These results suggest that the model strikes a moderate balance between identifying positive cases (recall) and minimizing false positives (precision)
+
+See slice_output.txt
 
 ## Ethical Considerations
+Bias & Fairness: The model may reflect historical biases present in the U.S. Census dataset. For instance, features such as race and gender could lead to disparate performance on different groups.
+
+Interpretability: Random Forests are not easily interpretable by default, which may limit their use in regulated environments.
+
+Use Limitations: This model should not be used for automated income predicitions affecting real-world decisions without additional audits and mitigation techniques.
+
+Data Privacy: The dataset does not contain personally identifiable information (PII), but proper care should be taken if extended to real-world data.
 
 ## Caveats and Recommendations
+Performance may vary significantly across different population subgroups. Always check model performance using slice-based analysis.
+Further model robustness checks are recommended before deployment.
+For production use, consider model interpretability enhancements such as SHAP or LIME for local explanations. 
+LIME https://lime-ml.readthedocs.io/en/latest/
+SHAP https://shap.readthedocs.io/en/latest/ 
+