Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 19 additions & 12 deletions local_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@

import requests

# TODO: send a GET using the URL http://127.0.0.1:8000
r = None # Your code here
# Send a GET using the URL http://127.0.0.1:8000
url="http://127.0.0.1:8000/predict" # Your code here

r = requests.get("http://127.0.0.1:8000")
# TODO: print the status code
# print()
print("Status Code:", r.status_code)
# TODO: print the welcome message
# print()
print("Results:", r.json())



data = {
input_data = {
"age": 37,
"workclass": "Private",
"fnlgt": 178356,
Expand All @@ -26,13 +27,19 @@
"capital-gain": 0,
"capital-loss": 0,
"hours-per-week": 40,
"native-country": "United-States",
"native-country": "United-States"
}

# TODO: send a POST using the data above
r = None # Your code here

# TODO: print the status code
# print()
# TODO: print the result
# print()
# Send a POST using the data above
response = requests.post(url, json=input_data) # Your code here

# Print the status code
print("Status Code: {response.status_code}")

print("Response text:", response.text)
# Print the result
if response.status_code == 200:
print("Result:", response.json())
else:
print(f"Failed with status code {response.status_code}. Response text: {response.text}")
42 changes: 27 additions & 15 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os

import pickle
import pandas as pd
from fastapi import FastAPI
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
import joblib

from ml.data import apply_label, process_data
from ml.model import inference, load_model
Expand All @@ -25,26 +26,33 @@ class Data(BaseModel):
capital_loss: int = Field(..., example=0, alias="capital-loss")
hours_per_week: int = Field(..., example=40, alias="hours-per-week")
native_country: str = Field(..., example="United-States", alias="native-country")
try:
encoder_path = os.path.join(os.getcwd(), "model", "encoder.pkl") # TODO: enter the path for the saved encoder
encoder = load_model(encoder_path)
print("Encoder loaded successfully.")

model_path = os.path.join(os.getcwd(), "model","model.pkl") # TODO: enter the path for the saved model
model = load_model(model_path)
print("Model loaded successfully.")

path = None # TODO: enter the path for the saved encoder
encoder = load_model(path)

path = None # TODO: enter the path for the saved model
model = load_model(path)
except Exception as e:
print(f"Error loading model, encoder, or label binarizer: {e}")
raise HTTPException(status_code=500, detail=f"Error loading model, encoder, or label binarizer: {str(e)}")

# TODO: create a RESTful API using FastAPI
app = None # your code here
# Create a RESTful API using FastAPI
app = FastAPI() # your code here

# TODO: create a GET on the root giving a welcome message
# Create a GET on the root giving a welcome message
@app.get("/")
async def get_root():
""" Say hello!"""
# your code here
pass
return {"message": "Hello from the API!"}


# TODO: create a POST on a different path that does model inference
@app.post("/data/")
# Create a POST on a different path that does model inference
@app.post("/predict")
async def post_inference(data: Data):
# DO NOT MODIFY: turn the Pydantic model into a dict.
data_dict = data.dict()
Expand All @@ -65,10 +73,14 @@ async def post_inference(data: Data):
"native-country",
]
data_processed, _, _, _ = process_data(
# your code here
data,
categorical_features=cat_features,
training=False,# your code here
encoder=encoder
# use data as data input
# use training = False
# do not need to pass lb as input
)
_inference = None # your code here to predict the result using data_processed
return {"result": apply_label(_inference)}
_inference = model.predict(data_processed)
result = apply_label(_inference)
return {"result": result}
5 changes: 4 additions & 1 deletion ml/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,15 @@ def process_data(
X_categorical = X[categorical_features].values
X_continuous = X.drop(*[categorical_features], axis=1)

if training is True:
if training:
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
lb = LabelBinarizer()
X_categorical = encoder.fit_transform(X_categorical)
y = lb.fit_transform(y.values).ravel()
else:
if encoder is None:
raise ValueError("Encoder and LabelBinarizer must be provided during inference")

X_categorical = encoder.transform(X_categorical)
try:
y = lb.transform(y.values).ravel()
Expand Down
61 changes: 48 additions & 13 deletions ml/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
from sklearn.metrics import fbeta_score, precision_score, recall_score
from ml.data import process_data
# TODO: add necessary import
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import joblib

# Optional: implement hyperparameter tuning.
def train_model(X_train, y_train):
def train_model(X_train, y_train, cv=None):
"""
Trains a machine learning model and returns it.

Expand All @@ -19,8 +22,19 @@ def train_model(X_train, y_train):
model
Trained machine learning model.
"""
# TODO: implement the function
pass
# Train and return a model
if cv is None:
cv = StratifiedKFold(n_splits=5)

param_grid = {
'n_estimators': [100, 200],
'max_depth': [None, 10, 20],
}
clf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=cv)
clf.fit(X_train, y_train)
return clf.best_estimator_




def compute_model_metrics(y, preds):
Expand Down Expand Up @@ -50,7 +64,7 @@ def inference(model, X):

Inputs
------
model : ???
model : sklearn.base.BaseEstimator
Trained machine learning model.
X : np.array
Data used for prediction.
Expand All @@ -59,8 +73,8 @@ def inference(model, X):
preds : np.array
Predictions from the model.
"""
# TODO: implement the function
pass
# Run model inferences and return the predictions
return model.predict(X)

def save_model(model, path):
""" Serializes model to a file.
Expand All @@ -72,13 +86,13 @@ def save_model(model, path):
path : str
Path to save pickle file.
"""
# TODO: implement the function
pass
# Save a model
joblib.dump(model, path)

def load_model(path):
""" Loads pickle file from `path` and returns it."""
# TODO: implement the function
pass
# Load a model
return joblib.load(path)


def performance_on_categorical_slice(
Expand Down Expand Up @@ -107,7 +121,7 @@ def performance_on_categorical_slice(
Trained sklearn OneHotEncoder, only used if training=False.
lb : sklearn.preprocessing._label.LabelBinarizer
Trained sklearn LabelBinarizer, only used if training=False.
model : ???
model : RandomForestClassifier
Model used for the task.

Returns
Expand All @@ -117,12 +131,33 @@ def performance_on_categorical_slice(
fbeta : float

"""
# TODO: implement the function
# Computes the metrics on a slice of the data
data_slice = data[data[column_name]==slice_value]

X_slice, y_slice, _, _ = process_data(
# your code here
# for input data, use data in column given as "column_name", with the slice_value
# use training = False
data_slice,
categorical_features=categorical_features,
label=label,
encoder=encoder,
lb=lb,
training=False
)
preds = None # your code here to get prediction on X_slice using the inference function
preds = inference(model, X_slice) # your code here to get prediction on X_slice using the inference function
precision, recall, fbeta = compute_model_metrics(y_slice, preds)

#Prepare the log message for this slice
log_message = (
f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {fbeta:.4f}\n"
f"{column_name}: {slice_value}, Count: {len(data_slice)}\n"
)
#Apend the results to slice_output.txt
with open('slice_output.txt', 'a') as f:
f.write(log_message)

#Print the result for terminal log
print(log_message)

return precision, recall, fbeta
Binary file added model/encoder.pkl
Binary file not shown.
Binary file added model/model.pkl
Binary file not shown.
39 changes: 38 additions & 1 deletion model_card_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,53 @@
For additional information see the Model Card paper: https://arxiv.org/pdf/1810.03993.pdf

## Model Details

Model Name: Random Forest Classifier
Version: v1.0
Type: Supervised Classification (Binary Classification)
Model Architecture: Random Forest with hyperparameter tuning (n_estimators: 100-200, max_depth: None, 10, 20) using GridSearchCV
Training Time: Approximately 2 hours (estimated)
Last Trained: 2025-05-04
## Intended Use
The model is designed to predict whether an individual's income exceeds $50k per year, based on demographic and employment-related features from U.S. Census data. Intended for educational purposes, especially for demonstrating machine learning workflows and deployment pipelines.

## Training Data
Source: UC Irvine Adult Census Income Dataset (loaded from census.csv)
Features:
-Categorical: workclass, education, marital-status, occupation, relationship, race, sex, native-country
-Numerical: age, hours-per-week, education-num, capital-gain, capital-loss, etc
Target Label: salary (<=50k, >50k)
Preprocessing:
-OneHotEncoding for categorical variables
-LabelBinarizer for the target variable
-Train-test split: 80/20

## Evaluation Data
The test dataset is a 20% hold-out sample from the original dataset
Used to evaluate generalization performance and perform slice-based fairness analysis

## Metrics
_Please include the metrics used and your model's performance on those metrics._
Precision: 0.7866
Recall: 0.6149
F1 Score: 0.6902

These results suggest that the model strikes a moderate balance between identifying positive cases (recall) and minimizing false positives (precision)

See slice_output.txt

## Ethical Considerations
Bias & Fairness: The model may reflect historical biases present in the U.S. Census dataset. For instance, features such as race and gender could lead to disparate performance on different groups.

Interpretability: Random Forests are not easily interpretable by default, which may limit their use in regulated environments.

Use Limitations: This model should not be used for automated income predicitions affecting real-world decisions without additional audits and mitigation techniques.

Data Privacy: The dataset does not contain personally identifiable information (PII), but proper care should be taken if extended to real-world data.

## Caveats and Recommendations
Performance may vary significantly across different population subgroups. Always check model performance using slice-based analysis.
Further model robustness checks are recommended before deployment.
For production use, consider model interpretability enhancements such as SHAP or LIME for local explanations.
LIME https://lime-ml.readthedocs.io/en/latest/
SHAP https://shap.readthedocs.io/en/latest/

Binary file added screenshots/local_api.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added screenshots/unit_test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading