Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"name": "HYF Week 2 Assignment",
"image": "mcr.microsoft.com/devcontainers/python:3.11",
"customizations": {
"vscode": {
"extensions": [
"ms-python.python",
"ms-python.vscode-pylance"
]
}
},
"postCreateCommand": "python3 -m pip install -r task-1/requirements.txt && echo '✅ HYF Week 2 Assignment Codespace ready. Run the auto-grader locally with: bash .hyf/test.sh && cat .hyf/score.json'"
}
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,9 @@ dist
vite.config.js.timestamp-*
vite.config.ts.timestamp-*


# Python virtual environments
venv/
.venv/
task-*/venv/
task-*/.venv/
212 changes: 205 additions & 7 deletions .hyf/test.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,13 +1,211 @@
#!/usr/bin/env bash
# Auto-grade Week 2 assignment. Writes score.json next to this script.
# Total = 100, passing = 60.
#
# The auto-grade workflow runs this from the .hyf working directory; we
# resolve the repo root so the script is robust to either invocation
# (cd .hyf && bash test.sh, or bash .hyf/test.sh from the repo root).
set -euo pipefail

# Run your test scripts here.
# Auto grade tool will execute this file within the .hyf working directory.
# The result should be stored in score.json file with the format shown below.
cat << EOF > score.json
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
cd "$REPO_ROOT"

PASSING=60

# --- Task 1: Cleaner Pipeline (60 points) ---
#
# Scoring ladder (each level depends on the previous):
# 0 nothing committed
# 10 required files all present (config.py, models.py, transforms.py,
# pipeline.py, tests/test_transforms.py, .env.example)
# 20 pipeline runs against messy_sales.csv without crashing (the
# grader injects INPUT_PATH/OUTPUT_PATH inline; no .env touched)
# 40 output/clean_sales.csv passes structural checks (12 rows, cleaned
# fields, revenue/vat correctly calculated)
# 60 the *code* also looks engineered: models.py defines a @dataclass
# with __post_init__; transforms.py uses the {**row, ...} spread
# pattern; pytest tests/ reports all tests passing.
Comment thread
lassebenni marked this conversation as resolved.
#
# Why the introspection cap at 40: a script that hardcodes the expected
# JSON literal could pass the structural checks without doing any real
# transformation. The 60-point tier requires the chapter's actual patterns
# (dataclass, spread, tests) to be present in the source.
task1=0
task1_msg="missing required files in task-1/"

required_files=(
"task-1/src/config.py"
"task-1/src/models.py"
"task-1/src/transforms.py"
"task-1/src/pipeline.py"
"task-1/tests/test_transforms.py"
"task-1/.env.example"
)

all_present=true
for f in "${required_files[@]}"; do
if [ ! -f "$f" ]; then
all_present=false
break
fi
done

if [ "$all_present" = true ]; then
task1=10
task1_msg="files exist but pipeline failed to run"

# Make sure the python-dotenv + pytest deps are available; if a
# requirements.txt exists, install it quietly.
if [ -f task-1/requirements.txt ]; then
python3 -m pip install -q -r task-1/requirements.txt || \
echo "WARN: pip install failed; pipeline may fail with ModuleNotFoundError" >&2
fi

# Force the canonical paths inline so the grader is deterministic
# regardless of the student's local .env (which may point INPUT_PATH /
# OUTPUT_PATH at /tmp or some other location during their own debugging).
# The student's .env is NOT read or modified by the grader.
PIPELINE_ERR=$(mktemp)
if ( cd task-1 && env INPUT_PATH=data/messy_sales.csv OUTPUT_PATH=output/clean_sales.csv python3 -m src.pipeline ) >/dev/null 2>"$PIPELINE_ERR"; then
task1=20
task1_msg="pipeline ran but output/clean_sales.csv failed structural checks"
STRUCT_ERR=$(mktemp)
if python3 - <<'PY' 2>"$STRUCT_ERR"
import csv
from pathlib import Path

p = Path("task-1/output/clean_sales.csv")
assert p.exists(), "output/clean_sales.csv was not created"

with p.open() as f:
rows = list(csv.DictReader(f))

# 15 input rows - 3 invalid (empty name #6, negative price #7, zero qty #8) = 12
assert len(rows) == 12, f"expected 12 cleaned rows, got {len(rows)}"

# Required columns
required = {"transaction_id", "product_name", "category", "price",
"quantity", "customer_email", "date", "revenue", "vat"}
missing = required - set(rows[0].keys())
assert not missing, f"output missing columns: {missing}"

# Field-level checks
for row in rows:
name = row["product_name"]
assert name == name.strip() and name == name.title(), \
f"product_name not cleaned: {name!r}"
email = row["customer_email"]
assert email == email.strip().lower(), \
f"customer_email not cleaned: {email!r}"
cat = row["category"]
assert cat, f"category empty (should default to 'Unknown') in row {row['transaction_id']}"

# Spot-check the math: row id=1 was 999.99 * 2 = 1999.98 revenue, then
# * 0.21 = 419.9958 vat (rounded to 420.00 at 2 decimals; the 0.01
# tolerance below absorbs either rounding precision the student picks).
row_1 = next(r for r in rows if r["transaction_id"] == "1")
revenue_1 = float(row_1["revenue"])
vat_1 = float(row_1["vat"])
assert abs(revenue_1 - 1999.98) < 0.01, f"row 1 revenue wrong: {revenue_1}"
assert abs(vat_1 - 419.9958) < 0.01, f"row 1 vat wrong: {vat_1}"

# At least one row should have category="Unknown" (row 15 had empty category)
assert any(r["category"].lower() == "unknown" for r in rows), \
"no row has category='Unknown' (row 15's empty category should default)"
PY
then
rm -f "$STRUCT_ERR"
task1=40
task1_msg="output passes structural checks but code is missing required engineering patterns (see below)"

# Introspection caps. The full 60 requires:
# - models.py imports `dataclass` AND defines a __post_init__ method
# - transforms.py uses the {**row, ...} spread pattern (no mutation)
# - pytest tests/test_transforms.py passes (all student tests green)
models_has_dataclass=$(grep -cE "^[[:space:]]*from dataclasses\b|^[[:space:]]*import dataclasses\b" task-1/src/models.py || true)
models_has_post_init=$(grep -cE "^[[:space:]]*def __post_init__" task-1/src/models.py || true)
transforms_has_spread=$(grep -cE '\{\*\*' task-1/src/transforms.py || true)

Comment thread
lassebenni marked this conversation as resolved.
Comment thread
lassebenni marked this conversation as resolved.
tests_pass=false
if ( cd task-1 && python3 -m pytest tests/ -q ) >/dev/null 2>&1; then
tests_pass=true
fi

if [ "$models_has_dataclass" -gt 0 ] && \
[ "$models_has_post_init" -gt 0 ] && \
[ "$transforms_has_spread" -gt 0 ] && \
[ "$tests_pass" = true ]; then
task1=60
task1_msg="output and code structure both pass; tests green"
else
missing=()
[ "$models_has_dataclass" -eq 0 ] && missing+=("from dataclasses import ... in models.py")
[ "$models_has_post_init" -eq 0 ] && missing+=("__post_init__ in models.py")
[ "$transforms_has_spread" -eq 0 ] && missing+=("{**row, ...} spread pattern in transforms.py")
[ "$tests_pass" = false ] && missing+=("pytest tests/ all green")
task1_msg="output passes but code missing: $(IFS=, ; echo "${missing[*]}")"
fi
else
# Structural checks failed: surface the assertion message.
err=$(tail -3 "$STRUCT_ERR" | tr '\n' ' ' | sed 's/ */ /g' | sed 's/^ //;s/ $//')
[ -n "$err" ] && task1_msg="structural check failed: $err"
rm -f "$STRUCT_ERR"
fi
else
# Pipeline crashed: surface the last few stderr lines.
err=$(tail -3 "$PIPELINE_ERR" | tr '\n' ' ' | sed 's/ */ /g' | sed 's/^ //;s/ $//')
[ -n "$err" ] && task1_msg="pipeline failed to run: $err"
fi
rm -f "$PIPELINE_ERR"
fi

# --- Task 2: AI Debug Report (20 points) ---
task2=0
task2_msg="missing task-2/AI_DEBUG.md"
if [ -s task-2/AI_DEBUG.md ]; then
task2=5
task2_msg="AI_DEBUG.md exists but missing required sections"
if grep -q "^## The Error" task-2/AI_DEBUG.md && \
grep -q "^## The Prompt" task-2/AI_DEBUG.md && \
grep -q "^## The Solution" task-2/AI_DEBUG.md && \
grep -q "^## Reflection" task-2/AI_DEBUG.md; then
task2=10
task2_msg="all sections present but file looks too short to be filled in"
# Empty template ships at ~1500 chars. Filled-in report should have
# meaningfully more content; threshold = template + ~600 chars
# of student writing (4 sections * ~150 chars each).
if [ "$(wc -c < task-2/AI_DEBUG.md)" -gt 2100 ]; then
task2=20
task2_msg="AI_DEBUG.md is filled in"
fi
fi
fi

# --- Task 3: HYF Azure Proof (20 points) ---
task3=0
task3_msg="missing task-3/azure_proof.png|jpg|jpeg"
for ext in png jpg jpeg; do
if [ -s "task-3/azure_proof.$ext" ]; then
task3=20
task3_msg="azure_proof.$ext present"
break
fi
done

score=$((task1 + task2 + task3))
if [ "$score" -ge "$PASSING" ]; then pass=true; else pass=false; fi

cat > "$SCRIPT_DIR/score.json" <<EOF
{
"score": 0,
"pass": true,
"passingScore": 0
"score": $score,
"pass": $pass,
"passingScore": $PASSING
}
EOF

echo "Task 1 (Cleaner Pipeline): $task1/60 — $task1_msg"
echo "Task 2 (AI Debug Report): $task2/20 — $task2_msg"
echo "Task 3 (Azure Proof): $task3/20 — $task3_msg"
echo "----------------------------------------"
echo "Total: $score/100 — pass=$pass (passing threshold: $PASSING)"
88 changes: 78 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,85 @@
# [Track] week X assignment
HackYourFuture <Track> week X assignment
The Week X assignment for the HackYourFuture <TRACK> can be found at the following link: [TODO: Assignment url in the learning platform]
# Data Track — Week 2 Assignment (Template)

The HackYourFuture Data Track Week 2 assignment: **Refactoring to a Clean Pipeline**.

## Implementation Instructions
> 👩‍🎓 **Students:** you are in the wrong place. Do **not** fork or use this template.
> Go to your cohort's assignment repo under
> [`HackYourAssignment`](https://github.com/HackYourAssignment) (e.g. `c55-data-week2`,
> `c56-data-week2`, …). Your teacher posts the exact link in your cohort channel.
> Fork the cohort repo, branch, and open a PR back to it. Full instructions live in the
> [Week 2 Assignment on Notion](https://www.notion.so/hackyourfuture/Week-2-Assignment-Refactoring-to-a-Clean-Pipeline-f8c27aa88d144cb18f54c49d02f50b73).

Provide clear instructions on how trainees should implement the tasks.
## For instructors / track maintainers

### Task 1
Instructions for Task 1
This repo is the **upstream template** for the Week 2 assignment. At the start of each
cohort, generate a cohort-specific repo under the `HackYourAssignment` org from this
template (GitHub: **Use this template → Create a new repository**, owner =
`HackYourAssignment`, name = `c<NN>-data-week2`). Students then fork *that* cohort repo
and open PRs back to it; the auto-grader runs on every push.

### Task 2
Instructions for Task 2
Edits to the assignment, dataset, or grader belong here on the template, not on the
cohort copies.

...
## Tasks at a glance

| Task | Folder | Points | What you build |
|---|---|---|---|
| **Task 1** — Cleaner Pipeline | `task-1/` | 60 | A modular Python pipeline with `config.py` (env-var loading), `models.py` (`Transaction` dataclass with `__post_init__` validation), `transforms.py` (4+ pure composable functions, no mutation), `pipeline.py` (orchestrator), and `tests/test_transforms.py` (4+ pytest tests). Reads `data/messy_sales.csv`, writes `output/clean_sales.csv`. |
| **Task 2** — AI Debug Report | `task-2/` | 20 | Document one debugging session where you used an LLM to fix a bug. Fill in the four sections of `AI_DEBUG.md`. |
| **Task 3** — HYF Azure proof | `task-3/` | 20 | Confirm your HYF Azure tenant access still works. Screenshot proof at `task-3/azure_proof.png` (or `.jpg` / `.jpeg`) showing resource group + region + €0 cost. |

Total: 100 · Passing: 60.

## Repository layout

```text
.
├── task-1/
│ ├── data/
│ │ └── messy_sales.csv # the dataset (committed; do not edit)
│ ├── src/
│ │ ├── config.py # env-var loader — fill in TODOs
│ │ ├── models.py # Transaction dataclass — fill in TODOs
│ │ ├── transforms.py # 4 pure transform functions — fill in TODOs
│ │ └── pipeline.py # orchestrator — fill in TODOs
│ ├── tests/
│ │ └── test_transforms.py # 4 pytest tests — fill in TODOs
│ ├── output/ # your pipeline writes clean_sales.csv here (gitignored)
│ ├── .env.example # copy to .env (gitignored) before running
│ └── requirements.txt # python3 -m pip install -r requirements.txt
├── task-2/
│ └── AI_DEBUG.md # fill in the four sections
├── task-3/
│ └── azure_proof.png # add your screenshot here
├── .hyf/
│ └── test.sh # auto-grader (read it to see exactly what it checks)
└── .github/workflows/
└── grade-assignment.yml # runs .hyf/test.sh on every PR
```

## Run the grader locally

Before opening a PR, run the same checks the auto-grader runs:

```bash
cd task-1
python3 -m pip install -r requirements.txt
cp .env.example .env
cd ..
bash .hyf/test.sh
cat .hyf/score.json
```
Comment thread
lassebenni marked this conversation as resolved.

The grader prints a per-task breakdown so you can see exactly which check failed and
why. The PR-time grader does the same — your local run and the CI run are identical.
Comment thread
lassebenni marked this conversation as resolved.

## Scoring ladder (Task 1)

The grader awards points incrementally so partial credit is meaningful:

- **10/60** — required files exist (`config.py`, `models.py`, `transforms.py`, `pipeline.py`, `tests/test_transforms.py`, `.env.example`).
- **20/60** — `python -m src.pipeline` runs from `task-1/` without crashing (the grader injects `INPUT_PATH` and `OUTPUT_PATH` inline; your local `.env` is not used during grading).
- **40/60** — `output/clean_sales.csv` passes structural checks: 12 rows (15 input − 3 invalid/zero-quantity), lowercased emails, title-cased product names, "Unknown" filled in for missing categories, `revenue` and `vat` columns present and correctly calculated.
- **60/60** — code looks engineered: `models.py` defines a `@dataclass` with `__post_init__`; `transforms.py` uses the `{**row, ...}` spread pattern (no mutation); `pytest tests/` reports all tests passing.

The 40-point cap exists to stop a 5-line script that hardcodes the expected JSON from getting full marks. Real engineering patterns (dataclass + spread + tests) are required for the top 20 points.
2 changes: 2 additions & 0 deletions task-1/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
INPUT_PATH=data/messy_sales.csv
OUTPUT_PATH=output/clean_sales.csv
16 changes: 16 additions & 0 deletions task-1/data/messy_sales.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
transaction_id,product_name,category,price,quantity,customer_email,date
1, laptop PRO ,Electronics,999.99,2,alice@example.com,2024-03-15
2,WIRELESS MOUSE,Electronics,29.99,5, BOB@Company.COM ,2024-03-15
3, usb cable,Electronics,4.99,10,,2024-03-16
4, Office Chair ,Furniture,349.50,1,charlie@work.org,2024-03-16
5,standing DESK,Furniture,599.00,1,charlie@work.org,not_a_date
6,,Electronics,19.99,3,dave@email.com,2024-03-17
7, Mechanical Keyboard ,Electronics,-89.99,2,eve@startup.io,2024-03-17
8,monitor ARM,Furniture,79.99,0,frank@corp.com,2024-03-18
9, WEBCAM hd ,Electronics,54.99,1, ,2024-03-18
10,desk lamp,Furniture,34.99,4,grace@university.edu,2024-03-19
11, NOISE CANCELLING headphones,Electronics,199.99,1,alice@example.com,2024-03-19
12,cable management KIT,Furniture,15.99,6,henry@business.com,2024-03-20
13, ergonomic MOUSE PAD ,Furniture,24.99,3,ivan@email.com,2024-03-20
14,laptop STAND,Furniture,45.99,2,jenny@work.org,2024-03-21
15, BLUETOOTH speaker,,39.99,1,karl@startup.io,2024-03-21
File renamed without changes.
2 changes: 2 additions & 0 deletions task-1/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
python-dotenv>=1.0.0
pytest>=7.0
File renamed without changes.
35 changes: 35 additions & 0 deletions task-1/src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Configuration loader.

Read INPUT_PATH and OUTPUT_PATH from a .env file (see .env.example for the
expected variable names) and expose them as named imports.

Tasks (see chapter Task 1):
1. Use python-dotenv's load_dotenv() to read the .env file.
2. Read INPUT_PATH and OUTPUT_PATH from os.environ.
3. Raise ValueError if either is missing — do NOT let None silently propagate.
"""
import os

from dotenv import load_dotenv


# Load .env values into os.environ before they're read by _required().
# (Step 1 from the docstring above; already wired up so the rest of the
# module can rely on os.environ being populated.)
load_dotenv()


def _required(name: str) -> str:
"""Read an env var; fail loudly if missing."""
# TODO 2: Read os.environ[name]; if not set, raise ValueError with a
# message that names the missing variable AND points at .env.example.
raise NotImplementedError("Implement _required: see TODO 2 in config.py")


# TODO 3: Replace the placeholder lines below by calling _required(...) for
# each variable. INPUT_PATH and OUTPUT_PATH must be importable from this
# module by the rest of the pipeline as a relative import
# (`from .config import INPUT_PATH, ...`), since the pipeline runs as
# `python -m src.pipeline`.
INPUT_PATH: str = "" # TODO: _required("INPUT_PATH")
OUTPUT_PATH: str = "" # TODO: _required("OUTPUT_PATH")
Loading