HackYourFuture · lassebenni · May 6, 2026 · May 5, 2026 · May 6, 2026 · May 6, 2026
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,13 @@
+{
+    "name": "HYF Week 2 Assignment",
+    "image": "mcr.microsoft.com/devcontainers/python:3.11",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.vscode-pylance"
+            ]
+        }
+    },
+    "postCreateCommand": "python3 -m pip install -r task-1/requirements.txt && echo '✅ HYF Week 2 Assignment Codespace ready. Run the auto-grader locally with: bash .hyf/test.sh && cat .hyf/score.json'"
+}
diff --git a/.gitignore b/.gitignore
@@ -156,3 +156,9 @@ dist
 vite.config.js.timestamp-*
 vite.config.ts.timestamp-*
 
+
+# Python virtual environments
+venv/
+.venv/
+task-*/venv/
+task-*/.venv/
diff --git a/.hyf/test.sh b/.hyf/test.sh
@@ -1,13 +1,211 @@
 #!/usr/bin/env bash
+# Auto-grade Week 2 assignment. Writes score.json next to this script.
+# Total = 100, passing = 60.
+#
+# The auto-grade workflow runs this from the .hyf working directory; we
+# resolve the repo root so the script is robust to either invocation
+# (cd .hyf && bash test.sh, or bash .hyf/test.sh from the repo root).
 set -euo pipefail
 
-# Run your test scripts here.
-# Auto grade tool will execute this file within the .hyf working directory.
-# The result should be stored in score.json file with the format shown below.
-cat << EOF > score.json
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$REPO_ROOT"
+
+PASSING=60
+
+# --- Task 1: Cleaner Pipeline (60 points) ---
+#
+# Scoring ladder (each level depends on the previous):
+#   0   nothing committed
+#   10  required files all present (config.py, models.py, transforms.py,
+#       pipeline.py, tests/test_transforms.py, .env.example)
+#   20  pipeline runs against messy_sales.csv without crashing (the
+#       grader injects INPUT_PATH/OUTPUT_PATH inline; no .env touched)
+#   40  output/clean_sales.csv passes structural checks (12 rows, cleaned
+#       fields, revenue/vat correctly calculated)
+#   60  the *code* also looks engineered: models.py defines a @dataclass
+#       with __post_init__; transforms.py uses the {**row, ...} spread
+#       pattern; pytest tests/ reports all tests passing.
+#
+# Why the introspection cap at 40: a script that hardcodes the expected
+# JSON literal could pass the structural checks without doing any real
+# transformation. The 60-point tier requires the chapter's actual patterns
+# (dataclass, spread, tests) to be present in the source.
+task1=0
+task1_msg="missing required files in task-1/"
+
+required_files=(
+    "task-1/src/config.py"
+    "task-1/src/models.py"
+    "task-1/src/transforms.py"
+    "task-1/src/pipeline.py"
+    "task-1/tests/test_transforms.py"
+    "task-1/.env.example"
+)
+
+all_present=true
+for f in "${required_files[@]}"; do
+    if [ ! -f "$f" ]; then
+        all_present=false
+        break
+    fi
+done
+
+if [ "$all_present" = true ]; then
+    task1=10
+    task1_msg="files exist but pipeline failed to run"
+
+    # Make sure the python-dotenv + pytest deps are available; if a
+    # requirements.txt exists, install it quietly.
+    if [ -f task-1/requirements.txt ]; then
+        python3 -m pip install -q -r task-1/requirements.txt || \
+            echo "WARN: pip install failed; pipeline may fail with ModuleNotFoundError" >&2
+    fi
+
+    # Force the canonical paths inline so the grader is deterministic
+    # regardless of the student's local .env (which may point INPUT_PATH /
+    # OUTPUT_PATH at /tmp or some other location during their own debugging).
+    # The student's .env is NOT read or modified by the grader.
+    PIPELINE_ERR=$(mktemp)
+    if ( cd task-1 && env INPUT_PATH=data/messy_sales.csv OUTPUT_PATH=output/clean_sales.csv python3 -m src.pipeline ) >/dev/null 2>"$PIPELINE_ERR"; then
+        task1=20
+        task1_msg="pipeline ran but output/clean_sales.csv failed structural checks"
+        STRUCT_ERR=$(mktemp)
+        if python3 - <<'PY' 2>"$STRUCT_ERR"
+import csv
+from pathlib import Path
+
+p = Path("task-1/output/clean_sales.csv")
+assert p.exists(), "output/clean_sales.csv was not created"
+
+with p.open() as f:
+    rows = list(csv.DictReader(f))
+
+# 15 input rows - 3 invalid (empty name #6, negative price #7, zero qty #8) = 12
+assert len(rows) == 12, f"expected 12 cleaned rows, got {len(rows)}"
+
+# Required columns
+required = {"transaction_id", "product_name", "category", "price",
+            "quantity", "customer_email", "date", "revenue", "vat"}
+missing = required - set(rows[0].keys())
+assert not missing, f"output missing columns: {missing}"
+
+# Field-level checks
+for row in rows:
+    name = row["product_name"]
+    assert name == name.strip() and name == name.title(), \
+        f"product_name not cleaned: {name!r}"
+    email = row["customer_email"]
+    assert email == email.strip().lower(), \
+        f"customer_email not cleaned: {email!r}"
+    cat = row["category"]
+    assert cat, f"category empty (should default to 'Unknown') in row {row['transaction_id']}"
+
+# Spot-check the math: row id=1 was 999.99 * 2 = 1999.98 revenue, then
+# * 0.21 = 419.9958 vat (rounded to 420.00 at 2 decimals; the 0.01
+# tolerance below absorbs either rounding precision the student picks).
+row_1 = next(r for r in rows if r["transaction_id"] == "1")
+revenue_1 = float(row_1["revenue"])
+vat_1 = float(row_1["vat"])
+assert abs(revenue_1 - 1999.98) < 0.01, f"row 1 revenue wrong: {revenue_1}"
+assert abs(vat_1 - 419.9958) < 0.01, f"row 1 vat wrong: {vat_1}"
+
+# At least one row should have category="Unknown" (row 15 had empty category)
+assert any(r["category"].lower() == "unknown" for r in rows), \
+    "no row has category='Unknown' (row 15's empty category should default)"
+PY
+        then
+            rm -f "$STRUCT_ERR"
+            task1=40
+            task1_msg="output passes structural checks but code is missing required engineering patterns (see below)"
+
+            # Introspection caps. The full 60 requires:
+            #  - models.py imports `dataclass` AND defines a __post_init__ method
+            #  - transforms.py uses the {**row, ...} spread pattern (no mutation)
+            #  - pytest tests/test_transforms.py passes (all student tests green)
+            models_has_dataclass=$(grep -cE "^[[:space:]]*from dataclasses\b|^[[:space:]]*import dataclasses\b" task-1/src/models.py || true)
+            models_has_post_init=$(grep -cE "^[[:space:]]*def __post_init__" task-1/src/models.py || true)
+            transforms_has_spread=$(grep -cE '\{\*\*' task-1/src/transforms.py || true)
+
+            tests_pass=false
+            if ( cd task-1 && python3 -m pytest tests/ -q ) >/dev/null 2>&1; then
+                tests_pass=true
+            fi
+
+            if [ "$models_has_dataclass" -gt 0 ] && \
+               [ "$models_has_post_init" -gt 0 ] && \
+               [ "$transforms_has_spread" -gt 0 ] && \
+               [ "$tests_pass" = true ]; then
+                task1=60
+                task1_msg="output and code structure both pass; tests green"
+            else
+                missing=()
+                [ "$models_has_dataclass" -eq 0 ] && missing+=("from dataclasses import ... in models.py")
+                [ "$models_has_post_init" -eq 0 ] && missing+=("__post_init__ in models.py")
+                [ "$transforms_has_spread" -eq 0 ] && missing+=("{**row, ...} spread pattern in transforms.py")
+                [ "$tests_pass" = false ] && missing+=("pytest tests/ all green")
+                task1_msg="output passes but code missing: $(IFS=, ; echo "${missing[*]}")"
+            fi
+        else
+            # Structural checks failed: surface the assertion message.
+            err=$(tail -3 "$STRUCT_ERR" | tr '\n' ' ' | sed 's/  */ /g' | sed 's/^ //;s/ $//')
+            [ -n "$err" ] && task1_msg="structural check failed: $err"
+            rm -f "$STRUCT_ERR"
+        fi
+    else
+        # Pipeline crashed: surface the last few stderr lines.
+        err=$(tail -3 "$PIPELINE_ERR" | tr '\n' ' ' | sed 's/  */ /g' | sed 's/^ //;s/ $//')
+        [ -n "$err" ] && task1_msg="pipeline failed to run: $err"
+    fi
+    rm -f "$PIPELINE_ERR"
+fi
+
+# --- Task 2: AI Debug Report (20 points) ---
+task2=0
+task2_msg="missing task-2/AI_DEBUG.md"
+if [ -s task-2/AI_DEBUG.md ]; then
+    task2=5
+    task2_msg="AI_DEBUG.md exists but missing required sections"
+    if grep -q "^## The Error" task-2/AI_DEBUG.md && \
+       grep -q "^## The Prompt" task-2/AI_DEBUG.md && \
+       grep -q "^## The Solution" task-2/AI_DEBUG.md && \
+       grep -q "^## Reflection" task-2/AI_DEBUG.md; then
+        task2=10
+        task2_msg="all sections present but file looks too short to be filled in"
+        # Empty template ships at ~1500 chars. Filled-in report should have
+        # meaningfully more content; threshold = template + ~600 chars
+        # of student writing (4 sections * ~150 chars each).
+        if [ "$(wc -c < task-2/AI_DEBUG.md)" -gt 2100 ]; then
+            task2=20
+            task2_msg="AI_DEBUG.md is filled in"
+        fi
+    fi
+fi
+
+# --- Task 3: HYF Azure Proof (20 points) ---
+task3=0
+task3_msg="missing task-3/azure_proof.png|jpg|jpeg"
+for ext in png jpg jpeg; do
+    if [ -s "task-3/azure_proof.$ext" ]; then
+        task3=20
+        task3_msg="azure_proof.$ext present"
+        break
+    fi
+done
+
+score=$((task1 + task2 + task3))
+if [ "$score" -ge "$PASSING" ]; then pass=true; else pass=false; fi
+
+cat > "$SCRIPT_DIR/score.json" <<EOF
 {
-  "score": 0,
-  "pass": true,
-  "passingScore": 0
+  "score": $score,
+  "pass": $pass,
+  "passingScore": $PASSING
 }
 EOF
+
+echo "Task 1 (Cleaner Pipeline): $task1/60 — $task1_msg"
+echo "Task 2 (AI Debug Report):  $task2/20 — $task2_msg"
+echo "Task 3 (Azure Proof):       $task3/20 — $task3_msg"
+echo "----------------------------------------"
+echo "Total: $score/100 — pass=$pass (passing threshold: $PASSING)"
diff --git a/README.md b/README.md
@@ -1,17 +1,85 @@
-# [Track] week X assignment
-HackYourFuture <Track> week X assignment
-The Week X assignment for the HackYourFuture <TRACK> can be found at the following link: [TODO: Assignment url in the learning platform]
+# Data Track — Week 2 Assignment (Template)
 
+The HackYourFuture Data Track Week 2 assignment: **Refactoring to a Clean Pipeline**.
 
-## Implementation Instructions
+> 👩‍🎓 **Students:** you are in the wrong place. Do **not** fork or use this template.
+> Go to your cohort's assignment repo under
+> [`HackYourAssignment`](https://github.com/HackYourAssignment) (e.g. `c55-data-week2`,
+> `c56-data-week2`, …). Your teacher posts the exact link in your cohort channel.
+> Fork the cohort repo, branch, and open a PR back to it. Full instructions live in the
+> [Week 2 Assignment on Notion](https://www.notion.so/hackyourfuture/Week-2-Assignment-Refactoring-to-a-Clean-Pipeline-f8c27aa88d144cb18f54c49d02f50b73).
 
-Provide clear instructions on how trainees should implement the tasks.
+## For instructors / track maintainers
 
-### Task 1
-Instructions for Task 1
+This repo is the **upstream template** for the Week 2 assignment. At the start of each
+cohort, generate a cohort-specific repo under the `HackYourAssignment` org from this
+template (GitHub: **Use this template → Create a new repository**, owner =
+`HackYourAssignment`, name = `c<NN>-data-week2`). Students then fork *that* cohort repo
+and open PRs back to it; the auto-grader runs on every push.
 
-### Task 2
-Instructions for Task 2
+Edits to the assignment, dataset, or grader belong here on the template, not on the
+cohort copies.
 
-...
+## Tasks at a glance
 
+| Task | Folder | Points | What you build |
+|---|---|---|---|
+| **Task 1** — Cleaner Pipeline | `task-1/` | 60 | A modular Python pipeline with `config.py` (env-var loading), `models.py` (`Transaction` dataclass with `__post_init__` validation), `transforms.py` (4+ pure composable functions, no mutation), `pipeline.py` (orchestrator), and `tests/test_transforms.py` (4+ pytest tests). Reads `data/messy_sales.csv`, writes `output/clean_sales.csv`. |
+| **Task 2** — AI Debug Report | `task-2/` | 20 | Document one debugging session where you used an LLM to fix a bug. Fill in the four sections of `AI_DEBUG.md`. |
+| **Task 3** — HYF Azure proof | `task-3/` | 20 | Confirm your HYF Azure tenant access still works. Screenshot proof at `task-3/azure_proof.png` (or `.jpg` / `.jpeg`) showing resource group + region + €0 cost. |
+
+Total: 100 · Passing: 60.
+
+## Repository layout
+
+```text
+.
+├── task-1/
+│   ├── data/
+│   │   └── messy_sales.csv      # the dataset (committed; do not edit)
+│   ├── src/
+│   │   ├── config.py            # env-var loader — fill in TODOs
+│   │   ├── models.py            # Transaction dataclass — fill in TODOs
+│   │   ├── transforms.py        # 4 pure transform functions — fill in TODOs
+│   │   └── pipeline.py          # orchestrator — fill in TODOs
+│   ├── tests/
+│   │   └── test_transforms.py   # 4 pytest tests — fill in TODOs
+│   ├── output/                  # your pipeline writes clean_sales.csv here (gitignored)
+│   ├── .env.example             # copy to .env (gitignored) before running
+│   └── requirements.txt         # python3 -m pip install -r requirements.txt
+├── task-2/
+│   └── AI_DEBUG.md              # fill in the four sections
+├── task-3/
+│   └── azure_proof.png          # add your screenshot here
+├── .hyf/
+│   └── test.sh                  # auto-grader (read it to see exactly what it checks)
+└── .github/workflows/
+    └── grade-assignment.yml     # runs .hyf/test.sh on every PR
+```
+
+## Run the grader locally
+
+Before opening a PR, run the same checks the auto-grader runs:
+
+```bash
+cd task-1
+python3 -m pip install -r requirements.txt
+cp .env.example .env
+cd ..
+bash .hyf/test.sh
+cat .hyf/score.json
+```
+
+The grader prints a per-task breakdown so you can see exactly which check failed and
+why. The PR-time grader does the same — your local run and the CI run are identical.
+
+## Scoring ladder (Task 1)
+
+The grader awards points incrementally so partial credit is meaningful:
+
+- **10/60** — required files exist (`config.py`, `models.py`, `transforms.py`, `pipeline.py`, `tests/test_transforms.py`, `.env.example`).
+- **20/60** — `python -m src.pipeline` runs from `task-1/` without crashing (the grader injects `INPUT_PATH` and `OUTPUT_PATH` inline; your local `.env` is not used during grading).
+- **40/60** — `output/clean_sales.csv` passes structural checks: 12 rows (15 input − 3 invalid/zero-quantity), lowercased emails, title-cased product names, "Unknown" filled in for missing categories, `revenue` and `vat` columns present and correctly calculated.
+- **60/60** — code looks engineered: `models.py` defines a `@dataclass` with `__post_init__`; `transforms.py` uses the `{**row, ...}` spread pattern (no mutation); `pytest tests/` reports all tests passing.
+
+The 40-point cap exists to stop a 5-line script that hardcodes the expected JSON from getting full marks. Real engineering patterns (dataclass + spread + tests) are required for the top 20 points.
diff --git a/task-1/.env.example b/task-1/.env.example
@@ -0,0 +1,2 @@
+INPUT_PATH=data/messy_sales.csv
+OUTPUT_PATH=output/clean_sales.csv
diff --git a/task-1/data/messy_sales.csv b/task-1/data/messy_sales.csv
@@ -0,0 +1,16 @@
+transaction_id,product_name,category,price,quantity,customer_email,date
+1,  laptop PRO ,Electronics,999.99,2,alice@example.com,2024-03-15
+2,WIRELESS MOUSE,Electronics,29.99,5,  BOB@Company.COM  ,2024-03-15
+3,  usb cable,Electronics,4.99,10,,2024-03-16
+4,  Office Chair ,Furniture,349.50,1,charlie@work.org,2024-03-16
+5,standing DESK,Furniture,599.00,1,charlie@work.org,not_a_date
+6,,Electronics,19.99,3,dave@email.com,2024-03-17
+7,  Mechanical Keyboard ,Electronics,-89.99,2,eve@startup.io,2024-03-17
+8,monitor ARM,Furniture,79.99,0,frank@corp.com,2024-03-18
+9,  WEBCAM hd ,Electronics,54.99,1,  ,2024-03-18
+10,desk lamp,Furniture,34.99,4,grace@university.edu,2024-03-19
+11,  NOISE CANCELLING headphones,Electronics,199.99,1,alice@example.com,2024-03-19
+12,cable management KIT,Furniture,15.99,6,henry@business.com,2024-03-20
+13,  ergonomic MOUSE PAD ,Furniture,24.99,3,ivan@email.com,2024-03-20
+14,laptop STAND,Furniture,45.99,2,jenny@work.org,2024-03-21
+15,  BLUETOOTH speaker,,39.99,1,karl@startup.io,2024-03-21
diff --git a/task-1/task 1 files → task-1/output/.gitkeep b/task-1/task 1 files → task-1/output/.gitkeep
diff --git a/task-1/requirements.txt b/task-1/requirements.txt
@@ -0,0 +1,2 @@
+python-dotenv>=1.0.0
+pytest>=7.0
diff --git a/task-2/task 2 files → task-1/src/__init__.py b/task-2/task 2 files → task-1/src/__init__.py
diff --git a/task-1/src/config.py b/task-1/src/config.py
@@ -0,0 +1,35 @@
+"""Configuration loader.
+
+Read INPUT_PATH and OUTPUT_PATH from a .env file (see .env.example for the
+expected variable names) and expose them as named imports.
+
+Tasks (see chapter Task 1):
+  1. Use python-dotenv's load_dotenv() to read the .env file.
+  2. Read INPUT_PATH and OUTPUT_PATH from os.environ.
+  3. Raise ValueError if either is missing — do NOT let None silently propagate.
+"""
+import os
+
+from dotenv import load_dotenv
+
+
+# Load .env values into os.environ before they're read by _required().
+# (Step 1 from the docstring above; already wired up so the rest of the
+# module can rely on os.environ being populated.)
+load_dotenv()
+
+
+def _required(name: str) -> str:
+    """Read an env var; fail loudly if missing."""
+    # TODO 2: Read os.environ[name]; if not set, raise ValueError with a
+    # message that names the missing variable AND points at .env.example.
+    raise NotImplementedError("Implement _required: see TODO 2 in config.py")
+
+
+# TODO 3: Replace the placeholder lines below by calling _required(...) for
+# each variable. INPUT_PATH and OUTPUT_PATH must be importable from this
+# module by the rest of the pipeline as a relative import
+# (`from .config import INPUT_PATH, ...`), since the pipeline runs as
+# `python -m src.pipeline`.
+INPUT_PATH: str = ""   # TODO: _required("INPUT_PATH")
+OUTPUT_PATH: str = ""  # TODO: _required("OUTPUT_PATH")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		INPUT_PATH=data/messy_sales.csv
		OUTPUT_PATH=output/clean_sales.csv