Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
256 changes: 253 additions & 3 deletions src/notebooks/mainNb.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 28,
"id": "d11a2343",
"metadata": {},
"outputs": [],
Expand All @@ -28,7 +28,8 @@
"from pathlib import Path\n",
"import os\n",
"import sys\n",
"import re"
"import re\n",
"import pandas.testing as pdt"
]
},
{
Expand Down Expand Up @@ -630,7 +631,8 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,

"id": "749ae60a",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -782,6 +784,254 @@
" return self.df"
]
},
{
"cell_type": "markdown",

"id": "3eb6373f",
"metadata": {},
"source": [
"### Sample use of the clean_salary function. "
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "182eac4a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Salary\n",
"0 50000.0\n",
"1 20800.0\n",
"2 104000.0\n",
"3 60000.0\n",
"4 75000.0\n",
"5 100000.0\n",
"6 150000.0\n",
"7 200.0\n",
"8 3000.0\n",
"9 NaN\n",
"10 NaN\n",
"11 NaN\n",
"12 145600.0\n"
]
}
],
"source": [
"test_df = pd.DataFrame({\n",
" \"Salary\": [\"$50k\", \"10\", \"50\", \"60,000\", \"70,000-80,000\", \"100k\", \"150000\", \"200\", \"3000\", \"5000000\", \"$1.5M\", \"invalid\", 70]\n",
"})\n",
"\n",
"# Create instance with test DataFrame\n",
"cleaner = DataCleaner(test_df)\n",
"\n",
"# Run salary cleaning\n",
"cleaner = cleaner.clean_salary(2080)\n",
"\n",
"# Get the cleaned DataFrame\n",
"result_df = cleaner.finalize()\n",
"print(result_df)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "82806fc9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Salary\n",
"0 NaN\n",
"1 NaN\n",
"2 NaN\n",
"3 NaN\n",
"4 50000.0\n",
"5 5000.0\n",
"6 NaN\n",
"7 NaN\n",
"8 NaN\n",
"9 NaN\n"
]
}
],
"source": [
"fail_df = pd.DataFrame({\n",
" \"Salary\": [\n",
" None, # NaN input\n",
" \"\", # empty string\n",
" \" \", # whitespace only\n",
" \"abc123\", # text + numbers\n",
" \"50k-abc\", # malformed range\n",
" \"$-5000\", # negative salary\n",
" \"∞\", # infinity symbol\n",
" \"NaN\", # literal string NaN\n",
" \"$1.5M\", # millions, not handled in parser\n",
" \"70,000—80,000\" # em dash (—) instead of hyphen/dash\n",
" ]\n",
"})\n",
"# Create instance with failing DataFrame\n",
"fail_cleaner = DataCleaner(fail_df)\n",
"# Run salary cleaning on failing DataFrame\n",
"fail_cleaner = fail_cleaner.clean_salary(2080)\n",
"# Get the cleaned DataFrame\n",
"fail_result_df = fail_cleaner.finalize()\n",
"print(fail_result_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "123deb70",
"metadata": {},
"outputs": [],
"source": [
"class DataCleaner:\n",
" def __init__(self, df: pd.DataFrame):\n",
" self.df = df.copy()\n",
"\n",
" def clean_salary(self, hours_per_year: int = 2080):\n",
" \"\"\"\n",
" Clean and standardize salary values in the DataFrame.\n",
"\n",
" Steps performed:\n",
" 1. Remove currency symbols, commas, and shorthand (e.g., \"$50k\" → 50000).\n",
" 2. Handle ranges by converting them to the average value \n",
" (e.g., \"50,000–70,000\" → 60000).\n",
" 3. Handle shorthand \"M\" (e.g., \"$1.5M\" → 1,500,000).\n",
" 4. Convert values to numeric, coercing invalid entries to NaN.\n",
" 5. Treat values <= 200 as hourly wages and convert to annual salaries \n",
" (multiplied by `hours_per_year`).\n",
" 6. Drop unrealistic values greater than 1,000,000 (set to NaN).\n",
"\n",
" Parameters\n",
" ----------\n",
" hours_per_year : int, optional (default=2080)\n",
" Number of work hours in a year for converting hourly to annual salary.\n",
"\n",
" Returns\n",
" -------\n",
" self : object\n",
" The current instance with the cleaned Salary column.\n",
" \"\"\"\n",
" try:\n",
" if \"Salary\" in self.df.columns:\n",
" self.df[\"Salary\"] = self.df[\"Salary\"].astype(str)\n",
"\n",
" def parse_salary(val: str):\n",
" val = val.strip()\n",
" if not val or val.lower() in {\"nan\", \"none\"}:\n",
" return None\n",
"\n",
" # Normalize dash types (hyphen, en dash, em dash \"-\")\n",
" val = re.sub(r\"[–—]\", \"-\", val)\n",
"\n",
" # Handle range like \"50k-70k\" or \"50,000-70,000\"\n",
" if \"-\" in val:\n",
" parts = val.split(\"-\")\n",
" nums = [parse_salary(p) for p in parts if p.strip()]\n",
" nums = [n for n in nums if n is not None]\n",
" return sum(nums) / len(nums) if nums else None\n",
"\n",
" # Remove $, commas, spaces\n",
" val = re.sub(r\"[\\$,]\", \"\", val)\n",
"\n",
" # Handle shorthand k/K (e.g., \"50k\" → 50000)\n",
" match_k = re.match(r\"^(\\d+(\\.\\d+)?)[kK]$\", val)\n",
" if match_k:\n",
" return float(match_k.group(1)) * 1000\n",
"\n",
" # Handle shorthand M (e.g., \"1.5M\" → 1500000)\n",
" match_m = re.match(r\"^(\\d+(\\.\\d+)?)[mM]$\", val)\n",
" if match_m:\n",
" return float(match_m.group(1)) * 1_000_000\n",
"\n",
" # Plain number (integer or float)\n",
" try:\n",
" return float(val)\n",
" except ValueError:\n",
" return None\n",
"\n",
" # Apply parsing\n",
" self.df[\"Salary\"] = self.df[\"Salary\"].apply(parse_salary)\n",
"\n",
" # Convert small numbers (hourly) to annual\n",
" self.df.loc[self.df[\"Salary\"] <= 200, \"Salary\"] *= hours_per_year\n",
"\n",
" # Drop unrealistic salaries\n",
" self.df.loc[self.df[\"Salary\"] > 1_000_000, \"Salary\"] = None\n",
"\n",
" except Exception as e:\n",
" print(f\"[Warning] Failed salary cleaning: {e}\")\n",
"\n",
" return self\n",
"\n",
" def finalize(self):\n",
" \"\"\"Return cleaned dataframe.\"\"\"\n",
" return self.df\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "688bdf74",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ Salary cleaning DataFrame test passed!\n"
]
}
],
"source": [
"# Test DataFrame with edge/fail cases\n",
"fail_df = pd.DataFrame({\n",
" \"Salary\": [\n",
" None, # NaN\n",
" \"\", # NaN\n",
" \" \", # NaN\n",
" \"abc123\", # NaN\n",
" \"50k-abc\", # 50000.0\n",
" \"$-5000\", # -5000.0 (still allowed for now)\n",
" \"∞\", # NaN\n",
" \"NaN\", # NaN\n",
" \"$1.5M\", # NaN ( >1,000,000 rule)\n",
" \"70,000—80,000\" # 75000.0 (dash normalized)\n",
" ]\n",
"})\n",
"\n",
"# Run through cleaner\n",
"cleaner = DataCleaner(fail_df)\n",
"result = cleaner.clean_salary().finalize().reset_index(drop=True)\n",
"\n",
"# Expected results as DataFrame\n",
"expected = pd.DataFrame({\n",
" \"Salary\": [\n",
" None, # None\n",
" None, # empty string\n",
" None, # whitespace\n",
" None, # abc123\n",
" 50000.0, # 50k-abc\n",
" 5000.0, # negative salary\n",
" None, # infinity\n",
" None, # \"NaN\"\n",
" None, # 1.5M filtered out\n",
" 75000.0 # range with em dash\n",
" ]\n",
"}, dtype=\"float64\").reset_index(drop=True)\n",
"\n",
"# Assertion test\n",
"pdt.assert_frame_equal(result, expected)\n",
"print(\"✅ Salary cleaning DataFrame test passed!\")"
]
},
{
"cell_type": "markdown",
"id": "6ddbb4c0",
Expand Down
Loading