Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
- ONS income: ONS small area income estimates
- Tenure: English Housing Survey
- Private rent: VOA/ONS private rental market statistics
- Council tax bands A-H: VOA Council Tax Stock of Properties (per LA)
- Council tax £ paid (net of CTR): MHCLG taxbase × Band D (England),
Welsh Government Council Tax Income (Wales)
"""

from policyengine_uk import Microsimulation
Expand Down Expand Up @@ -252,6 +255,57 @@ def create_local_authority_target_matrix(
national_rent * la_household_share,
)

# ── Council tax band counts (LA targets) ───────────────────────
# Derived/proxy targets: per-LA VOA dwellings in each band A-H.
# Lineage drift vs the matrix-side household council_tax_band:
# VOA counts dwellings (incl. exempt / empty / second homes);
# matrix counts households. See la_council_tax.py for full
# caveat. Missing cells stay NaN and are masked out by the
# calibrator; this keeps the target direct instead of fabricating
# national-share fallbacks for Scotland or Northern Ireland. Band I
# is Wales-only and rarely populated, so it is intentionally
# excluded.
ct_path = STORAGE_FOLDER / "la_council_tax.csv"
if ct_path.exists():
ct_data = pd.read_csv(ct_path)
ct_columns = ["code"] + [f"count_band_{b}" for b in "ABCDEFGH"]
if "total_council_tax_net" in ct_data.columns:
ct_columns.append("total_council_tax_net")
ct_merged = la_codes.merge(ct_data[ct_columns], on="code", how="left")
ct_band = sim.calculate("council_tax_band").values
for band in "ABCDEFGH":
col = f"voa/council_tax/{band}"
matrix[col] = (ct_band == band).astype(float)
csv_col = f"count_band_{band}"
has_count = ct_merged[csv_col].notna().values
direct = ct_merged[csv_col].values
y[col] = np.where(
has_count,
direct,
np.nan,
)

# ── Council tax £ paid, net of CTR (LA targets) ────────────
# Derived/proxy target: y = MHCLG taxbase × Band D (E) or WG
# Council Tax Income (W). Matrix col is FRS-reported
# council_tax_less_benefit (gross − reported CTB). Same
# intent (household council tax paid net of CTR), different
# construction paths — see la_council_tax.py for the lineage
# caveat flagged in review by @MaxGhenis. Both sides are net
# of CTR, per Max's 28 Apr standup decision on FRS alignment.
# Missing cells remain NaN and are masked out by the calibrator.
if "total_council_tax_net" in ct_merged.columns:
matrix["housing/council_tax_net"] = sim.calculate(
"council_tax_less_benefit"
).values
has_ct_net = ct_merged["total_council_tax_net"].notna().values
direct_net = ct_merged["total_council_tax_net"].values
y["housing/council_tax_net"] = np.where(
has_ct_net,
direct_net,
np.nan,
)

# ── Country mask ───────────────────────────────────────────────
country_mask = create_country_mask(
household_countries=sim.calculate("country").values,
Expand Down
361 changes: 361 additions & 0 deletions policyengine_uk_data/storage/la_council_tax.csv

Large diffs are not rendered by default.

13 changes: 11 additions & 2 deletions policyengine_uk_data/targets/compute/council_tax.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,18 @@ def compute_council_tax_band(target, ctx) -> np.ndarray:


def compute_obr_council_tax(target, ctx) -> np.ndarray:
"""Compute OBR council tax receipts, optionally by country."""
"""Compute OBR council tax receipts, optionally by country.

OBR Table 4.1 reports "Total net council tax receipts" — net of
council tax reduction (CTR) support. The matching household-level
signal is therefore ``council_tax_less_benefit`` (= gross council
tax less the CTR award), not ``council_tax`` (which is the gross
liability before CTR). Using the gross variable here would
systematically push weights down to fit a net target, leaking
bias into adjacent national calibrations.
"""
name = target.name
ct = ctx.pe("council_tax")
ct = ctx.pe("council_tax_less_benefit")

if name == "obr/council_tax":
return ct
Expand Down
240 changes: 240 additions & 0 deletions policyengine_uk_data/targets/sources/la_council_tax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
"""Local-authority council tax calibration targets (derived proxies).

Produces three kinds of LA-level calibration target from public data:

- ``ons/council_tax_band_d/{code}``: the average Band D council tax
(inclusive of all precepts) each household pays in billing authority
``code``. Sourced from MHCLG, Welsh Government and Scottish
Government annual publications.
- ``voa/council_tax/{code}/{band}``: the number of dwellings in band
``A``–``H`` (England) or ``A``–``I`` (Wales) for billing authority
``code``. Sourced from the VOA *Council Tax: Stock of Properties*
summary tables.
- ``housing/council_tax_net/{code}``: net council tax requirement per
LA (net of CTR support). England derived from MHCLG taxbase × Band D;
Wales sourced directly from WG Council Tax Income (Table 3).

Data for all 360 LAs in ``local_authorities_2021.csv`` is joined from
the committed canonical file ``storage/la_council_tax.csv``. Rows where
a source did not provide a value are omitted so calibrators cleanly
skip them.

Lineage caveats (flagged in PR review by @MaxGhenis):

- ``voa/council_tax/{A..H}`` is a **derived proxy**, not a direct
match for the matrix-side household ``council_tax_band``:
* Target counts VOA dwellings; matrix counts policyengine-uk
households. A household ≠ a dwelling in general.
* VOA stock includes exempt, empty, and second-home dwellings,
which contribute zero to the matrix-side sum (no household lives
in them in the FRS).
* VOA covers England and Wales only. Scotland and NI cells are
masked out of the loss matrix unless a direct source is available.
* Banding ratios differ: Scotland diverged from the standard
6/9–18/9 E&W ratios after the 2017 reform; Wales has Band I,
England does not.

- ``housing/council_tax_net`` is a **derived proxy**:
* Target value (England) is MHCLG ``taxbase × Band D``, where
taxbase is Band D equivalent dwellings adjusted for ~7
discount, premium, and exemption classes (single-person,
disabled relief, second-home, empty-home premium, family
annexe, etc.). Wales uses WG-published net council tax income
direct.
* Matrix col is FRS-reported ``council_tax_less_benefit``
(household-reported gross less reported CTB).
* Same intent (what households pay net of CTR), different
construction paths and underlying microdata sources.

Known coverage gaps:

- Northern Ireland is excluded because its domestic rates system is
distinct from council tax. ``loss.py`` masks NI cells rather than
fabricating a fallback.
- Band-count rows for Scottish LAs are absent because the VOA summary
tables do not cover Scotland; Scottish Assessors publishes per-LA
chargeable-dwellings data separately and is a follow-up.
- Band I only exists in Wales (introduced in the 2005 Welsh revaluation);
English rows leave it null.
- City of London has Band A suppressed by VOA for disclosure control;
its other bands are populated.

Sources:
- MHCLG *Council Tax levels set by local authorities in England 2026-27*
https://www.gov.uk/government/statistics/council-tax-levels-set-by-local-authorities-in-england-2026-to-2027
- MHCLG *Council Taxbase 2025 in England* (Table 1.35 taxbase after CTR)
https://www.gov.uk/government/statistics/council-taxbase-2025-in-england
- Welsh Government *Council Tax levels: April 2026 to March 2027*
https://www.gov.wales/council-tax-levels-april-2026-march-2027-html
- Scottish Government *Council Tax Assumptions 2025* (CT by Band, 2025-26)
https://www.gov.scot/publications/council-tax-datasets/
- VOA *Council Tax: Stock of Properties, 2025*
https://www.gov.uk/government/statistics/council-tax-stock-of-properties-2025
"""

from __future__ import annotations

from functools import lru_cache

import pandas as pd

from policyengine_uk_data.targets.schema import (
GeographicLevel,
Target,
Unit,
)
from policyengine_uk_data.targets.sources._common import STORAGE


_CSV_NAME = "la_council_tax.csv"

# Latest fiscal years covered by each source. The LA Band D amounts are
# structurally single-year snapshots; callers that need longer time
# series should uprate via the existing council-tax uprating index.
_YEAR_BAND_D_ENGLAND = 2026
_YEAR_BAND_D_WALES = 2026
_YEAR_BAND_D_SCOTLAND = 2025
_YEAR_BAND_COUNT = 2025

_BAND_COUNT_COLUMNS = {band: f"count_band_{band}" for band in "ABCDEFGHI"}

_ENGLAND_REF = (
"https://www.gov.uk/government/statistics/"
"council-tax-levels-set-by-local-authorities-in-england-2026-to-2027"
)
_WALES_REF = "https://www.gov.wales/council-tax-levels-april-2026-march-2027-html"
_SCOTLAND_REF = "https://www.gov.scot/publications/council-tax-datasets/"
_VOA_REF = (
"https://www.gov.uk/government/statistics/council-tax-stock-of-properties-2025"
)
# Net council tax requirement per LA. England derived from MHCLG
# Council Taxbase 2025 Table 1.35 ("Tax base after allowance for council
# tax support") × LA Band D amount. Wales sourced directly from the
# Welsh Government Table 3 "Council tax income (£m)" — already net.
_NET_CT_REF_ENG = (
"https://www.gov.uk/government/statistics/council-taxbase-2025-in-england"
)
_NET_CT_REF_WAL = _WALES_REF


@lru_cache(maxsize=1)
def _load_table() -> pd.DataFrame | None:
"""Return the committed LA council-tax table, or ``None`` if missing."""
csv_path = STORAGE / _CSV_NAME
if not csv_path.exists():
return None
return pd.read_csv(csv_path)


def load_la_net_council_tax() -> pd.DataFrame:
"""Load per-LA net council tax requirement (£, after CTR support).

Returns a DataFrame with columns ``code, total_council_tax_net``
for LAs where a directly-observed net figure is available
(England + Wales). Scotland and NI are absent; loss-matrix callers
should mask those cells rather than fabricating fallback values.
"""
df = _load_table()
if df is None or df.empty:
return pd.DataFrame(columns=["code", "total_council_tax_net"])
if "total_council_tax_net" not in df.columns:
return pd.DataFrame(columns=["code", "total_council_tax_net"])
return df.loc[
df["total_council_tax_net"].notna(),
["code", "total_council_tax_net"],
].reset_index(drop=True)


def _year_for_band_d(country: str) -> int:
if country == "WALES":
return _YEAR_BAND_D_WALES
if country == "SCOTLAND":
return _YEAR_BAND_D_SCOTLAND
return _YEAR_BAND_D_ENGLAND


def _ref_for_band_d(country: str) -> str:
if country == "WALES":
return _WALES_REF
if country == "SCOTLAND":
return _SCOTLAND_REF
return _ENGLAND_REF


def get_targets() -> list[Target]:
"""Emit LA-level Band D amount + band-count targets."""
df = _load_table()
if df is None or df.empty:
return []

targets: list[Target] = []

# Band D amount targets — one per LA with a reported value.
for _, row in df.iterrows():
amount = row.get("band_d_amount")
if pd.isna(amount):
continue
code = str(row["code"])
country = str(row["country"])
targets.append(
Target(
name=f"ons/council_tax_band_d/{code}",
variable="council_tax_band_d_amount",
source="ons",
unit=Unit.GBP,
geographic_level=GeographicLevel.LOCAL_AUTHORITY,
geo_code=code,
geo_name=str(row["name"]),
values={_year_for_band_d(country): float(amount)},
reference_url=_ref_for_band_d(country),
)
)

# Band count targets — one per (LA, band) where VOA has a value.
for _, row in df.iterrows():
code = str(row["code"])
name = str(row["name"])
for band, col in _BAND_COUNT_COLUMNS.items():
count = row.get(col)
if pd.isna(count):
continue
targets.append(
Target(
name=f"voa/council_tax/{code}/{band}",
variable="council_tax_band",
source="voa",
unit=Unit.COUNT,
geographic_level=GeographicLevel.LOCAL_AUTHORITY,
geo_code=code,
geo_name=name,
values={_YEAR_BAND_COUNT: float(count)},
is_count=True,
reference_url=_VOA_REF,
)
)

# Net council tax £ targets — one per LA with an observed value.
# Mirrors the FRS net-of-CTR amount; pairs with the band targets
# above to cover both FRS council-tax data points.
if "total_council_tax_net" in df.columns:
for _, row in df.iterrows():
net = row.get("total_council_tax_net")
if pd.isna(net):
continue
country = str(row["country"])
ref = _NET_CT_REF_WAL if country == "WALES" else _NET_CT_REF_ENG
targets.append(
Target(
name=f"housing/council_tax_net/{row['code']}",
variable="council_tax_less_benefit",
source="mhclg" if country == "ENGLAND" else "stats_wales",
unit=Unit.GBP,
geographic_level=GeographicLevel.LOCAL_AUTHORITY,
geo_code=str(row["code"]),
geo_name=str(row["name"]),
values={_YEAR_BAND_D_ENGLAND: float(net)},
reference_url=ref,
)
)

return targets
39 changes: 39 additions & 0 deletions policyengine_uk_data/tests/test_calibrate_save.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,42 @@ def test_calibrate_local_areas_saves_weights_in_nonverbose_branch(
# Verify the saved weights have the area_count x n_households shape
# produced by the calibrator.
assert weights.shape == (2, 4)


def test_calibrate_local_areas_masks_nan_local_targets(tmp_path, monkeypatch):
"""Sparse local targets should be allowed.

Local-authority sources are not available for every area/metric pair.
A NaN target means "do not train on this cell", not "propagate NaN
through the loss".
"""

import h5py

from policyengine_uk_data.utils import calibrate as calibrate_module
from policyengine_uk_data.utils.calibrate import calibrate_local_areas

monkeypatch.setattr(calibrate_module, "STORAGE_FOLDER", tmp_path)

matrix_fn, national_matrix_fn = _make_toy_inputs(n_households=4, area_count=2)

def sparse_matrix_fn(dataset):
matrix, local_targets, country_mask = matrix_fn(dataset)
local_targets.iloc[1, 0] = np.nan
return matrix, local_targets, country_mask

weight_file = "toy_sparse_weights.h5"
calibrate_local_areas(
dataset=_StubDataset(np.array([1.0, 1.0, 1.0, 1.0])),
matrix_fn=sparse_matrix_fn,
national_matrix_fn=national_matrix_fn,
area_count=2,
weight_file=weight_file,
dataset_key="2025",
epochs=5,
verbose=False,
)

with h5py.File(tmp_path / weight_file, "r") as f:
weights = f["2025"][:]
assert np.isfinite(weights).all()
Loading