-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Description
Related to issue #5
Code for list of stations (and metadata #1):
I am still working on the automatic metadata download. Currently we use the
(latitude and longitude converted from the original 31287 from the file: "messstellen_alle.csv" downloaded from https://ehyd.gv.at.
Code for downloading the data
import requests
import pandas as pd
import re
def get_ehyd_data(
gauge_id: str,
variable: str = "streamflow",
start_date: str = None,
end_date: str = None,
) -> pd.DataFrame:
"""
Download and parse eHYD Austria daily hydrological CSV data directly into a DataFrame.
Parameters
----------
gauge_id : str or int
eHYD gauge ID, e.g. 205856
variable : str, optional
One of: 'streamflow' (Q-Tagesmittel in m3s), 'stage' (W-Tagesmittel in cm)
start_date, end_date : str, optional
Date range filters in 'YYYY-MM-DD' format
Returns
-------
pd.DataFrame with columns ['time', <variable>]
"""
variable = variable.lower()
file_map = {"streamflow": 5, "stage": 2}
if variable not in file_map:
raise ValueError("Variable must be 'streamflow' or 'stage'.")
url = f"https://ehyd.gv.at/eHYD/MessstellenExtraData/owf?id={gauge_id}&file={file_map[variable]}"
try:
r = requests.get(url, timeout=30)
if r.status_code == 404:
raise ValueError(f"No file found for gauge {gauge_id} ({variable})")
r.raise_for_status()
except Exception as e:
print(f"Failed to fetch data: {e}")
return pd.DataFrame(columns=["time", variable])
text = r.text
lines = text.splitlines()
# Find where "Werte:" starts
data_start = None
for i, line in enumerate(lines):
if line.strip().startswith("Werte"):
data_start = i + 1
break
if data_start is None:
print(f"No 'Werte:' section found for gauge {gauge_id}")
return pd.DataFrame(columns=["time", variable])
# Parse data lines
cleaned = []
for line in lines[data_start:]:
line = line.strip()
if not line:
continue
parts = re.split(r"[;\t\s]+", line, maxsplit=2)
if len(parts) >= 2:
date_str = parts[0]
time_str = parts[1] if len(parts) > 1 else "00:00:00"
val_str = parts[-1].replace(",", ".")
full_datetime = f"{date_str} {time_str}"
cleaned.append((full_datetime, val_str))
if not cleaned:
print(f"No valid data lines found for gauge {gauge_id}")
return pd.DataFrame(columns=["time", variable])
df = pd.DataFrame(cleaned, columns=["time", variable])
df["time"] = pd.to_datetime(df["time"], errors="coerce", format="%d.%m.%Y %H:%M:%S")
df[variable] = pd.to_numeric(df[variable], errors="coerce")
# Filter by date range
if start_date:
df = df[df["time"] >= pd.to_datetime(start_date)]
if end_date:
df = df[df["time"] <= pd.to_datetime(end_date)]
# Handle empty result
if df.empty:
print(f"No data found for gauge {gauge_id} in selected time interval.")
return pd.DataFrame(columns=["time", variable])
df = df.sort_values("time").reset_index(drop=True)
return df
Example usage:
gauge_id = austria_sites.gauge_id.tolist()[-10]
df = get_ehyd_data(gauge_id, variable="streamflow", start_date="1900-01-01", end_date="2021-01-05")
print(df.head())
simonmoulds
Metadata
Metadata
Assignees
Labels
No labels