Skip to content

Austria (add more countries) #51

@thiagovmdon

Description

@thiagovmdon

Related to issue #5

Code for list of stations (and metadata #1):

I am still working on the automatic metadata download. Currently we use the

austria_sites.csv

(latitude and longitude converted from the original 31287 from the file: "messstellen_alle.csv" downloaded from https://ehyd.gv.at.

Code for downloading the data

import requests
import pandas as pd
import re

def get_ehyd_data(
    gauge_id: str,
    variable: str = "streamflow",
    start_date: str = None,
    end_date: str = None,
) -> pd.DataFrame:
    """
    Download and parse eHYD Austria daily hydrological CSV data directly into a DataFrame.

    Parameters
    ----------
    gauge_id : str or int
        eHYD gauge ID, e.g. 205856
    variable : str, optional
        One of: 'streamflow' (Q-Tagesmittel in m3s), 'stage' (W-Tagesmittel in cm)
    start_date, end_date : str, optional
        Date range filters in 'YYYY-MM-DD' format

    Returns
    -------
    pd.DataFrame with columns ['time', <variable>]
    """

    variable = variable.lower()
    file_map = {"streamflow": 5, "stage": 2}
    if variable not in file_map:
        raise ValueError("Variable must be 'streamflow' or 'stage'.")

    url = f"https://ehyd.gv.at/eHYD/MessstellenExtraData/owf?id={gauge_id}&file={file_map[variable]}"

    try:
        r = requests.get(url, timeout=30)
        if r.status_code == 404:
            raise ValueError(f"No file found for gauge {gauge_id} ({variable})")
        r.raise_for_status()
    except Exception as e:
        print(f"Failed to fetch data: {e}")
        return pd.DataFrame(columns=["time", variable])

    text = r.text
    lines = text.splitlines()

    # Find where "Werte:" starts
    data_start = None
    for i, line in enumerate(lines):
        if line.strip().startswith("Werte"):
            data_start = i + 1
            break
    if data_start is None:
        print(f"No 'Werte:' section found for gauge {gauge_id}")
        return pd.DataFrame(columns=["time", variable])

    # Parse data lines
    cleaned = []
    for line in lines[data_start:]:
        line = line.strip()
        if not line:
            continue
        parts = re.split(r"[;\t\s]+", line, maxsplit=2)
        if len(parts) >= 2:
            date_str = parts[0]
            time_str = parts[1] if len(parts) > 1 else "00:00:00"
            val_str = parts[-1].replace(",", ".")
            full_datetime = f"{date_str} {time_str}"
            cleaned.append((full_datetime, val_str))

    if not cleaned:
        print(f"No valid data lines found for gauge {gauge_id}")
        return pd.DataFrame(columns=["time", variable])

    df = pd.DataFrame(cleaned, columns=["time", variable])
    df["time"] = pd.to_datetime(df["time"], errors="coerce", format="%d.%m.%Y %H:%M:%S")
    df[variable] = pd.to_numeric(df[variable], errors="coerce")

    # Filter by date range
    if start_date:
        df = df[df["time"] >= pd.to_datetime(start_date)]
    if end_date:
        df = df[df["time"] <= pd.to_datetime(end_date)]

    # Handle empty result
    if df.empty:
        print(f"No data found for gauge {gauge_id} in selected time interval.")
        return pd.DataFrame(columns=["time", variable])

    df = df.sort_values("time").reset_index(drop=True)
    return df

Example usage:

gauge_id = austria_sites.gauge_id.tolist()[-10]
df = get_ehyd_data(gauge_id, variable="streamflow", start_date="1900-01-01", end_date="2021-01-05")
print(df.head())

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions