Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 50 additions & 2 deletions dataretrieval/waterdata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,9 +662,24 @@ def _arrange_cols(
plural = output_id.replace("_id", "s_id")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this line confuses me, but the code appears to work as it should

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can revisit, because this might not be necessary. I'll make an issue. Basically, every service returns a straight up "id" column with the data, which is actually different across services. So the package adds the service name to the beginning of the "id" column, e.g. "monitoring_location_id", "daily_id", etc. This part of the function accounts for whether someone enters just "id" into their properties argument, or enters "monitoring_locationS_id" (maybe they notice that pattern that it's service + id, and the sites service is called "monitoring-locationS"). If they enter "id", then the resulting dataframe will have the "monitoring_location_id" column name. But if they enter "monitoring_locations_id" (straight up service name, "monitoring locations", plus "id"), then it will return the column name "monitoring_locations_id". I kinda doubt this will be leveraged at all, and adds confusion.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if plural in properties:
df = df.rename(columns={"id": plural})
return df.loc[:, [col for col in properties if col in df.columns]]
df = df.loc[:, [col for col in properties if col in df.columns]]
else:
return df.rename(columns={"id": output_id})
df = df.rename(columns={"id": output_id})

# Move meaningless-to-user, extra id columns to the end
# of the dataframe, if they exist
extra_id_cols = set(df.columns).intersection({
"latest_continuous_id",
"latest_daily_id",
"daily_id",
"continuous_id",
"field_measurement_id"
})
if extra_id_cols:
id_col_order = [col for col in df.columns if col not in extra_id_cols] + list(extra_id_cols)
df = df.loc[:, id_col_order]

return df


def _type_cols(df: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -712,6 +727,36 @@ def _type_cols(df: pd.DataFrame) -> pd.DataFrame:
return df


def _sort_rows(df: pd.DataFrame) -> pd.DataFrame:
"""
Sorts rows by 'time' and 'monitoring_location_id' columns if they
exist.

Parameters
----------
df : pd.DataFrame
The input DataFrame containing water data.

Returns
-------
pd.DataFrame
The DataFrame with rows ordered by time and site.

"""
if "time" in df.columns and "monitoring_location_id" in df.columns:
df = df.sort_values(
by=["time", "monitoring_location_id"],
ignore_index=True
)
elif "time" in df.columns:
df = df.sort_values(
by="time",
ignore_index=True
)

return df


def get_ogc_data(
args: Dict[str, Any], output_id: str, service: str
) -> Tuple[pd.DataFrame, BaseMetadata]:
Expand Down Expand Up @@ -769,7 +814,10 @@ def get_ogc_data(
return_list = _type_cols(return_list)

return_list = _arrange_cols(return_list, properties, output_id)

return_list = _sort_rows(return_list)
# Create metadata object from response
metadata = BaseMetadata(response)
return return_list, metadata


4 changes: 3 additions & 1 deletion tests/waterdata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,16 +122,18 @@ def test_get_daily():
)
assert "daily_id" in df.columns
assert "geometry" in df.columns
assert df.columns[-1] == "daily_id"
assert df.shape[1] == 12
assert df.parameter_code.unique().tolist() == ["00060"]
assert df.monitoring_location_id.unique().tolist() == ["USGS-05427718"]
assert df["time"].apply(lambda x: isinstance(x, datetime.date)).all()
assert df["time"].iloc[0] < df["time"].iloc[-1]
assert hasattr(md, 'url')
assert hasattr(md, 'query_time')
assert df["value"].dtype == "float64"

def test_get_daily_properties():
df, md = get_daily(
df,_ = get_daily(
monitoring_location_id="USGS-05427718",
parameter_code="00060",
time="2025-01-01/..",
Expand Down