Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dataretrieval/waterdata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
get_latest_continuous,
get_latest_daily,
get_monitoring_locations,
get_reference_table,
get_samples,
get_time_series_metadata,
)
Expand All @@ -37,6 +38,7 @@
"get_latest_continuous",
"get_latest_daily",
"get_monitoring_locations",
"get_reference_table",
"get_samples",
"get_time_series_metadata",
"_check_profiles",
Expand Down
64 changes: 63 additions & 1 deletion dataretrieval/waterdata/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,17 @@
from dataretrieval.utils import BaseMetadata, to_str
from dataretrieval.waterdata.types import (
CODE_SERVICES,
METADATA_COLLECTIONS,
PROFILE_LOOKUP,
PROFILES,
SERVICES,
)
from dataretrieval.waterdata.utils import SAMPLES_URL, get_ogc_data
from dataretrieval.waterdata.utils import (
SAMPLES_URL,
get_ogc_data,
_construct_api_requests,
_walk_pages
)

# Set up logger for this module
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -1388,6 +1394,62 @@ def get_field_measurements(

return get_ogc_data(args, output_id, service)

def get_reference_table(
collection: str,
limit: Optional[int] = None,
) -> Tuple[pd.DataFrame, BaseMetadata]:
"""Get metadata reference tables for the USGS Water Data API.

Reference tables provide the range of allowable values for parameter
arguments in the waterdata module.

Parameters
----------
collection : string
One of the following options: "agency-codes", "altitude-datums",
"aquifer-codes", "aquifer-types", "coordinate-accuracy-codes",
"coordinate-datum-codes", "coordinate-method-codes", "counties",
"hydrologic-unit-codes", "medium-codes", "national-aquifer-codes",
"parameter-codes", "reliability-codes", "site-types", "states",
"statistic-codes", "topographic-codes", "time-zone-codes"
limit : numeric, optional
The optional limit parameter is used to control the subset of the
selected features that should be returned in each page. The maximum
allowable limit is 50000. It may be beneficial to set this number lower
if your internet connection is spotty. The default (None) will set the
limit to the maximum allowable limit for the service.
"""
valid_code_services = get_args(METADATA_COLLECTIONS)
if collection not in valid_code_services:
raise ValueError(
f"Invalid code service: '{collection}'. "
f"Valid options are: {valid_code_services}."
)

req = _construct_api_requests(
service=collection,
limit=limit,
skip_geometry=True,
)
# Run API request and iterate through pages if needed
return_list, response = _walk_pages(
geopd=False, req=req
)

# Give ID column a more meaningful name
if collection.endswith("s"):
return_list = return_list.rename(
columns={"id": f"{collection[:-1].replace('-', '_')}_id"}
)
else:
return_list = return_list.rename(
columns={"id": f"{collection.replace('-', '_')}_id"}
)

# Create metadata object from response
metadata = BaseMetadata(response)
return return_list, metadata


def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame:
"""Return codes from a Samples code service.
Expand Down
21 changes: 21 additions & 0 deletions dataretrieval/waterdata/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,27 @@
"states",
]

METADATA_COLLECTIONS = Literal[
"agency-codes",
"altitude-datums",
"aquifer-codes",
"aquifer-types",
"coordinate-accuracy-codes",
"coordinate-datum-codes",
"coordinate-method-codes",
"counties",
"hydrologic-unit-codes",
"medium-codes",
"national-aquifer-codes",
"parameter-codes",
"reliability-codes",
"site-types",
"states",
"statistic-codes",
"topographic-codes",
"time-zone-codes",
]

SERVICES = Literal[
"activities",
"locations",
Expand Down
43 changes: 23 additions & 20 deletions dataretrieval/waterdata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,7 @@ def _walk_pages(
logger.info("Requesting: %s", req.url)

if not geopd:
logger.warning(
logger.info(
"Geopandas not installed. Geometries will be flattened into pandas DataFrames."
)

Expand Down Expand Up @@ -648,35 +648,38 @@ def _arrange_cols(
pd.DataFrame or gpd.GeoDataFrame
The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id.
"""

# Rename id column to output_id
df = df.rename(columns={"id": output_id})

# If properties are provided, filter to only those columns
# plus geometry if skip_geometry is False
if properties and not all(pd.isna(properties)):
if "id" not in properties:
# If user refers to service-specific output id in properties,
# then rename the "id" column to the output_id (id column is
# automatically included).
if output_id in properties:
df = df.rename(columns={"id": output_id})
# If output id is not in properties, but user requests the plural
# of the output_id (e.g. "monitoring_locations_id"), then rename
# "id" to plural. This is pretty niche.
else:
plural = output_id.replace("_id", "s_id")
if plural in properties:
df = df.rename(columns={"id": plural})
# Make sure geometry stays in the dataframe if skip_geometry is False
if 'geometry' in df.columns and 'geometry' not in properties:
properties.append('geometry')
# id is technically a valid column from the service, but these
# functions make the name more specific. So, if someone requests
# 'id', give them the output_id column
if 'id' in properties:
properties[properties.index('id')] = output_id
df = df.loc[:, [col for col in properties if col in df.columns]]
else:
df = df.rename(columns={"id": output_id})


# Move meaningless-to-user, extra id columns to the end
# of the dataframe, if they exist
extra_id_cols = set(df.columns).intersection({
extra_id_col = set(df.columns).intersection({
"latest_continuous_id",
"latest_daily_id",
"daily_id",
"continuous_id",
"field_measurement_id"
})
if extra_id_cols:
id_col_order = [col for col in df.columns if col not in extra_id_cols] + list(extra_id_cols)

# If the arbitrary id column is returned (either due to properties
# being none or NaN), then move it to the end of the dataframe, but
# if part of properties, keep in requested order
if extra_id_col and (properties is None or all(pd.isna(properties))):
id_col_order = [col for col in df.columns if col not in extra_id_col] + list(extra_id_col)
df = df.loc[:, id_col_order]

return df
Expand Down
35 changes: 32 additions & 3 deletions tests/waterdata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
get_latest_daily,
get_field_measurements,
get_time_series_metadata,
get_reference_table
)

def mock_request(requests_mock, request_url, file_path):
Expand Down Expand Up @@ -139,11 +140,20 @@ def test_get_daily_properties():
time="2025-01-01/..",
properties=["daily_id", "monitoring_location_id", "parameter_code", "time", "value", "geometry"]
)
assert "daily_id" in df.columns
assert "geometry" in df.columns
assert "daily_id" == df.columns[0]
assert "geometry" == df.columns[-1]
assert df.shape[1] == 6
assert df.parameter_code.unique().tolist() == ["00060"]

def test_get_daily_properties_id():
df,_ = get_daily(
monitoring_location_id="USGS-05427718",
parameter_code="00060",
time="2025-01-01/..",
properties=["monitoring_location_id", "id", "parameter_code", "time", "value", "geometry"]
)
assert "daily_id" == df.columns[1]

def test_get_daily_no_geometry():
df,_ = get_daily(
monitoring_location_id="USGS-05427718",
Expand Down Expand Up @@ -187,7 +197,7 @@ def test_get_latest_continuous():
monitoring_location_id=["USGS-05427718", "USGS-05427719"],
parameter_code=["00060", "00065"]
)
assert "latest_continuous_id" in df.columns
assert "latest_continuous_id" == df.columns[-1]
assert df.shape[0] <= 4
assert df.statistic_id.unique().tolist() == ["00011"]
assert hasattr(md, 'url')
Expand All @@ -204,6 +214,15 @@ def test_get_latest_daily():
assert hasattr(md, 'url')
assert hasattr(md, 'query_time')

def test_get_latest_daily_properties_geometry():
df, md = get_latest_daily(
monitoring_location_id=["USGS-05427718", "USGS-05427719"],
parameter_code=["00060", "00065"],
properties=['monitoring_location_id', 'parameter_code', 'time', 'value', 'unit_of_measure']
)
assert "geometry" in df.columns
assert df.shape[1] == 6

def test_get_field_measurements():
df, md = get_field_measurements(
monitoring_location_id="USGS-05427718",
Expand All @@ -227,4 +246,14 @@ def test_get_time_series_metadata():
assert hasattr(md, 'url')
assert hasattr(md, 'query_time')

def test_get_reference_table():
df, md = get_reference_table("agency-codes")
assert "agency_code_id" in df.columns
assert df.shape[0] > 0
assert hasattr(md, 'url')
assert hasattr(md, 'query_time')

def test_get_reference_table_wrong_name():
with pytest.raises(ValueError):
get_reference_table("agency-cod")