Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions PR217.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Recreate PR #216: Remove Defunct NWIS Functions

This PR resubmits the changes from [PR #216](https://github.com/DOI-USGS/dataretrieval-python/pull/216) which removes four defunct NWIS legacy functions (`get_gwlevels`, `get_discharge_measurements`, `get_pmcodes`, and `get_water_use`) by replacing them with `NameError` exceptions that point users to the modernized `waterdata` equivalents.

The original PR #216 was superseded by linting changes (PR #219) on the `main` branch before being merged, which caused conflicts. This PR correctly recreates the exact logical changes from PR #216 directly on top of the newly linted `main` branch, ensuring we maintain `ruff` compliance while formally deprecating the defunct functions as planned.

All related tests and defunct data files have been removed, and the README has been updated to reflect the new API announcements identically to PR #216.

### Notebook Modernization

In addition to the core API changes, a comprehensive review and modernization of **16 demo notebooks** (including all legacy `hydroshare` examples) was performed:

- **API Migration**: Legacy `nwis.get_dv()`, `nwis.get_iv()`, and `nwis.get_info()` calls were upgraded to their modern `waterdata` equivalents.
- **Defunct Removal**: Defunct functions (`get_water_use`, `get_pmcodes`) were commented out or replaced with modern alternatives (`get_reference_table`) across all demos (e.g., `R Python Vignette`, `WaterUse` suite).
- **Execution Validation**: All notebooks were successfully re-executed using `jupyter nbconvert` in the local `.venv` environment to ensure they remain functional and generate correct plots with the new OGC long-format data schema.
- **Clean State**: Each notebook was processed with `nb-clean` to strip execution outputs and counts, ensuring a clean version-controlled state.
- **Dependencies**: `scipy` and `mapclassify` were added to the environment to support advanced plotting and analytical features now required by the modernized examples.

### Performance & Maintenance Optimizations

A series of internal architectural improvements were implemented to enhance scalability and maintainability:

- **Efficient Pagination**: Refactored `_walk_pages` in `waterdata/utils.py` to use list-based aggregation, reducing memory copying overhead from $O(n^2)$ to $O(n)$.
- **Centralized Parameter Handling**: Introduced a shared `_get_args` helper and refactored all 11 API functions in `waterdata/api.py` to use it, eliminating over 100 lines of redundant dictionary comprehension logic.
- **Utility Optimization**: Enhanced `to_str` in `utils.py` with `map(str, ...)` and broader iterable support (sets, tuples, generators), verified with new comprehensive unit tests.
- **Improved Testing**: Added [waterdata_utils_test.py](file:///Users/thodson/Desktop/dev/software/dataretrieval-python/tests/waterdata_utils_test.py) and expanded `tests/utils_test.py` to ensure long-term stability of the new utility logic.
123 changes: 54 additions & 69 deletions dataretrieval/nwis.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import warnings
from io import StringIO
from json import JSONDecodeError

import pandas as pd
import requests
Expand Down Expand Up @@ -481,7 +482,20 @@ def get_dv(
kwargs["multi_index"] = multi_index

response = query_waterservices("dv", format="json", ssl_check=ssl_check, **kwargs)
df = _read_json(response.json())
try:
df = _read_json(response.json())
except (ValueError, JSONDecodeError) as e:
if (
"<html>" in response.text.lower()
or "<!doctype" in response.text.lower()
or "text/html" in response.headers.get("Content-Type", "").lower()
):
raise ValueError(
f"Received HTML response instead of JSON from {response.url} "
f"(Status: {response.status_code}). This often indicates "
"that the service is currently unavailable."
) from e
raise

return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs)

Expand Down Expand Up @@ -667,7 +681,20 @@ def get_iv(
service="iv", format="json", ssl_check=ssl_check, **kwargs
)

df = _read_json(response.json())
try:
df = _read_json(response.json())
except (ValueError, JSONDecodeError) as e:
if (
"<html>" in response.text.lower()
or "<!doctype" in response.text.lower()
or "text/html" in response.headers.get("Content-Type", "").lower()
):
raise ValueError(
f"Received HTML response instead of JSON from {response.url} "
f"(Status: {response.status_code}). This often indicates "
"that the service is currently unavailable."
) from e
raise
return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs)


Expand Down Expand Up @@ -840,11 +867,11 @@ def get_record(
- 'iv' : instantaneous data
- 'dv' : daily mean data
- 'site' : site description
- 'measurements' : discharge measurements
- 'measurements' : (defunct) use `waterdata.get_field_measurements`
- 'peaks': discharge peaks
- 'gwlevels': groundwater levels
- 'pmcodes': get parameter codes
- 'water_use': get water use data
- 'gwlevels': (defunct) use `waterdata.get_field_measurements`
- 'pmcodes': (defunct) use `get_reference_table`
- 'water_use': (defunct) no replacement available
- 'ratings': get rating table
- 'stat': get statistics
ssl_check: bool, optional
Expand All @@ -870,29 +897,10 @@ def get_record(
>>> # Get site description for site 01585200
>>> df = dataretrieval.nwis.get_record(sites="01585200", service="site")

>>> # Get discharge measurements for site 01585200
>>> df = dataretrieval.nwis.get_record(
... sites="01585200", service="measurements"
... )

>>> # Get discharge peaks for site 01585200
>>> df = dataretrieval.nwis.get_record(sites="01585200", service="peaks")

>>> # Get latest groundwater level for site 434400121275801
>>> df = dataretrieval.nwis.get_record(
... sites="434400121275801", service="gwlevels"
... )

>>> # Get information about the discharge parameter code
>>> df = dataretrieval.nwis.get_record(
... service="pmcodes", parameterCd="00060"
... )

>>> # Get water use data for livestock nationally in 2010
>>> df = dataretrieval.nwis.get_record(
... service="water_use", years="2010", categories="L"
... )

>>> # Get rating table for USGS streamgage 01585200
>>> df = dataretrieval.nwis.get_record(sites="01585200", service="ratings")

Expand All @@ -907,7 +915,13 @@ def get_record(
"""
_check_sites_value_types(sites)

if service not in WATERSERVICES_SERVICES + WATERDATA_SERVICES:
defunct_services = ["measurements", "gwlevels", "pmcodes", "water_use"]
if service in defunct_services:
raise NameError(
f"The NWIS service '{service}' is no longer supported by get_record."
)

if service not in WATERSERVICES_SERVICES + WATERDATA_SERVICES + defunct_services:
raise TypeError(f"Unrecognized service: {service}")

if service == "iv":
Expand Down Expand Up @@ -936,43 +950,6 @@ def get_record(
df, _ = get_info(sites=sites, ssl_check=ssl_check, **kwargs)
return df

elif service == "measurements":
df, _ = get_discharge_measurements(
site_no=sites, begin_date=start, end_date=end, ssl_check=ssl_check, **kwargs
)
return df

elif service == "peaks":
df, _ = get_discharge_peaks(
site_no=sites,
begin_date=start,
end_date=end,
multi_index=multi_index,
ssl_check=ssl_check,
**kwargs,
)
return df

elif service == "gwlevels":
df, _ = get_gwlevels(
sites=sites,
startDT=start,
endDT=end,
multi_index=multi_index,
datetime_index=datetime_index,
ssl_check=ssl_check,
**kwargs,
)
return df

elif service == "pmcodes":
df, _ = get_pmcodes(ssl_check=ssl_check, **kwargs)
return df

elif service == "water_use":
df, _ = get_water_use(state=state, ssl_check=ssl_check, **kwargs)
return df

elif service == "ratings":
df, _ = get_ratings(site=sites, ssl_check=ssl_check, **kwargs)
return df
Expand Down Expand Up @@ -1167,8 +1144,8 @@ class NWIS_Metadata(BaseMetadata):
Site information if the query included `site_no`, `sites`, `stateCd`,
`huc`, `countyCd` or `bBox`. `site_no` is preferred over `sites` if
both are present.
variable_info: tuple[pd.DataFrame, NWIS_Metadata] | None
Variable information if the query included `parameterCd`.
variable_info: None
Deprecated. Accessing variable_info via NWIS_Metadata is deprecated.

"""

Expand Down Expand Up @@ -1232,7 +1209,15 @@ def site_info(self) -> tuple[pd.DataFrame, BaseMetadata] | None:
return None # don't set metadata site_info attribute

@property
def variable_info(self) -> tuple[pd.DataFrame, BaseMetadata] | None:
# define variable_info metadata based on parameterCd if available
if "parameterCd" in self._parameters:
return get_pmcodes(parameterCd=self._parameters["parameterCd"])
def variable_info(self) -> None:
"""
Deprecated. Accessing variable_info via NWIS_Metadata is deprecated.
Returns None.
"""
warnings.warn(
"Accessing variable_info via NWIS_Metadata is deprecated as "
"it relies on the defunct get_pmcodes function.",
DeprecationWarning,
stacklevel=2,
)
return None
17 changes: 11 additions & 6 deletions dataretrieval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import warnings
from collections.abc import Iterable

import pandas as pd
import requests
Expand Down Expand Up @@ -39,14 +40,13 @@ def to_str(listlike, delimiter=","):
'0+10+42'

"""
if isinstance(listlike, list):
return delimiter.join([str(x) for x in listlike])
if isinstance(listlike, str):
return listlike

elif isinstance(listlike, (pd.core.series.Series, pd.core.indexes.base.Index)):
return delimiter.join(listlike.tolist())
if isinstance(listlike, Iterable):
return delimiter.join(map(str, listlike))

elif isinstance(listlike, str):
return listlike
return None


def format_datetime(df, date_field, time_field, tz_field):
Expand Down Expand Up @@ -212,6 +212,11 @@ def query(url, payload, delimiter=",", ssl_check=True):
+ f"API response reason: {_reason}. Pseudo-code example of how to "
+ f"split your query: \n {_example}"
)
elif response.status_code in [500, 502, 503]:
raise ValueError(
f"Service Unavailable: {response.status_code} {response.reason}. "
+ f"The service at {response.url} may be down or experiencing issues."
)

if response.text.startswith("No sites/data"):
raise NoSitesError(response.url)
Expand Down
64 changes: 14 additions & 50 deletions dataretrieval/waterdata/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
SAMPLES_URL,
_check_profiles,
_default_headers,
_get_args,
get_ogc_data,
get_stats_data,
)
Expand Down Expand Up @@ -208,11 +209,7 @@ def get_daily(
output_id = "daily_id"

# Build argument dictionary, omitting None values
args = {
k: v
for k, v in locals().items()
if k not in {"service", "output_id"} and v is not None
}
args = _get_args(locals())

return get_ogc_data(args, output_id, service)

Expand Down Expand Up @@ -378,11 +375,7 @@ def get_continuous(
output_id = "continuous_id"

# Build argument dictionary, omitting None values
args = {
k: v
for k, v in locals().items()
if k not in {"service", "output_id"} and v is not None
}
args = _get_args(locals())

return get_ogc_data(args, output_id, service)

Expand Down Expand Up @@ -673,11 +666,7 @@ def get_monitoring_locations(
output_id = "monitoring_location_id"

# Build argument dictionary, omitting None values
args = {
k: v
for k, v in locals().items()
if k not in {"service", "output_id"} and v is not None
}
args = _get_args(locals())

return get_ogc_data(args, output_id, service)

Expand Down Expand Up @@ -893,11 +882,7 @@ def get_time_series_metadata(
output_id = "time_series_id"

# Build argument dictionary, omitting None values
args = {
k: v
for k, v in locals().items()
if k not in {"service", "output_id"} and v is not None
}
args = _get_args(locals())

return get_ogc_data(args, output_id, service)

Expand Down Expand Up @@ -1069,11 +1054,7 @@ def get_latest_continuous(
output_id = "latest_continuous_id"

# Build argument dictionary, omitting None values
args = {
k: v
for k, v in locals().items()
if k not in {"service", "output_id"} and v is not None
}
args = _get_args(locals())

return get_ogc_data(args, output_id, service)

Expand Down Expand Up @@ -1247,11 +1228,7 @@ def get_latest_daily(
output_id = "latest_daily_id"

# Build argument dictionary, omitting None values
args = {
k: v
for k, v in locals().items()
if k not in {"service", "output_id"} and v is not None
}
args = _get_args(locals())

return get_ogc_data(args, output_id, service)

Expand Down Expand Up @@ -1424,11 +1401,7 @@ def get_field_measurements(
output_id = "field_measurement_id"

# Build argument dictionary, omitting None values
args = {
k: v
for k, v in locals().items()
if k not in {"service", "output_id"} and v is not None
}
args = _get_args(locals())

return get_ogc_data(args, output_id, service)

Expand Down Expand Up @@ -1735,11 +1708,8 @@ def get_samples(

_check_profiles(service, profile)

params = {
k: v
for k, v in locals().items()
if k not in ["ssl_check", "service", "profile"] and v is not None
}
# Build argument dictionary, omitting None values
params = _get_args(locals(), exclude={"ssl_check", "profile"})

params.update({"mimeType": "text/csv"})

Expand Down Expand Up @@ -1879,11 +1849,8 @@ def get_stats_por(
... end_date="01-31",
... )
"""
params = {
k: v
for k, v in locals().items()
if k not in ["expand_percentiles"] and v is not None
}
# Build argument dictionary, omitting None values
params = _get_args(locals(), exclude={"expand_percentiles"})

return get_stats_data(
args=params, service="observationNormals", expand_percentiles=expand_percentiles
Expand Down Expand Up @@ -2011,11 +1978,8 @@ def get_stats_date_range(
... computation_type=["minimum", "maximum"],
... )
"""
params = {
k: v
for k, v in locals().items()
if k not in ["expand_percentiles"] and v is not None
}
# Build argument dictionary, omitting None values
params = _get_args(locals(), exclude={"expand_percentiles"})

return get_stats_data(
args=params,
Expand Down
Loading
Loading