Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dataretrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from dataretrieval.nadp import *
from dataretrieval.nwis import *
from dataretrieval.samples import *
from dataretrieval.streamstats import *
from dataretrieval.utils import *
from dataretrieval.waterwatch import *
Expand Down
351 changes: 351 additions & 0 deletions dataretrieval/samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,351 @@
"""Functions for downloading data from the USGS Aquarius Samples database
(https://waterdata.usgs.gov/download-samples/).

See https://api.waterdata.usgs.gov/samples-data/docs#/ for API reference
"""

from __future__ import annotations

import json
from io import StringIO
from typing import TYPE_CHECKING, Literal, get_args

import pandas as pd
import requests
from requests.models import PreparedRequest

from dataretrieval.utils import BaseMetadata, to_str

if TYPE_CHECKING:
from typing import Optional, Tuple, Union

from pandas import DataFrame


_BASE_URL = "https://api.waterdata.usgs.gov/samples-data"

_CODE_SERVICES = Literal[
"characteristicgroup",
"characteristics",
"counties",
"countries",
"observedproperty",
"samplemedia",
"sitetype",
"states",
]


_SERVICES = Literal["activities", "locations", "organizations", "projects", "results"]

_PROFILES = Literal[
"actgroup",
"actmetric",
"basicbio",
"basicphyschem",
"count",
"fullbio",
"fullphyschem",
"labsampleprep",
"narrow",
"organization",
"project",
"projectmonitoringlocationweight",
"resultdetectionquantitationlimit",
"sampact",
"site",
]

_PROFILE_LOOKUP = {
"activities": ["sampact", "actmetric", "actgroup", "count"],
"locations": ["site", "count"],
"organizations": ["organization", "count"],
"projects": ["project", "projectmonitoringlocationweight"],
"results": [
"fullphyschem",
"basicphyschem",
"fullbio",
"basicbio",
"narrow",
"resultdetectionquantitationlimit",
"labsampleprep",
"count",
],
}


def get_codes(code_service: _CODE_SERVICES) -> DataFrame:
"""Return codes from a Samples code service.

Parameters
----------
code_service : string
One of the following options: "states", "counties", "countries"
"sitetype", "samplemedia", "characteristicgroup", "characteristics",
or "observedproperty"
"""
valid_code_services = get_args(_CODE_SERVICES)
if code_service not in valid_code_services:
raise ValueError(
f"Invalid code service: '{code_service}'. "
f"Valid options are: {valid_code_services}."
)

url = f"{_BASE_URL}/codeservice/{code_service}?mimeType=application%2Fjson"

response = requests.get(url)

response.raise_for_status()

data_dict = json.loads(response.text)
data_list = data_dict['data']

df = pd.DataFrame(data_list)

return df

def get_usgs_samples(
ssl_check: bool = True,
service: _SERVICES = "results",
profile: _PROFILES = "fullphyschem",
activityMediaName: Optional[Union[str, list[str]]] = None,
activityStartDateLower: Optional[str] = None,
activityStartDateUpper: Optional[str] = None,
activityTypeCode: Optional[Union[str, list[str]]] = None,
characteristicGroup: Optional[Union[str, list[str]]] = None,
characteristic: Optional[Union[str, list[str]]] = None,
characteristicUserSupplied: Optional[Union[str, list[str]]] = None,
boundingBox: Optional[list[float]] = None,
countryFips: Optional[Union[str, list[str]]] = None,
stateFips: Optional[Union[str, list[str]]] = None,
countyFips: Optional[Union[str, list[str]]] = None,
siteTypeCode: Optional[Union[str, list[str]]] = None,
siteTypeName: Optional[Union[str, list[str]]] = None,
usgsPCode: Optional[Union[str, list[str]]] = None,
hydrologicUnit: Optional[Union[str, list[str]]] = None,
monitoringLocationIdentifier: Optional[Union[str, list[str]]] = None,
organizationIdentifier: Optional[Union[str, list[str]]] = None,
pointLocationLatitude: Optional[float] = None,
pointLocationLongitude: Optional[float] = None,
pointLocationWithinMiles: Optional[float] = None,
projectIdentifier: Optional[Union[str, list[str]]] = None,
recordIdentifierUserSupplied: Optional[Union[str, list[str]]] = None,
) -> Tuple[DataFrame, BaseMetadata]:
"""Search Samples database for USGS water quality data.
This is a wrapper function for the Samples database API. All potential
filters are provided as arguments to the function, but please do not
populate all possible filters; leave as many as feasible with their default
value (None). This is important because overcomplicated web service queries
can bog down the database's ability to return an applicable dataset before
it times out.

The web GUI for the Samples database can be found here:
https://waterdata.usgs.gov/download-samples/#dataProfile=site

If you would like more details on feasible query parameters (complete with
examples), please visit the Samples database swagger docs, here:
https://api.waterdata.usgs.gov/samples-data/docs#/

Parameters
----------
ssl_check : bool, optional
Check the SSL certificate.
service : string
One of the available Samples services: "results", "locations", "activities",
"projects", or "organizations". Defaults to "results".
profile : string
One of the available profiles associated with a service. Options for each
service are:
results - "fullphyschem", "basicphyschem",
"fullbio", "basicbio", "narrow",
"resultdetectionquantitationlimit",
"labsampleprep", "count"
locations - "site", "count"
activities - "sampact", "actmetric",
"actgroup", "count"
projects - "project", "projectmonitoringlocationweight"
organizations - "organization", "count"
activityMediaName : string or list of strings, optional
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kinda crazy to me that the only doc pages on this type of parameter are from dataRetrieval or related packages. The WQP documentation just says something to the effect of "look at the output for more information." Why include it as a query-able parameter if the parameter's possible values aren't communicated to the user? This isn't something to solve in this PR, or even in the package overall, but I feel like this is something seriously lacking in the API documentation we expect users to read.

I found the "sample media" endpoint through the codeservice link you provided that looks to provide the actual values for these more obscure parameters. But pinging one endpoint to learn about the parameter values of a different endpoint seems overly complicated to me. Non-rhetorically: are these parameters so volatile that we couldn't provide static documentation somewhere that describes the values they can take?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you make great points. I believe @ldecicco-USGS has incorporated some of these types of lookup tables to prevent users from passing invalid options to the API, and there are some basic lookup tables in dataRetrieval too. I don't see it as too much trouble to have a set of functions called "[input name]_lookup" that a user can run to return all viable options for Samples. Doubly useful is Laura's approach of checking a user's query before it is made. I will add this as an issue (if it isn't there already) and we can tackle this as a separate PR.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup! There's actually a web service to check available parameters:

https://github.com/DOI-USGS/dataRetrieval/blob/main/R/read_USGS_samples.R#L334

The main bit:

service_options <- c("characteristicgroup", "states", "counties",
                       "countries", "sitetype", "samplemedia",
                       "characteristics", "observedproperty")
  
  check_group_req <- httr2::request("https://api.waterdata.usgs.gov") |> 
    httr2::req_url_path_append("samples-data",
                               "codeservice",
                               service)

For instance:
So, https://api.waterdata.usgs.gov/samples-data/codeservice/samplemedia shows all the available sample media options. It's probably not described in the main documentation because (I think) it's dynamically populated from the values in the data base. So, if a data provider suddenly adds "Jello", the full documentation doesn't need a manual update. I think there's a codeservice Swagger somewhere.....

Here it is:
https://api.waterdata.usgs.gov/samples-data/codeservice/docs

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Laura, I agree: while some of the options may not change very often, I still think it is a smart move to make the options a service rather than a "hard-coded" list....data managers are adding new PFAS compounds every day, for instance.

This PR links to that codeservice in the samples function documentation, but I do think Joe's point is valid that we can be nicer to the user in python and avoid sending them to the Swagger doc to find all available options. I bet it's a similar number of lines of code in python!

Name or code indicating environmental medium in which sample was taken.
Check the `activityMediaName_lookup()` function in this module for all
possible inputs.
Example: "Water".
activityStartDateLower : string, optional
The start date if using a date range. Takes the format YYYY-MM-DD.
The logic is inclusive, i.e. it will also return results that
match the date. If left as None, will pull all data on or before
activityStartDateUpper, if populated.
activityStartDateUpper : string, optional
The end date if using a date range. Takes the format YYYY-MM-DD.
The logic is inclusive, i.e. it will also return results that
match the date. If left as None, will pull all data after
activityStartDateLower up to the most recent available results.
activityTypeCode : string or list of strings, optional
Text code that describes type of field activity performed.
Example: "Sample-Routine, regular".
characteristicGroup : string or list of strings, optional
Characteristic group is a broad category of characteristics
describing one or more results. Check the `characteristicGroup_lookup()`
function in this module for all possible inputs.
Example: "Organics, PFAS"
characteristic : string or list of strings, optional
Characteristic is a specific category describing one or more results.
Check the `characteristic_lookup()` function in this module for all
possible inputs.
Example: "Suspended Sediment Discharge"
characteristicUserSupplied : string or list of strings, optional
A user supplied characteristic name describing one or more results.
boundingBox: list of four floats, optional
Filters on the the associated monitoring location's point location
by checking if it is located within the specified geographic area.
The logic is inclusive, i.e. it will include locations that overlap
with the edge of the bounding box. Values are separated by commas,
expressed in decimal degrees, NAD83, and longitudes west of Greenwich
are negative.
The format is a string consisting of:
- Western-most longitude
- Southern-most latitude
- Eastern-most longitude
- Northern-most longitude
Example: [-92.8,44.2,-88.9,46.0]
countryFips : string or list of strings, optional
Example: "US" (United States)
stateFips : string or list of strings, optional
Check the `stateFips_lookup()` function in this module for all
possible inputs.
Example: "US:15" (United States: Hawaii)
countyFips : string or list of strings, optional
Check the `countyFips_lookup()` function in this module for all
possible inputs.
Example: "US:15:001" (United States: Hawaii, Hawaii County)
siteTypeCode : string or list of strings, optional
An abbreviation for a certain site type. Check the `siteType_lookup()`
function in this module for all possible inputs.
Example: "GW" (Groundwater site)
siteTypeName : string or list of strings, optional
A full name for a certain site type. Check the `siteType_lookup()`
function in this module for all possible inputs.
Example: "Well"
usgsPCode : string or list of strings, optional
5-digit number used in the US Geological Survey computerized
data system, National Water Information System (NWIS), to
uniquely identify a specific constituent. Check the
`characteristic_lookup()` function in this module for all possible
inputs.
Example: "00060" (Discharge, cubic feet per second)
hydrologicUnit : string or list of strings, optional
Max 12-digit number used to describe a hydrologic unit.
Example: "070900020502"
monitoringLocationIdentifier : string or list of strings, optional
A monitoring location identifier has two parts: the agency code
and the location number, separated by a dash (-).
Example: "USGS-040851385"
organizationIdentifier : string or list of strings, optional
Designator used to uniquely identify a specific organization.
Currently only accepting the organization "USGS".
pointLocationLatitude : float, optional
Latitude for a point/radius query (decimal degrees). Must be used
with pointLocationLongitude and pointLocationWithinMiles.
pointLocationLongitude : float, optional
Longitude for a point/radius query (decimal degrees). Must be used
with pointLocationLatitude and pointLocationWithinMiles.
pointLocationWithinMiles : float, optional
Radius for a point/radius query. Must be used with
pointLocationLatitude and pointLocationLongitude
projectIdentifier : string or list of strings, optional
Designator used to uniquely identify a data collection project. Project
identifiers are specific to an organization (e.g. USGS).
Example: "ZH003QW03"
recordIdentifierUserSupplied : string or list of strings, optional
Internal AQS record identifier that returns 1 entry. Only available
for the "results" service.

Returns
-------
df : ``pandas.DataFrame``
Formatted data returned from the API query.
md : :obj:`dataretrieval.utils.Metadata`
Custom ``dataretrieval`` metadata object pertaining to the query.

Examples
--------
.. code::

>>> # Get PFAS results within a bounding box
>>> df, md = dataretrieval.samples.get_usgs_samples(
... boundingBox=[-90.2,42.6,-88.7,43.2],
... characteristicGroup="Organics, PFAS"
... )

>>> # Get all activities for the Commonwealth of Virginia over a date range
>>> df, md = dataretrieval.samples.get_usgs_samples(
... service="activities",
... profile="sampact",
... activityStartDateLower="2023-10-01",
... activityStartDateUpper="2024-01-01",
... stateFips="US:51")

>>> # Get all pH samples for two sites in Utah
>>> df, md = dataretrieval.samples.get_usgs_samples(
... monitoringLocationIdentifier=['USGS-393147111462301', 'USGS-393343111454101'],
... usgsPCode='00400')

"""

_check_profiles(service, profile)

params = {
k: v for k, v in locals().items()
if k not in ["ssl_check", "service", "profile"]
and v is not None
}


params.update({"mimeType": "text/csv"})

if "boundingBox" in params:
params["boundingBox"] = to_str(params["boundingBox"])

url = f"{_BASE_URL}/{service}/{profile}"

req = PreparedRequest()
req.prepare_url(url, params=params)
print(f"Request: {req.url}")

response = requests.get(url, params=params, verify=ssl_check)

response.raise_for_status()

df = pd.read_csv(StringIO(response.text), delimiter=",")

return df, BaseMetadata(response)

def _check_profiles(
service: _SERVICES,
profile: _PROFILES,
) -> None:
"""Check whether a service profile is valid.

Parameters
----------
service : string
One of the service names from the "services" list.
profile : string
One of the profile names from "results_profiles",
"locations_profiles", "activities_profiles",
"projects_profiles" or "organizations_profiles".
"""
valid_services = get_args(_SERVICES)
if service not in valid_services:
raise ValueError(
f"Invalid service: '{service}'. "
f"Valid options are: {valid_services}."
)

valid_profiles = _PROFILE_LOOKUP[service]
if profile not in valid_profiles:
raise ValueError(
f"Invalid profile: '{profile}' for service '{service}'. "
f"Valid options are: {valid_profiles}."
)

1 change: 1 addition & 0 deletions docs/source/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ API reference

nadp
nwis
samples
streamstats
utils
wqp
8 changes: 8 additions & 0 deletions docs/source/reference/samples.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.. _samples

dataretrieval.samples
-------------------------

.. automodule:: dataretrieval.samples
:members:
:special-members:
2 changes: 2 additions & 0 deletions docs/source/userguide/dataportals.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ provided below.
+-----------------------------------+---------------------------------------------------------------+
| Mercury Deposition Network | https://nadp.slh.wisc.edu/networks/mercury-deposition-network |
+-----------------------------------+---------------------------------------------------------------+
| USGS Samples | https://waterdata.usgs.gov/download-samples/ |
+-----------------------------------+---------------------------------------------------------------+
| Streamstats | https://streamstats.usgs.gov |
+-----------------------------------+---------------------------------------------------------------+
| Water Quality Portal | https://waterqualitydata.us |
Expand Down
Loading