api_client

`umwelt_apy.api_client`

`fetch_by_ids(ids, search_url=SEARCH_URL, output='Json', **kwargs)`

Fetch multiple specific datasets by their unique identifiers.

Retrieves complete metadata for a list of dataset IDs. If a dataset is not found, logs an error and continues with the next ID.

Parameters:

Name	Type	Description	Default
`ids`	`list`	List of unique dataset identifiers (strings) to fetch.	required
`search_url`	`str`	Address of the server. Defaults to "https://md.umwelt.info".	`SEARCH_URL`
`output`	`str`	Format of the returned data. One of "Json", "Pandas", "Polars". Defaults to "Json".	`'Json'`
`**kwargs`	`Any`	Additional parameters reserved for future extensions.	`{}`

Returns:

Type	Description
`Union[Generator[dict, None, None], DataFrame, DataFrame]`	Generator[dict, None, None] \| pandas.DataFrame \| polars.DataFrame: Depending on the output parameter: - "Json": Generator yielding dataset dicts one by one. - "Pandas": pandas DataFrame containing all datasets. - "Polars": polars DataFrame containing all datasets. Skips datasets that return HTTP errors (e.g., not found).

Raises:

Type	Description
`ValueError`	If output is not one of the allowed formats.
`ImportError`	If pandas or polars is not installed.

Examples:

>>> ids = [
...     "manual/camels_de",
...     "manual/hochwasserzentralen"
... ]
>>> datasets = fetch_by_ids(ids)
>>> for dataset in datasets:
...     print(dataset[`title`])

Note

Failed fetches (HTTPError) are logged but do not stop the iteration. Check logs for any missing datasets.

Source code in src/umwelt_apy/api_client.py

def fetch_by_ids(
    ids: list, search_url=SEARCH_URL, output="Json", **kwargs: Any
) -> Union[Generator[dict, None, None], pandas.DataFrame, polars.DataFrame]:
    """
    Fetch multiple specific datasets by their unique identifiers.

    Retrieves complete metadata for a list of dataset IDs. If a dataset
    is not found, logs an error and continues with the next ID.

    Args:
        ids (list): List of unique dataset identifiers (strings) to fetch.
        search_url (str, optional): Address of the server. Defaults to "https://md.umwelt.info".
        output (str, optional): Format of the returned data.
            One of "Json", "Pandas", "Polars". Defaults to "Json".
        **kwargs (Any): Additional parameters reserved for future extensions.

    Returns:
        Generator[dict, None, None] | pandas.DataFrame | polars.DataFrame:
            Depending on the output parameter:
            - "Json": Generator yielding dataset dicts one by one.
            - "Pandas": pandas DataFrame containing all datasets.
            - "Polars": polars DataFrame containing all datasets.
            Skips datasets that return HTTP errors (e.g., not found).

    Raises:
        ValueError: If output is not one of the allowed formats.
        ImportError: If pandas or polars is not installed.

    Examples:
        >>> ids = [
        ...     "manual/camels_de",
        ...     "manual/hochwasserzentralen"
        ... ]
        >>> datasets = fetch_by_ids(ids)
        >>> for dataset in datasets:
        ...     print(dataset[`title`])

    Note:
        Failed fetches (HTTPError) are logged but do not stop the iteration.
        Check logs for any missing datasets.
    """
    # allowlist of output formates
    output_norm = output.capitalize()
    allowed_outputs = ["Json", "Polars", "Pandas"]

    if output_norm not in allowed_outputs:
        raise ValueError(
            f"Invalid output format: '{output}'. "
            f"Expected one of: {', '.join([o.upper() for o in allowed_outputs])}"
        )

    dataset_gen = _fetch_list_of_datasets(url=search_url, dataset_list=ids)
    if output_norm == "Json":
        return dataset_gen
    elif output_norm == "Polars":
        return to_polars(dataset_gen, **kwargs)
    elif output_norm == "Pandas":
        return to_pandas(dataset_gen, **kwargs)

`fetch_by_query(query='*', output='Json', search_url=SEARCH_URL, sampling_fraction=None, **kwargs)`

fetch_by_query(query: str = '*', output: Literal['Json', 'Json Ranked'] = 'Json', search_url: str = SEARCH_URL, sampling_fraction: Optional[float] = None, **kwargs: Any) -> Generator[dict, None, None]

fetch_by_query(query: str = '*', output: Literal['Pandas'] = 'Pandas', search_url: str = SEARCH_URL, sampling_fraction: Optional[float] = None, exclude: Optional[List[str]] = None, columns: Optional[List[str]] = None, build_row: Optional[Callable] = None, filter_datasets: Optional[Callable] = None, dataset_list: Optional[List[str]] = None, flatten: bool = False, **kwargs: Any) -> pandas.DataFrame

fetch_by_query(query: str = '*', output: Literal['Polars'] = 'Polars', search_url: str = SEARCH_URL, sampling_fraction: Optional[float] = None, build_row: Optional[Callable] = None, **kwargs: Any) -> polars.DataFrame

Searches the umwelt.info index by a search query and fetches the resulting datasets. Unlike fetch_by_url(), this allows to search for multiple entries (e.g. organisations). If you want to reproduce a given search from our web-ui, you should use fetch_by_url() instead. For information on how to build a query see https://md.umwelt.info/swagger-ui/#/search/text_search

Parameters:

Name	Type	Description	Default
`query`	`str`	Search query string. Defaults to "*", which returns all datasets.	`'*'`
`output`	`str`	Format of the returned data. One of "Json", "Json Ranked", "Pandas", "Polars". Defaults to "Json".	`'Json'`
`search_url`	`str`	Address of the server. Defaults to "https://md.umwelt.info".	`SEARCH_URL`
`sampling_fraction`	`float`	Fraction of results to return (0.0-1.0). If None, returns all results. Useful for testing with large datasets. Defaults to None.	`None`
`**kwargs`	`Any`	Additional search parameters passed to the underlying fetch function (e.g. filters, facets).	`{}`

Returns:

Type	Description
`Union[Generator[dict, None, None], DataFrame, DataFrame]`	Union[Generator, pandas.DataFrame, polars.DataFrame]: The return type depends on the `output` parameter: - "Json" or "Json Ranked": Returns a generator yielding dataset dictionaries. - "Pandas": Returns a pandas.DataFrame indexed by ['source', 'id']. - "Polars": Returns a polars.DataFrame.

Example

results = list(fetch_by_query(query="organisation:/Land/Bayern/LfU AND Luftqualität")) print(results[0][title])

Source code in src/umwelt_apy/api_client.py

def fetch_by_query(
    query="*",
    output="Json",
    search_url=SEARCH_URL,
    sampling_fraction=None,
    **kwargs: Any,
) -> Union[Generator[dict, None, None], pandas.DataFrame, polars.DataFrame]:
    """
    Searches the umwelt.info index by a search query and fetches the resulting
    datasets. Unlike fetch_by_url(), this allows to search for multiple entries (e.g. organisations).
    If you want to reproduce a given search from our web-ui, you should use fetch_by_url() instead.
    For information on how to build a query see https://md.umwelt.info/swagger-ui/#/search/text_search

    Args:
        query (str, optional): Search query string.
            Defaults to "*", which returns all datasets.
        output (str, optional): Format of the returned data.
            One of "Json", "Json Ranked", "Pandas", "Polars". Defaults to "Json".
        search_url (str, optional): Address of the server. Defaults to "https://md.umwelt.info".
        sampling_fraction (float, optional): Fraction of results to return (0.0-1.0).
            If None, returns all results. Useful for testing with large datasets.
            Defaults to None.
        **kwargs (Any): Additional search parameters passed to the underlying
            fetch function (e.g. filters, facets).

    Returns:
        Union[Generator, pandas.DataFrame, polars.DataFrame]:
            The return type depends on the `output` parameter:
            - "Json" or "Json Ranked": Returns a generator yielding
              dataset dictionaries.
            - "Pandas": Returns a pandas.DataFrame indexed by ['source', 'id'].
            - "Polars": Returns a polars.DataFrame.

    Example:
        >>> results = list(fetch_by_query(query="organisation:/Land/Bayern/LfU AND Luftqualität"))
        >>> print(results[0][`title`])
    """
    output_norm = output.title()
    allowed = ["Json", "Json Ranked", "Polars", "Pandas"]
    if output_norm not in allowed:
        raise ValueError(f"Invalid output: {output}. Expected: {allowed}")
    if output_norm == "Json" and kwargs:
        unused_keys = ", ".join(kwargs.keys())
        logging.warning(
            f"The following parameters are IGNORED because output='Json' (streaming mode) "
            f"does not support additional search filters: [{unused_keys}]. "
            "To use filters or facets, please set output='Json Ranked'."
        )

    if output_norm == "Json":
        return fetch(url=search_url, query=query, sampling_fraction=sampling_fraction)
    elif output_norm == "Json Ranked":
        return _fetch_ranked(
            url=search_url, query=query, sampling_fraction=sampling_fraction
        )
    elif output_norm == "Polars":
        dataset_gen = fetch(
            url=search_url, query=query, sampling_fraction=sampling_fraction
        )
        return to_polars(dataset=dataset_gen, **kwargs)
    elif output_norm == "Pandas":
        dataset_gen = fetch(
            url=search_url, query=query, sampling_fraction=sampling_fraction
        )
        return to_pandas(iter=dataset_gen, **kwargs)

`fetch_by_url(api_url, output='Json', sampling_fraction=None, **kwargs)`

fetch_by_url(api_url: str, output: Literal['Json'] = 'Json', sampling_fraction: Optional[float] = None, **kwargs: Any) -> Generator[dict, None, None]

fetch_by_url(api_url: str, output: Literal['Pandas'] = 'Pandas', sampling_fraction: Optional[float] = None, exclude: Optional[List[str]] = None, columns: Optional[List[str]] = None, build_row: Optional[Callable] = None, filter_datasets: Optional[Callable] = None, dataset_list: Optional[List[str]] = None, flatten: bool = False, **kwargs: Any) -> pandas.DataFrame

fetch_by_url(api_url: str, output: Literal['Polars'] = 'Polars', sampling_fraction: Optional[float] = None, build_row: Optional[Callable] = None, **kwargs: Any) -> polars.DataFrame

Fetch datasets using a complete API search URL.

Unlike fetch_by_query(), this function accepts a fully-formed URL including all query parameters. You can get this url by using our web-ui.

Parameters:

Name	Type	Description	Default
`api_url`	`str`	Complete URL with query parameters. Example: https://md.umwelt.info/search/all?query=Borganisation%3A%2FBund+%2Btype%3A%22%2FChemische+Verbindung%22&language=de	required
`output`	`str`	Format of the returned data. One of "Json", "Pandas", "Polars". Defaults to "Json".	`'Json'`
`sampling_fraction`	`float`	Fraction of results to return (0.0-1.0). If None, returns all results. Useful for testing with large datasets. Defaults to None.	`None`
`**kwargs`	`Any`	Additional search parameters passed to the underlying fetch function.	`{}`

Returns:

Type	Description
`Union[Generator[dict, None, None], DataFrame, DataFrame]`	depending on the output it is a generator of dataset dictionarie,
`Union[Generator[dict, None, None], DataFrame, DataFrame]`	a pandas.DataFrame or a polars.DataFrame. Skips datasets that
`Union[Generator[dict, None, None], DataFrame, DataFrame]`	return HTTP errors (e.g., not found).

Examples:

>>> url = 'https://md.umwelt.info/search/all?query=%2B%28wasser%29+%2Borganisation%3A%2FLand&language=de'
>>> for dataset in fetch_by_url(url):
    ... print(dataset[`title`])

Source code in src/umwelt_apy/api_client.py

def fetch_by_url(
    api_url: str, output="Json", sampling_fraction=None, **kwargs: Any
) -> Union[Generator[dict, None, None], pandas.DataFrame, polars.DataFrame]:
    """
    Fetch datasets using a complete API search URL.

    Unlike fetch_by_query(), this function accepts a fully-formed URL including
    all query parameters. You can get this url by using our web-ui.

    Args:
        api_url (str): Complete URL with query parameters. Example:
            https://md.umwelt.info/search/all?query=Borganisation%3A%2FBund+%2Btype%3A%22%2FChemische+Verbindung%22&language=de
        output (str, optional): Format of the returned data.
            One of "Json", "Pandas", "Polars". Defaults to "Json".
        sampling_fraction (float, optional): Fraction of results to return (0.0-1.0).
            If None, returns all results. Useful for testing with large datasets.
            Defaults to None.
        **kwargs (Any): Additional search parameters passed to the underlying
            fetch function.

    Returns:
        depending on the output it is a generator of dataset dictionarie,
        a pandas.DataFrame or a polars.DataFrame. Skips datasets that
        return HTTP errors (e.g., not found).

    Examples:
        >>> url = 'https://md.umwelt.info/search/all?query=%2B%28wasser%29+%2Borganisation%3A%2FLand&language=de'
        >>> for dataset in fetch_by_url(url):
            ... print(dataset[`title`])
    """
    logging.debug(f"Fetching search results from URL: {api_url}")

    output_norm = output.title()
    allowed = ["Json", "Polars", "Pandas"]
    if output_norm not in allowed:
        raise ValueError(f"Invalid output: {output}. Expected: {allowed}")

    params = {}

    if sampling_fraction is not None:
        params["sampling_fraction"] = sampling_fraction

    dataset_gen = _stream_datasets(api_url, params)

    if output_norm == "Json":
        return dataset_gen
    elif output_norm == "Pandas":
        return to_pandas(iter=dataset_gen, **kwargs)
    elif output_norm == "Polars":
        return to_polars(datasets=dataset_gen, **kwargs)

`fetch_facet_values(name='type', url=SEARCH_URL)`

Fetches all possible values for a given facet from the umwelt.info API.

Facets are hierarchical groupings used to filter search results. This function queries the /facet/{name} endpoint and returns all available values for the requested facet.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the facet to retrieve. Must be one of: - "type" → dataset types (e.g. Taxon, Text, Image) - "topic" → thematic tags (e.g. /Boden, /Wasser) - "organisation" → data providers (e.g. /Bund/UBA, /Land/Bayern) - "license" → licenses (e.g. /offen, /geschlossen) - "language" → languages (e.g. /Deutsch, /Englisch) - "resource_type" → resource types (e.g. /Datei, /Dienst) Defaults to "type".	`'type'`
`url`	`str`	Base URL of the API endpoint. Defaults to SEARCH_URL.	`SEARCH_URL`

Returns:

Name	Type	Description
`list`	`list`	List of facet values as returned by the API. Each entry contains a path (str) and a dataset count (int).

Raises:

Type	Description
`ValueError`	If name is not one of the valid facet names.
`HTTPError`	If the API returns an error.

Examples:

Fetch all resource types (analogue to R: fetch_facet_values("resource_type")):

>>> resource_types = fetch_facet_values("resource_type")
>>> for value in resource_types[:5]:
...     print(value)

Fetch all organisations: This can be e.g. useful if you want to restrict the results to certain organisations when building your own query, so you know which organisations are available.

>>> organisations = fetch_facet_values("organisation")
>>> print(organisations[:3])

Invalid Name gives ValueError:

>>> fetch_facet_values("invalid")
ValueError: Invalid Facette Name: 'invalid'.
Allowed Values: ['type', 'topic', 'organisation', 'license', 'language', 'resource_type']

Source code in src/umwelt_apy/api_client.py

def fetch_facet_values(name: str = "type", url: str = SEARCH_URL) -> list:
    """
    Fetches all possible values for a given facet from the umwelt.info API.

    Facets are hierarchical groupings used to filter search results.
    This function queries the /facet/{name} endpoint and returns all
    available values for the requested facet.

    Args:
        name (str): Name of the facet to retrieve. Must be one of:
            - "type"          → dataset types (e.g. Taxon, Text, Image)
            - "topic"         → thematic tags (e.g. /Boden, /Wasser)
            - "organisation"  → data providers (e.g. /Bund/UBA, /Land/Bayern)
            - "license"       → licenses (e.g. /offen, /geschlossen)
            - "language"      → languages (e.g. /Deutsch, /Englisch)
            - "resource_type" → resource types (e.g. /Datei, /Dienst)
            Defaults to "type".
        url (str, optional): Base URL of the API endpoint.
            Defaults to SEARCH_URL.

    Returns:
        list: List of facet values as returned by the API. Each entry
            contains a path (str) and a dataset count (int).

    Raises:
        ValueError: If name is not one of the valid facet names.
        requests.exceptions.HTTPError: If the API returns an error.

    Examples:
        Fetch all resource types (analogue to R: fetch_facet_values("resource_type")):

        >>> resource_types = fetch_facet_values("resource_type")
        >>> for value in resource_types[:5]:
        ...     print(value)

        Fetch all organisations:
        This can be e.g. useful if you want to restrict the results to certain organisations when building your own query, so you know which organisations are available.

        >>> organisations = fetch_facet_values("organisation")
        >>> print(organisations[:3])

        Invalid Name gives ValueError:

        >>> fetch_facet_values("invalid")
        ValueError: Invalid Facette Name: 'invalid'.
        Allowed Values: ['type', 'topic', 'organisation', 'license', 'language', 'resource_type']

    """
    if name not in FACET_NAMES:
        raise ValueError(
            f"Invalid Facette Name: '{name}'. Allowed Values: {FACET_NAMES}"
        )

    global session
    api_route = urljoin(url, f"facet/{name}")
    logging.debug(f"Fetching facet values for '{name}' from {api_route}")

    response = session.get(api_route)
    response.raise_for_status()

    return response.json()