download_resources

`umwelt_apy.download_resources`

`download_resources(df, base_dir='downloads', sleep_time=0.2)`

Downloads files into subfolders named after the source/title.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A DataFrame as provided by preview_resources.	required
`base_dir`	`str`	Directory where the output is stored. Defaults to "downloads".	`'downloads'`
`sleep_time`	`float`	Time in seconds between individual downloads to minimize server load. Defaults to 0.2.	`0.2`

Returns:

Type	Description
`None`	None

Examples:

Download resources attached to the datasets of a given query This is a four step process. First you retrieve the list of resources from the api (fetch_by_url or fetch_by_query), you unnest and optionally refine it further (unnest_and_filter), you screen the preview (preview_resources) and download the resources (download_resources). Note that the unnesting is a prerequisite for preview_resources() and download_resources(). Currently this workflow only works for output = "Pandas". Exemplary query

>>> url = "https://md.umwelt.info/search/all?query=(Ozon)+AND+organisation%3A%2FLand%2FBayern%2Fopen.bydata"
Execute flow
>>> results = (
...     fetch_by_url(
...     api_url = url,
...     build_row = only_resources,
...     filter_datasets = lambda dataset: "resources" in dataset,
...     output = "Pandas",
...     )
...     .pipe(unnest_and_filter, formats=["Microsoft Excel Spreadsheet", "CSV"], description_regex="Ozon")
...     .pipe(preview_resources)
... )
In case the list of resources in the preview should be downloaded
>>> download_resources(results, base_dir="downloads")

Source code in src/umwelt_apy/download_resources.py

def download_resources(
    df: pandas.DataFrame, base_dir: str = "downloads", sleep_time: float = 0.2
) -> None:
    """Downloads files into subfolders named after the source/title.

    Args:
        df (pandas.DataFrame): A DataFrame as provided by preview_resources.
        base_dir (str, optional): Directory where the output is stored.
            Defaults to "downloads".
        sleep_time (float, optional): Time in seconds between individual
            downloads to minimize server load. Defaults to 0.2.

    Returns:
        None

    Examples:
        Download resources attached to the datasets of a given query
        This is a four step process. First you retrieve the list of resources from the api (fetch_by_url or fetch_by_query),
        you unnest and optionally refine it further (unnest_and_filter), you screen the preview (preview_resources) and download the resources (download_resources).
        Note that the unnesting is a prerequisite for preview_resources() and download_resources().
        Currently this workflow only works for output = "Pandas".
        Exemplary query
        >>> url = "https://md.umwelt.info/search/all?query=(Ozon)+AND+organisation%3A%2FLand%2FBayern%2Fopen.bydata"
        Execute flow
        >>> results = (
        ...     fetch_by_url(
        ...     api_url = url,
        ...     build_row = only_resources,
        ...     filter_datasets = lambda dataset: "resources" in dataset,
        ...     output = "Pandas",
        ...     )
        ...     .pipe(unnest_and_filter, formats=["Microsoft Excel Spreadsheet", "CSV"], description_regex="Ozon")
        ...     .pipe(preview_resources)
        ... )
        In case the list of resources in the preview should be downloaded
        >>> download_resources(results, base_dir="downloads")
    """
    if pandas is None:
        raise ImportError(
            "This function requires pandas. Install it with 'pip install pandas'."
        )

    base_path = Path(base_dir)

    for _, row in df.iterrows():
        clean_title = re.sub(r'[<>:"/\\|?*]', "_", str(row["title"])).strip(". ")[:255]

        # Path: downloads/source/title/filename
        directory = base_path / str(row["source"]) / clean_title
        directory.mkdir(parents=True, exist_ok=True)

        url_path = urlparse(row["url"]).path
        filename = Path(url_path).name
        full_path = directory / filename

        if full_path.is_file():
            print(f"Skipping: {filename} (already exists)")
            continue

        try:
            response = requests.get(row["url"], timeout=30)
            if response.status_code == 200:
                full_path.write_bytes(response.content)
                print(f"Downloaded: {filename}")
            else:
                print(f"Error: {row['url']} (Status: {response.status_code})")
        except Exception as e:
            print(f"Error at {row['url']}: {e}")

        time.sleep(sleep_time)

`preview_resources(df)`

Prints a summary and returns the dataframe. This function provides a preview which helps to decide whether the enlisted resources should be downloaded

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	a pandas.DataFrame as provided by unnest_and_filter	required

Returns:

Type	Description
`DataFrame`	Prints preview

Source code in src/umwelt_apy/download_resources.py

def preview_resources(df: pandas.DataFrame) -> pandas.DataFrame:
    """
    Prints a summary and returns the dataframe.
    This function provides a preview which helps to decide whether the enlisted resources should be downloaded

    Args:
        df: a pandas.DataFrame as provided by unnest_and_filter

    Returns:
        Prints preview
    """
    export_columns = {
        "id": "id",
        "title": "title",
        "type.label": "resource_type",
        "description": "resource_description",
    }

    if pandas is None:
        raise ImportError(
            "This function requires pandas. Install it with 'pip install pandas'."
        )

    cols = [c for c in export_columns.keys() if c in df.columns]
    preview = df[cols].rename(columns=export_columns)

    print(preview)
    print(f"\nFound {len(preview)} resources")
    return df

`unnest_and_filter(df, formats=None, description_regex=None)`

Unnests the resources dataframe and optionally filters the resources by output format and description.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A DataFrame as provided by fetch_by_url or fetch_by_query, containing a `resources` column with nested resource dicts.	required
`formats`	`list \| None`	List of accepted output formats. Defaults to CSV, ZIP, JSON, JSON-LD, GeoJSON, TSV, PDF, and Microsoft Excel Spreadsheet. Possible values: run 'fetch_facet_values("resource_type")' to get a list of existing formats	`None`
`description_regex`	`str \| None`	Regex string to filter only resources whose description contains a match. Defaults to None.	`None`

Returns:

Type	Description
`DataFrame`	pandas.DataFrame: Unnested and filtered DataFrame with one row per resource.

Source code in src/umwelt_apy/download_resources.py

def unnest_and_filter(
    df: pandas.DataFrame,
    formats: list | None = None,
    description_regex: str | None = None,
) -> pandas.DataFrame:
    """Unnests the resources dataframe and optionally filters the resources by output format and description.

    Args:
        df (pandas.DataFrame): A DataFrame as provided by fetch_by_url or fetch_by_query,
            containing a `resources` column with nested resource dicts.
        formats (list | None, optional): List of accepted output formats.
            Defaults to CSV, ZIP, JSON, JSON-LD, GeoJSON, TSV, PDF,
            and Microsoft Excel Spreadsheet.
            Possible values: run 'fetch_facet_values("resource_type")' to get a list of existing formats
        description_regex (str | None, optional): Regex string to filter
            only resources whose description contains a match.
            Defaults to None.

    Returns:
        pandas.DataFrame: Unnested and filtered DataFrame with one row per resource.
    """
    if pandas is None:
        raise ImportError(
            "This function requires pandas. Install it with 'pip install pandas'."
        )

    if formats is None:
        formats = [
            "CSV",
            "ZIP",
            "JSON",
            "JSON-LD",
            "GeoJSON",
            "TSV",
            "PDF",
            "Microsoft Excel Spreadsheet",
        ]

    res = df.explode(column="resources").reset_index(drop=False)

    res_meta = pandas.json_normalize(res["resources"])
    res = pandas.concat([res.drop(columns=["resources"]), res_meta], axis=1)

    # Filter by format and direct link
    mask = res["direct_link"] & (res["type.label"].isin(formats))
    res = res[mask]

    res = res.drop_duplicates(subset="url")

    # Optional Regex filter
    if description_regex:
        if "description" in res.columns:
            res = res[
                res["description"].str.contains(description_regex, case=False, na=False)
            ]
        else:
            print(
                "Warning: description_regex was provided, but column 'description' not found. Filtering skipped."
            )
    return res