Skip to content

download_resources

umwelt_apy.download_resources

download_resources(df, base_dir='downloads', sleep_time=0.2)

Downloads files into subfolders named after the source/title.

Parameters:

Name Type Description Default
df DataFrame

A DataFrame as provided by preview_resources.

required
base_dir str

Directory where the output is stored. Defaults to "downloads".

'downloads'
sleep_time float

Time in seconds between individual downloads to minimize server load. Defaults to 0.2.

0.2

Returns:

Type Description
None

None

Examples:

Download resources attached to the datasets of a given query This is a four step process. First you retrieve the list of resources from the api (fetch_by_url or fetch_by_query), you unnest and optionally refine it further (unnest_and_filter), you screen the preview (preview_resources) and download the resources (download_resources). Note that the unnesting is a prerequisite for preview_resources() and download_resources(). Currently this workflow only works for output = "Pandas". Exemplary query

>>> url = "https://md.umwelt.info/search/all?query=(Ozon)+AND+organisation%3A%2FLand%2FBayern%2Fopen.bydata"
Execute flow
>>> results = (
...     fetch_by_url(
...     api_url = url,
...     build_row = only_resources,
...     filter_datasets = lambda dataset: "resources" in dataset,
...     output = "Pandas",
...     )
...     .pipe(unnest_and_filter, formats=["Microsoft Excel Spreadsheet", "CSV"], description_regex="Ozon")
...     .pipe(preview_resources)
... )
In case the list of resources in the preview should be downloaded
>>> download_resources(results, base_dir="downloads")
Source code in src/umwelt_apy/download_resources.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def download_resources(
    df: pandas.DataFrame, base_dir: str = "downloads", sleep_time: float = 0.2
) -> None:
    """Downloads files into subfolders named after the source/title.

    Args:
        df (pandas.DataFrame): A DataFrame as provided by preview_resources.
        base_dir (str, optional): Directory where the output is stored.
            Defaults to "downloads".
        sleep_time (float, optional): Time in seconds between individual
            downloads to minimize server load. Defaults to 0.2.

    Returns:
        None

    Examples:
        Download resources attached to the datasets of a given query
        This is a four step process. First you retrieve the list of resources from the api (fetch_by_url or fetch_by_query),
        you unnest and optionally refine it further (unnest_and_filter), you screen the preview (preview_resources) and download the resources (download_resources).
        Note that the unnesting is a prerequisite for preview_resources() and download_resources().
        Currently this workflow only works for output = "Pandas".
        Exemplary query
        >>> url = "https://md.umwelt.info/search/all?query=(Ozon)+AND+organisation%3A%2FLand%2FBayern%2Fopen.bydata"
        Execute flow
        >>> results = (
        ...     fetch_by_url(
        ...     api_url = url,
        ...     build_row = only_resources,
        ...     filter_datasets = lambda dataset: "resources" in dataset,
        ...     output = "Pandas",
        ...     )
        ...     .pipe(unnest_and_filter, formats=["Microsoft Excel Spreadsheet", "CSV"], description_regex="Ozon")
        ...     .pipe(preview_resources)
        ... )
        In case the list of resources in the preview should be downloaded
        >>> download_resources(results, base_dir="downloads")
    """
    if pandas is None:
        raise ImportError(
            "This function requires pandas. Install it with 'pip install pandas'."
        )

    base_path = Path(base_dir)

    for _, row in df.iterrows():
        clean_title = re.sub(r'[<>:"/\\|?*]', "_", str(row["title"])).strip(". ")[:255]

        # Path: downloads/source/title/filename
        directory = base_path / str(row["source"]) / clean_title
        directory.mkdir(parents=True, exist_ok=True)

        url_path = urlparse(row["url"]).path
        filename = Path(url_path).name
        full_path = directory / filename

        if full_path.is_file():
            print(f"Skipping: {filename} (already exists)")
            continue

        try:
            response = requests.get(row["url"], timeout=30)
            if response.status_code == 200:
                full_path.write_bytes(response.content)
                print(f"Downloaded: {filename}")
            else:
                print(f"Error: {row['url']} (Status: {response.status_code})")
        except Exception as e:
            print(f"Error at {row['url']}: {e}")

        time.sleep(sleep_time)

preview_resources(df)

Prints a summary and returns the dataframe. This function provides a preview which helps to decide whether the enlisted resources should be downloaded

Parameters:

Name Type Description Default
df DataFrame

a pandas.DataFrame as provided by unnest_and_filter

required

Returns:

Type Description
DataFrame

Prints preview

Source code in src/umwelt_apy/download_resources.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def preview_resources(df: pandas.DataFrame) -> pandas.DataFrame:
    """
    Prints a summary and returns the dataframe.
    This function provides a preview which helps to decide whether the enlisted resources should be downloaded

    Args:
        df: a pandas.DataFrame as provided by unnest_and_filter

    Returns:
        Prints preview
    """
    export_columns = {
        "id": "id",
        "title": "title",
        "type.label": "resource_type",
        "description": "resource_description",
    }

    if pandas is None:
        raise ImportError(
            "This function requires pandas. Install it with 'pip install pandas'."
        )

    cols = [c for c in export_columns.keys() if c in df.columns]
    preview = df[cols].rename(columns=export_columns)

    print(preview)
    print(f"\nFound {len(preview)} resources")
    return df

unnest_and_filter(df, formats=None, description_regex=None)

Unnests the resources dataframe and optionally filters the resources by output format and description.

Parameters:

Name Type Description Default
df DataFrame

A DataFrame as provided by fetch_by_url or fetch_by_query, containing a resources column with nested resource dicts.

required
formats list | None

List of accepted output formats. Defaults to CSV, ZIP, JSON, JSON-LD, GeoJSON, TSV, PDF, and Microsoft Excel Spreadsheet. Possible values: run 'fetch_facet_values("resource_type")' to get a list of existing formats

None
description_regex str | None

Regex string to filter only resources whose description contains a match. Defaults to None.

None

Returns:

Type Description
DataFrame

pandas.DataFrame: Unnested and filtered DataFrame with one row per resource.

Source code in src/umwelt_apy/download_resources.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def unnest_and_filter(
    df: pandas.DataFrame,
    formats: list | None = None,
    description_regex: str | None = None,
) -> pandas.DataFrame:
    """Unnests the resources dataframe and optionally filters the resources by output format and description.

    Args:
        df (pandas.DataFrame): A DataFrame as provided by fetch_by_url or fetch_by_query,
            containing a `resources` column with nested resource dicts.
        formats (list | None, optional): List of accepted output formats.
            Defaults to CSV, ZIP, JSON, JSON-LD, GeoJSON, TSV, PDF,
            and Microsoft Excel Spreadsheet.
            Possible values: run 'fetch_facet_values("resource_type")' to get a list of existing formats
        description_regex (str | None, optional): Regex string to filter
            only resources whose description contains a match.
            Defaults to None.

    Returns:
        pandas.DataFrame: Unnested and filtered DataFrame with one row per resource.
    """
    if pandas is None:
        raise ImportError(
            "This function requires pandas. Install it with 'pip install pandas'."
        )

    if formats is None:
        formats = [
            "CSV",
            "ZIP",
            "JSON",
            "JSON-LD",
            "GeoJSON",
            "TSV",
            "PDF",
            "Microsoft Excel Spreadsheet",
        ]

    res = df.explode(column="resources").reset_index(drop=False)

    res_meta = pandas.json_normalize(res["resources"])
    res = pandas.concat([res.drop(columns=["resources"]), res_meta], axis=1)

    # Filter by format and direct link
    mask = res["direct_link"] & (res["type.label"].isin(formats))
    res = res[mask]

    res = res.drop_duplicates(subset="url")

    # Optional Regex filter
    if description_regex:
        if "description" in res.columns:
            res = res[
                res["description"].str.contains(description_regex, case=False, na=False)
            ]
        else:
            print(
                "Warning: description_regex was provided, but column 'description' not found. Filtering skipped."
            )
    return res