From 3093f898dddfaaf7d07fc61432745054b91b4cfd Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Wed, 25 Sep 2024 14:14:34 +0100 Subject: [PATCH 01/15] simplify listed buildings functions --- .../prepare_features/listed_buildings.py | 34 +++++++++++++++++-- .../pipeline/run_scripts/run_add_features.py | 16 ++------- .../suitability/calculate_suitability.py | 8 ----- 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py index 3ec4385..d806df3 100644 --- a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py +++ b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py @@ -25,7 +25,30 @@ def transform_gdf_listed_buildings(nation: str) -> gpd.GeoDataFrame: return gdf -def sjoin_df_epc_with_listed_buildings( +def generate_df_epc_listed_buildings( + epc_df: pl.DataFrame, nations: list = ["England", "Scotland", "Wales"] +): + """ + Generate dataframe of listed buildings in EPC data in specified nation(s). + + Args: + epc_df (pl.DataFrame): + nations (list): + + Returns: + pl.DataFrame: + """ + dfs = [] + for nation in nations: + logging.info(f"Loading listed building data for {nation}") + gdf = transform_gdf_listed_buildings(nation) + df = sjoin_df_epc_with_nation_listed_buildings(epc_df, gdf) + dfs.append(df) + + return pl.concat(dfs, how="vertical") + + +def sjoin_df_epc_with_nation_listed_buildings( epc_df: pl.DataFrame, listed_buildings_gdf: gpd.GeoDataFrame, chunk_size: int = 100000, @@ -60,5 +83,10 @@ def sjoin_df_epc_with_listed_buildings( dfs.append(df) df = pl.from_pandas(pd.concat(dfs)) - - return df + df = df.with_columns( + pl.when(pl.col("listed_building_grade").is_null()) + .then(False) + .otherwise(True) + .alias("listed_building"), + ) + return df.select(["PUPRN", "listed_building"]) diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py b/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py index f45fe89..2170a11 100644 --- a/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py +++ b/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py @@ -145,21 +145,11 @@ def parse_arguments() -> argparse.Namespace: epc_df = off_gas.add_off_gas_feature(epc_df, off_gas_postcodes) # Add feature: listed buildings data - logging.info("Loading listed buildings for England") - e_listed_buildings_df = listed_buildings.transform_gdf_listed_buildings("England") - e_listed_buildings_df = listed_buildings.sjoin_df_epc_with_listed_buildings( - epc_df, e_listed_buildings_df + logging.info("Adding listed buildings to EPC") + listed_buildings_df = listed_buildings.generate_df_epc_listed_buildings( + epc_df=epc_df ) - logging.info("Loading listed buildings for Wales") - w_listed_buildings_df = listed_buildings.transform_gdf_listed_buildings("Wales") - w_listed_buildings_df = listed_buildings.sjoin_df_epc_with_listed_buildings( - epc_df, w_listed_buildings_df - ) - - listed_buildings_df = pl.concat( - [e_listed_buildings_df, w_listed_buildings_df], how="vertical" - ) epc_df = epc_df.join(listed_buildings_df, how="left", on="UPRN") # Save to S3 diff --git a/asf_heat_pump_suitability/pipeline/suitability/calculate_suitability.py b/asf_heat_pump_suitability/pipeline/suitability/calculate_suitability.py index f08b1fb..82ded7c 100644 --- a/asf_heat_pump_suitability/pipeline/suitability/calculate_suitability.py +++ b/asf_heat_pump_suitability/pipeline/suitability/calculate_suitability.py @@ -190,14 +190,6 @@ def get_enhanced_epc(path) -> pl.DataFrame: df = df.filter( ~pl.col("UPRN").str.contains("dummy"), pl.col("COUNTRY") != "Scotland" ) - - df = df.with_columns( - pl.when(pl.col("listed_building_grade").is_null()) - .then(False) - .otherwise(True) - .alias("listed_building"), - ) - return df From 2c7ae3016aee4444e45c5dbe369a1e6466991bf8 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Wed, 25 Sep 2024 14:15:02 +0100 Subject: [PATCH 02/15] add Scotland listed buildings dataset to config yaml --- asf_heat_pump_suitability/config/base.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/asf_heat_pump_suitability/config/base.yaml b/asf_heat_pump_suitability/config/base.yaml index d68cff1..dbc794a 100644 --- a/asf_heat_pump_suitability/config/base.yaml +++ b/asf_heat_pump_suitability/config/base.yaml @@ -19,6 +19,7 @@ data_source: UK_spa_offgasgrid: "s3://asf-heat-pump-suitability/source_data/2024_vMar2024_SPA_offgaspostcode_UK.xlsx" E_historicengland_listed_buildings: "s3://asf-heat-pump-suitability/source_data/Jun2024_vJul2024_HistoricEngland_listedbuilding_E.gpkg" W_cadw_listed_buildings: "s3://asf-heat-pump-suitability/source_data/May2024_vMay2024_Cadw_listedbuilding_W.gpkg" + S_scottish_gov_listed_buildings: "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings_boundaries.shp" EW_ons_lsoa_lad_lookup: "s3://asf-heat-pump-suitability/source_data/2021_vApr2023_ons_lsoa_to_lad_lookup_EW.csv" usecols: epc: From 6f9f289e15251518c22eb8992cb0a21889e42e3b Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:02:05 +0100 Subject: [PATCH 03/15] update config base.yaml data source for Scotland listed buildings to use points instead of boundaries. this is due to boundaries dataset not being complete --- asf_heat_pump_suitability/config/base.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asf_heat_pump_suitability/config/base.yaml b/asf_heat_pump_suitability/config/base.yaml index dbc794a..5706d0e 100644 --- a/asf_heat_pump_suitability/config/base.yaml +++ b/asf_heat_pump_suitability/config/base.yaml @@ -19,7 +19,7 @@ data_source: UK_spa_offgasgrid: "s3://asf-heat-pump-suitability/source_data/2024_vMar2024_SPA_offgaspostcode_UK.xlsx" E_historicengland_listed_buildings: "s3://asf-heat-pump-suitability/source_data/Jun2024_vJul2024_HistoricEngland_listedbuilding_E.gpkg" W_cadw_listed_buildings: "s3://asf-heat-pump-suitability/source_data/May2024_vMay2024_Cadw_listedbuilding_W.gpkg" - S_scottish_gov_listed_buildings: "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings_boundaries.shp" + S_scottish_gov_listed_buildings: "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings.shp" EW_ons_lsoa_lad_lookup: "s3://asf-heat-pump-suitability/source_data/2021_vApr2023_ons_lsoa_to_lad_lookup_EW.csv" usecols: epc: From b77104e50708de3f47681ae5b895eee56a71140a Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:03:48 +0100 Subject: [PATCH 04/15] update listed buildings getter to load Scotland data optionally in get_datasets.py --- asf_heat_pump_suitability/getters/get_datasets.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/asf_heat_pump_suitability/getters/get_datasets.py b/asf_heat_pump_suitability/getters/get_datasets.py index 5e53caf..bceaa72 100644 --- a/asf_heat_pump_suitability/getters/get_datasets.py +++ b/asf_heat_pump_suitability/getters/get_datasets.py @@ -248,7 +248,7 @@ def load_gdf_listed_buildings(nation: str, **kwargs) -> gpd.GeoDataFrame: Get raw Listed Buildings polygons dataset for specified nation. CRS EPSG:27700, British National Grid. Args: - nation (str): UK nation to load listed buildings data for. Options: "England"; "Wales". + nation (str): nation to load listed buildings data for. Options: "England"; "Scotland", "Wales". **kwargs for `gpd.read_file()` Returns: @@ -260,6 +260,12 @@ def load_gdf_listed_buildings(nation: str, **kwargs) -> gpd.GeoDataFrame: ) elif nation.lower() == "wales": gdf = gpd.read_file(config["data_source"]["W_cadw_listed_buildings"], **kwargs) + elif nation.lower() == "scotland": + gdf = gpd.read_file( + config["data_source"]["S_scottish_gov_listed_buildings"], **kwargs + ) else: - raise ValueError("Please set `nation` to either 'England' or 'Wales'.") + raise ValueError( + "Please set `nation` to either 'England', 'Scotland', or 'Wales'." + ) return gdf From 0f0be13ca7bfbcf31666e7b3d16930380282d7fe Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:05:21 +0100 Subject: [PATCH 05/15] update listed_buildings.py functions to transform Scotland data as well as England and Wales - fix bug in Wales data processing. Wales listed buildings dataset is shared as point geometries, not polygons so update from sjoin to sjoin_nearest - refactor the EPC chunking function slightly to be more succinct --- .../prepare_features/listed_buildings.py | 115 ++++++++++++------ 1 file changed, 75 insertions(+), 40 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py index d806df3..88306e2 100644 --- a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py +++ b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py @@ -7,61 +7,61 @@ from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon -def transform_gdf_listed_buildings(nation: str) -> gpd.GeoDataFrame: - """ - Load and transform listed buildings polygons dataset for specified nation. - - Args: - nation (str): UK nation to load listed buildings data for. Options: "England"; "Wales". - - Returns: - gpd.GeoDataFrame: listed buildings dataset for specified nation with grade and geometry columns. - """ - gdf = get_datasets.load_gdf_listed_buildings(nation, columns=["Grade", "geometry"]) - gdf = gdf.drop_duplicates(subset="geometry").rename( - columns={"Grade": "listed_building_grade"} - ) - - return gdf - - def generate_df_epc_listed_buildings( epc_df: pl.DataFrame, nations: list = ["England", "Scotland", "Wales"] -): +) -> pl.DataFrame: """ Generate dataframe of listed buildings in EPC data in specified nation(s). Args: - epc_df (pl.DataFrame): - nations (list): + epc_df (pl.DataFrame): EPC dataset with x, y coordinates per UPRN + nations (list): nation(s) to load listed buildings data for. Options: "England"; "Scotland"; "Wales". Returns: - pl.DataFrame: + pl.DataFrame: listed buildings in EPC data """ dfs = [] for nation in nations: logging.info(f"Loading listed building data for {nation}") gdf = transform_gdf_listed_buildings(nation) - df = sjoin_df_epc_with_nation_listed_buildings(epc_df, gdf) + df = chunk_sjoin_df_epc_listed_buildings(epc_df, gdf, nation) dfs.append(df) return pl.concat(dfs, how="vertical") -def sjoin_df_epc_with_nation_listed_buildings( +def transform_gdf_listed_buildings(nation: str) -> gpd.GeoDataFrame: + """ + Load and transform listed buildings polygons dataset for specified nation. + + Args: + nation (str): nation to load listed buildings data for. Options: "England"; "Scotland"; "Wales". + + Returns: + gpd.GeoDataFrame: listed buildings dataset for specified nation with grade and geometry columns. + """ + gdf = get_datasets.load_gdf_listed_buildings(nation, columns=["geometry"]) + gdf = gdf.drop_duplicates(subset="geometry") + gdf["listed_building"] = True + + return gdf + + +def chunk_sjoin_df_epc_listed_buildings( epc_df: pl.DataFrame, listed_buildings_gdf: gpd.GeoDataFrame, chunk_size: int = 100000, ) -> pl.DataFrame: """ - Spatial join EPC UPRNs with listed buildings polygons to return dataframe of EPC UPRNs which are located in listed - buildings, along with the building grade. + Chunk EPC data and spatial join EPC UPRNs with listed buildings. + Args: epc_df (pl.DataFrame): Enhanced EPC dataset with X and Y coordinates - listed_buildings_gdf (gpd.GeoDataFrame): listed buildings polygons dataset + listed_buildings_gdf (gpd.GeoDataFrame): listed buildings GeoDataFrame. Can be points / polygons. chunk_size (int): number of EPC rows in each partition. Default 100,000 + Returns: - pl.DataFrame: EPC UPRNs with listed buildings grade + pl.DataFrame: EPC UPRNs in listed buildings """ partitions = ( epc_df.select(["UPRN", "X_COORDINATE", "Y_COORDINATE"]) @@ -70,23 +70,58 @@ def sjoin_df_epc_with_nation_listed_buildings( ) dfs = [] - # Group based on the created index data_partitioned = epc_df.with_columns(partitions).partition_by("chunk_id") logging.info(f"Adding listed buildings to EPC in {len(data_partitioned)} chunks") for epc_chunk in tqdm(data_partitioned): - epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_chunk)[["UPRN", "geometry"]] - df = epc_gdf.sjoin(listed_buildings_gdf, how="inner", predicate="intersects")[ - ["UPRN", "listed_building_grade"] - ].drop_duplicates(subset="UPRN") - + df = sjoin_df_epc_listed_buildings(epc_chunk, listed_buildings_gdf) dfs.append(df) - df = pl.from_pandas(pd.concat(dfs)) - df = df.with_columns( - pl.when(pl.col("listed_building_grade").is_null()) - .then(False) - .otherwise(True) - .alias("listed_building"), + df = pl.from_pandas(pd.concat(dfs)).with_columns( + pl.col("listed_building").fill_null(False) ) + return df.select(["PUPRN", "listed_building"]) + + +def sjoin_df_epc_listed_buildings( + epc_df: pl.DataFrame, listed_buildings_gdf: gpd.GeoDataFrame, distance: float = 5 +) -> pd.DataFrame: + """ + Spatial join EPC UPRNs with listed buildings using `geopandas.GeoDataFrame.sjoin_nearest` where Point or MultiPoint + geometries detected, and `geopandas.GeoDataFrame.sjoin` where Polygons or MultiPolygons detected. + + Args: + epc_df (pl.DataFrame): EPC dataset with X and Y coordinates per UPRN + listed_buildings_gdf (gpd.GeoDataFrame): listed buildings data + distance (float): maximum distance (m) within which to query for nearest geometry where `sjoin_nearest` used. Must be greater than 0. Default 5. + + Returns: + pd.DataFrame: EPC UPRNs in listed buildings + """ + epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_df, usecols=["UPRN", "geometry"]) + if any( + [ + expr in listed_buildings_gdf.geom_type.unique() + for expr in ["Point", "MultiPoint"] + ] + ): + df = epc_gdf.sjoin_nearest( + listed_buildings_gdf, how="inner", max_distance=distance + )["UPRN", "listed_building"].drop_duplicates(subset="UPRN") + elif any( + [ + expr in listed_buildings_gdf.geom_type.unique() + for expr in ["Polygon", "MultiPolygon"] + ] + ): + df = epc_gdf.sjoin(listed_buildings_gdf, how="inner", predicate="intersects")[ + ["UPRN", "listed_building"] + ].drop_duplicates(subset="UPRN") + else: + raise ValueError( + f"Listed buildings GeoDataFrame does not have appropriate geometries for sjoin. " + f"Geometries required: [Multi]Point or [Multi]Polygon. " + f"Geometries found: {listed_buildings_gdf.geom_type.unique()}" + ) + return df From 08324fc97053276982995b5eac1ac50dcaef9c1c Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:09:54 +0100 Subject: [PATCH 06/15] fix small bug introduced in listed_buildings.py - delete unused `nation` param --- .../pipeline/prepare_features/listed_buildings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py index 88306e2..456c105 100644 --- a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py +++ b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py @@ -24,7 +24,7 @@ def generate_df_epc_listed_buildings( for nation in nations: logging.info(f"Loading listed building data for {nation}") gdf = transform_gdf_listed_buildings(nation) - df = chunk_sjoin_df_epc_listed_buildings(epc_df, gdf, nation) + df = chunk_sjoin_df_epc_listed_buildings(epc_df, gdf) dfs.append(df) return pl.concat(dfs, how="vertical") From eca12df338a438ec8fe5ced113be3d7cc55e3e9f Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:12:52 +0100 Subject: [PATCH 07/15] add notebook to explore listed building data in Scotland and determine nearest neighbour distance threshold --- ...40926_explore_scotland_listed_buildings.py | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 asf_heat_pump_suitability/notebooks/20240926_explore_scotland_listed_buildings.py diff --git a/asf_heat_pump_suitability/notebooks/20240926_explore_scotland_listed_buildings.py b/asf_heat_pump_suitability/notebooks/20240926_explore_scotland_listed_buildings.py new file mode 100644 index 0000000..540ab5a --- /dev/null +++ b/asf_heat_pump_suitability/notebooks/20240926_explore_scotland_listed_buildings.py @@ -0,0 +1,111 @@ +# %% [markdown] +# # Calculate threshold for nearest neighbour search +# +# Scotland and Wales listed buildings data is available as point geometries rather than polygons. We also have point geometries for UPRNs in the EPC dataset. In order to join EPC UPRNs to listed buildings, we need to use a nearest neighbour search. This notebook uses ground-truth listed building polygon data for Scotland to identify the threshold of distance in metres for determining whether a UPRN is located within a listed building. + +# %% +import geopandas as gpd +import polars as pl +import matplotlib.pyplot as plt + +from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon + +# %% +# Listed building point geoms Scotland +points_gdf = gpd.read_file( + "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings.shp" +) +points_gdf = points_gdf[["ENT_REF", "ENT_TITLE", "geometry"]] + +# %% +epc = pl.read_parquet( + "s3://asf-heat-pump-suitability/outputs/2023Q4/20240904_2023_Q4_EPC_weighted_features_gardens.parquet" +) + +# %% +epc_gdf = epc.filter(pl.col("COUNTRY") == "Scotland") +epc_gdf = lat_lon.generate_gdf_uprn_coords(epc_gdf, usecols=["UPRN", "COUNTRY"]) + +# %% [markdown] +# ## Compare to ground truth + +# %% +# Listed building boundaries for Scotland (limited dataset) +boundaries = gpd.read_file( + "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings_boundaries.shp" +) + +# %% +# Join listed building polygons to listed building points +boundaries = boundaries[["DES_REF", "DES_TITLE", "geometry"]] +ground_truth = boundaries.sjoin(points_gdf, how="inner", predicate="intersects").drop( + columns=["index_right"] +) + +# %% +# Join listed buildings to EPC using polygons to identify true matches +gdf = epc_gdf.sjoin( + boundaries[boundaries["DES_REF"].isin(ground_truth["DES_REF"])], + how="left", + predicate="intersects", +).drop(columns=["index_right"]) + +# Calculate distance from nearest listed building for each EPC record +gdf = gdf.sjoin_nearest( + points_gdf[points_gdf["ENT_REF"].isin(ground_truth["ENT_REF"])], + how="left", + max_distance=500, + distance_col="distance_from_nearest_listed_m", +) +df = gdf.drop(columns=["geometry"]) + +df = pl.from_pandas(df) +df = df.with_columns( + pl.when(pl.col("DES_REF").is_not_null()) + .then(True) + .otherwise(False) + .alias("true_match") +) +df = df.filter(pl.col("distance_from_nearest_listed_m").is_not_null()) +df.head() + +# %% +# Visualise distance for true matches vs non-matches +fig, axs = plt.subplots(1, 2, figsize=(10, 5)) + +axs[0].boxplot(df.filter(pl.col("true_match"))["distance_from_nearest_listed_m"]) +axs[0].set_title("True matches") +axs[0].set_ylabel("Distance from nearest listed building (m)") + +axs[1].boxplot(df.filter(~pl.col("true_match"))["distance_from_nearest_listed_m"]) +axs[1].set_title("Not matches") +plt.suptitle("Distance from nearest listed building point geom (m)") + + +# %% +fig, axs = plt.subplots(1, 1, figsize=(10, 5)) + +axs.boxplot( + df.filter(pl.col("true_match"), pl.col("distance_from_nearest_listed_m") <= 20)[ + "distance_from_nearest_listed_m" + ] +) +axs.set_title("True matches") +axs.set_ylabel("Distance from nearest listed building (m)") + +# %% +df.filter(pl.col("true_match"))["distance_from_nearest_listed_m"].describe() + +# %% [markdown] +# ## Test threshold distance + +# %% +test = epc_gdf.sjoin_nearest( + points_gdf, + how="inner", + max_distance=5, + distance_col="distance_from_nearest_listed_m", +) + +# %% +test.shape From 80ce9f8560108436cb844679c57f417d03bfa901 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:16:23 +0100 Subject: [PATCH 08/15] fix usecols bug in listed_buildings.py --- .../pipeline/prepare_features/listed_buildings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py index 456c105..68ea85c 100644 --- a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py +++ b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py @@ -99,7 +99,7 @@ def sjoin_df_epc_listed_buildings( Returns: pd.DataFrame: EPC UPRNs in listed buildings """ - epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_df, usecols=["UPRN", "geometry"]) + epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_df, usecols=["UPRN"]) if any( [ expr in listed_buildings_gdf.geom_type.unique() From 072c2b0051c5a14d7e48c71c4affa18520ca269b Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Thu, 26 Sep 2024 13:39:35 +0100 Subject: [PATCH 09/15] fix minor bug introduced in listed_buildings.py --- .../pipeline/prepare_features/listed_buildings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py index 68ea85c..cceafbb 100644 --- a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py +++ b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py @@ -22,7 +22,7 @@ def generate_df_epc_listed_buildings( """ dfs = [] for nation in nations: - logging.info(f"Loading listed building data for {nation}") + logging.info(f"Loading listed buildings data for {nation}") gdf = transform_gdf_listed_buildings(nation) df = chunk_sjoin_df_epc_listed_buildings(epc_df, gdf) dfs.append(df) @@ -81,7 +81,7 @@ def chunk_sjoin_df_epc_listed_buildings( pl.col("listed_building").fill_null(False) ) - return df.select(["PUPRN", "listed_building"]) + return df.select(["UPRN", "listed_building"]) def sjoin_df_epc_listed_buildings( @@ -108,7 +108,7 @@ def sjoin_df_epc_listed_buildings( ): df = epc_gdf.sjoin_nearest( listed_buildings_gdf, how="inner", max_distance=distance - )["UPRN", "listed_building"].drop_duplicates(subset="UPRN") + )[["UPRN", "listed_building"]].drop_duplicates(subset="UPRN") elif any( [ expr in listed_buildings_gdf.geom_type.unique() From 6e9c38b385448d00e276fb6d2daf4822591feb04 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Thu, 26 Sep 2024 13:46:07 +0100 Subject: [PATCH 10/15] left join listed buildings to EPC instead of inner join in listed_buildings.py --- .../pipeline/prepare_features/listed_buildings.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py index cceafbb..e44ca1a 100644 --- a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py +++ b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py @@ -18,7 +18,7 @@ def generate_df_epc_listed_buildings( nations (list): nation(s) to load listed buildings data for. Options: "England"; "Scotland"; "Wales". Returns: - pl.DataFrame: listed buildings in EPC data + pl.DataFrame: EPC UPRNs with listed buildings flag """ dfs = [] for nation in nations: @@ -61,7 +61,7 @@ def chunk_sjoin_df_epc_listed_buildings( chunk_size (int): number of EPC rows in each partition. Default 100,000 Returns: - pl.DataFrame: EPC UPRNs in listed buildings + pl.DataFrame: EPC UPRNs with listed building flag """ partitions = ( epc_df.select(["UPRN", "X_COORDINATE", "Y_COORDINATE"]) @@ -94,10 +94,11 @@ def sjoin_df_epc_listed_buildings( Args: epc_df (pl.DataFrame): EPC dataset with X and Y coordinates per UPRN listed_buildings_gdf (gpd.GeoDataFrame): listed buildings data - distance (float): maximum distance (m) within which to query for nearest geometry where `sjoin_nearest` used. Must be greater than 0. Default 5. + distance (float): maximum distance (m) within which to query for nearest geometry where `sjoin_nearest` used. + Must be greater than 0. Default 5. Returns: - pd.DataFrame: EPC UPRNs in listed buildings + pd.DataFrame: EPC UPRNs with listed buildings flag """ epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_df, usecols=["UPRN"]) if any( @@ -107,7 +108,7 @@ def sjoin_df_epc_listed_buildings( ] ): df = epc_gdf.sjoin_nearest( - listed_buildings_gdf, how="inner", max_distance=distance + listed_buildings_gdf, how="left", max_distance=distance )[["UPRN", "listed_building"]].drop_duplicates(subset="UPRN") elif any( [ @@ -115,7 +116,7 @@ def sjoin_df_epc_listed_buildings( for expr in ["Polygon", "MultiPolygon"] ] ): - df = epc_gdf.sjoin(listed_buildings_gdf, how="inner", predicate="intersects")[ + df = epc_gdf.sjoin(listed_buildings_gdf, how="left", predicate="intersects")[ ["UPRN", "listed_building"] ].drop_duplicates(subset="UPRN") else: From 6bb7d83ff27738f301e308def664c326325c9606 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Fri, 27 Sep 2024 13:32:27 +0100 Subject: [PATCH 11/15] add world heritage sites flag for scotland - add WHS dataset to config base.yaml - rename conservation_areas.py to protected_areas.py - add functions to protected_areas.py to transform WHS data - update cons area functions in protected_areas.py to return properties in cons areas only - update listed_buildings.py functions to return properties in listed buildings only - update geospatial functions that join data to EPC to use epc geodataframe where possible instead of generating new gdf every time - changes to listed_buildings.py; output_areas.py; protected_areas.py - update run_add_features.py script to use new WHS functions and to use EPC gdf instead of EPC df --- asf_heat_pump_suitability/config/base.yaml | 1 + .../prepare_features/conservation_areas.py | 88 -------------- .../prepare_features/listed_buildings.py | 17 ++- .../pipeline/prepare_features/output_areas.py | 10 +- .../prepare_features/protected_areas.py | 110 ++++++++++++++++++ .../pipeline/run_scripts/run_add_features.py | 21 ++-- 6 files changed, 134 insertions(+), 113 deletions(-) delete mode 100644 asf_heat_pump_suitability/pipeline/prepare_features/conservation_areas.py create mode 100644 asf_heat_pump_suitability/pipeline/prepare_features/protected_areas.py diff --git a/asf_heat_pump_suitability/config/base.yaml b/asf_heat_pump_suitability/config/base.yaml index 5706d0e..4824d20 100644 --- a/asf_heat_pump_suitability/config/base.yaml +++ b/asf_heat_pump_suitability/config/base.yaml @@ -21,6 +21,7 @@ data_source: W_cadw_listed_buildings: "s3://asf-heat-pump-suitability/source_data/May2024_vMay2024_Cadw_listedbuilding_W.gpkg" S_scottish_gov_listed_buildings: "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings.shp" EW_ons_lsoa_lad_lookup: "s3://asf-heat-pump-suitability/source_data/2021_vApr2023_ons_lsoa_to_lad_lookup_EW.csv" + S_historic_environment_scotland_world_heritage_sites: "s3://asf-heat-pump-suitability/source_data/WHS/World_Heritage_Sites.shp" usecols: epc: - COUNTRY diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/conservation_areas.py b/asf_heat_pump_suitability/pipeline/prepare_features/conservation_areas.py deleted file mode 100644 index f44cf2e..0000000 --- a/asf_heat_pump_suitability/pipeline/prepare_features/conservation_areas.py +++ /dev/null @@ -1,88 +0,0 @@ -import geopandas as gpd -import pandas as pd -import numpy as np -import polars as pl -from asf_heat_pump_suitability.getters import get_datasets -from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon - - -def transform_gdf_building_cons_areas() -> gpd.GeoDataFrame: - """ - Load, transform, and concatenate building conservation areas from England and Wales. Resulting GeoDataFrame is in - CRS EPSG:27700 British National Grid. - - Returns: - gpd.GeoDataFrame: building conservation areas in England and Wales - """ - e_gdf = ( - get_datasets.load_gdf_historic_england_conservation_areas( - columns=["name", "geometry"] - ) - .to_crs("EPSG:27700") - .rename(columns={"name": "sitename"}) - ) - w_gdf = get_datasets.load_gdf_welsh_gov_conservation_areas( - columns=["sitename", "geometry"] - ) - - gdf = pd.concat([e_gdf, w_gdf]) - - return gdf - - -def generate_df_conservation_area_data_availability( - ladcd_col: str = "LAD23CD", -) -> pl.DataFrame: - """ - Generate dataframe of UK local authority districts (LADs) with indicator of building conservation area data - availability. - - Args: - ladcd_col (str): name of column in local authority district (LAD) boundaries file with LAD codes - - Returns: - pl.DataFrame: building conservation area data availability per LAD in the UK - """ - cons_areas_gdf = transform_gdf_building_cons_areas() - council_bounds = get_datasets.load_gdf_ons_council_bounds() - - # Join conservation areas to their councils - df = council_bounds.sjoin(cons_areas_gdf, how="left", predicate="intersects")[ - [ladcd_col, "sitename"] - ].replace("No data available for publication by HE", np.nan) - - df = df.groupby(ladcd_col).agg({"sitename": "count"}) - df["lad_conservation_area_data_available"] = df["sitename"].astype(bool) - df = df.drop(columns=["sitename"]).reset_index() - - return pl.from_pandas(df) - - -def generate_df_uprn_to_cons_area(df: pl.DataFrame) -> pl.DataFrame: - """ - Generate dataframe of UPRNs located within building conservation areas in England and Wales. - - Args: - df (pl.DataFrame): EPC dataset with UPRN column and X and Y coordinate columns in BNG - - Returns: - pl.DataFrame: dataframe of EPC UPRNs in building conservation areas in England and Wales - """ - # Convert BNG x, y coordinates to point geometries - df = lat_lon.generate_gdf_uprn_coords(df)[["UPRN", "geometry"]] - - # Load and transform conservation areas in England and Wales - cons_areas_gdf = transform_gdf_building_cons_areas() - - # Join EPC UPRNs within or on boundaries of conservation areas - df = ( - df.sjoin(cons_areas_gdf, how="inner", predicate="intersects") - .drop(columns=["index_right", "geometry"]) - .drop_duplicates(subset="UPRN") - ) - - # Set column as boolean - df["sitename"] = df["sitename"].astype(bool) - df = df.rename(columns={"sitename": "in_conservation_area"}) - - return pl.from_pandas(df) diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py index e44ca1a..26614b1 100644 --- a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py +++ b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py @@ -18,14 +18,13 @@ def generate_df_epc_listed_buildings( nations (list): nation(s) to load listed buildings data for. Options: "England"; "Scotland"; "Wales". Returns: - pl.DataFrame: EPC UPRNs with listed buildings flag + pl.DataFrame: EPC UPRNs in listed buildings """ dfs = [] for nation in nations: logging.info(f"Loading listed buildings data for {nation}") gdf = transform_gdf_listed_buildings(nation) - df = chunk_sjoin_df_epc_listed_buildings(epc_df, gdf) - dfs.append(df) + dfs.append(chunk_sjoin_df_epc_listed_buildings(epc_df, gdf)) return pl.concat(dfs, how="vertical") @@ -61,7 +60,7 @@ def chunk_sjoin_df_epc_listed_buildings( chunk_size (int): number of EPC rows in each partition. Default 100,000 Returns: - pl.DataFrame: EPC UPRNs with listed building flag + pl.DataFrame: EPC UPRNs in listed buildings """ partitions = ( epc_df.select(["UPRN", "X_COORDINATE", "Y_COORDINATE"]) @@ -77,9 +76,7 @@ def chunk_sjoin_df_epc_listed_buildings( df = sjoin_df_epc_listed_buildings(epc_chunk, listed_buildings_gdf) dfs.append(df) - df = pl.from_pandas(pd.concat(dfs)).with_columns( - pl.col("listed_building").fill_null(False) - ) + df = pl.from_pandas(pd.concat(dfs)) return df.select(["UPRN", "listed_building"]) @@ -98,7 +95,7 @@ def sjoin_df_epc_listed_buildings( Must be greater than 0. Default 5. Returns: - pd.DataFrame: EPC UPRNs with listed buildings flag + pd.DataFrame: EPC UPRNs in listed buildings """ epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_df, usecols=["UPRN"]) if any( @@ -108,7 +105,7 @@ def sjoin_df_epc_listed_buildings( ] ): df = epc_gdf.sjoin_nearest( - listed_buildings_gdf, how="left", max_distance=distance + listed_buildings_gdf, how="inner", max_distance=distance )[["UPRN", "listed_building"]].drop_duplicates(subset="UPRN") elif any( [ @@ -116,7 +113,7 @@ def sjoin_df_epc_listed_buildings( for expr in ["Polygon", "MultiPolygon"] ] ): - df = epc_gdf.sjoin(listed_buildings_gdf, how="left", predicate="intersects")[ + df = epc_gdf.sjoin(listed_buildings_gdf, how="inner", predicate="intersects")[ ["UPRN", "listed_building"] ].drop_duplicates(subset="UPRN") else: diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/output_areas.py b/asf_heat_pump_suitability/pipeline/prepare_features/output_areas.py index d1fb320..97ade72 100644 --- a/asf_heat_pump_suitability/pipeline/prepare_features/output_areas.py +++ b/asf_heat_pump_suitability/pipeline/prepare_features/output_areas.py @@ -1,28 +1,26 @@ import polars as pl +import geopandas as gpd from asf_heat_pump_suitability import config from asf_heat_pump_suitability.getters import get_datasets -from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon -def sjoin_df_uprn_lad_code(df: pl.DataFrame) -> pl.DataFrame: +def sjoin_df_uprn_lad_code(gdf: gpd.GeoDataFrame) -> pl.DataFrame: """ Geospatial join between UPRNs with x,y coordinates and local authority (LAD) boundaries to match UPRNs with the code for the local authority they are located in. Null LAD codes are filled with LAD codes matched to UPRN on postcode. Args: - df (pl.DataFrame): dataframe with UPRNs; x,y coordinates; and LAD code from postcode + gdf (gpd.GeoDataFrame): dataframe with point geometries per UPRN in BNG, and LAD code from postcode Returns: pl.DataFrame: UPRNs with matched local authority code """ - gdf = lat_lon.generate_gdf_uprn_coords( - df, usecols=["UPRN", "lad_code", "X_COORDINATE", "Y_COORDINATE"] - ) lad_bounds_gdf = get_datasets.load_gdf_ons_council_bounds( columns=["LAD23CD", "geometry"] ) gdf = gdf.sjoin(lad_bounds_gdf, how="left", predicate="intersects") gdf["lad_code"] = gdf["LAD23CD"].fillna(gdf["lad_code"]) + return pl.from_pandas(gdf[["UPRN", "lad_code"]]) diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/protected_areas.py b/asf_heat_pump_suitability/pipeline/prepare_features/protected_areas.py new file mode 100644 index 0000000..1ad7b4f --- /dev/null +++ b/asf_heat_pump_suitability/pipeline/prepare_features/protected_areas.py @@ -0,0 +1,110 @@ +import geopandas as gpd +import pandas as pd +import numpy as np +import polars as pl +from asf_heat_pump_suitability import config +from asf_heat_pump_suitability.getters import get_datasets + + +def generate_df_uprn_in_cons_area(gdf: gpd.GeoDataFrame) -> pl.DataFrame: + """ + Generate dataframe of UPRNs located within building conservation areas in England and Wales. + + Args: + gdf (pl.DataFrame): dataframe with point geometries per UPRN in BNG + + Returns: + pl.DataFrame: EPC UPRNs in building conservation areas in England and Wales + """ + cons_areas_gdf = transform_gdf_building_cons_areas() + + gdf = gdf.sjoin( + cons_areas_gdf, how="inner", predicate="intersects" + ).drop_duplicates(subset="UPRN") + + return pl.from_pandas(gdf[["UPRN", "in_conservation_area_ew"]]) + + +def transform_gdf_building_cons_areas() -> gpd.GeoDataFrame: + """ + Load, transform, and concatenate building conservation areas from England and Wales. Resulting GeoDataFrame is in + CRS EPSG:27700 British National Grid. + + Returns: + gpd.GeoDataFrame: building conservation areas in England and Wales + """ + e_gdf = get_datasets.load_gdf_historic_england_conservation_areas( + columns=["geometry"] + ).to_crs("EPSG:27700") + w_gdf = get_datasets.load_gdf_welsh_gov_conservation_areas(columns=["geometry"]) + + gdf = pd.concat([e_gdf, w_gdf]) + gdf["in_conservation_area_ew"] = True + + return gdf + + +def generate_df_uprn_in_whs(gdf: gpd.GeoDataFrame) -> pl.DataFrame: + """ + Generate dataframe to flag UPRNs located within World Heritage Sites in Scotland. + + Args: + gdf (pl.DataFrame): dataframe with point geometries per UPRN in BNG + + Returns: + pl.DataFrame: EPC UPRNs with flag for World Heritage Sites in Scotland + """ + whs_gdf = load_transform_gdf_scottish_world_heritage_sites() + + gdf = gdf.sjoin(whs_gdf, how="left", predicate="intersects").drop_duplicates( + subset="UPRN" + ) + gdf["in_world_heritage_site_s"] = gdf["in_world_heritage_site_s"].fillna(False) + + return pl.from_pandas(gdf[["UPRN", "in_world_heritage_site_s"]]) + + +def load_transform_gdf_scottish_world_heritage_sites() -> gpd.GeoDataFrame: + """ + Load and transform Scottish World Heritage Sites geospatial data. CRS EPSG:27700, British National Grid. + + Returns: + gpd.GeoDataFrame: Scottish World Heritage Sites + """ + gdf = gpd.read_file( + config["data_source"]["S_historic_environment_scotland_world_heritage_sites"], + columns=["geometry"], + ) + gdf["in_world_heritage_site_s"] = True + + return gdf + + +def generate_df_conservation_area_data_availability( + ladcd_col: str = "LAD23CD", +) -> pl.DataFrame: + """ + Generate dataframe of UK local authority districts (LADs) with indicator of building conservation area data + availability. + + Args: + ladcd_col (str): name of column in local authority district (LAD) boundaries file with LAD codes + + Returns: + pl.DataFrame: building conservation area data availability per LAD in the UK + """ + cons_areas_gdf = transform_gdf_building_cons_areas() + council_bounds = get_datasets.load_gdf_ons_council_bounds() + + # Join conservation areas to their councils + df = council_bounds.sjoin(cons_areas_gdf, how="left", predicate="intersects")[ + [ladcd_col, "in_conservation_area_ew"] + ].replace("No data available for publication by HE", np.nan) + + df = df.groupby(ladcd_col).agg({"in_conservation_area_ew": "count"}) + df["lad_conservation_area_data_available_ew"] = df[ + "in_conservation_area_ew" + ].astype(bool) + df = df.drop(columns=["in_conservation_area_ew"]).reset_index() + + return pl.from_pandas(df) diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py b/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py index 2170a11..5c119a1 100644 --- a/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py +++ b/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py @@ -17,7 +17,7 @@ import argparse from datetime import datetime from asf_heat_pump_suitability.pipeline.prepare_features import ( - conservation_areas, + protected_areas, epc, garden_space_avg, lat_lon, @@ -85,10 +85,11 @@ def parse_arguments() -> argparse.Namespace: logging.info("Adding lat/lon data to EPC") uprn_latlon_df = lat_lon.transform_df_osopen_uprn_latlon() epc_df = epc_df.join(uprn_latlon_df, how="left", on="UPRN") + epc_gdf = lat_lon.generate_gdf_uprn_coords(epc_df, usecols=["UPRN", "lad_code"]) # Replace `lad_code` from postcode with `lad_code` from geospatial join and postcode logging.info("Adding LAD code with geospatial join") - uprn_lad_df = output_areas.sjoin_df_uprn_lad_code(epc_df) + uprn_lad_df = output_areas.sjoin_df_uprn_lad_code(epc_gdf) epc_df = epc_df.drop("lad_code").join(uprn_lad_df, how="left", on="UPRN") # Join new features to EPC dataset @@ -104,21 +105,24 @@ def parse_arguments() -> argparse.Namespace: ) # Add feature: building conservation area flag - logging.info("Adding building conservation area flag") + logging.info("Adding building conservation area England and Wales flag") # Get UPRNs in building conservation areas - uprns_in_cons_area_df = conservation_areas.generate_df_uprn_to_cons_area(epc_df) + uprns_in_cons_area_df = protected_areas.generate_df_uprn_in_cons_area(epc_gdf) epc_df = epc_df.join(uprns_in_cons_area_df, how="left", on="UPRN") # Label local authorities with missing building conservation area data - lad_cons_areas_df = ( - conservation_areas.generate_df_conservation_area_data_availability( - ladcd_col="LAD23CD" - ) + lad_cons_areas_df = protected_areas.generate_df_conservation_area_data_availability( + ladcd_col="LAD23CD" ) epc_df = epc_df.join( lad_cons_areas_df, how="left", left_on="lad_code", right_on="LAD23CD" ) + # Add feature: World Heritage Site flag + logging.info("Adding World Heritage Site Scotland flag") + uprns_in_whs_df = protected_areas.generate_df_uprn_in_whs(epc_gdf) + epc_df = epc_df.join(uprns_in_whs_df, how="left", on="UPRN") + # Add feature: property density logging.info("Adding number of households data to EPC") lsoa_number_of_households_df = ( @@ -149,7 +153,6 @@ def parse_arguments() -> argparse.Namespace: listed_buildings_df = listed_buildings.generate_df_epc_listed_buildings( epc_df=epc_df ) - epc_df = epc_df.join(listed_buildings_df, how="left", on="UPRN") # Save to S3 From 27a7cfff7fd9ea554aafaaec3322287699debc60 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Fri, 27 Sep 2024 17:35:59 +0100 Subject: [PATCH 12/15] add new run script to download INSPIRE files for Scotland to S3 - add INSPIRE scotland data source to S3 --- asf_heat_pump_suitability/config/base.yaml | 1 + .../pipeline/run_scripts/run_download_inspire.py | 0 2 files changed, 1 insertion(+) create mode 100644 asf_heat_pump_suitability/pipeline/run_scripts/run_download_inspire.py diff --git a/asf_heat_pump_suitability/config/base.yaml b/asf_heat_pump_suitability/config/base.yaml index 4824d20..baf81fc 100644 --- a/asf_heat_pump_suitability/config/base.yaml +++ b/asf_heat_pump_suitability/config/base.yaml @@ -22,6 +22,7 @@ data_source: S_scottish_gov_listed_buildings: "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings.shp" EW_ons_lsoa_lad_lookup: "s3://asf-heat-pump-suitability/source_data/2021_vApr2023_ons_lsoa_to_lad_lookup_EW.csv" S_historic_environment_scotland_world_heritage_sites: "s3://asf-heat-pump-suitability/source_data/WHS/World_Heritage_Sites.shp" + S_ros_inspire_url: "https://ros.locationcentre.co.uk/inspire/" usecols: epc: - COUNTRY diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_download_inspire.py b/asf_heat_pump_suitability/pipeline/run_scripts/run_download_inspire.py new file mode 100644 index 0000000..e69de29 From 2b14140bffa3ae028e724aa596adba77a88a9e3f Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Fri, 27 Sep 2024 17:37:35 +0100 Subject: [PATCH 13/15] populate script to download INSPIRE files for Scotland --- .../run_scripts/run_download_inspire.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_download_inspire.py b/asf_heat_pump_suitability/pipeline/run_scripts/run_download_inspire.py index e69de29..a5b822b 100644 --- a/asf_heat_pump_suitability/pipeline/run_scripts/run_download_inspire.py +++ b/asf_heat_pump_suitability/pipeline/run_scripts/run_download_inspire.py @@ -0,0 +1,30 @@ +""" +Script to download INSPIRE files for Scotland from ROS webpage and save to S3 asf-heat-pump-suitability bucket. +""" + +from bs4 import BeautifulSoup +import requests +import regex as re +import boto3 +from asf_heat_pump_suitability import config +from asf_heat_pump_suitability.getters import base_getters + + +if __name__ == "__main__": + ros_url = config["data_source"]["S_ros_inspire_url"] + bucket = "asf-heat-pump-suitability" + + page = requests.get(ros_url) + soup = BeautifulSoup(page.content, "html.parser") + + pattern = re.compile(f'https:.+?(?=")') + url = pattern.search(soup.find("script", string=pattern).text).group(0) + ids = {v.contents[0]: v["value"] for v in soup.find_all("option") if v["value"]} + download_urls = {k: url + v for k, v in ids.items()} + + s3 = boto3.client("s3") + for area, url in download_urls.items(): + content = base_getters.get_content_from_url(url) + s3.upload_fileobj( + content, bucket, f"source_data/inspire_zips_scotland/{area}.zip" + ) From 2f3f35ac098c04bbf9e63abfc1637245b9313e96 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Fri, 27 Sep 2024 17:37:59 +0100 Subject: [PATCH 14/15] update base_getter.py function to make it unprivate --- asf_heat_pump_suitability/getters/base_getters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asf_heat_pump_suitability/getters/base_getters.py b/asf_heat_pump_suitability/getters/base_getters.py index ef101b4..f46b4da 100644 --- a/asf_heat_pump_suitability/getters/base_getters.py +++ b/asf_heat_pump_suitability/getters/base_getters.py @@ -19,7 +19,7 @@ def get_df_from_excel_url(url: str, **kwargs) -> pl.DataFrame: Returns pl.DataFrame: dataframe from Excel file """ - content = _get_content_from_url(url) + content = get_content_from_url(url) df = pl.read_excel(content, **kwargs) return df @@ -37,7 +37,7 @@ def get_df_from_zip_url(url: str, extract_file: str, **kwargs) -> pl.DataFrame: Returns: pl.DataFrame: dataset from ZIP file """ - content = _get_content_from_url(url) + content = get_content_from_url(url) df = pl.read_csv(ZipFile(content).open(name=extract_file), **kwargs) return df @@ -107,7 +107,7 @@ def get_content_from_s3_path(path: str) -> bytes: return content -def _get_content_from_url(url: str) -> BytesIO: +def get_content_from_url(url: str) -> BytesIO: """ Get BytesIO stream from URL. Args From 9b9f6a4ba19c9a0034b11109f5625c5794785d8a Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Fri, 27 Sep 2024 17:38:31 +0100 Subject: [PATCH 15/15] add bs4 to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 8cbe8f1..9cb31b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ boto3==1.21.21 tenacity numpy fiona +bs4