diff --git a/asf_heat_pump_suitability/config/base.yaml b/asf_heat_pump_suitability/config/base.yaml index d68cff1..baf81fc 100644 --- a/asf_heat_pump_suitability/config/base.yaml +++ b/asf_heat_pump_suitability/config/base.yaml @@ -19,7 +19,10 @@ data_source: UK_spa_offgasgrid: "s3://asf-heat-pump-suitability/source_data/2024_vMar2024_SPA_offgaspostcode_UK.xlsx" E_historicengland_listed_buildings: "s3://asf-heat-pump-suitability/source_data/Jun2024_vJul2024_HistoricEngland_listedbuilding_E.gpkg" W_cadw_listed_buildings: "s3://asf-heat-pump-suitability/source_data/May2024_vMay2024_Cadw_listedbuilding_W.gpkg" + S_scottish_gov_listed_buildings: "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings.shp" EW_ons_lsoa_lad_lookup: "s3://asf-heat-pump-suitability/source_data/2021_vApr2023_ons_lsoa_to_lad_lookup_EW.csv" + S_historic_environment_scotland_world_heritage_sites: "s3://asf-heat-pump-suitability/source_data/WHS/World_Heritage_Sites.shp" + S_ros_inspire_url: "https://ros.locationcentre.co.uk/inspire/" usecols: epc: - COUNTRY diff --git a/asf_heat_pump_suitability/getters/base_getters.py b/asf_heat_pump_suitability/getters/base_getters.py index ef101b4..f46b4da 100644 --- a/asf_heat_pump_suitability/getters/base_getters.py +++ b/asf_heat_pump_suitability/getters/base_getters.py @@ -19,7 +19,7 @@ def get_df_from_excel_url(url: str, **kwargs) -> pl.DataFrame: Returns pl.DataFrame: dataframe from Excel file """ - content = _get_content_from_url(url) + content = get_content_from_url(url) df = pl.read_excel(content, **kwargs) return df @@ -37,7 +37,7 @@ def get_df_from_zip_url(url: str, extract_file: str, **kwargs) -> pl.DataFrame: Returns: pl.DataFrame: dataset from ZIP file """ - content = _get_content_from_url(url) + content = get_content_from_url(url) df = pl.read_csv(ZipFile(content).open(name=extract_file), **kwargs) return df @@ -107,7 +107,7 @@ def get_content_from_s3_path(path: str) -> bytes: return content -def _get_content_from_url(url: str) -> BytesIO: +def get_content_from_url(url: str) -> BytesIO: """ Get BytesIO stream from URL. Args diff --git a/asf_heat_pump_suitability/getters/get_datasets.py b/asf_heat_pump_suitability/getters/get_datasets.py index 5e53caf..bceaa72 100644 --- a/asf_heat_pump_suitability/getters/get_datasets.py +++ b/asf_heat_pump_suitability/getters/get_datasets.py @@ -248,7 +248,7 @@ def load_gdf_listed_buildings(nation: str, **kwargs) -> gpd.GeoDataFrame: Get raw Listed Buildings polygons dataset for specified nation. CRS EPSG:27700, British National Grid. Args: - nation (str): UK nation to load listed buildings data for. Options: "England"; "Wales". + nation (str): nation to load listed buildings data for. Options: "England"; "Scotland", "Wales". **kwargs for `gpd.read_file()` Returns: @@ -260,6 +260,12 @@ def load_gdf_listed_buildings(nation: str, **kwargs) -> gpd.GeoDataFrame: ) elif nation.lower() == "wales": gdf = gpd.read_file(config["data_source"]["W_cadw_listed_buildings"], **kwargs) + elif nation.lower() == "scotland": + gdf = gpd.read_file( + config["data_source"]["S_scottish_gov_listed_buildings"], **kwargs + ) else: - raise ValueError("Please set `nation` to either 'England' or 'Wales'.") + raise ValueError( + "Please set `nation` to either 'England', 'Scotland', or 'Wales'." + ) return gdf diff --git a/asf_heat_pump_suitability/notebooks/20240926_explore_scotland_listed_buildings.py b/asf_heat_pump_suitability/notebooks/20240926_explore_scotland_listed_buildings.py new file mode 100644 index 0000000..540ab5a --- /dev/null +++ b/asf_heat_pump_suitability/notebooks/20240926_explore_scotland_listed_buildings.py @@ -0,0 +1,111 @@ +# %% [markdown] +# # Calculate threshold for nearest neighbour search +# +# Scotland and Wales listed buildings data is available as point geometries rather than polygons. We also have point geometries for UPRNs in the EPC dataset. In order to join EPC UPRNs to listed buildings, we need to use a nearest neighbour search. This notebook uses ground-truth listed building polygon data for Scotland to identify the threshold of distance in metres for determining whether a UPRN is located within a listed building. + +# %% +import geopandas as gpd +import polars as pl +import matplotlib.pyplot as plt + +from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon + +# %% +# Listed building point geoms Scotland +points_gdf = gpd.read_file( + "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings.shp" +) +points_gdf = points_gdf[["ENT_REF", "ENT_TITLE", "geometry"]] + +# %% +epc = pl.read_parquet( + "s3://asf-heat-pump-suitability/outputs/2023Q4/20240904_2023_Q4_EPC_weighted_features_gardens.parquet" +) + +# %% +epc_gdf = epc.filter(pl.col("COUNTRY") == "Scotland") +epc_gdf = lat_lon.generate_gdf_uprn_coords(epc_gdf, usecols=["UPRN", "COUNTRY"]) + +# %% [markdown] +# ## Compare to ground truth + +# %% +# Listed building boundaries for Scotland (limited dataset) +boundaries = gpd.read_file( + "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings_boundaries.shp" +) + +# %% +# Join listed building polygons to listed building points +boundaries = boundaries[["DES_REF", "DES_TITLE", "geometry"]] +ground_truth = boundaries.sjoin(points_gdf, how="inner", predicate="intersects").drop( + columns=["index_right"] +) + +# %% +# Join listed buildings to EPC using polygons to identify true matches +gdf = epc_gdf.sjoin( + boundaries[boundaries["DES_REF"].isin(ground_truth["DES_REF"])], + how="left", + predicate="intersects", +).drop(columns=["index_right"]) + +# Calculate distance from nearest listed building for each EPC record +gdf = gdf.sjoin_nearest( + points_gdf[points_gdf["ENT_REF"].isin(ground_truth["ENT_REF"])], + how="left", + max_distance=500, + distance_col="distance_from_nearest_listed_m", +) +df = gdf.drop(columns=["geometry"]) + +df = pl.from_pandas(df) +df = df.with_columns( + pl.when(pl.col("DES_REF").is_not_null()) + .then(True) + .otherwise(False) + .alias("true_match") +) +df = df.filter(pl.col("distance_from_nearest_listed_m").is_not_null()) +df.head() + +# %% +# Visualise distance for true matches vs non-matches +fig, axs = plt.subplots(1, 2, figsize=(10, 5)) + +axs[0].boxplot(df.filter(pl.col("true_match"))["distance_from_nearest_listed_m"]) +axs[0].set_title("True matches") +axs[0].set_ylabel("Distance from nearest listed building (m)") + +axs[1].boxplot(df.filter(~pl.col("true_match"))["distance_from_nearest_listed_m"]) +axs[1].set_title("Not matches") +plt.suptitle("Distance from nearest listed building point geom (m)") + + +# %% +fig, axs = plt.subplots(1, 1, figsize=(10, 5)) + +axs.boxplot( + df.filter(pl.col("true_match"), pl.col("distance_from_nearest_listed_m") <= 20)[ + "distance_from_nearest_listed_m" + ] +) +axs.set_title("True matches") +axs.set_ylabel("Distance from nearest listed building (m)") + +# %% +df.filter(pl.col("true_match"))["distance_from_nearest_listed_m"].describe() + +# %% [markdown] +# ## Test threshold distance + +# %% +test = epc_gdf.sjoin_nearest( + points_gdf, + how="inner", + max_distance=5, + distance_col="distance_from_nearest_listed_m", +) + +# %% +test.shape diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/conservation_areas.py b/asf_heat_pump_suitability/pipeline/prepare_features/conservation_areas.py deleted file mode 100644 index f44cf2e..0000000 --- a/asf_heat_pump_suitability/pipeline/prepare_features/conservation_areas.py +++ /dev/null @@ -1,88 +0,0 @@ -import geopandas as gpd -import pandas as pd -import numpy as np -import polars as pl -from asf_heat_pump_suitability.getters import get_datasets -from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon - - -def transform_gdf_building_cons_areas() -> gpd.GeoDataFrame: - """ - Load, transform, and concatenate building conservation areas from England and Wales. Resulting GeoDataFrame is in - CRS EPSG:27700 British National Grid. - - Returns: - gpd.GeoDataFrame: building conservation areas in England and Wales - """ - e_gdf = ( - get_datasets.load_gdf_historic_england_conservation_areas( - columns=["name", "geometry"] - ) - .to_crs("EPSG:27700") - .rename(columns={"name": "sitename"}) - ) - w_gdf = get_datasets.load_gdf_welsh_gov_conservation_areas( - columns=["sitename", "geometry"] - ) - - gdf = pd.concat([e_gdf, w_gdf]) - - return gdf - - -def generate_df_conservation_area_data_availability( - ladcd_col: str = "LAD23CD", -) -> pl.DataFrame: - """ - Generate dataframe of UK local authority districts (LADs) with indicator of building conservation area data - availability. - - Args: - ladcd_col (str): name of column in local authority district (LAD) boundaries file with LAD codes - - Returns: - pl.DataFrame: building conservation area data availability per LAD in the UK - """ - cons_areas_gdf = transform_gdf_building_cons_areas() - council_bounds = get_datasets.load_gdf_ons_council_bounds() - - # Join conservation areas to their councils - df = council_bounds.sjoin(cons_areas_gdf, how="left", predicate="intersects")[ - [ladcd_col, "sitename"] - ].replace("No data available for publication by HE", np.nan) - - df = df.groupby(ladcd_col).agg({"sitename": "count"}) - df["lad_conservation_area_data_available"] = df["sitename"].astype(bool) - df = df.drop(columns=["sitename"]).reset_index() - - return pl.from_pandas(df) - - -def generate_df_uprn_to_cons_area(df: pl.DataFrame) -> pl.DataFrame: - """ - Generate dataframe of UPRNs located within building conservation areas in England and Wales. - - Args: - df (pl.DataFrame): EPC dataset with UPRN column and X and Y coordinate columns in BNG - - Returns: - pl.DataFrame: dataframe of EPC UPRNs in building conservation areas in England and Wales - """ - # Convert BNG x, y coordinates to point geometries - df = lat_lon.generate_gdf_uprn_coords(df)[["UPRN", "geometry"]] - - # Load and transform conservation areas in England and Wales - cons_areas_gdf = transform_gdf_building_cons_areas() - - # Join EPC UPRNs within or on boundaries of conservation areas - df = ( - df.sjoin(cons_areas_gdf, how="inner", predicate="intersects") - .drop(columns=["index_right", "geometry"]) - .drop_duplicates(subset="UPRN") - ) - - # Set column as boolean - df["sitename"] = df["sitename"].astype(bool) - df = df.rename(columns={"sitename": "in_conservation_area"}) - - return pl.from_pandas(df) diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py index 3ec4385..26614b1 100644 --- a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py +++ b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py @@ -7,38 +7,60 @@ from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon +def generate_df_epc_listed_buildings( + epc_df: pl.DataFrame, nations: list = ["England", "Scotland", "Wales"] +) -> pl.DataFrame: + """ + Generate dataframe of listed buildings in EPC data in specified nation(s). + + Args: + epc_df (pl.DataFrame): EPC dataset with x, y coordinates per UPRN + nations (list): nation(s) to load listed buildings data for. Options: "England"; "Scotland"; "Wales". + + Returns: + pl.DataFrame: EPC UPRNs in listed buildings + """ + dfs = [] + for nation in nations: + logging.info(f"Loading listed buildings data for {nation}") + gdf = transform_gdf_listed_buildings(nation) + dfs.append(chunk_sjoin_df_epc_listed_buildings(epc_df, gdf)) + + return pl.concat(dfs, how="vertical") + + def transform_gdf_listed_buildings(nation: str) -> gpd.GeoDataFrame: """ Load and transform listed buildings polygons dataset for specified nation. Args: - nation (str): UK nation to load listed buildings data for. Options: "England"; "Wales". + nation (str): nation to load listed buildings data for. Options: "England"; "Scotland"; "Wales". Returns: gpd.GeoDataFrame: listed buildings dataset for specified nation with grade and geometry columns. """ - gdf = get_datasets.load_gdf_listed_buildings(nation, columns=["Grade", "geometry"]) - gdf = gdf.drop_duplicates(subset="geometry").rename( - columns={"Grade": "listed_building_grade"} - ) + gdf = get_datasets.load_gdf_listed_buildings(nation, columns=["geometry"]) + gdf = gdf.drop_duplicates(subset="geometry") + gdf["listed_building"] = True return gdf -def sjoin_df_epc_with_listed_buildings( +def chunk_sjoin_df_epc_listed_buildings( epc_df: pl.DataFrame, listed_buildings_gdf: gpd.GeoDataFrame, chunk_size: int = 100000, ) -> pl.DataFrame: """ - Spatial join EPC UPRNs with listed buildings polygons to return dataframe of EPC UPRNs which are located in listed - buildings, along with the building grade. + Chunk EPC data and spatial join EPC UPRNs with listed buildings. + Args: epc_df (pl.DataFrame): Enhanced EPC dataset with X and Y coordinates - listed_buildings_gdf (gpd.GeoDataFrame): listed buildings polygons dataset + listed_buildings_gdf (gpd.GeoDataFrame): listed buildings GeoDataFrame. Can be points / polygons. chunk_size (int): number of EPC rows in each partition. Default 100,000 + Returns: - pl.DataFrame: EPC UPRNs with listed buildings grade + pl.DataFrame: EPC UPRNs in listed buildings """ partitions = ( epc_df.select(["UPRN", "X_COORDINATE", "Y_COORDINATE"]) @@ -47,18 +69,57 @@ def sjoin_df_epc_with_listed_buildings( ) dfs = [] - # Group based on the created index data_partitioned = epc_df.with_columns(partitions).partition_by("chunk_id") logging.info(f"Adding listed buildings to EPC in {len(data_partitioned)} chunks") for epc_chunk in tqdm(data_partitioned): - epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_chunk)[["UPRN", "geometry"]] - df = epc_gdf.sjoin(listed_buildings_gdf, how="inner", predicate="intersects")[ - ["UPRN", "listed_building_grade"] - ].drop_duplicates(subset="UPRN") - + df = sjoin_df_epc_listed_buildings(epc_chunk, listed_buildings_gdf) dfs.append(df) df = pl.from_pandas(pd.concat(dfs)) + return df.select(["UPRN", "listed_building"]) + + +def sjoin_df_epc_listed_buildings( + epc_df: pl.DataFrame, listed_buildings_gdf: gpd.GeoDataFrame, distance: float = 5 +) -> pd.DataFrame: + """ + Spatial join EPC UPRNs with listed buildings using `geopandas.GeoDataFrame.sjoin_nearest` where Point or MultiPoint + geometries detected, and `geopandas.GeoDataFrame.sjoin` where Polygons or MultiPolygons detected. + + Args: + epc_df (pl.DataFrame): EPC dataset with X and Y coordinates per UPRN + listed_buildings_gdf (gpd.GeoDataFrame): listed buildings data + distance (float): maximum distance (m) within which to query for nearest geometry where `sjoin_nearest` used. + Must be greater than 0. Default 5. + + Returns: + pd.DataFrame: EPC UPRNs in listed buildings + """ + epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_df, usecols=["UPRN"]) + if any( + [ + expr in listed_buildings_gdf.geom_type.unique() + for expr in ["Point", "MultiPoint"] + ] + ): + df = epc_gdf.sjoin_nearest( + listed_buildings_gdf, how="inner", max_distance=distance + )[["UPRN", "listed_building"]].drop_duplicates(subset="UPRN") + elif any( + [ + expr in listed_buildings_gdf.geom_type.unique() + for expr in ["Polygon", "MultiPolygon"] + ] + ): + df = epc_gdf.sjoin(listed_buildings_gdf, how="inner", predicate="intersects")[ + ["UPRN", "listed_building"] + ].drop_duplicates(subset="UPRN") + else: + raise ValueError( + f"Listed buildings GeoDataFrame does not have appropriate geometries for sjoin. " + f"Geometries required: [Multi]Point or [Multi]Polygon. " + f"Geometries found: {listed_buildings_gdf.geom_type.unique()}" + ) return df diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/output_areas.py b/asf_heat_pump_suitability/pipeline/prepare_features/output_areas.py index d1fb320..97ade72 100644 --- a/asf_heat_pump_suitability/pipeline/prepare_features/output_areas.py +++ b/asf_heat_pump_suitability/pipeline/prepare_features/output_areas.py @@ -1,28 +1,26 @@ import polars as pl +import geopandas as gpd from asf_heat_pump_suitability import config from asf_heat_pump_suitability.getters import get_datasets -from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon -def sjoin_df_uprn_lad_code(df: pl.DataFrame) -> pl.DataFrame: +def sjoin_df_uprn_lad_code(gdf: gpd.GeoDataFrame) -> pl.DataFrame: """ Geospatial join between UPRNs with x,y coordinates and local authority (LAD) boundaries to match UPRNs with the code for the local authority they are located in. Null LAD codes are filled with LAD codes matched to UPRN on postcode. Args: - df (pl.DataFrame): dataframe with UPRNs; x,y coordinates; and LAD code from postcode + gdf (gpd.GeoDataFrame): dataframe with point geometries per UPRN in BNG, and LAD code from postcode Returns: pl.DataFrame: UPRNs with matched local authority code """ - gdf = lat_lon.generate_gdf_uprn_coords( - df, usecols=["UPRN", "lad_code", "X_COORDINATE", "Y_COORDINATE"] - ) lad_bounds_gdf = get_datasets.load_gdf_ons_council_bounds( columns=["LAD23CD", "geometry"] ) gdf = gdf.sjoin(lad_bounds_gdf, how="left", predicate="intersects") gdf["lad_code"] = gdf["LAD23CD"].fillna(gdf["lad_code"]) + return pl.from_pandas(gdf[["UPRN", "lad_code"]]) diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/protected_areas.py b/asf_heat_pump_suitability/pipeline/prepare_features/protected_areas.py new file mode 100644 index 0000000..1ad7b4f --- /dev/null +++ b/asf_heat_pump_suitability/pipeline/prepare_features/protected_areas.py @@ -0,0 +1,110 @@ +import geopandas as gpd +import pandas as pd +import numpy as np +import polars as pl +from asf_heat_pump_suitability import config +from asf_heat_pump_suitability.getters import get_datasets + + +def generate_df_uprn_in_cons_area(gdf: gpd.GeoDataFrame) -> pl.DataFrame: + """ + Generate dataframe of UPRNs located within building conservation areas in England and Wales. + + Args: + gdf (pl.DataFrame): dataframe with point geometries per UPRN in BNG + + Returns: + pl.DataFrame: EPC UPRNs in building conservation areas in England and Wales + """ + cons_areas_gdf = transform_gdf_building_cons_areas() + + gdf = gdf.sjoin( + cons_areas_gdf, how="inner", predicate="intersects" + ).drop_duplicates(subset="UPRN") + + return pl.from_pandas(gdf[["UPRN", "in_conservation_area_ew"]]) + + +def transform_gdf_building_cons_areas() -> gpd.GeoDataFrame: + """ + Load, transform, and concatenate building conservation areas from England and Wales. Resulting GeoDataFrame is in + CRS EPSG:27700 British National Grid. + + Returns: + gpd.GeoDataFrame: building conservation areas in England and Wales + """ + e_gdf = get_datasets.load_gdf_historic_england_conservation_areas( + columns=["geometry"] + ).to_crs("EPSG:27700") + w_gdf = get_datasets.load_gdf_welsh_gov_conservation_areas(columns=["geometry"]) + + gdf = pd.concat([e_gdf, w_gdf]) + gdf["in_conservation_area_ew"] = True + + return gdf + + +def generate_df_uprn_in_whs(gdf: gpd.GeoDataFrame) -> pl.DataFrame: + """ + Generate dataframe to flag UPRNs located within World Heritage Sites in Scotland. + + Args: + gdf (pl.DataFrame): dataframe with point geometries per UPRN in BNG + + Returns: + pl.DataFrame: EPC UPRNs with flag for World Heritage Sites in Scotland + """ + whs_gdf = load_transform_gdf_scottish_world_heritage_sites() + + gdf = gdf.sjoin(whs_gdf, how="left", predicate="intersects").drop_duplicates( + subset="UPRN" + ) + gdf["in_world_heritage_site_s"] = gdf["in_world_heritage_site_s"].fillna(False) + + return pl.from_pandas(gdf[["UPRN", "in_world_heritage_site_s"]]) + + +def load_transform_gdf_scottish_world_heritage_sites() -> gpd.GeoDataFrame: + """ + Load and transform Scottish World Heritage Sites geospatial data. CRS EPSG:27700, British National Grid. + + Returns: + gpd.GeoDataFrame: Scottish World Heritage Sites + """ + gdf = gpd.read_file( + config["data_source"]["S_historic_environment_scotland_world_heritage_sites"], + columns=["geometry"], + ) + gdf["in_world_heritage_site_s"] = True + + return gdf + + +def generate_df_conservation_area_data_availability( + ladcd_col: str = "LAD23CD", +) -> pl.DataFrame: + """ + Generate dataframe of UK local authority districts (LADs) with indicator of building conservation area data + availability. + + Args: + ladcd_col (str): name of column in local authority district (LAD) boundaries file with LAD codes + + Returns: + pl.DataFrame: building conservation area data availability per LAD in the UK + """ + cons_areas_gdf = transform_gdf_building_cons_areas() + council_bounds = get_datasets.load_gdf_ons_council_bounds() + + # Join conservation areas to their councils + df = council_bounds.sjoin(cons_areas_gdf, how="left", predicate="intersects")[ + [ladcd_col, "in_conservation_area_ew"] + ].replace("No data available for publication by HE", np.nan) + + df = df.groupby(ladcd_col).agg({"in_conservation_area_ew": "count"}) + df["lad_conservation_area_data_available_ew"] = df[ + "in_conservation_area_ew" + ].astype(bool) + df = df.drop(columns=["in_conservation_area_ew"]).reset_index() + + return pl.from_pandas(df) diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py b/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py index f45fe89..5c119a1 100644 --- a/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py +++ b/asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py @@ -17,7 +17,7 @@ import argparse from datetime import datetime from asf_heat_pump_suitability.pipeline.prepare_features import ( - conservation_areas, + protected_areas, epc, garden_space_avg, lat_lon, @@ -85,10 +85,11 @@ def parse_arguments() -> argparse.Namespace: logging.info("Adding lat/lon data to EPC") uprn_latlon_df = lat_lon.transform_df_osopen_uprn_latlon() epc_df = epc_df.join(uprn_latlon_df, how="left", on="UPRN") + epc_gdf = lat_lon.generate_gdf_uprn_coords(epc_df, usecols=["UPRN", "lad_code"]) # Replace `lad_code` from postcode with `lad_code` from geospatial join and postcode logging.info("Adding LAD code with geospatial join") - uprn_lad_df = output_areas.sjoin_df_uprn_lad_code(epc_df) + uprn_lad_df = output_areas.sjoin_df_uprn_lad_code(epc_gdf) epc_df = epc_df.drop("lad_code").join(uprn_lad_df, how="left", on="UPRN") # Join new features to EPC dataset @@ -104,21 +105,24 @@ def parse_arguments() -> argparse.Namespace: ) # Add feature: building conservation area flag - logging.info("Adding building conservation area flag") + logging.info("Adding building conservation area England and Wales flag") # Get UPRNs in building conservation areas - uprns_in_cons_area_df = conservation_areas.generate_df_uprn_to_cons_area(epc_df) + uprns_in_cons_area_df = protected_areas.generate_df_uprn_in_cons_area(epc_gdf) epc_df = epc_df.join(uprns_in_cons_area_df, how="left", on="UPRN") # Label local authorities with missing building conservation area data - lad_cons_areas_df = ( - conservation_areas.generate_df_conservation_area_data_availability( - ladcd_col="LAD23CD" - ) + lad_cons_areas_df = protected_areas.generate_df_conservation_area_data_availability( + ladcd_col="LAD23CD" ) epc_df = epc_df.join( lad_cons_areas_df, how="left", left_on="lad_code", right_on="LAD23CD" ) + # Add feature: World Heritage Site flag + logging.info("Adding World Heritage Site Scotland flag") + uprns_in_whs_df = protected_areas.generate_df_uprn_in_whs(epc_gdf) + epc_df = epc_df.join(uprns_in_whs_df, how="left", on="UPRN") + # Add feature: property density logging.info("Adding number of households data to EPC") lsoa_number_of_households_df = ( @@ -145,20 +149,9 @@ def parse_arguments() -> argparse.Namespace: epc_df = off_gas.add_off_gas_feature(epc_df, off_gas_postcodes) # Add feature: listed buildings data - logging.info("Loading listed buildings for England") - e_listed_buildings_df = listed_buildings.transform_gdf_listed_buildings("England") - e_listed_buildings_df = listed_buildings.sjoin_df_epc_with_listed_buildings( - epc_df, e_listed_buildings_df - ) - - logging.info("Loading listed buildings for Wales") - w_listed_buildings_df = listed_buildings.transform_gdf_listed_buildings("Wales") - w_listed_buildings_df = listed_buildings.sjoin_df_epc_with_listed_buildings( - epc_df, w_listed_buildings_df - ) - - listed_buildings_df = pl.concat( - [e_listed_buildings_df, w_listed_buildings_df], how="vertical" + logging.info("Adding listed buildings to EPC") + listed_buildings_df = listed_buildings.generate_df_epc_listed_buildings( + epc_df=epc_df ) epc_df = epc_df.join(listed_buildings_df, how="left", on="UPRN") diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_download_inspire.py b/asf_heat_pump_suitability/pipeline/run_scripts/run_download_inspire.py new file mode 100644 index 0000000..a5b822b --- /dev/null +++ b/asf_heat_pump_suitability/pipeline/run_scripts/run_download_inspire.py @@ -0,0 +1,30 @@ +""" +Script to download INSPIRE files for Scotland from ROS webpage and save to S3 asf-heat-pump-suitability bucket. +""" + +from bs4 import BeautifulSoup +import requests +import regex as re +import boto3 +from asf_heat_pump_suitability import config +from asf_heat_pump_suitability.getters import base_getters + + +if __name__ == "__main__": + ros_url = config["data_source"]["S_ros_inspire_url"] + bucket = "asf-heat-pump-suitability" + + page = requests.get(ros_url) + soup = BeautifulSoup(page.content, "html.parser") + + pattern = re.compile(f'https:.+?(?=")') + url = pattern.search(soup.find("script", string=pattern).text).group(0) + ids = {v.contents[0]: v["value"] for v in soup.find_all("option") if v["value"]} + download_urls = {k: url + v for k, v in ids.items()} + + s3 = boto3.client("s3") + for area, url in download_urls.items(): + content = base_getters.get_content_from_url(url) + s3.upload_fileobj( + content, bucket, f"source_data/inspire_zips_scotland/{area}.zip" + ) diff --git a/asf_heat_pump_suitability/pipeline/suitability/calculate_suitability.py b/asf_heat_pump_suitability/pipeline/suitability/calculate_suitability.py index f08b1fb..82ded7c 100644 --- a/asf_heat_pump_suitability/pipeline/suitability/calculate_suitability.py +++ b/asf_heat_pump_suitability/pipeline/suitability/calculate_suitability.py @@ -190,14 +190,6 @@ def get_enhanced_epc(path) -> pl.DataFrame: df = df.filter( ~pl.col("UPRN").str.contains("dummy"), pl.col("COUNTRY") != "Scotland" ) - - df = df.with_columns( - pl.when(pl.col("listed_building_grade").is_null()) - .then(False) - .otherwise(True) - .alias("listed_building"), - ) - return df diff --git a/requirements.txt b/requirements.txt index 8cbe8f1..9cb31b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ boto3==1.21.21 tenacity numpy fiona +bs4