nestauk · crispy-wonton · Sep 25, 2024 · Sep 25, 2024 · Sep 26, 2024 · Sep 26, 2024
diff --git a/asf_heat_pump_suitability/config/base.yaml b/asf_heat_pump_suitability/config/base.yaml
@@ -19,7 +19,10 @@ data_source:
   UK_spa_offgasgrid: "s3://asf-heat-pump-suitability/source_data/2024_vMar2024_SPA_offgaspostcode_UK.xlsx"
   E_historicengland_listed_buildings: "s3://asf-heat-pump-suitability/source_data/Jun2024_vJul2024_HistoricEngland_listedbuilding_E.gpkg"
   W_cadw_listed_buildings: "s3://asf-heat-pump-suitability/source_data/May2024_vMay2024_Cadw_listedbuilding_W.gpkg"
+  S_scottish_gov_listed_buildings: "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings.shp"
   EW_ons_lsoa_lad_lookup: "s3://asf-heat-pump-suitability/source_data/2021_vApr2023_ons_lsoa_to_lad_lookup_EW.csv"
+  S_historic_environment_scotland_world_heritage_sites: "s3://asf-heat-pump-suitability/source_data/WHS/World_Heritage_Sites.shp"
+  S_ros_inspire_url: "https://ros.locationcentre.co.uk/inspire/"
 usecols:
   epc:
     - COUNTRY

diff --git a/asf_heat_pump_suitability/getters/base_getters.py b/asf_heat_pump_suitability/getters/base_getters.py
@@ -19,7 +19,7 @@ def get_df_from_excel_url(url: str, **kwargs) -> pl.DataFrame:
     Returns
         pl.DataFrame: dataframe from Excel file
     """
-    content = _get_content_from_url(url)
+    content = get_content_from_url(url)
     df = pl.read_excel(content, **kwargs)
 
     return df
@@ -37,7 +37,7 @@ def get_df_from_zip_url(url: str, extract_file: str, **kwargs) -> pl.DataFrame:
     Returns:
         pl.DataFrame: dataset from ZIP file
     """
-    content = _get_content_from_url(url)
+    content = get_content_from_url(url)
     df = pl.read_csv(ZipFile(content).open(name=extract_file), **kwargs)
 
     return df
@@ -107,7 +107,7 @@ def get_content_from_s3_path(path: str) -> bytes:
     return content
 
 
-def _get_content_from_url(url: str) -> BytesIO:
+def get_content_from_url(url: str) -> BytesIO:
     """
     Get BytesIO stream from URL.
     Args

diff --git a/asf_heat_pump_suitability/getters/get_datasets.py b/asf_heat_pump_suitability/getters/get_datasets.py
@@ -248,7 +248,7 @@ def load_gdf_listed_buildings(nation: str, **kwargs) -> gpd.GeoDataFrame:
     Get raw Listed Buildings polygons dataset for specified nation. CRS EPSG:27700, British National Grid.
 
     Args:
-        nation (str): UK nation to load listed buildings data for. Options: "England"; "Wales".
+        nation (str): nation to load listed buildings data for. Options: "England"; "Scotland", "Wales".
         **kwargs for `gpd.read_file()`
 
     Returns:
@@ -260,6 +260,12 @@ def load_gdf_listed_buildings(nation: str, **kwargs) -> gpd.GeoDataFrame:
         )
     elif nation.lower() == "wales":
         gdf = gpd.read_file(config["data_source"]["W_cadw_listed_buildings"], **kwargs)
+    elif nation.lower() == "scotland":
+        gdf = gpd.read_file(
+            config["data_source"]["S_scottish_gov_listed_buildings"], **kwargs
+        )
     else:
-        raise ValueError("Please set `nation` to either 'England' or 'Wales'.")
+        raise ValueError(
+            "Please set `nation` to either 'England', 'Scotland', or 'Wales'."
+        )
     return gdf
diff --git a/asf_heat_pump_suitability/notebooks/20240926_explore_scotland_listed_buildings.py b/asf_heat_pump_suitability/notebooks/20240926_explore_scotland_listed_buildings.py
@@ -0,0 +1,111 @@
+# %% [markdown]
+# # Calculate threshold for nearest neighbour search
+#
+# Scotland and Wales listed buildings data is available as point geometries rather than polygons. We also have point geometries for UPRNs in the EPC dataset. In order to join EPC UPRNs to listed buildings, we need to use a nearest neighbour search. This notebook uses ground-truth listed building polygon data for Scotland to identify the threshold of distance in metres for determining whether a UPRN is located within a listed building.
+
+# %%
+import geopandas as gpd
+import polars as pl
+import matplotlib.pyplot as plt
+
+from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon
+
+# %%
+# Listed building point geoms Scotland
+points_gdf = gpd.read_file(
+    "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings.shp"
+)
+points_gdf = points_gdf[["ENT_REF", "ENT_TITLE", "geometry"]]
+
+# %%
+epc = pl.read_parquet(
+    "s3://asf-heat-pump-suitability/outputs/2023Q4/20240904_2023_Q4_EPC_weighted_features_gardens.parquet"
+)
+
+# %%
+epc_gdf = epc.filter(pl.col("COUNTRY") == "Scotland")
+epc_gdf = lat_lon.generate_gdf_uprn_coords(epc_gdf, usecols=["UPRN", "COUNTRY"])
+
+# %% [markdown]
+# ## Compare to ground truth
+
+# %%
+# Listed building boundaries for Scotland (limited dataset)
+boundaries = gpd.read_file(
+    "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings_boundaries.shp"
+)
+
+# %%
+# Join listed building polygons to listed building points
+boundaries = boundaries[["DES_REF", "DES_TITLE", "geometry"]]
+ground_truth = boundaries.sjoin(points_gdf, how="inner", predicate="intersects").drop(
+    columns=["index_right"]
+)
+
+# %%
+# Join listed buildings to EPC using polygons to identify true matches
+gdf = epc_gdf.sjoin(
+    boundaries[boundaries["DES_REF"].isin(ground_truth["DES_REF"])],
+    how="left",
+    predicate="intersects",
+).drop(columns=["index_right"])
+
+# Calculate distance from nearest listed building for each EPC record
+gdf = gdf.sjoin_nearest(
+    points_gdf[points_gdf["ENT_REF"].isin(ground_truth["ENT_REF"])],
+    how="left",
+    max_distance=500,
+    distance_col="distance_from_nearest_listed_m",
+)
+df = gdf.drop(columns=["geometry"])
+
+df = pl.from_pandas(df)
+df = df.with_columns(
+    pl.when(pl.col("DES_REF").is_not_null())
+    .then(True)
+    .otherwise(False)
+    .alias("true_match")
+)
+df = df.filter(pl.col("distance_from_nearest_listed_m").is_not_null())
+df.head()
+
+# %%
+# Visualise distance for true matches vs non-matches
+fig, axs = plt.subplots(1, 2, figsize=(10, 5))
+
+axs[0].boxplot(df.filter(pl.col("true_match"))["distance_from_nearest_listed_m"])
+axs[0].set_title("True matches")
+axs[0].set_ylabel("Distance from nearest listed building (m)")
+
+axs[1].boxplot(df.filter(~pl.col("true_match"))["distance_from_nearest_listed_m"])
+axs[1].set_title("Not matches")
+plt.suptitle("Distance from nearest listed building point geom (m)")
+
+
+# %%
+fig, axs = plt.subplots(1, 1, figsize=(10, 5))
+
+axs.boxplot(
+    df.filter(pl.col("true_match"), pl.col("distance_from_nearest_listed_m") <= 20)[
+        "distance_from_nearest_listed_m"
+    ]
+)
+axs.set_title("True matches")
+axs.set_ylabel("Distance from nearest listed building (m)")
+
+# %%
+df.filter(pl.col("true_match"))["distance_from_nearest_listed_m"].describe()
+
+# %% [markdown]
+# ## Test threshold distance
+
+# %%
+test = epc_gdf.sjoin_nearest(
+    points_gdf,
+    how="inner",
+    max_distance=5,
+    distance_col="distance_from_nearest_listed_m",
+)
+
+# %%
+test.shape
diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/conservation_areas.py b/asf_heat_pump_suitability/pipeline/prepare_features/conservation_areas.py
diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py b/asf_heat_pump_suitability/pipeline/prepare_features/listed_buildings.py
@@ -7,38 +7,60 @@
 from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon
 
 
+def generate_df_epc_listed_buildings(
+    epc_df: pl.DataFrame, nations: list = ["England", "Scotland", "Wales"]
+) -> pl.DataFrame:
+    """
+    Generate dataframe of listed buildings in EPC data in specified nation(s).
+
+    Args:
+        epc_df (pl.DataFrame): EPC dataset with x, y coordinates per UPRN
+        nations (list): nation(s) to load listed buildings data for. Options: "England"; "Scotland"; "Wales".
+
+    Returns:
+        pl.DataFrame: EPC UPRNs in listed buildings
+    """
+    dfs = []
+    for nation in nations:
+        logging.info(f"Loading listed buildings data for {nation}")
+        gdf = transform_gdf_listed_buildings(nation)
+        dfs.append(chunk_sjoin_df_epc_listed_buildings(epc_df, gdf))
+
+    return pl.concat(dfs, how="vertical")
+
+
 def transform_gdf_listed_buildings(nation: str) -> gpd.GeoDataFrame:
     """
     Load and transform listed buildings polygons dataset for specified nation.
 
     Args:
-        nation (str): UK nation to load listed buildings data for. Options: "England"; "Wales".
+        nation (str): nation to load listed buildings data for. Options: "England"; "Scotland"; "Wales".
 
     Returns:
         gpd.GeoDataFrame: listed buildings dataset for specified nation with grade and geometry columns.
     """
-    gdf = get_datasets.load_gdf_listed_buildings(nation, columns=["Grade", "geometry"])
-    gdf = gdf.drop_duplicates(subset="geometry").rename(
-        columns={"Grade": "listed_building_grade"}
-    )
+    gdf = get_datasets.load_gdf_listed_buildings(nation, columns=["geometry"])
+    gdf = gdf.drop_duplicates(subset="geometry")
+    gdf["listed_building"] = True
 
     return gdf
 
 
-def sjoin_df_epc_with_listed_buildings(
+def chunk_sjoin_df_epc_listed_buildings(
     epc_df: pl.DataFrame,
     listed_buildings_gdf: gpd.GeoDataFrame,
     chunk_size: int = 100000,
 ) -> pl.DataFrame:
     """
-    Spatial join EPC UPRNs with listed buildings polygons to return dataframe of EPC UPRNs which are located in listed
-    buildings, along with the building grade.
+    Chunk EPC data and spatial join EPC UPRNs with listed buildings.
+
     Args:
         epc_df (pl.DataFrame): Enhanced EPC dataset with X and Y coordinates
-        listed_buildings_gdf (gpd.GeoDataFrame): listed buildings polygons dataset
+        listed_buildings_gdf (gpd.GeoDataFrame): listed buildings GeoDataFrame. Can be points / polygons.
         chunk_size (int): number of EPC rows in each partition. Default 100,000
+
     Returns:
-        pl.DataFrame: EPC UPRNs with listed buildings grade
+        pl.DataFrame: EPC UPRNs in listed buildings
     """
     partitions = (
         epc_df.select(["UPRN", "X_COORDINATE", "Y_COORDINATE"])
@@ -47,18 +69,57 @@ def sjoin_df_epc_with_listed_buildings(
     )
 
     dfs = []
-
     # Group based on the created index
     data_partitioned = epc_df.with_columns(partitions).partition_by("chunk_id")
     logging.info(f"Adding listed buildings to EPC in {len(data_partitioned)} chunks")
     for epc_chunk in tqdm(data_partitioned):
-        epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_chunk)[["UPRN", "geometry"]]
-        df = epc_gdf.sjoin(listed_buildings_gdf, how="inner", predicate="intersects")[
-            ["UPRN", "listed_building_grade"]
-        ].drop_duplicates(subset="UPRN")
-
+        df = sjoin_df_epc_listed_buildings(epc_chunk, listed_buildings_gdf)
         dfs.append(df)
 
     df = pl.from_pandas(pd.concat(dfs))
 
+    return df.select(["UPRN", "listed_building"])
+
+
+def sjoin_df_epc_listed_buildings(
+    epc_df: pl.DataFrame, listed_buildings_gdf: gpd.GeoDataFrame, distance: float = 5
+) -> pd.DataFrame:
+    """
+    Spatial join EPC UPRNs with listed buildings using `geopandas.GeoDataFrame.sjoin_nearest` where Point or MultiPoint
+    geometries detected, and `geopandas.GeoDataFrame.sjoin` where Polygons or MultiPolygons detected.
+
+    Args:
+        epc_df (pl.DataFrame): EPC dataset with X and Y coordinates per UPRN
+        listed_buildings_gdf (gpd.GeoDataFrame): listed buildings data
+        distance (float): maximum distance (m) within which to query for nearest geometry where `sjoin_nearest` used.
+                          Must be greater than 0. Default 5.
+
+    Returns:
+        pd.DataFrame: EPC UPRNs in listed buildings
+    """
+    epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_df, usecols=["UPRN"])
+    if any(
+        [
+            expr in listed_buildings_gdf.geom_type.unique()
+            for expr in ["Point", "MultiPoint"]
+        ]
+    ):
+        df = epc_gdf.sjoin_nearest(
+            listed_buildings_gdf, how="inner", max_distance=distance
+        )[["UPRN", "listed_building"]].drop_duplicates(subset="UPRN")
+    elif any(
+        [
+            expr in listed_buildings_gdf.geom_type.unique()
+            for expr in ["Polygon", "MultiPolygon"]
+        ]
+    ):
+        df = epc_gdf.sjoin(listed_buildings_gdf, how="inner", predicate="intersects")[
+            ["UPRN", "listed_building"]
+        ].drop_duplicates(subset="UPRN")
+    else:
+        raise ValueError(
+            f"Listed buildings GeoDataFrame does not have appropriate geometries for sjoin. "
+            f"Geometries required: [Multi]Point or [Multi]Polygon. "
+            f"Geometries found: {listed_buildings_gdf.geom_type.unique()}"
+        )
     return df