Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

66 scotland features #70

Draft
wants to merge 15 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions asf_heat_pump_suitability/config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ data_source:
UK_spa_offgasgrid: "s3://asf-heat-pump-suitability/source_data/2024_vMar2024_SPA_offgaspostcode_UK.xlsx"
E_historicengland_listed_buildings: "s3://asf-heat-pump-suitability/source_data/Jun2024_vJul2024_HistoricEngland_listedbuilding_E.gpkg"
W_cadw_listed_buildings: "s3://asf-heat-pump-suitability/source_data/May2024_vMay2024_Cadw_listedbuilding_W.gpkg"
S_scottish_gov_listed_buildings: "s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings.shp"
EW_ons_lsoa_lad_lookup: "s3://asf-heat-pump-suitability/source_data/2021_vApr2023_ons_lsoa_to_lad_lookup_EW.csv"
S_historic_environment_scotland_world_heritage_sites: "s3://asf-heat-pump-suitability/source_data/WHS/World_Heritage_Sites.shp"
S_ros_inspire_url: "https://ros.locationcentre.co.uk/inspire/"
usecols:
epc:
- COUNTRY
Expand Down
6 changes: 3 additions & 3 deletions asf_heat_pump_suitability/getters/base_getters.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_df_from_excel_url(url: str, **kwargs) -> pl.DataFrame:
Returns
pl.DataFrame: dataframe from Excel file
"""
content = _get_content_from_url(url)
content = get_content_from_url(url)
df = pl.read_excel(content, **kwargs)

return df
Expand All @@ -37,7 +37,7 @@ def get_df_from_zip_url(url: str, extract_file: str, **kwargs) -> pl.DataFrame:
Returns:
pl.DataFrame: dataset from ZIP file
"""
content = _get_content_from_url(url)
content = get_content_from_url(url)
df = pl.read_csv(ZipFile(content).open(name=extract_file), **kwargs)

return df
Expand Down Expand Up @@ -107,7 +107,7 @@ def get_content_from_s3_path(path: str) -> bytes:
return content


def _get_content_from_url(url: str) -> BytesIO:
def get_content_from_url(url: str) -> BytesIO:
"""
Get BytesIO stream from URL.
Args
Expand Down
10 changes: 8 additions & 2 deletions asf_heat_pump_suitability/getters/get_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def load_gdf_listed_buildings(nation: str, **kwargs) -> gpd.GeoDataFrame:
Get raw Listed Buildings polygons dataset for specified nation. CRS EPSG:27700, British National Grid.

Args:
nation (str): UK nation to load listed buildings data for. Options: "England"; "Wales".
nation (str): nation to load listed buildings data for. Options: "England"; "Scotland", "Wales".
**kwargs for `gpd.read_file()`

Returns:
Expand All @@ -260,6 +260,12 @@ def load_gdf_listed_buildings(nation: str, **kwargs) -> gpd.GeoDataFrame:
)
elif nation.lower() == "wales":
gdf = gpd.read_file(config["data_source"]["W_cadw_listed_buildings"], **kwargs)
elif nation.lower() == "scotland":
gdf = gpd.read_file(
config["data_source"]["S_scottish_gov_listed_buildings"], **kwargs
)
else:
raise ValueError("Please set `nation` to either 'England' or 'Wales'.")
raise ValueError(
"Please set `nation` to either 'England', 'Scotland', or 'Wales'."
)
return gdf
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# %% [markdown]
# # Calculate threshold for nearest neighbour search
#
# Scotland and Wales listed buildings data is available as point geometries rather than polygons. We also have point geometries for UPRNs in the EPC dataset. In order to join EPC UPRNs to listed buildings, we need to use a nearest neighbour search. This notebook uses ground-truth listed building polygon data for Scotland to identify the threshold of distance in metres for determining whether a UPRN is located within a listed building.

# %%
import geopandas as gpd
import polars as pl
import matplotlib.pyplot as plt

from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon

# %%
# Listed building point geoms Scotland
points_gdf = gpd.read_file(
"s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings.shp"
)
points_gdf = points_gdf[["ENT_REF", "ENT_TITLE", "geometry"]]

# %%
epc = pl.read_parquet(
"s3://asf-heat-pump-suitability/outputs/2023Q4/20240904_2023_Q4_EPC_weighted_features_gardens.parquet"
)

# %%
epc_gdf = epc.filter(pl.col("COUNTRY") == "Scotland")
epc_gdf = lat_lon.generate_gdf_uprn_coords(epc_gdf, usecols=["UPRN", "COUNTRY"])

# %% [markdown]
# ## Compare to ground truth

# %%
# Listed building boundaries for Scotland (limited dataset)
boundaries = gpd.read_file(
"s3://asf-heat-pump-suitability/source_data/lb_scotland/Listed_Buildings_boundaries.shp"
)

# %%
# Join listed building polygons to listed building points
boundaries = boundaries[["DES_REF", "DES_TITLE", "geometry"]]
ground_truth = boundaries.sjoin(points_gdf, how="inner", predicate="intersects").drop(
columns=["index_right"]
)

# %%
# Join listed buildings to EPC using polygons to identify true matches
gdf = epc_gdf.sjoin(
boundaries[boundaries["DES_REF"].isin(ground_truth["DES_REF"])],
how="left",
predicate="intersects",
).drop(columns=["index_right"])

# Calculate distance from nearest listed building for each EPC record
gdf = gdf.sjoin_nearest(
points_gdf[points_gdf["ENT_REF"].isin(ground_truth["ENT_REF"])],
how="left",
max_distance=500,
distance_col="distance_from_nearest_listed_m",
)
df = gdf.drop(columns=["geometry"])

df = pl.from_pandas(df)
df = df.with_columns(
pl.when(pl.col("DES_REF").is_not_null())
.then(True)
.otherwise(False)
.alias("true_match")
)
df = df.filter(pl.col("distance_from_nearest_listed_m").is_not_null())
df.head()

# %%
# Visualise distance for true matches vs non-matches
fig, axs = plt.subplots(1, 2, figsize=(10, 5))

axs[0].boxplot(df.filter(pl.col("true_match"))["distance_from_nearest_listed_m"])
axs[0].set_title("True matches")
axs[0].set_ylabel("Distance from nearest listed building (m)")

axs[1].boxplot(df.filter(~pl.col("true_match"))["distance_from_nearest_listed_m"])
axs[1].set_title("Not matches")
plt.suptitle("Distance from nearest listed building point geom (m)")


# %%
fig, axs = plt.subplots(1, 1, figsize=(10, 5))

axs.boxplot(
df.filter(pl.col("true_match"), pl.col("distance_from_nearest_listed_m") <= 20)[
"distance_from_nearest_listed_m"
]
)
axs.set_title("True matches")
axs.set_ylabel("Distance from nearest listed building (m)")

# %%
df.filter(pl.col("true_match"))["distance_from_nearest_listed_m"].describe()

# %% [markdown]
# ## Test threshold distance

# %%
test = epc_gdf.sjoin_nearest(
points_gdf,
how="inner",
max_distance=5,
distance_col="distance_from_nearest_listed_m",
)

# %%
test.shape

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -7,38 +7,60 @@
from asf_heat_pump_suitability.pipeline.prepare_features import lat_lon


def generate_df_epc_listed_buildings(
epc_df: pl.DataFrame, nations: list = ["England", "Scotland", "Wales"]
) -> pl.DataFrame:
"""
Generate dataframe of listed buildings in EPC data in specified nation(s).

Args:
epc_df (pl.DataFrame): EPC dataset with x, y coordinates per UPRN
nations (list): nation(s) to load listed buildings data for. Options: "England"; "Scotland"; "Wales".

Returns:
pl.DataFrame: EPC UPRNs in listed buildings
"""
dfs = []
for nation in nations:
logging.info(f"Loading listed buildings data for {nation}")
gdf = transform_gdf_listed_buildings(nation)
dfs.append(chunk_sjoin_df_epc_listed_buildings(epc_df, gdf))

return pl.concat(dfs, how="vertical")


def transform_gdf_listed_buildings(nation: str) -> gpd.GeoDataFrame:
"""
Load and transform listed buildings polygons dataset for specified nation.

Args:
nation (str): UK nation to load listed buildings data for. Options: "England"; "Wales".
nation (str): nation to load listed buildings data for. Options: "England"; "Scotland"; "Wales".

Returns:
gpd.GeoDataFrame: listed buildings dataset for specified nation with grade and geometry columns.
"""
gdf = get_datasets.load_gdf_listed_buildings(nation, columns=["Grade", "geometry"])
gdf = gdf.drop_duplicates(subset="geometry").rename(
columns={"Grade": "listed_building_grade"}
)
gdf = get_datasets.load_gdf_listed_buildings(nation, columns=["geometry"])
gdf = gdf.drop_duplicates(subset="geometry")
gdf["listed_building"] = True

return gdf


def sjoin_df_epc_with_listed_buildings(
def chunk_sjoin_df_epc_listed_buildings(
epc_df: pl.DataFrame,
listed_buildings_gdf: gpd.GeoDataFrame,
chunk_size: int = 100000,
) -> pl.DataFrame:
"""
Spatial join EPC UPRNs with listed buildings polygons to return dataframe of EPC UPRNs which are located in listed
buildings, along with the building grade.
Chunk EPC data and spatial join EPC UPRNs with listed buildings.

Args:
epc_df (pl.DataFrame): Enhanced EPC dataset with X and Y coordinates
listed_buildings_gdf (gpd.GeoDataFrame): listed buildings polygons dataset
listed_buildings_gdf (gpd.GeoDataFrame): listed buildings GeoDataFrame. Can be points / polygons.
chunk_size (int): number of EPC rows in each partition. Default 100,000

Returns:
pl.DataFrame: EPC UPRNs with listed buildings grade
pl.DataFrame: EPC UPRNs in listed buildings
"""
partitions = (
epc_df.select(["UPRN", "X_COORDINATE", "Y_COORDINATE"])
Expand All @@ -47,18 +69,57 @@ def sjoin_df_epc_with_listed_buildings(
)

dfs = []

# Group based on the created index
data_partitioned = epc_df.with_columns(partitions).partition_by("chunk_id")
logging.info(f"Adding listed buildings to EPC in {len(data_partitioned)} chunks")
for epc_chunk in tqdm(data_partitioned):
epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_chunk)[["UPRN", "geometry"]]
df = epc_gdf.sjoin(listed_buildings_gdf, how="inner", predicate="intersects")[
["UPRN", "listed_building_grade"]
].drop_duplicates(subset="UPRN")

df = sjoin_df_epc_listed_buildings(epc_chunk, listed_buildings_gdf)
dfs.append(df)

df = pl.from_pandas(pd.concat(dfs))

return df.select(["UPRN", "listed_building"])


def sjoin_df_epc_listed_buildings(
epc_df: pl.DataFrame, listed_buildings_gdf: gpd.GeoDataFrame, distance: float = 5
) -> pd.DataFrame:
"""
Spatial join EPC UPRNs with listed buildings using `geopandas.GeoDataFrame.sjoin_nearest` where Point or MultiPoint
geometries detected, and `geopandas.GeoDataFrame.sjoin` where Polygons or MultiPolygons detected.

Args:
epc_df (pl.DataFrame): EPC dataset with X and Y coordinates per UPRN
listed_buildings_gdf (gpd.GeoDataFrame): listed buildings data
distance (float): maximum distance (m) within which to query for nearest geometry where `sjoin_nearest` used.
Must be greater than 0. Default 5.

Returns:
pd.DataFrame: EPC UPRNs in listed buildings
"""
epc_gdf = lat_lon.generate_gdf_uprn_coords(df=epc_df, usecols=["UPRN"])
if any(
[
expr in listed_buildings_gdf.geom_type.unique()
for expr in ["Point", "MultiPoint"]
]
):
df = epc_gdf.sjoin_nearest(
listed_buildings_gdf, how="inner", max_distance=distance
)[["UPRN", "listed_building"]].drop_duplicates(subset="UPRN")
Comment on lines +107 to +109
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this looks right.

Thinking aloud: So you find all the points or geometries of listed properties within a certain distance of each UPRN, then drop duplicates (since you don't care about which listed property they match to, just that they match to at least one).

elif any(
[
expr in listed_buildings_gdf.geom_type.unique()
for expr in ["Polygon", "MultiPolygon"]
]
):
df = epc_gdf.sjoin(listed_buildings_gdf, how="inner", predicate="intersects")[
["UPRN", "listed_building"]
].drop_duplicates(subset="UPRN")
Comment on lines +110 to +118
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume sjoin can be used for both points and polygons, so why use sjoin_nearest at all?

else:
raise ValueError(
f"Listed buildings GeoDataFrame does not have appropriate geometries for sjoin. "
f"Geometries required: [Multi]Point or [Multi]Polygon. "
f"Geometries found: {listed_buildings_gdf.geom_type.unique()}"
)
return df
Loading