Skip to content

Commit

Permalink
Add reusable subsetting API (#55)
Browse files Browse the repository at this point in the history
* Miggrate to selctor API

* Add reusable subset examples

* Add an exmaple for saving and loading selector with pickle

* Add docstrings
  • Loading branch information
mpiannucci authored Jul 31, 2024
1 parent 3f0b2d9 commit 2de3f6f
Show file tree
Hide file tree
Showing 10 changed files with 6,608 additions and 2,438 deletions.
282 changes: 141 additions & 141 deletions examples/fvcom.ipynb

Large diffs are not rendered by default.

2,366 changes: 1,184 additions & 1,182 deletions examples/roms.ipynb

Large diffs are not rendered by default.

1,941 changes: 975 additions & 966 deletions examples/rtofs.ipynb

Large diffs are not rendered by default.

4,286 changes: 4,198 additions & 88 deletions examples/stofs_3d.ipynb

Large diffs are not rendered by default.

66 changes: 55 additions & 11 deletions xarray_subset_grid/grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import numpy as np
import xarray as xr

from xarray_subset_grid.selector import Selector

FLOAT_MAX = np.finfo(np.float32).max
FLOAT_MIN = np.finfo(np.float32).min

Expand Down Expand Up @@ -130,29 +132,71 @@ def subset_vertical_levels(
return ds.sel(selection, method=method)

@abstractmethod
def compute_polygon_subset_selector(
self, ds: xr.Dataset, polygon: list[tuple[float, float]]
) -> Selector:
"""Compute the subset selector for the polygon
This method will return a Selector that can be used to subset the
dataset to the polygon. The selector will contain all the logic needed to subset
a dataset with the same grid type to the polygon. Once returned, the user
can call the select method on the selector to subset the dataset as many times
as needed without recomputing the selector.
"""
raise NotImplementedError()

def compute_bbox_subset_selector(
self,
ds: xr.Dataset,
bbox: tuple[float, float, float, float],
) -> Selector:
"""Compute the subset selector for the bounding box
This method will return a Selector that can be used to subset the
dataset to the bounding box. The selector will contain all the logic needed to subset
a dataset with the same grid type to the bounding box. Once returned, the user
can call the select method on the selector to subset the dataset as many times
as needed without recomputing the selector.
"""
polygon = np.array(
[
[bbox[0], bbox[3]],
[bbox[0], bbox[1]],
[bbox[2], bbox[1]],
[bbox[2], bbox[3]],
[bbox[0], bbox[3]],
]
)
return self.compute_polygon_subset_selector(ds, polygon)

def subset_polygon(
self, ds: xr.Dataset, polygon: list[tuple[float, float]] | np.ndarray
) -> xr.Dataset:
"""Subset the dataset to the grid
This is a conveinence method that will compute the subset selector
for the polygon and then apply it to the dataset. This is useful for
one off subsetting operations where the user does not want to keep
the selector around for later use.
:param ds: The dataset to subset
:param polygon: The polygon to subset to
:return: The subsetted dataset
"""
return ds
selector = self.compute_polygon_subset_selector(ds, polygon)
return selector.select(ds)

def subset_bbox(self, ds: xr.Dataset, bbox: tuple[float, float, float, float]) -> xr.Dataset:
"""Subset the dataset to the bounding box
This is a conveinence method that will compute the subset selector
for the polygon and then apply it to the dataset. This is useful for
one off subsetting operations where the user does not want to keep
the selector around for later use.
:param ds: The dataset to subset
:param bbox: The bounding box to subset to
:return: The subsetted dataset
"""
polygon = np.array(
[
[bbox[0], bbox[3]],
[bbox[0], bbox[1]],
[bbox[2], bbox[1]],
[bbox[2], bbox[3]],
[bbox[0], bbox[3]],
]
)
return self.subset_polygon(ds, polygon)
selector = self.compute_bbox_subset_selector(ds, bbox)
return selector.select(ds)
26 changes: 10 additions & 16 deletions xarray_subset_grid/grids/regular_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,9 @@ def data_vars(self, ds: xr.Dataset) -> set[str]:
and "longitude" in var.cf.coordinates
}

def subset_polygon(
self, ds: xr.Dataset, polygon: list[tuple[float, float]] | np.ndarray
) -> xr.Dataset:
"""Subset the dataset to the grid
:param ds: The dataset to subset
:param polygon: The polygon to subset to
:return: The subsetted dataset
"""
def compute_polygon_subset_selector(
self, ds: xr.Dataset, polygon: list[tuple[float, float]]
) -> Selector:
lat = ds.cf["latitude"]
lon = ds.cf["longitude"]

Expand All @@ -111,14 +106,13 @@ def subset_polygon(
polygon_mask = ray_tracing_numpy(x, lat.flat, polygon).reshape(lon.shape)

selector = RegularGridPolygonSelector(polygon, polygon_mask)
return selector.select(ds)
return selector

def subset_bbox(self, ds: xr.Dataset, bbox: tuple[float, float, float, float]) -> xr.Dataset:
"""Subset the dataset to the bounding box
:param ds: The dataset to subset
:param bbox: The bounding box to subset to
:return: The subsetted dataset
"""
def compute_bbox_subset_selector(
self,
ds: xr.Dataset,
bbox: tuple[float, float, float, float],
) -> Selector:
bbox = normalize_bbox_x_coords(ds.cf["longitude"].values, bbox)
selector = RegularGridBBoxSelector(bbox)
return selector.select(ds)
return selector
14 changes: 4 additions & 10 deletions xarray_subset_grid/grids/regular_grid_2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,11 @@ def data_vars(self, ds: xr.Dataset) -> set[str]:
and "longitude" in var.cf.coordinates
}

def subset_polygon(
self, ds: xr.Dataset, polygon: list[tuple[float, float]] | np.ndarray
) -> xr.Dataset:
"""Subset the dataset to the grid
:param ds: The dataset to subset
:param polygon: The polygon to subset to
:return: The subsetted dataset
"""
def compute_polygon_subset_selector(
self, ds: xr.Dataset, polygon: list[tuple[float, float]]
) -> Selector:
lat = ds.cf["latitude"]
lon = ds.cf["longitude"]
subset_mask = compute_2d_subset_mask(lat=lat, lon=lon, polygon=polygon)

selector = RegularGrid2dSelector(polygon=polygon, subset_mask=subset_mask)
return selector.select(ds)
return RegularGrid2dSelector(polygon=polygon, subset_mask=subset_mask)
24 changes: 13 additions & 11 deletions xarray_subset_grid/grids/sgrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@ class SGridSelector(Selector):
def __init__(
self,
polygon: list[tuple[float, float]] | np.ndarray,
grid_topology_key: str,
grid_topology: xr.DataArray,
subset_masks: list[tuple[list[str], xr.DataArray]],
):
super().__init__()
self.polygon = polygon
self._grid_topology_key = grid_topology_key
self._grid_topology = grid_topology
self._subset_masks = subset_masks

def select(self, ds: xr.Dataset) -> xr.Dataset:
Expand All @@ -33,7 +37,6 @@ def select(self, ds: xr.Dataset) -> xr.Dataset:

# Now we can use the mask to subset the data
ds_subset = ds_subset[mask[0]].where(ds_subset.subset_mask, drop=True).drop_encoding()
ds_subset = ds_subset.drop_vars("subset_mask")

# Add the subsetted dataset to the list for merging
ds_out.append(ds_subset)
Expand Down Expand Up @@ -94,14 +97,9 @@ def data_vars(self, ds: xr.Dataset) -> set[str]:

return {var for var in ds.data_vars if not set(ds[var].dims).isdisjoint(dims)}

def subset_polygon(
self, ds: xr.Dataset, polygon: list[tuple[float, float]] | np.ndarray
) -> xr.Dataset:
"""Subset the dataset to the grid
:param ds: The dataset to subset
:param polygon: The polygon to subset to
:return: The subsetted dataset
"""
def compute_polygon_subset_selector(
self, ds: xr.Dataset, polygon: list[tuple[float, float]]
) -> Selector:
grid_topology_key = ds.cf.cf_roles["grid_topology"][0]
grid_topology = ds[grid_topology_key]
dims = _get_sgrid_dim_coord_names(grid_topology)
Expand Down Expand Up @@ -133,8 +131,12 @@ def subset_polygon(

subset_masks.append((vars, subset_mask))

selector = SGridSelector(polygon=polygon, subset_masks=subset_masks)
return selector.select(ds)
return SGridSelector(
polygon=polygon,
grid_topology_key=grid_topology_key,
grid_topology=grid_topology,
subset_masks=subset_masks,
)


def _get_sgrid_dim_coord_names(
Expand Down
15 changes: 4 additions & 11 deletions xarray_subset_grid/grids/ugrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,9 @@ def data_vars(self, ds: xr.Dataset) -> set[str]:

return data_vars

def subset_polygon(
self, ds: xr.Dataset, polygon: list[tuple[float, float]] | np.ndarray
) -> xr.Dataset:
"""Subset the dataset to the grid
:param ds: The dataset to subset
:param polygon: The polygon to subset to
:return: The subsetted dataset
"""
def compute_polygon_subset_selector(
self, ds: xr.Dataset, polygon: list[tuple[float, float]]
) -> Selector:
# For this grid type, we find all nodes that are connected to elements that are inside
# the polygon. To do this, we first find all nodes that are inside the polygon and then
# find all elements that are connected to those nodes.
Expand Down Expand Up @@ -259,7 +254,7 @@ def subset_polygon(
if transpose_face_face_connectivity:
face_face_new = face_face_new.T

selector = UGridSelector(
return UGridSelector(
polygon=polygon,
node_dimension=node_dimension,
selected_nodes=selected_nodes,
Expand All @@ -273,8 +268,6 @@ def subset_polygon(
face_face_connectivity=face_face_new if has_face_face_connectivity else None,
)

return selector.select(ds)


def assign_ugrid_topology(
ds: xr.Dataset,
Expand Down
26 changes: 24 additions & 2 deletions xarray_subset_grid/selector.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,29 @@
from abc import abstractmethod

import xarray as xr


class Selector:
"""Selector class to subset datasets
This is a base method that should be implemented by a subclass
to perform selection on a given xarray dataset with whatever
context or logic is desired by the implementation.
select should return a new xarray dataset that is a subset of the input dataset
and must be implemented by the subclass.
"""

@abstractmethod
def select(self, ds: xr.Dataset) -> xr.Dataset:
"""Perform the selection on the dataset"""
return ds
"""Perform the selection on the dataset
For example, a selector could hold predifined masks to apply to the dataset,
and the select method here would apply those masks to the dataset and return the result.
This workflow is useful because computing the masks can be expensive, and
we want to avoid recomputing them for every dataset that needs to be subsetted. It also
allows datasets that are non standard to be subset using information from manually or
otherwise standardized datasets..
"""
raise NotImplementedError()

0 comments on commit 2de3f6f

Please sign in to comment.