From f7b956b2644a0525efd70cf769b6472e6c3461df Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 10 May 2024 22:52:41 +0800 Subject: [PATCH 01/22] WIP: Add geometry logical type --- src/main/thrift/parquet.thrift | 98 +++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index c928ad66..3911878e 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -270,8 +270,11 @@ struct Statistics { * may set min_value="B", max_value="C". Such more compact values must still be * valid values within the column's logical type. * - * Values are encoded using PLAIN encoding, except that variable-length byte - * arrays do not include a length prefix. + * Values are encoded using PLAIN encoding, except that: + * 1) variable-length byte arrays do not include a length prefix. + * 2) geometry logical type with BoundingBoxOrder uses max_value/min_value pair + * to store the bounding box for the column. Please refer to the definition + * of BoundingBoxOrder for detail. */ 5: optional binary max_value; 6: optional binary min_value; @@ -373,6 +376,69 @@ struct JsonType { struct BsonType { } +/** + * A geometry can be any of the following subtypes. + * The list of geospatial subtypes is taken from the OGC (Open Geospatial Consortium) + * SFA (Simple Feature Access) Part 1- Common Architecture. + */ +enum GeometrySubType { + POINT = 0; + LINESTRING = 1; + POLYGON = 2; + MULTIPOINT = 3; + MULTILINESTRING = 4; + MULTIPOLYGON = 5; + GEOMETRY_COLLECTION = 6; +} + +/** + * Interpretation for edges, i.e. whether the edge between points + * represent a straight cartesian line or the shortest line on the sphere + */ +enum Edges { + PLANAR = 0; + // SPHERICAL = 1; // not supported yet +} + +/** + * Well-Known Binary. This is a well-known and popular binary representation regulated + * by the Open Geospatial Consortium (OGC). + */ +struct WKB {} +/** + * Encoding for geospatial data. + */ +union GeospatialEncoding { + 1: WKB WKB +} + +/** + * Geometry logical type annotation + * + * Allowed for physical types: BINARY (added in 2.11.0) + */ +struct GeometryType { + /** + * The subtype of the geometry. + * If set, all values in the column must be of the same subtype. + * If not set, the column may contain values of any subtype. + */ + 1: optional GeometrySubType subtype; + /** + * The dimension of the geometry. + * For now only 2D geometry is supported and the value must be 2 if set. + */ + 2: optional byte dimension; + /** + * Coordinate Reference System, i.e. mapping of how coordinates refer to + * precise locations on earth. + * For now only OGC:CRS84 is supported. + */ + 3: optional string crs; + 4: required Edges edges; + 5: required GeospatialEncoding encoding; +} + /** * LogicalType annotations to replace ConvertedType. * @@ -403,6 +469,7 @@ union LogicalType { 13: BsonType BSON // use ConvertedType BSON 14: UUIDType UUID // no compatible ConvertedType 15: Float16Type FLOAT16 // no compatible ConvertedType + 16: GeometryType GEOMETRY // no compatible ConvertedType } /** @@ -916,6 +983,8 @@ struct RowGroup { /** Empty struct to signal the order defined by the physical or logical type */ struct TypeDefinedOrder {} +/** Empty struct to signal the order of GEOMETRY logical type */ +struct BoundingBoxOrder {} /** * Union to specify the order used for the min_value and max_value fields for a @@ -925,6 +994,8 @@ struct TypeDefinedOrder {} * Possible values are: * * TypeDefinedOrder - the column uses the order defined by its logical or * physical type (if there is no logical type). + * * BoundingBoxOrder - the column uses the order to build bounding box + * (if the logical type is GEOMETRY). * * If the reader does not support the value of this union, min and max stats * for this column should be ignored. @@ -954,6 +1025,7 @@ union ColumnOrder { * ENUM - unsigned byte-wise comparison * LIST - undefined * MAP - undefined + * GEOMETRY - undefined, as geometry objects cannot be compared directly * * In the absence of logical types, the sort order is determined by the physical type: * BOOLEAN - false, true @@ -982,6 +1054,23 @@ union ColumnOrder { * `-0.0` should be written into the min statistics field. */ 1: TypeDefinedOrder TYPE_ORDER; + + /** + * The order only applies to GEOMETRY logical type. + * + * Please note that geometry objects cannot be compared directly. This order aims to + * provide an approach to build a bounding box for geometry objects in the same page + * or column chunk. + * + * In this order, all 2D geometries are regarded as a collection of coordinate (x, y). + * For example, POINT has one coordinate, LINESTRING has two coordinates, and POLYGON + * might have three or more coordinates. A bounding box is the combination of x_min, + * x_max, y_min, and y_max of all coordinates from all geometry values. For simplexty, + * min_value field in the Statistics/ColumnIndex is encoded as the concatenation of + * PLAIN-encoded DOUBLE-typed x_min and y_min values. Similarly, max_value field is + * encoded as the concatenation of PLAIN-encoded DOUBLE-typed x_max and y_max values. + */ + 2: BoundingBoxOrder BBOX_ORDER; } struct PageLocation { @@ -1052,6 +1141,9 @@ struct ColumnIndex { * Such more compact values must still be valid values within the column's * logical type. Readers must make sure that list entries are populated before * using them by inspecting null_pages. + * + * For GEOMETRY logical type, these values are the bounding box of the column. + * Please refer to the definition of BoundingBoxOrder for detail. */ 2: required list min_values 3: required list max_values @@ -1061,6 +1153,8 @@ struct ColumnIndex { * which direction. This allows readers to perform binary searches in both * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even * if the lists are ordered. + * + * For GEOMETRY type, UNORDERED is used at all times. */ 4: required BoundaryOrder boundary_order From a1472f0a69bab843c7cde247fae350a2e13095c8 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 25 May 2024 23:14:15 +0800 Subject: [PATCH 02/22] address various comments --- src/main/thrift/parquet.thrift | 174 ++++++++++++++++++--------------- 1 file changed, 93 insertions(+), 81 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 3911878e..5f7f647b 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -237,6 +237,38 @@ struct SizeStatistics { 3: optional list definition_level_histogram; } +/** + * Bounding box of geometries in the representation of min/max value pair of + * coordinates from each axis. Values of Z and M are omitted for 2D geometries. + */ +struct BoundingBox { + 1: optional double x_min; + 2: optional double x_max; + 3: optional double y_min; + 4: optional double y_max; + 5: optional double z_min; + 6: optional double z_max; + 7: optional double m_min; + 8: optional double m_max; +} + +/** Statistics specific to GEOMETRY logical type */ +struct GeometryStatistics { + /** Bounding box of geometries */ + 1: optional BoundingBox bbox; + /** Covering of geometries as a list of Google S2 cell ids */ + 2: list s2_cell_ids; + /** Covering of geometries as a list of Uber H3 indices */ + 3: list h3_indices; + /** + * The geometry types of all geometries, or an empty array if they are not + * known. It follows the same rule of `geometry_types` column metadata of + * GeoParquet. Accepted geometry types are: "Point", "LineString", "Polygon", + * "MultiPoint", "MultiLineString", "MultiPolygon", "GeometryCollection". + */ + 4: list geometry_types; +} + /** * Statistics per row group and per page * All fields are optional. @@ -270,11 +302,8 @@ struct Statistics { * may set min_value="B", max_value="C". Such more compact values must still be * valid values within the column's logical type. * - * Values are encoded using PLAIN encoding, except that: - * 1) variable-length byte arrays do not include a length prefix. - * 2) geometry logical type with BoundingBoxOrder uses max_value/min_value pair - * to store the bounding box for the column. Please refer to the definition - * of BoundingBoxOrder for detail. + * Values are encoded using PLAIN encoding, except that variable-length byte + * arrays do not include a length prefix. */ 5: optional binary max_value; 6: optional binary min_value; @@ -282,6 +311,9 @@ struct Statistics { 7: optional bool is_max_value_exact; /** If true, min_value is the actual minimum value for a column */ 8: optional bool is_min_value_exact; + + /** statistics specific to geometry logical type */ + 9: optional GeometryStatistics geometry_stats; } /** Empty structs to use as logical type annotations */ @@ -377,66 +409,69 @@ struct BsonType { } /** - * A geometry can be any of the following subtypes. - * The list of geospatial subtypes is taken from the OGC (Open Geospatial Consortium) - * SFA (Simple Feature Access) Part 1- Common Architecture. + * Phyiscal type and encoding for the geometry type. */ -enum GeometrySubType { - POINT = 0; - LINESTRING = 1; - POLYGON = 2; - MULTIPOINT = 3; - MULTILINESTRING = 4; - MULTIPOLYGON = 5; - GEOMETRY_COLLECTION = 6; -} - -/** - * Interpretation for edges, i.e. whether the edge between points - * represent a straight cartesian line or the shortest line on the sphere - */ -enum Edges { - PLANAR = 0; - // SPHERICAL = 1; // not supported yet -} +enum GeometryEncoding { + /** + * Allowed for phyiscal type: BYTE_ARRAY. + * + * Well-known binary (WKB) representations of geometries. It supports 2D or + * 3D geometries of the standard geometry types (Point, LineString, Polygon, + * MultiPoint, MultiLineString, MultiPolygon, and GeometryCollection). This + * is the preferred option for maximum portability. + * + * This encoding enables GeometryStatistics to be set in the column chunk + * and page index. + */ + WKB = 0; -/** - * Well-Known Binary. This is a well-known and popular binary representation regulated - * by the Open Geospatial Consortium (OGC). - */ -struct WKB {} -/** - * Encoding for geospatial data. - */ -union GeospatialEncoding { - 1: WKB WKB + /** + * Encodings from POINT to MULTIPOLYGON below are specialized for single + * geometry type and inspired by GeoArrow (https://geoarrow.org/format.html) + * native encodings. It uses the separated (struct) representation of + * coordinates for single-geometry type encodings because this encoding + * results in useful column statistics when row groups and/or files contain + * related features. + * + * WARNING: GeometryStatistics cannot be enabled for these encodings because + * only leaf columns can have column statistics and page index. + * + * The actual coordinates of the geometries MUST be stored as native numbers, + * i.e. using the DOUBLE type in a (repeated) group of fields (exact + * repetition depending on the geometry type). + * + * For the POINT encoding, this results in a struct of two fields for x and y + * coordinates (in case of 2D geometries): + * optional group geometry { + * required double x; + * required double y; + * } + * + * For more detail, please refer to link below: + * https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#encoding + */ + POINT = 1; + LINESTRING = 2; + POLYGON = 3; + MULTIPOINT = 4; + MULTILINESTRING = 5; + MULTIPOLYGON = 6; } /** - * Geometry logical type annotation - * - * Allowed for physical types: BINARY (added in 2.11.0) + * Geometry logical type annotation (added in 2.11.0) */ struct GeometryType { /** - * The subtype of the geometry. - * If set, all values in the column must be of the same subtype. - * If not set, the column may contain values of any subtype. + * Phyiscal type and encoding for the geometry type. Please refer to the + * definition of GeometryEncoding for more detail. */ - 1: optional GeometrySubType subtype; + 1: required GeometryEncoding encoding; /** - * The dimension of the geometry. - * For now only 2D geometry is supported and the value must be 2 if set. + * Additional informative metadata. + * It can be used by GeoParquet to offload some of the column metadata. */ - 2: optional byte dimension; - /** - * Coordinate Reference System, i.e. mapping of how coordinates refer to - * precise locations on earth. - * For now only OGC:CRS84 is supported. - */ - 3: optional string crs; - 4: required Edges edges; - 5: required GeospatialEncoding encoding; + 2: optional string metadata; } /** @@ -983,8 +1018,6 @@ struct RowGroup { /** Empty struct to signal the order defined by the physical or logical type */ struct TypeDefinedOrder {} -/** Empty struct to signal the order of GEOMETRY logical type */ -struct BoundingBoxOrder {} /** * Union to specify the order used for the min_value and max_value fields for a @@ -994,8 +1027,6 @@ struct BoundingBoxOrder {} * Possible values are: * * TypeDefinedOrder - the column uses the order defined by its logical or * physical type (if there is no logical type). - * * BoundingBoxOrder - the column uses the order to build bounding box - * (if the logical type is GEOMETRY). * * If the reader does not support the value of this union, min and max stats * for this column should be ignored. @@ -1025,7 +1056,7 @@ union ColumnOrder { * ENUM - unsigned byte-wise comparison * LIST - undefined * MAP - undefined - * GEOMETRY - undefined, as geometry objects cannot be compared directly + * GEOMETRY - undefined, use GeometryStatistics instead. * * In the absence of logical types, the sort order is determined by the physical type: * BOOLEAN - false, true @@ -1054,23 +1085,6 @@ union ColumnOrder { * `-0.0` should be written into the min statistics field. */ 1: TypeDefinedOrder TYPE_ORDER; - - /** - * The order only applies to GEOMETRY logical type. - * - * Please note that geometry objects cannot be compared directly. This order aims to - * provide an approach to build a bounding box for geometry objects in the same page - * or column chunk. - * - * In this order, all 2D geometries are regarded as a collection of coordinate (x, y). - * For example, POINT has one coordinate, LINESTRING has two coordinates, and POLYGON - * might have three or more coordinates. A bounding box is the combination of x_min, - * x_max, y_min, and y_max of all coordinates from all geometry values. For simplexty, - * min_value field in the Statistics/ColumnIndex is encoded as the concatenation of - * PLAIN-encoded DOUBLE-typed x_min and y_min values. Similarly, max_value field is - * encoded as the concatenation of PLAIN-encoded DOUBLE-typed x_max and y_max values. - */ - 2: BoundingBoxOrder BBOX_ORDER; } struct PageLocation { @@ -1141,9 +1155,6 @@ struct ColumnIndex { * Such more compact values must still be valid values within the column's * logical type. Readers must make sure that list entries are populated before * using them by inspecting null_pages. - * - * For GEOMETRY logical type, these values are the bounding box of the column. - * Please refer to the definition of BoundingBoxOrder for detail. */ 2: required list min_values 3: required list max_values @@ -1153,8 +1164,6 @@ struct ColumnIndex { * which direction. This allows readers to perform binary searches in both * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even * if the lists are ordered. - * - * For GEOMETRY type, UNORDERED is used at all times. */ 4: required BoundaryOrder boundary_order @@ -1178,6 +1187,9 @@ struct ColumnIndex { * Same as repetition_level_histograms except for definitions levels. **/ 7: optional list definition_level_histograms; + + /** A list containing statistics of GEOMETRY logical type for each page */ + 8: optional list geometry_stats; } struct AesGcmV1 { From 1d583d5d3624b939e262191d9e6d9e7ce9b09bb8 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 27 May 2024 09:32:37 +0800 Subject: [PATCH 03/22] add file level geo stats --- src/main/thrift/parquet.thrift | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 5f7f647b..40ca58da 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -472,6 +472,8 @@ struct GeometryType { * It can be used by GeoParquet to offload some of the column metadata. */ 2: optional string metadata; + /** File-level statistics for geometries */ + 3: optional GeometryStatistics statistics; } /** From 3a23ba1eea2d0ae0207a105e1a078916c5ecee11 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 31 May 2024 11:30:15 +0800 Subject: [PATCH 04/22] address feedback: - remove file-level geo stats - add custom wkb-encoded geometry stats - comment out controversial items --- src/main/thrift/parquet.thrift | 126 ++++++++++++++++++++++++++------- 1 file changed, 99 insertions(+), 27 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 40ca58da..8e1f2b34 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -237,36 +237,97 @@ struct SizeStatistics { 3: optional list definition_level_histogram; } +/** + * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge + * between points represent a straight cartesian line or the shortest line on + * the sphere. + */ +enum Edges { + PLANAR = 0; + SPHERICAL = 1; +} + +/** + * A custom WKB-encoded geometry data to be used in geometry statistics. + * The geometry may be a polygon to encode an s2 or h3 covering to provide + * vendor-agnostic coverings, or an evelope of geometries when a bounding + * box cannot be built (e.g. a geometry has spherical edges, or if an edge + * of geographic coordinates crosses the antimeridian). + */ +struct Geometry { + /** Bytes of a WKB-encoded geometry */ + 1: required binary geometry; + /** + * Edges of the geometry if it is a polygon. It may be different to the + * edges attribute from the GEOMETRY logical type. + */ + 2: optional Edges edges; +} + /** * Bounding box of geometries in the representation of min/max value pair of * coordinates from each axis. Values of Z and M are omitted for 2D geometries. */ struct BoundingBox { - 1: optional double x_min; - 2: optional double x_max; - 3: optional double y_min; - 4: optional double y_max; - 5: optional double z_min; - 6: optional double z_max; - 7: optional double m_min; - 8: optional double m_max; + 1: required double xmin; + 2: required double xmax; + 3: required double ymin; + 4: required double ymax; + 5: optional double zmin; + 6: optional double zmax; + 7: optional double mmin; + 8: optional double mmax; } -/** Statistics specific to GEOMETRY logical type */ -struct GeometryStatistics { - /** Bounding box of geometries */ - 1: optional BoundingBox bbox; +union Envelope { + 1: BoundingBox bbox // A bounding box of geometries if it can be built. + 2: Geometry covering // A covering polygon of geometries if bbox is unavailable. +} + +/** S2 spatial index: http://s2geometry.io/ */ +struct S2Index { + /** Level of S2 cell ids. valid range is [0, 30] */ + 1: required i32 level; /** Covering of geometries as a list of Google S2 cell ids */ - 2: list s2_cell_ids; + 2: required list cell_ids; +} + +/** H3 spatial index: https://h3geo.org */ +struct H3Index { + /** Precision of H3 cell ids. valid range is [0, 15] */ + 1: required i32 precision; /** Covering of geometries as a list of Uber H3 indices */ - 3: list h3_indices; + 2: required list cell_ids; +} + +/** Statistics specific to GEOMETRY logical type */ +struct GeometryStatistics { + /** Envelope of geometries */ + 1: optional Envelope envelope; + /** * The geometry types of all geometries, or an empty array if they are not * known. It follows the same rule of `geometry_types` column metadata of * GeoParquet. Accepted geometry types are: "Point", "LineString", "Polygon", * "MultiPoint", "MultiLineString", "MultiPolygon", "GeometryCollection". + * + * In addition, the following rules are used: + * - In case of 3D geometries, a `" Z"` suffix gets added (e.g. `["Point Z"]`). + * - A list of multiple values indicates that multiple geometry types are + * present (e.g. `["Polygon", "MultiPolygon"]`). + * - An empty array explicitly signals that the geometry types are not known. + * - The geometry types in the list must be unique (e.g. `["Point", "Point"]` + * is not valid). + * + * Please refer to link below for more detail: + * https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 */ - 4: list geometry_types; + 2: optional list geometry_types; + + // S2 and H3 are controversial from the discussion. Now they are commented + // out to show a possible approach for future extension. + // 3: optional S2Index s2; + // 4: optional H3Index h3; } /** @@ -433,9 +494,6 @@ enum GeometryEncoding { * results in useful column statistics when row groups and/or files contain * related features. * - * WARNING: GeometryStatistics cannot be enabled for these encodings because - * only leaf columns can have column statistics and page index. - * * The actual coordinates of the geometries MUST be stored as native numbers, * i.e. using the DOUBLE type in a (repeated) group of fields (exact * repetition depending on the geometry type). @@ -449,13 +507,20 @@ enum GeometryEncoding { * * For more detail, please refer to link below: * https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#encoding + * + * WARNING: GeometryStatistics cannot be enabled for these encodings because + * only leaf columns can have column statistics and page index. In this case, + * the statistics for the leaf columns contain equivalent information to the + * bounding box. */ - POINT = 1; - LINESTRING = 2; - POLYGON = 3; - MULTIPOINT = 4; - MULTILINESTRING = 5; - MULTIPOLYGON = 6; + // Native encodings are controversial from the discussion. Now they are commented + // out to show a possible approach for future extension. + // POINT = 1; + // LINESTRING = 2; + // POLYGON = 3; + // MULTIPOINT = 4; + // MULTILINESTRING = 5; + // MULTIPOLYGON = 6; } /** @@ -467,13 +532,20 @@ struct GeometryType { * definition of GeometryEncoding for more detail. */ 1: required GeometryEncoding encoding; + /** + * Coordinate Reference System, i.e. mapping of how coordinates refer to + * precise locations on earth, e.g. OGC:CRS84 + */ + 2: optional string crs; + /** + * Edges of polygon. + */ + 3: optional Edges edges; /** * Additional informative metadata. * It can be used by GeoParquet to offload some of the column metadata. */ - 2: optional string metadata; - /** File-level statistics for geometries */ - 3: optional GeometryStatistics statistics; + 4: optional string metadata; } /** From 85acff9d99a04061aefe88a3ca198138414c5ec4 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 13 Jun 2024 14:54:42 +0800 Subject: [PATCH 05/22] change naming and remove controversial items --- src/main/thrift/parquet.thrift | 67 +++------------------------------- 1 file changed, 6 insertions(+), 61 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 8e1f2b34..62f0c38e 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -279,31 +279,15 @@ struct BoundingBox { 8: optional double mmax; } -union Envelope { - 1: BoundingBox bbox // A bounding box of geometries if it can be built. - 2: Geometry covering // A covering polygon of geometries if bbox is unavailable. -} - -/** S2 spatial index: http://s2geometry.io/ */ -struct S2Index { - /** Level of S2 cell ids. valid range is [0, 30] */ - 1: required i32 level; - /** Covering of geometries as a list of Google S2 cell ids */ - 2: required list cell_ids; -} - -/** H3 spatial index: https://h3geo.org */ -struct H3Index { - /** Precision of H3 cell ids. valid range is [0, 15] */ - 1: required i32 precision; - /** Covering of geometries as a list of Uber H3 indices */ - 2: required list cell_ids; +struct Covering { + optional BoundingBox bbox // A bounding box of geometries if it can be built. + optional Geometry covering // A covering polygon of geometries if bbox is unavailable. } /** Statistics specific to GEOMETRY logical type */ struct GeometryStatistics { - /** Envelope of geometries */ - 1: optional Envelope envelope; + /** Covering of geometries */ + 1: optional Covering covering; /** * The geometry types of all geometries, or an empty array if they are not @@ -323,11 +307,6 @@ struct GeometryStatistics { * https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 */ 2: optional list geometry_types; - - // S2 and H3 are controversial from the discussion. Now they are commented - // out to show a possible approach for future extension. - // 3: optional S2Index s2; - // 4: optional H3Index h3; } /** @@ -486,41 +465,7 @@ enum GeometryEncoding { */ WKB = 0; - /** - * Encodings from POINT to MULTIPOLYGON below are specialized for single - * geometry type and inspired by GeoArrow (https://geoarrow.org/format.html) - * native encodings. It uses the separated (struct) representation of - * coordinates for single-geometry type encodings because this encoding - * results in useful column statistics when row groups and/or files contain - * related features. - * - * The actual coordinates of the geometries MUST be stored as native numbers, - * i.e. using the DOUBLE type in a (repeated) group of fields (exact - * repetition depending on the geometry type). - * - * For the POINT encoding, this results in a struct of two fields for x and y - * coordinates (in case of 2D geometries): - * optional group geometry { - * required double x; - * required double y; - * } - * - * For more detail, please refer to link below: - * https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#encoding - * - * WARNING: GeometryStatistics cannot be enabled for these encodings because - * only leaf columns can have column statistics and page index. In this case, - * the statistics for the leaf columns contain equivalent information to the - * bounding box. - */ - // Native encodings are controversial from the discussion. Now they are commented - // out to show a possible approach for future extension. - // POINT = 1; - // LINESTRING = 2; - // POLYGON = 3; - // MULTIPOINT = 4; - // MULTILINESTRING = 5; - // MULTIPOLYGON = 6; + // TODO: add native encoding from GeoParquet/GeoArrow } /** From 4b8e1f893d17f0ad44634a624fb08177584eda1b Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 16 Jun 2024 15:30:17 +0800 Subject: [PATCH 06/22] address feedback --- src/main/thrift/parquet.thrift | 39 +++++++++++++++------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 62f0c38e..8729b943 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -240,7 +240,7 @@ struct SizeStatistics { /** * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge * between points represent a straight cartesian line or the shortest line on - * the sphere. + * the sphere. Please note that it only applies to polygons. */ enum Edges { PLANAR = 0; @@ -248,20 +248,17 @@ enum Edges { } /** - * A custom WKB-encoded geometry data to be used in geometry statistics. - * The geometry may be a polygon to encode an s2 or h3 covering to provide - * vendor-agnostic coverings, or an evelope of geometries when a bounding - * box cannot be built (e.g. a geometry has spherical edges, or if an edge - * of geographic coordinates crosses the antimeridian). + * A custom WKB-encoded polygon or multi-polygon to represent a covering of + * geometries. For example, it may be a bounding box, or an evelope of geometries + * when a bounding box cannot be built (e.g. a geometry has spherical edges, or if + * an edge of geographic coordinates crosses the antimeridian). In addition, it can + * also be used to provide vendor-agnostic coverings like S2 or H3 grids. */ -struct Geometry { +struct Covering { /** Bytes of a WKB-encoded geometry */ 1: required binary geometry; - /** - * Edges of the geometry if it is a polygon. It may be different to the - * edges attribute from the GEOMETRY logical type. - */ - 2: optional Edges edges; + /** Edges of the geometry, which is independent of edges from the logical type */ + 2: required Edges edges; } /** @@ -279,15 +276,13 @@ struct BoundingBox { 8: optional double mmax; } -struct Covering { - optional BoundingBox bbox // A bounding box of geometries if it can be built. - optional Geometry covering // A covering polygon of geometries if bbox is unavailable. -} - /** Statistics specific to GEOMETRY logical type */ struct GeometryStatistics { - /** Covering of geometries */ - 1: optional Covering covering; + /** A bounding box of geometries */ + 1: optional BoundingBox bbox; + + /** A covering polygon of geometries */ + 2: optional Covering covering; /** * The geometry types of all geometries, or an empty array if they are not @@ -306,7 +301,7 @@ struct GeometryStatistics { * Please refer to link below for more detail: * https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 */ - 2: optional list geometry_types; + 3: optional list geometry_types; } /** @@ -449,7 +444,7 @@ struct BsonType { } /** - * Phyiscal type and encoding for the geometry type. + * Physical type and encoding for the geometry type. */ enum GeometryEncoding { /** @@ -490,7 +485,7 @@ struct GeometryType { * Additional informative metadata. * It can be used by GeoParquet to offload some of the column metadata. */ - 4: optional string metadata; + 4: optional binary metadata; } /** From 69b59780f38e94aa892b4505f314d4e13c791ce5 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 16 Jun 2024 15:31:20 +0800 Subject: [PATCH 07/22] fix typo --- src/main/thrift/parquet.thrift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 8729b943..2a2457c5 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -448,7 +448,7 @@ struct BsonType { */ enum GeometryEncoding { /** - * Allowed for phyiscal type: BYTE_ARRAY. + * Allowed for physical type: BYTE_ARRAY. * * Well-known binary (WKB) representations of geometries. It supports 2D or * 3D geometries of the standard geometry types (Point, LineString, Polygon, @@ -468,7 +468,7 @@ enum GeometryEncoding { */ struct GeometryType { /** - * Phyiscal type and encoding for the geometry type. Please refer to the + * Physical type and encoding for the geometry type. Please refer to the * definition of GeometryEncoding for more detail. */ 1: required GeometryEncoding encoding; From ea12bd29bdbfc5a4f6f5a8e5a2c279c281f4d257 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 19 Jun 2024 22:17:03 +0800 Subject: [PATCH 08/22] use WKB type code --- src/main/thrift/parquet.thrift | 38 +++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 2a2457c5..801dde3b 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -286,22 +286,32 @@ struct GeometryStatistics { /** * The geometry types of all geometries, or an empty array if they are not - * known. It follows the same rule of `geometry_types` column metadata of - * GeoParquet. Accepted geometry types are: "Point", "LineString", "Polygon", - * "MultiPoint", "MultiLineString", "MultiPolygon", "GeometryCollection". + * known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] + * except that values in the list are WKB (ISO variant) integer codes [2]. Table + * below shows the most common geometry types and their codes: + * + * | Type | XY | XYZ | XYM | XYZM | + * | :----------------- | :--- | :--- | :--- | :--: | + * | Point | 0001 | 1001 | 2001 | 3001 | + * | LineString | 0002 | 1002 | 2002 | 3002 | + * | Polygon | 0003 | 1003 | 2003 | 3003 | + * | MultiPoint | 0004 | 1004 | 2004 | 3004 | + * | MultiLineString | 0005 | 1005 | 2005 | 3005 | + * | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + * | GeometryCollection | 0007 | 1007 | 2007 | 3007 | * * In addition, the following rules are used: - * - In case of 3D geometries, a `" Z"` suffix gets added (e.g. `["Point Z"]`). * - A list of multiple values indicates that multiple geometry types are - * present (e.g. `["Polygon", "MultiPolygon"]`). + * present (e.g. `[0003, 0006]`). * - An empty array explicitly signals that the geometry types are not known. - * - The geometry types in the list must be unique (e.g. `["Point", "Point"]` + * - The geometry types in the list must be unique (e.g. `[0001, 0001]` * is not valid). * - * Please refer to link below for more detail: - * https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 + * Please refer to links below for more detail: + * [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary + * [2] https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 */ - 3: optional list geometry_types; + 3: optional list geometry_types; } /** @@ -473,14 +483,14 @@ struct GeometryType { */ 1: required GeometryEncoding encoding; /** - * Coordinate Reference System, i.e. mapping of how coordinates refer to - * precise locations on earth, e.g. OGC:CRS84 + * Edges of polygon. */ - 2: optional string crs; + 2: required Edges edges; /** - * Edges of polygon. + * Coordinate Reference System, i.e. mapping of how coordinates refer to + * precise locations on earth, e.g. OGC:CRS84 */ - 3: optional Edges edges; + 3: optional string crs; /** * Additional informative metadata. * It can be used by GeoParquet to offload some of the column metadata. From 82ac23279f5c00dd6dcc27034e46690652794907 Mon Sep 17 00:00:00 2001 From: Feng Zhang Date: Wed, 7 Aug 2024 07:57:19 -0700 Subject: [PATCH 09/22] Update covering and geometry type protocol based on comments (#2) --- src/main/thrift/parquet.thrift | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 801dde3b..fdc94b27 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -249,16 +249,22 @@ enum Edges { /** * A custom WKB-encoded polygon or multi-polygon to represent a covering of - * geometries. For example, it may be a bounding box, or an evelope of geometries - * when a bounding box cannot be built (e.g. a geometry has spherical edges, or if + * geometries. For example, it may be a bounding box or an envelope of geometries + * when a bounding box cannot be built (e.g., a geometry has spherical edges, or if * an edge of geographic coordinates crosses the antimeridian). In addition, it can * also be used to provide vendor-agnostic coverings like S2 or H3 grids. */ struct Covering { - /** Bytes of a WKB-encoded geometry */ - 1: required binary geometry; - /** Edges of the geometry, which is independent of edges from the logical type */ - 2: required Edges edges; + /** + * A type of covering. Currently accepted values: "WKB". + */ + 1: required string kind; + /** A payload specific to kind: + * - WKB: well-known binary of a POLYGON that completely covers the contents. + * This will be interpreted according to the same CRS and edges defined by + * the logical type. + */ + 2: required binary value; } /** @@ -281,8 +287,8 @@ struct GeometryStatistics { /** A bounding box of geometries */ 1: optional BoundingBox bbox; - /** A covering polygon of geometries */ - 2: optional Covering covering; + /** A list of coverings of geometries */ + 2: optional list coverings; /** * The geometry types of all geometries, or an empty array if they are not @@ -488,14 +494,19 @@ struct GeometryType { 2: required Edges edges; /** * Coordinate Reference System, i.e. mapping of how coordinates refer to - * precise locations on earth, e.g. OGC:CRS84 + * precise locations on earth. */ 3: optional string crs; + /** + * Encoding used in the above crs field. + * Currently the only allowed value is "PROJJSON". + */ + 4: optional string crs_encoding; /** * Additional informative metadata. * It can be used by GeoParquet to offload some of the column metadata. */ - 4: optional binary metadata; + 5: optional binary metadata; } /** From 82be6d81ff2544e0657943e43ba8d1958a91d40a Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Wed, 14 Aug 2024 22:25:16 -0700 Subject: [PATCH 10/22] Add the new suggestion according to the meeting with Snowflake (#3) --- src/main/thrift/parquet.thrift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index fdc94b27..cac3fd31 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -270,6 +270,9 @@ struct Covering { /** * Bounding box of geometries in the representation of min/max value pair of * coordinates from each axis. Values of Z and M are omitted for 2D geometries. + * Filter pushdown on geometries are only safe for planar spatial predicate + * but it is recommended that the writer always generates bounding box statistics, + * regardless of whether the geometries are planar or spherical. */ struct BoundingBox { 1: required double xmin; From 336a4f287780c316c1518d01e779b82d481c1a18 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 21 Aug 2024 00:04:18 +0800 Subject: [PATCH 11/22] change metadata to string type and rewording WKB description --- src/main/thrift/parquet.thrift | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index cac3fd31..e20d0b4d 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -248,7 +248,7 @@ enum Edges { } /** - * A custom WKB-encoded polygon or multi-polygon to represent a covering of + * A custom binary-encoded polygon or multi-polygon to represent a covering of * geometries. For example, it may be a bounding box or an envelope of geometries * when a bounding box cannot be built (e.g., a geometry has spherical edges, or if * an edge of geographic coordinates crosses the antimeridian). In addition, it can @@ -259,10 +259,11 @@ struct Covering { * A type of covering. Currently accepted values: "WKB". */ 1: required string kind; - /** A payload specific to kind: - * - WKB: well-known binary of a POLYGON that completely covers the contents. - * This will be interpreted according to the same CRS and edges defined by - * the logical type. + /** + * A payload specific to kind: + * - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely + * covers the contents. This will be interpreted according to the same CRS + * and edges defined by the logical type. */ 2: required binary value; } @@ -318,7 +319,7 @@ struct GeometryStatistics { * * Please refer to links below for more detail: * [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary - * [2] https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 + * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 */ 3: optional list geometry_types; } @@ -476,6 +477,11 @@ enum GeometryEncoding { * * This encoding enables GeometryStatistics to be set in the column chunk * and page index. + * + * Please note that we follow the same rule of WKB and coordinate axis order + * of GeoParquet, see detail below: + * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 + * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 */ WKB = 0; @@ -507,9 +513,10 @@ struct GeometryType { 4: optional string crs_encoding; /** * Additional informative metadata. - * It can be used by GeoParquet to offload some of the column metadata. + * GeoParquet could offload its column metadata in a JSON-encoded UTF-8 string: + * https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L46 */ - 5: optional binary metadata; + 5: optional string metadata; } /** From 7de7d79b9e692e67475c778efea09c757961ae03 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 21 Aug 2024 22:50:56 +0800 Subject: [PATCH 12/22] add example for crs --- src/main/thrift/parquet.thrift | 40 +++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index e20d0b4d..38c9f52e 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -504,10 +504,48 @@ struct GeometryType { /** * Coordinate Reference System, i.e. mapping of how coordinates refer to * precise locations on earth. + * + * For example, OGC:CRS84 encoded in PROJJSON is set as below: + * { + * "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", + * "type": "GeographicCRS", + * "name": "WGS 84 longitude-latitude", + * "datum": { + * "type": "GeodeticReferenceFrame", + * "name": "World Geodetic System 1984", + * "ellipsoid": { + * "name": "WGS 84", + * "semi_major_axis": 6378137, + * "inverse_flattening": 298.257223563 + * } + * }, + * "coordinate_system": { + * "subtype": "ellipsoidal", + * "axis": [ + * { + * "name": "Geodetic longitude", + * "abbreviation": "Lon", + * "direction": "east", + * "unit": "degree" + * }, + * { + * "name": "Geodetic latitude", + * "abbreviation": "Lat", + * "direction": "north", + * "unit": "degree" + * } + * ] + * }, + * "id": { + * "authority": "OGC", + * "code": "CRS84" + * } + * } */ 3: optional string crs; /** - * Encoding used in the above crs field. + * Encoding used in the above crs field. If MUST be set if crs is set. + * * Currently the only allowed value is "PROJJSON". */ 4: optional string crs_encoding; From 04d4f944c0d543aad55e6c857ce1047c6018c9ee Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 21 Aug 2024 23:04:22 +0800 Subject: [PATCH 13/22] reword crs --- src/main/thrift/parquet.thrift | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 38c9f52e..ff683dc2 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -503,9 +503,9 @@ struct GeometryType { 2: required Edges edges; /** * Coordinate Reference System, i.e. mapping of how coordinates refer to - * precise locations on earth. - * - * For example, OGC:CRS84 encoded in PROJJSON is set as below: + * precise locations on earth. Writers are not required to set this field. + * Once crs is set, crs_encoding field below MUST be set together. + * For example, "OGC:CRS84" can be set in the form of PROJJSON as below: * { * "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", * "type": "GeographicCRS", @@ -544,8 +544,7 @@ struct GeometryType { */ 3: optional string crs; /** - * Encoding used in the above crs field. If MUST be set if crs is set. - * + * Encoding used in the above crs field. It MUST be set if crs field is set. * Currently the only allowed value is "PROJJSON". */ 4: optional string crs_encoding; From 68f061d557bc43d90176de6f5e3d30f84af694d3 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 22 Aug 2024 11:51:43 +0800 Subject: [PATCH 14/22] clarify WKB --- src/main/thrift/parquet.thrift | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index ff683dc2..91b15c99 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -240,7 +240,7 @@ struct SizeStatistics { /** * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge * between points represent a straight cartesian line or the shortest line on - * the sphere. Please note that it only applies to polygons. + * the sphere. It applies to all non-point geometry objects. */ enum Edges { PLANAR = 0; @@ -260,7 +260,7 @@ struct Covering { */ 1: required string kind; /** - * A payload specific to kind: + * A payload specific to kind. Below are the supported values: * - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely * covers the contents. This will be interpreted according to the same CRS * and edges defined by the logical type. @@ -470,22 +470,20 @@ enum GeometryEncoding { /** * Allowed for physical type: BYTE_ARRAY. * - * Well-known binary (WKB) representations of geometries. It supports 2D or - * 3D geometries of the standard geometry types (Point, LineString, Polygon, - * MultiPoint, MultiLineString, MultiPolygon, and GeometryCollection). This - * is the preferred option for maximum portability. + * Well-known binary (WKB) representations of geometries. * - * This encoding enables GeometryStatistics to be set in the column chunk - * and page index. + * To be clear, we follow the same rule of WKB and coordinate axis order from + * GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM and the + * standard geometry types (Point, LineString, Polygon, MultiPoint, + * MultiLineString, MultiPolygon, and GeometryCollection). + * + * This is the preferred encoding for maximum portability. It also supports + * GeometryStatistics to be set in the column chunk and page index. * - * Please note that we follow the same rule of WKB and coordinate axis order - * of GeoParquet, see detail below: * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 */ WKB = 0; - - // TODO: add native encoding from GeoParquet/GeoArrow } /** @@ -493,12 +491,13 @@ enum GeometryEncoding { */ struct GeometryType { /** - * Physical type and encoding for the geometry type. Please refer to the - * definition of GeometryEncoding for more detail. + * Physical type and encoding for the geometry type. + * Please refer to the definition of GeometryEncoding for more detail. */ 1: required GeometryEncoding encoding; /** - * Edges of polygon. + * Edges of geometry type. + * Please refer to the definition of Edges for more detail. */ 2: required Edges edges; /** From 3196377e278f8320788b464dbdccbf0230b3f11d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 24 Aug 2024 13:40:25 +0800 Subject: [PATCH 15/22] clarify coverings --- src/main/thrift/parquet.thrift | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 91b15c99..09829743 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -250,7 +250,7 @@ enum Edges { /** * A custom binary-encoded polygon or multi-polygon to represent a covering of * geometries. For example, it may be a bounding box or an envelope of geometries - * when a bounding box cannot be built (e.g., a geometry has spherical edges, or if + * when a bounding box cannot be built (e.g. a geometry has spherical edges, or if * an edge of geographic coordinates crosses the antimeridian). In addition, it can * also be used to provide vendor-agnostic coverings like S2 or H3 grids. */ @@ -291,7 +291,12 @@ struct GeometryStatistics { /** A bounding box of geometries */ 1: optional BoundingBox bbox; - /** A list of coverings of geometries */ + /** + * A list of coverings of geometries. + * Note that It is allowed to have more than one covering of the same kind and + * implementation is free to use any of them. It is recommended to have at most + * one covering for each kind. + */ 2: optional list coverings; /** From 79aedb0152b2f2380d7794f2c79ebd81103b5c5e Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Tue, 10 Sep 2024 22:12:31 -0700 Subject: [PATCH 16/22] Update the suggestion for bbox stats (#4) * Add the new suggestion according to the meeting with Snowflake * Refine the description according to the suggestion --- src/main/thrift/parquet.thrift | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 09829743..528e6323 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -271,9 +271,8 @@ struct Covering { /** * Bounding box of geometries in the representation of min/max value pair of * coordinates from each axis. Values of Z and M are omitted for 2D geometries. - * Filter pushdown on geometries are only safe for planar spatial predicate - * but it is recommended that the writer always generates bounding box statistics, - * regardless of whether the geometries are planar or spherical. + * Filter pushdown on geometries using this is only safe for planar spatial + * filters. */ struct BoundingBox { 1: required double xmin; From 89bfac7c340bbc680b541be4fbf2826996722c64 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 23:08:47 +0800 Subject: [PATCH 17/22] Update src/main/thrift/parquet.thrift Co-authored-by: Dewey Dunnington --- src/main/thrift/parquet.thrift | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 528e6323..c04f2921 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -477,9 +477,13 @@ enum GeometryEncoding { * Well-known binary (WKB) representations of geometries. * * To be clear, we follow the same rule of WKB and coordinate axis order from - * GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM and the - * standard geometry types (Point, LineString, Polygon, MultiPoint, - * MultiLineString, MultiPolygon, and GeometryCollection). + * GeoParquet [1][2]. Geometries SHOULD be encoded as ISO WKB [3][4] + * supporting XY, XYZ, XYM, XYZM and the standard geometry types + * Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon, + * and GeometryCollection). Coordinate order is always (x, y) where x is + * easting or longitude and y is northing or latitude. This ordering explicitly + * overrides the axis order as specified in the CRS following the GeoPackage + * specification [5]. * * This is the preferred encoding for maximum portability. It also supports * GeometryStatistics to be set in the column chunk and page index. From d98cf6169cbc8dca5ce64ba3333a3c8ff659360b Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 23:08:58 +0800 Subject: [PATCH 18/22] Update src/main/thrift/parquet.thrift Co-authored-by: Dewey Dunnington --- src/main/thrift/parquet.thrift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index c04f2921..51e6aca9 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -490,6 +490,9 @@ enum GeometryEncoding { * * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 + * [3] https://portal.ogc.org/files/?artifact_id=18241 + * [4] https://www.iso.org/standard/60343.html + * [5] https://www.geopackage.org/spec130/#gpb_spec */ WKB = 0; } From 2a3524f1327a894dabe2089a0f4a2287ee0f92af Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 23:09:29 +0800 Subject: [PATCH 19/22] Update src/main/thrift/parquet.thrift Co-authored-by: Dewey Dunnington --- src/main/thrift/parquet.thrift | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 51e6aca9..a8fcbdd4 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -238,9 +238,16 @@ struct SizeStatistics { } /** - * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge - * between points represent a straight cartesian line or the shortest line on - * the sphere. It applies to all non-point geometry objects. + * Interpretation for edges of elements of a GEOMETRY logical type. In other + * words, whether a point between two vertices should be interpolated in + * its XY dimensions as if it were a Cartesian line connecting the two + * vertices (planar) or the shortest spherical arc between the longitude + * and latitude represented by the two vertices (spherical). This value + * applies to all non-point geometry objects and is independent of the + * coordinate reference system. + * + * Because most systems currently assume planar edges and do not support + * spherical edges, planar should be used as the default value. */ enum Edges { PLANAR = 0; From 3e54c7ed5e2d127d22722375ca532cda09842351 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 23:09:53 +0800 Subject: [PATCH 20/22] Update src/main/thrift/parquet.thrift Co-authored-by: Dewey Dunnington --- src/main/thrift/parquet.thrift | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index a8fcbdd4..90b0daab 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -258,8 +258,9 @@ enum Edges { * A custom binary-encoded polygon or multi-polygon to represent a covering of * geometries. For example, it may be a bounding box or an envelope of geometries * when a bounding box cannot be built (e.g. a geometry has spherical edges, or if - * an edge of geographic coordinates crosses the antimeridian). In addition, it can - * also be used to provide vendor-agnostic coverings like S2 or H3 grids. + * an edge of geographic coordinates crosses the antimeridian). It may be + * extended in future versions to provide vendor-agnostic coverings like + * vectors of cells on a discrete global grid (e.g., S2 or H3 cells). */ struct Covering { /** From 3961cbfd41e6465e60d9acf591d9df9e2d0ede2d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 23:46:49 +0800 Subject: [PATCH 21/22] address feedback about edges and wkb --- src/main/thrift/parquet.thrift | 77 ++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 90b0daab..77af877b 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -237,6 +237,36 @@ struct SizeStatistics { 3: optional list definition_level_histogram; } +/** + * Physical type and encoding for the geometry type. + */ +enum GeometryEncoding { + /** + * Allowed for physical type: BYTE_ARRAY. + * + * Well-known binary (WKB) representations of geometries. + * + * To be clear, we follow the same rule of WKB and coordinate axis order from + * GeoParquet [1][2]. Geometries SHOULD be encoded as ISO WKB [3][4] + * supporting XY, XYZ, XYM, XYZM and the standard geometry types + * Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon, + * and GeometryCollection). Coordinate order is always (x, y) where x is + * easting or longitude and y is northing or latitude. This ordering explicitly + * overrides the axis order as specified in the CRS following the GeoPackage + * specification [5]. + * + * This is the preferred encoding for maximum portability. It also supports + * GeometryStatistics to be set in the column chunk and page index. + * + * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 + * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 + * [3] https://portal.ogc.org/files/?artifact_id=18241 + * [4] https://www.iso.org/standard/60343.html + * [5] https://www.geopackage.org/spec130/#gpb_spec + */ + WKB = 0; +} + /** * Interpretation for edges of elements of a GEOMETRY logical type. In other * words, whether a point between two vertices should be interpolated in @@ -249,7 +279,7 @@ struct SizeStatistics { * Because most systems currently assume planar edges and do not support * spherical edges, planar should be used as the default value. */ -enum Edges { +enum EdgeInterpolation { PLANAR = 0; SPHERICAL = 1; } @@ -475,36 +505,6 @@ struct JsonType { struct BsonType { } -/** - * Physical type and encoding for the geometry type. - */ -enum GeometryEncoding { - /** - * Allowed for physical type: BYTE_ARRAY. - * - * Well-known binary (WKB) representations of geometries. - * - * To be clear, we follow the same rule of WKB and coordinate axis order from - * GeoParquet [1][2]. Geometries SHOULD be encoded as ISO WKB [3][4] - * supporting XY, XYZ, XYM, XYZM and the standard geometry types - * Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon, - * and GeometryCollection). Coordinate order is always (x, y) where x is - * easting or longitude and y is northing or latitude. This ordering explicitly - * overrides the axis order as specified in the CRS following the GeoPackage - * specification [5]. - * - * This is the preferred encoding for maximum portability. It also supports - * GeometryStatistics to be set in the column chunk and page index. - * - * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 - * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 - * [3] https://portal.ogc.org/files/?artifact_id=18241 - * [4] https://www.iso.org/standard/60343.html - * [5] https://www.geopackage.org/spec130/#gpb_spec - */ - WKB = 0; -} - /** * Geometry logical type annotation (added in 2.11.0) */ @@ -515,10 +515,12 @@ struct GeometryType { */ 1: required GeometryEncoding encoding; /** - * Edges of geometry type. + * Interpretation for edges of elements of a GEOMETRY logical type, i.e. whether + * the interpolation between points along an edge represents a straight cartesian + * line or the shortest line on the sphere. * Please refer to the definition of Edges for more detail. */ - 2: required Edges edges; + 2: required EdgeInterpolation edges; /** * Coordinate Reference System, i.e. mapping of how coordinates refer to * precise locations on earth. Writers are not required to set this field. @@ -567,11 +569,12 @@ struct GeometryType { */ 4: optional string crs_encoding; /** - * Additional informative metadata. - * GeoParquet could offload its column metadata in a JSON-encoded UTF-8 string: - * https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L46 + * Additional informative metadata as a list of key-value pair of UTF-8 string. + * It is not strictly required by the low-level Parquet implementation for + * features like statistics or filter pushdown. Using a list of key-value pair + * provides maximum flexibility for adding future informative metadata. */ - 5: optional string metadata; + 5: optional list key_value_metadata; } /** From 98c358914efb4988ffc83cf8e47fda9342857683 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 27 Sep 2024 09:48:09 +0800 Subject: [PATCH 22/22] add geoparquet column metadata back --- src/main/thrift/parquet.thrift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 77af877b..47a28433 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -570,9 +570,13 @@ struct GeometryType { 4: optional string crs_encoding; /** * Additional informative metadata as a list of key-value pair of UTF-8 string. + * * It is not strictly required by the low-level Parquet implementation for * features like statistics or filter pushdown. Using a list of key-value pair * provides maximum flexibility for adding future informative metadata. + * + * GeoParquet could store its column metadata in this field: + * https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L46 */ 5: optional list key_value_metadata; }