From fec464cacc9ae2572b285789f718ac3ee65b6708 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 19 May 2024 08:56:45 -0400 Subject: [PATCH 1/4] Refine parquet documentation on types and metadata --- parquet/src/file/metadata.rs | 61 +++++++++++++++++++--------- parquet/src/file/mod.rs | 11 +++-- parquet/src/file/page_index/index.rs | 9 ++-- parquet/src/file/statistics.rs | 1 + parquet/src/format.rs | 1 + parquet/src/lib.rs | 23 +++++++---- parquet/src/schema/mod.rs | 14 +++++++ parquet/src/schema/types.rs | 52 +++++++++++++++--------- 8 files changed, 116 insertions(+), 56 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index c9232d83e80d..58096d58a138 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -15,23 +15,20 @@ // specific language governing permissions and limitations // under the License. -//! Contains information about available Parquet metadata. +//! Parquet metadata structures //! -//! The hierarchy of metadata is as follows: +//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet +//! file footer. //! -//! [`ParquetMetaData`](struct.ParquetMetaData.html) contains -//! [`FileMetaData`](struct.FileMetaData.html) and zero or more -//! [`RowGroupMetaData`](struct.RowGroupMetaData.html) for each row group. +//! * [`FileMetaData`]: File level metadata such as schema, row counts and +//! version. //! -//! [`FileMetaData`](struct.FileMetaData.html) includes file version, application specific -//! metadata. +//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as +//! location and number of rows, and column chunks. //! -//! Each [`RowGroupMetaData`](struct.RowGroupMetaData.html) contains information about row -//! group and one or more [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) for -//! each column chunk. -//! -//! [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) has information about column -//! chunk (primitive leaf column), including encoding/compression, number of values, etc. +//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf) +//! within a Row Group including encoding and compression information, +//! number of values, statistics, etc. use std::ops::Range; use std::sync::Arc; @@ -61,7 +58,7 @@ use crate::schema::types::{ /// column in the third row group of the parquet file. pub type ParquetColumnIndex = Vec>; -/// [`PageLocation`] for each datapage of each row group of each column. +/// [`PageLocation`] for each data page of each row group of each column. /// /// `offset_index[row_group_number][column_number][page_number]` holds /// the [`PageLocation`] corresponding to page `page_number` of column @@ -72,10 +69,27 @@ pub type ParquetColumnIndex = Vec>; /// parquet file. pub type ParquetOffsetIndex = Vec>>; -/// Global Parquet metadata. +/// Global Parquet metadata, including [`FileMetaData`], [`RowGroupMetaData`]. +/// +/// This structure is stored in the footer of Parquet files, in the format +/// defined by [`parquet.thrift`]. It contains: +/// +/// * File level metadata: [`FileMetaData`] +/// * Row Group level metadata: [`RowGroupMetaData`] +/// * (Optional) "Page Index": [`ParquetColumnIndex`] +/// * (Optional) Offset index: [`ParquetOffsetIndex`] +/// +/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift +/// +/// This structure is read by the various readers in this crate or can be read +/// directly from a file using the [`parse_metadata`] function. +/// +/// [`parse_metadata`]: crate::file::footer::parse_metadata #[derive(Debug, Clone)] pub struct ParquetMetaData { + /// File level metadata file_metadata: FileMetaData, + /// Row group metadata row_groups: Vec, /// Page index for all pages in each column chunk column_index: Option, @@ -172,7 +186,9 @@ pub type KeyValue = crate::format::KeyValue; /// Reference counted pointer for [`FileMetaData`]. pub type FileMetaDataPtr = Arc; -/// Metadata for a Parquet file. +/// File level metadata for a Parquet file. +/// +/// Includes the version of the file, metadata, number of rows, schema, and column orders #[derive(Debug, Clone)] pub struct FileMetaData { version: i32, @@ -271,7 +287,10 @@ impl FileMetaData { /// Reference counted pointer for [`RowGroupMetaData`]. pub type RowGroupMetaDataPtr = Arc; -/// Metadata for a row group. +/// Metadata for a row group +/// +/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows +/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group. #[derive(Debug, Clone, PartialEq)] pub struct RowGroupMetaData { columns: Vec, @@ -279,8 +298,9 @@ pub struct RowGroupMetaData { sorting_columns: Option>, total_byte_size: i64, schema_descr: SchemaDescPtr, - // We can't infer from file offset of first column since there may empty columns in row group. + /// We can't infer from file offset of first column since there may empty columns in row group. file_offset: Option, + /// Ordinal position of this row group in file ordinal: Option, } @@ -335,7 +355,10 @@ impl RowGroupMetaData { self.schema_descr.clone() } - /// Returns ordinal of this row group in file + /// Returns ordinal position of this row group in file. + /// + /// For example if this is the first row group in the file, this will return 0. + /// If this is the second row group in the file, this will return 1. #[inline(always)] pub fn ordinal(&self) -> Option { self.ordinal diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index 6589d2efaf8b..a1df33633fc7 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -19,10 +19,13 @@ //! //! Provides access to file and row group readers and writers, record API, metadata, etc. //! -//! See [`serialized_reader::SerializedFileReader`](serialized_reader/struct.SerializedFileReader.html) or -//! [`writer::SerializedFileWriter`](writer/struct.SerializedFileWriter.html) for a -//! starting reference, [`metadata::ParquetMetaData`](metadata/index.html) for file -//! metadata, and [`statistics`](statistics/index.html) for working with statistics. +//! # See Also: +//! * [`SerializedFileReader`] and [`SerializedFileWriter`] for reading / writing parquet +//! * [`metadata`]: for working with metadata such as schema +//! * [`statistics`]: for working with statistics in metadata +//! +//! [`SerializedFileReader`]: serialized_reader::SerializedFileReader +//! [`SerializedFileWriter`]: writer::SerializedFileWriter //! //! # Example of writing a new file //! diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index f3a09046a63c..ab342d52b7f5 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -57,10 +57,11 @@ impl PageIndex { #[derive(Debug, Clone, PartialEq)] #[allow(non_camel_case_types)] -/// Typed statistics for a data page in a column chunk. This structure -/// is obtained from decoding the [ColumnIndex] in the parquet file -/// and can be used to skip decoding pages while reading the file -/// data. +/// Typed statistics for a data page in a column chunk. +/// +/// This structure is part of the "Page Index" and is optionally part of +/// [ColumnIndex] in the parquet file and can be used to skip decoding pages +/// while reading the file data. pub enum Index { /// Sometimes reading page index from parquet file /// will only return pageLocations without min_max index, diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index d24b91741bef..7d704cc138fc 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -20,6 +20,7 @@ //! Though some common methods are available on enum, use pattern match to extract //! actual min and max values from statistics, see below: //! +//! # Examples //! ```rust //! use parquet::file::statistics::Statistics; //! diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 9f4ddfe82855..5e091158ab46 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -1,3 +1,4 @@ +//! See [`crate::file`] for easier to use APIs. // Autogenerated by Thrift Compiler (0.19.0) // DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 1166703bb150..f8342453fec7 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -28,25 +28,30 @@ //! # Format Overview //! //! Parquet is a columnar format, which means that unlike row formats like [CSV], values are -//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], with Parquet -//! focusing on storage efficiency whereas Arrow prioritizes compute efficiency. +//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], but +//! focuses on storage efficiency whereas Arrow prioritizes compute efficiency. //! //! Parquet files are partitioned for scalability. Each file contains metadata, //! along with zero or more "row groups", each row group containing one or //! more columns. The APIs in this crate reflect this structure. //! -//! Parquet distinguishes between "logical" and "physical" data types. -//! For instance, strings (logical type) are stored as byte arrays (physical type). -//! Likewise, temporal types like dates, times, timestamps, etc. (logical type) -//! are stored as integers (physical type). This crate exposes both kinds of types. +//! Data in Parquet files is strongly typed and differentiates between logical +//! and physical types (see [`schema`]). In addition, Parquet files may contain +//! other metadata, such as statistics, which can be used to optimize reading +//! (see [`file::metadata`]). +//! For more details about the Parquet format itself, see the [Parquet spec] //! -//! For more details about the Parquet format, see the -//! [Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format). +//! [Parquet spec]: https://github.com/apache/parquet-format/blob/master/README.md#file-format //! //! # APIs //! //! This crate exposes a number of APIs for different use-cases. //! +//! ## Metadata and Schema +//! +//! The [`schema`] module provides APIs to work with Parquet schemas. The +//! [`file::metadata`] module provides APIs to work with Parquet metadata. +//! //! ## Read/Write Arrow //! //! The [`arrow`] module allows reading and writing Parquet data to/from Arrow `RecordBatch`. @@ -64,7 +69,7 @@ //! //! ## Read/Write Parquet //! -//! Workloads needing finer-grained control, or looking to not take a dependency on arrow, +//! Workloads needing finer-grained control, or avoid a dependence on arrow, //! can use the lower-level APIs in [`mod@file`]. These APIs expose the underlying parquet //! data model, and therefore require knowledge of the underlying parquet format, //! including the details of [Dremel] record shredding and [Logical Types]. Most workloads diff --git a/parquet/src/schema/mod.rs b/parquet/src/schema/mod.rs index ead7f1d2c0f8..415802c990ed 100644 --- a/parquet/src/schema/mod.rs +++ b/parquet/src/schema/mod.rs @@ -17,6 +17,20 @@ //! Parquet schema definitions and methods to print and parse schema. //! +//! * [`SchemaDescriptor`] describes the data types of the columns stored in a file +//! * [`ColumnDescriptor`]: Describes the schema of a single (leaf) column. +//! * [`ColumnPath`]: Represents the location of a column in the schema (e.g. a nested field) +//! +//! Parquet distinguishes +//! between "logical" and "physical" data types. +//! For instance, strings (logical type) are stored as byte arrays (physical type). +//! Likewise, temporal types like dates, times, timestamps, etc. (logical type) +//! are stored as integers (physical type). +//! +//! [`SchemaDescriptor`]: types::SchemaDescriptor +//! [`ColumnDescriptor`]: types::ColumnDescriptor +//! [`ColumnPath`]: types::ColumnPath +//! //! # Example //! //! ```rust diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index dbf6e8dcb3bd..a0cbf506f7c7 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -37,8 +37,10 @@ pub type SchemaDescPtr = Arc; pub type ColumnDescPtr = Arc; /// Representation of a Parquet type. +/// /// Used to describe primitive leaf fields and structs, including top-level schema. -/// Note that the top-level schema type is represented using `GroupType` whose +/// +/// Note that the top-level schema is represented using [`Type::GroupType`] whose /// repetition is `None`. #[derive(Clone, Debug, PartialEq)] pub enum Type { @@ -662,7 +664,7 @@ impl BasicTypeInfo { // ---------------------------------------------------------------------- // Parquet descriptor definitions -/// Represents a path in a nested schema +/// Represents the location of a column in a Parquet schema #[derive(Clone, PartialEq, Debug, Eq, Hash)] pub struct ColumnPath { parts: Vec, @@ -737,21 +739,22 @@ impl AsRef<[String]> for ColumnPath { } } -/// A descriptor for leaf-level primitive columns. -/// This encapsulates information such as definition and repetition levels and is used to +/// Physical type for leaf-level primitive columns. +/// +/// Also includes the maximum definition and repetition levels required to /// re-assemble nested data. #[derive(Debug, PartialEq)] pub struct ColumnDescriptor { - // The "leaf" primitive type of this column + /// The "leaf" primitive type of this column primitive_type: TypePtr, - // The maximum definition level for this column + /// The maximum definition level for this column max_def_level: i16, - // The maximum repetition level for this column + /// The maximum repetition level for this column max_rep_level: i16, - // The path of this column. For instance, "a.b.c.d". + /// The path of this column. For instance, "a.b.c.d". path: ColumnPath, } @@ -860,24 +863,33 @@ impl ColumnDescriptor { } } -/// A schema descriptor. This encapsulates the top-level schemas for all the columns, -/// as well as all descriptors for all the primitive columns. +/// Schema of a Parquet file. +/// +/// Encapsulates the file's schema ([`Type`]) and [`ColumnDescriptor`]s for +/// each primitive (leaf) column. #[derive(PartialEq)] pub struct SchemaDescriptor { - // The top-level schema (the "message" type). - // This must be a `GroupType` where each field is a root column type in the schema. + /// The top-level logical schema (the "message" type). + /// + /// This must be a [`Type::GroupType`] where each field is a root + /// column type in the schema. schema: TypePtr, - // All the descriptors for primitive columns in this schema, constructed from - // `schema` in DFS order. + /// The descriptors for the physical type of each leaf column in this schema + /// + /// Constructed from `schema` in DFS order. leaves: Vec, - // Mapping from a leaf column's index to the root column index that it - // comes from. For instance: the leaf `a.b.c.d` would have a link back to `a`: - // -- a <-----+ - // -- -- b | - // -- -- -- c | - // -- -- -- -- d + /// Mapping from a leaf column's index to the root column index that it + /// comes from. + /// + /// For instance: the leaf `a.b.c.d` would have a link back to `a`: + /// ```text + /// -- a <-----+ + /// -- -- b | + /// -- -- -- c | + /// -- -- -- -- d + /// ``` leaf_to_base: Vec, } From a895f7c008690fe85a25f4c2709b5b235481f141 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 20 May 2024 07:53:47 -0400 Subject: [PATCH 2/4] Update regen.sh and thrift.rs --- parquet/regen.sh | 6 +++++- parquet/src/format.rs | 5 ++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/parquet/regen.sh b/parquet/regen.sh index f2d8158765c4..d1b82108a018 100755 --- a/parquet/regen.sh +++ b/parquet/regen.sh @@ -21,7 +21,10 @@ REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2 SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" -docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\ +COMMENT='//! See [`crate::file`] for easier to use APIs.' + +# Note: add argument --platform=linux/amd64 to run on mac +docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\ pacman -Sy --noconfirm wget thrift && \ wget https://raw.githubusercontent.com/apache/parquet-format/$REVISION/src/main/thrift/parquet.thrift -O /tmp/parquet.thrift && \ thrift --gen rs /tmp/parquet.thrift && \ @@ -35,5 +38,6 @@ docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\ sed -i 's/fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol)/fn read_from_in_protocol(i_prot: \&mut T)/g' parquet.rs && \ echo 'Rewriting return value expectations' && \ sed -i 's/Ok(ret.expect(\"return value should have been constructed\"))/ret.ok_or_else(|| thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, \"return value should have been constructed\")))/g' parquet.rs && \ + sed -i '1i${COMMENT}' parquet.rs && \ mv parquet.rs /thrift/src/format.rs " diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 5e091158ab46..7c05e1a13116 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -1,13 +1,12 @@ //! See [`crate::file`] for easier to use APIs. -// Autogenerated by Thrift Compiler (0.19.0) +// Autogenerated by Thrift Compiler (0.20.0) // DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING #![allow(dead_code)] #![allow(unused_imports)] #![allow(unused_extern_crates)] #![allow(clippy::too_many_arguments, clippy::type_complexity, clippy::vec_box, clippy::wrong_self_convention)] -// Fix unexpected `cfg` condition name: `rustfmt` https://github.com/apache/arrow-rs/issues/5725 -//#![cfg_attr(rustfmt, rustfmt_skip)] +#![cfg_attr(rustfmt, rustfmt_skip)] use std::cell::RefCell; use std::collections::{BTreeMap, BTreeSet}; From 7ac688efdb35e27116f76ecc77798006e2048a30 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 20 May 2024 07:55:29 -0400 Subject: [PATCH 3/4] Clarify page index encompasses offset index and column index --- parquet/src/file/metadata.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 58096d58a138..853d5ffec8b0 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -76,8 +76,7 @@ pub type ParquetOffsetIndex = Vec>>; /// /// * File level metadata: [`FileMetaData`] /// * Row Group level metadata: [`RowGroupMetaData`] -/// * (Optional) "Page Index": [`ParquetColumnIndex`] -/// * (Optional) Offset index: [`ParquetOffsetIndex`] +/// * (Optional) "Page Index" structures: [`ParquetColumnIndex`] and [`ParquetOffsetIndex`] /// /// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift /// @@ -91,9 +90,9 @@ pub struct ParquetMetaData { file_metadata: FileMetaData, /// Row group metadata row_groups: Vec, - /// Page index for all pages in each column chunk + /// Page level index for each page in each column chunk column_index: Option, - /// Offset index for all pages in each column chunk + /// Offset index for all each page in each column chunk offset_index: Option, } From 59b0cc58b756cd1213578b98aa28871fc51b0c17 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 20 May 2024 07:58:03 -0400 Subject: [PATCH 4/4] revert unexpected diff --- parquet/src/format.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 7c05e1a13116..b210d6ec1b7e 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -6,7 +6,8 @@ #![allow(unused_imports)] #![allow(unused_extern_crates)] #![allow(clippy::too_many_arguments, clippy::type_complexity, clippy::vec_box, clippy::wrong_self_convention)] -#![cfg_attr(rustfmt, rustfmt_skip)] +// Fix unexpected `cfg` condition name: `rustfmt` https://github.com/apache/arrow-rs/issues/5725 +//#![cfg_attr(rustfmt, rustfmt_skip)] use std::cell::RefCell; use std::collections::{BTreeMap, BTreeSet};