Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refine parquet documentation on types and metadata #5786

Merged
merged 5 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion parquet/regen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2

SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)"

docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
COMMENT='//! See [`crate::file`] for easier to use APIs.'

# Note: add argument --platform=linux/amd64 to run on mac
docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
pacman -Sy --noconfirm wget thrift && \
wget https://raw.githubusercontent.com/apache/parquet-format/$REVISION/src/main/thrift/parquet.thrift -O /tmp/parquet.thrift && \
thrift --gen rs /tmp/parquet.thrift && \
Expand All @@ -35,5 +38,6 @@ docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
sed -i 's/fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol)/fn read_from_in_protocol<T: TInputProtocol>(i_prot: \&mut T)/g' parquet.rs && \
echo 'Rewriting return value expectations' && \
sed -i 's/Ok(ret.expect(\"return value should have been constructed\"))/ret.ok_or_else(|| thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, \"return value should have been constructed\")))/g' parquet.rs && \
sed -i '1i${COMMENT}' parquet.rs && \
mv parquet.rs /thrift/src/format.rs
"
64 changes: 43 additions & 21 deletions parquet/src/file/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,20 @@
// specific language governing permissions and limitations
// under the License.

//! Contains information about available Parquet metadata.
//! Parquet metadata structures
//!
//! The hierarchy of metadata is as follows:
//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
//! file footer.
//!
//! [`ParquetMetaData`](struct.ParquetMetaData.html) contains
//! [`FileMetaData`](struct.FileMetaData.html) and zero or more
//! [`RowGroupMetaData`](struct.RowGroupMetaData.html) for each row group.
//! * [`FileMetaData`]: File level metadata such as schema, row counts and
//! version.
//!
//! [`FileMetaData`](struct.FileMetaData.html) includes file version, application specific
//! metadata.
//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
//! location and number of rows, and column chunks.
//!
//! Each [`RowGroupMetaData`](struct.RowGroupMetaData.html) contains information about row
//! group and one or more [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) for
//! each column chunk.
//!
//! [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) has information about column
//! chunk (primitive leaf column), including encoding/compression, number of values, etc.
//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
//! within a Row Group including encoding and compression information,
//! number of values, statistics, etc.

use std::ops::Range;
use std::sync::Arc;
Expand Down Expand Up @@ -61,7 +58,7 @@ use crate::schema::types::{
/// column in the third row group of the parquet file.
pub type ParquetColumnIndex = Vec<Vec<Index>>;

/// [`PageLocation`] for each datapage of each row group of each column.
/// [`PageLocation`] for each data page of each row group of each column.
///
/// `offset_index[row_group_number][column_number][page_number]` holds
/// the [`PageLocation`] corresponding to page `page_number` of column
Expand All @@ -72,14 +69,30 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
/// parquet file.
pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;

/// Global Parquet metadata.
/// Global Parquet metadata, including [`FileMetaData`], [`RowGroupMetaData`].
///
/// This structure is stored in the footer of Parquet files, in the format
/// defined by [`parquet.thrift`]. It contains:
///
/// * File level metadata: [`FileMetaData`]
/// * Row Group level metadata: [`RowGroupMetaData`]
/// * (Optional) "Page Index" structures: [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]
///
/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
///
/// This structure is read by the various readers in this crate or can be read
/// directly from a file using the [`parse_metadata`] function.
///
/// [`parse_metadata`]: crate::file::footer::parse_metadata
#[derive(Debug, Clone)]
pub struct ParquetMetaData {
/// File level metadata
file_metadata: FileMetaData,
/// Row group metadata
row_groups: Vec<RowGroupMetaData>,
/// Page index for all pages in each column chunk
/// Page level index for each page in each column chunk
column_index: Option<ParquetColumnIndex>,
/// Offset index for all pages in each column chunk
/// Offset index for all each page in each column chunk
offset_index: Option<ParquetOffsetIndex>,
}

Expand Down Expand Up @@ -172,7 +185,9 @@ pub type KeyValue = crate::format::KeyValue;
/// Reference counted pointer for [`FileMetaData`].
pub type FileMetaDataPtr = Arc<FileMetaData>;

/// Metadata for a Parquet file.
/// File level metadata for a Parquet file.
///
/// Includes the version of the file, metadata, number of rows, schema, and column orders
#[derive(Debug, Clone)]
pub struct FileMetaData {
version: i32,
Expand Down Expand Up @@ -271,16 +286,20 @@ impl FileMetaData {
/// Reference counted pointer for [`RowGroupMetaData`].
pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;

/// Metadata for a row group.
/// Metadata for a row group
///
/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
#[derive(Debug, Clone, PartialEq)]
pub struct RowGroupMetaData {
columns: Vec<ColumnChunkMetaData>,
num_rows: i64,
sorting_columns: Option<Vec<SortingColumn>>,
total_byte_size: i64,
schema_descr: SchemaDescPtr,
// We can't infer from file offset of first column since there may empty columns in row group.
/// We can't infer from file offset of first column since there may empty columns in row group.
file_offset: Option<i64>,
/// Ordinal position of this row group in file
ordinal: Option<i16>,
}

Expand Down Expand Up @@ -335,7 +354,10 @@ impl RowGroupMetaData {
self.schema_descr.clone()
}

/// Returns ordinal of this row group in file
/// Returns ordinal position of this row group in file.
///
/// For example if this is the first row group in the file, this will return 0.
/// If this is the second row group in the file, this will return 1.
#[inline(always)]
pub fn ordinal(&self) -> Option<i16> {
self.ordinal
Expand Down
11 changes: 7 additions & 4 deletions parquet/src/file/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@
//!
//! Provides access to file and row group readers and writers, record API, metadata, etc.
//!
//! See [`serialized_reader::SerializedFileReader`](serialized_reader/struct.SerializedFileReader.html) or
//! [`writer::SerializedFileWriter`](writer/struct.SerializedFileWriter.html) for a
//! starting reference, [`metadata::ParquetMetaData`](metadata/index.html) for file
//! metadata, and [`statistics`](statistics/index.html) for working with statistics.
//! # See Also:
//! * [`SerializedFileReader`] and [`SerializedFileWriter`] for reading / writing parquet
//! * [`metadata`]: for working with metadata such as schema
//! * [`statistics`]: for working with statistics in metadata
//!
//! [`SerializedFileReader`]: serialized_reader::SerializedFileReader
//! [`SerializedFileWriter`]: writer::SerializedFileWriter
//!
//! # Example of writing a new file
//!
Expand Down
9 changes: 5 additions & 4 deletions parquet/src/file/page_index/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,11 @@ impl<T> PageIndex<T> {

#[derive(Debug, Clone, PartialEq)]
#[allow(non_camel_case_types)]
/// Typed statistics for a data page in a column chunk. This structure
/// is obtained from decoding the [ColumnIndex] in the parquet file
/// and can be used to skip decoding pages while reading the file
/// data.
/// Typed statistics for a data page in a column chunk.
///
/// This structure is part of the "Page Index" and is optionally part of
/// [ColumnIndex] in the parquet file and can be used to skip decoding pages
/// while reading the file data.
pub enum Index {
/// Sometimes reading page index from parquet file
/// will only return pageLocations without min_max index,
Expand Down
1 change: 1 addition & 0 deletions parquet/src/file/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
//! Though some common methods are available on enum, use pattern match to extract
//! actual min and max values from statistics, see below:
//!
//! # Examples
//! ```rust
//! use parquet::file::statistics::Statistics;
//!
Expand Down
3 changes: 2 additions & 1 deletion parquet/src/format.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 14 additions & 9 deletions parquet/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,30 @@
//! # Format Overview
//!
//! Parquet is a columnar format, which means that unlike row formats like [CSV], values are
//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], with Parquet
//! focusing on storage efficiency whereas Arrow prioritizes compute efficiency.
//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], but
//! focuses on storage efficiency whereas Arrow prioritizes compute efficiency.
//!
//! Parquet files are partitioned for scalability. Each file contains metadata,
//! along with zero or more "row groups", each row group containing one or
//! more columns. The APIs in this crate reflect this structure.
//!
//! Parquet distinguishes between "logical" and "physical" data types.
//! For instance, strings (logical type) are stored as byte arrays (physical type).
//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
//! are stored as integers (physical type). This crate exposes both kinds of types.
//! Data in Parquet files is strongly typed and differentiates between logical
//! and physical types (see [`schema`]). In addition, Parquet files may contain
//! other metadata, such as statistics, which can be used to optimize reading
//! (see [`file::metadata`]).
//! For more details about the Parquet format itself, see the [Parquet spec]
//!
//! For more details about the Parquet format, see the
//! [Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format).
//! [Parquet spec]: https://github.com/apache/parquet-format/blob/master/README.md#file-format
//!
//! # APIs
//!
//! This crate exposes a number of APIs for different use-cases.
//!
//! ## Metadata and Schema
//!
//! The [`schema`] module provides APIs to work with Parquet schemas. The
//! [`file::metadata`] module provides APIs to work with Parquet metadata.
//!
//! ## Read/Write Arrow
//!
//! The [`arrow`] module allows reading and writing Parquet data to/from Arrow `RecordBatch`.
Expand All @@ -64,7 +69,7 @@
//!
//! ## Read/Write Parquet
//!
//! Workloads needing finer-grained control, or looking to not take a dependency on arrow,
//! Workloads needing finer-grained control, or avoid a dependence on arrow,
//! can use the lower-level APIs in [`mod@file`]. These APIs expose the underlying parquet
//! data model, and therefore require knowledge of the underlying parquet format,
//! including the details of [Dremel] record shredding and [Logical Types]. Most workloads
Expand Down
14 changes: 14 additions & 0 deletions parquet/src/schema/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,20 @@

//! Parquet schema definitions and methods to print and parse schema.
//!
//! * [`SchemaDescriptor`] describes the data types of the columns stored in a file
//! * [`ColumnDescriptor`]: Describes the schema of a single (leaf) column.
//! * [`ColumnPath`]: Represents the location of a column in the schema (e.g. a nested field)
//!
//! Parquet distinguishes
//! between "logical" and "physical" data types.
//! For instance, strings (logical type) are stored as byte arrays (physical type).
//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
//! are stored as integers (physical type).
//!
//! [`SchemaDescriptor`]: types::SchemaDescriptor
//! [`ColumnDescriptor`]: types::ColumnDescriptor
//! [`ColumnPath`]: types::ColumnPath
//!
//! # Example
//!
//! ```rust
Expand Down
52 changes: 32 additions & 20 deletions parquet/src/schema/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ pub type SchemaDescPtr = Arc<SchemaDescriptor>;
pub type ColumnDescPtr = Arc<ColumnDescriptor>;

/// Representation of a Parquet type.
///
/// Used to describe primitive leaf fields and structs, including top-level schema.
/// Note that the top-level schema type is represented using `GroupType` whose
///
/// Note that the top-level schema is represented using [`Type::GroupType`] whose
/// repetition is `None`.
#[derive(Clone, Debug, PartialEq)]
pub enum Type {
Expand Down Expand Up @@ -662,7 +664,7 @@ impl BasicTypeInfo {
// ----------------------------------------------------------------------
// Parquet descriptor definitions

/// Represents a path in a nested schema
/// Represents the location of a column in a Parquet schema
#[derive(Clone, PartialEq, Debug, Eq, Hash)]
pub struct ColumnPath {
parts: Vec<String>,
Expand Down Expand Up @@ -737,21 +739,22 @@ impl AsRef<[String]> for ColumnPath {
}
}

/// A descriptor for leaf-level primitive columns.
/// This encapsulates information such as definition and repetition levels and is used to
/// Physical type for leaf-level primitive columns.
///
/// Also includes the maximum definition and repetition levels required to
/// re-assemble nested data.
#[derive(Debug, PartialEq)]
pub struct ColumnDescriptor {
// The "leaf" primitive type of this column
/// The "leaf" primitive type of this column
primitive_type: TypePtr,

// The maximum definition level for this column
/// The maximum definition level for this column
max_def_level: i16,

// The maximum repetition level for this column
/// The maximum repetition level for this column
max_rep_level: i16,

// The path of this column. For instance, "a.b.c.d".
/// The path of this column. For instance, "a.b.c.d".
path: ColumnPath,
}

Expand Down Expand Up @@ -860,24 +863,33 @@ impl ColumnDescriptor {
}
}

/// A schema descriptor. This encapsulates the top-level schemas for all the columns,
/// as well as all descriptors for all the primitive columns.
/// Schema of a Parquet file.
///
/// Encapsulates the file's schema ([`Type`]) and [`ColumnDescriptor`]s for
/// each primitive (leaf) column.
#[derive(PartialEq)]
pub struct SchemaDescriptor {
// The top-level schema (the "message" type).
// This must be a `GroupType` where each field is a root column type in the schema.
/// The top-level logical schema (the "message" type).
///
/// This must be a [`Type::GroupType`] where each field is a root
/// column type in the schema.
schema: TypePtr,

// All the descriptors for primitive columns in this schema, constructed from
// `schema` in DFS order.
/// The descriptors for the physical type of each leaf column in this schema
///
/// Constructed from `schema` in DFS order.
leaves: Vec<ColumnDescPtr>,

// Mapping from a leaf column's index to the root column index that it
// comes from. For instance: the leaf `a.b.c.d` would have a link back to `a`:
// -- a <-----+
// -- -- b |
// -- -- -- c |
// -- -- -- -- d
/// Mapping from a leaf column's index to the root column index that it
/// comes from.
///
/// For instance: the leaf `a.b.c.d` would have a link back to `a`:
/// ```text
/// -- a <-----+
/// -- -- b |
/// -- -- -- c |
/// -- -- -- -- d
/// ```
leaf_to_base: Vec<usize>,
}

Expand Down
Loading