{-# LANGUAGE OverloadedStrings #-} module Hydra.Sources.Tier4.Langs.Parquet.Format where import Hydra.Sources.Tier3.All import Hydra.Dsl.Annotations import Hydra.Dsl.Bootstrap import Hydra.Dsl.Types as Types -- Note: deprecated and trivial/empty type definitions are excluded from this model parquetFormatModule :: Module Kv parquetFormatModule = Module ns elements [] [] $ Just ("A model for the Parquet format. Based on the Thrift-based specification at:\n" ++ " https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift") where ns = Namespace "hydra/langs/parquet/format" def = datatype ns parquet = typeref ns elements = [ -- /** -- * Types supported by Parquet. These types are intended to be used in combination -- * with the encodings to control the on disk storage format. -- * For example INT16 is not included as a type since a good encoding of INT32 -- * would handle this. -- */ -- enum Type { def "Type" $ doc ("Types supported by Parquet. These types are intended to be used in combination " ++ "with the encodings to control the on disk storage format. " ++ "For example INT16 is not included as a type since a good encoding of INT32 " ++ "would handle this.") $ enum [ -- BOOLEAN = 0; "boolean", -- INT32 = 1; "int32", -- INT64 = 2; "int64", -- INT96 = 3; // deprecated, only used by legacy implementations. -- FLOAT = 4; "float", -- DOUBLE = 5; "double", -- BYTE_ARRAY = 6; "byteArray", -- FIXED_LEN_BYTE_ARRAY = 7; "fixedLenByteArray"], -- } -- /** -- * DEPRECATED: Common types used by frameworks(e.g. hive, pig) using parquet. -- * ConvertedType is superseded by LogicalType. This enum should not be extended. -- * -- * See LogicalTypes.md for conversion between ConvertedType and LogicalType. -- */ -- enum ConvertedType { -- /** a BYTE_ARRAY actually contains UTF8 encoded chars */ -- UTF8 = 0; -- -- /** a map is converted as an optional field containing a repeated key/value pair */ -- MAP = 1; -- -- /** a key/value pair is converted into a group of two fields */ -- MAP_KEY_VALUE = 2; -- -- /** a list is converted into an optional field containing a repeated field for its -- * values */ -- LIST = 3; -- -- /** an enum is converted into a binary field */ -- ENUM = 4; -- -- /** -- * A decimal value. -- * -- * This may be used to annotate binary or fixed primitive types. The -- * underlying byte array stores the unscaled value encoded as two's -- * complement using big-endian byte order (the most significant byte is the -- * zeroth element). The value of the decimal is the value * 10^{-scale}. -- * -- * This must be accompanied by a (maximum) precision and a scale in the -- * SchemaElement. The precision specifies the number of digits in the decimal -- * and the scale stores the location of the decimal point. For example 1.23 -- * would have precision 3 (3 total digits) and scale 2 (the decimal point is -- * 2 digits over). -- */ -- DECIMAL = 5; -- -- /** -- * A Date -- * -- * Stored as days since Unix epoch, encoded as the INT32 physical type. -- * -- */ -- DATE = 6; -- -- /** -- * A time -- * -- * The total number of milliseconds since midnight. The value is stored -- * as an INT32 physical type. -- */ -- TIME_MILLIS = 7; -- -- /** -- * A time. -- * -- * The total number of microseconds since midnight. The value is stored as -- * an INT64 physical type. -- */ -- TIME_MICROS = 8; -- -- /** -- * A date/time combination -- * -- * Date and time recorded as milliseconds since the Unix epoch. Recorded as -- * a physical type of INT64. -- */ -- TIMESTAMP_MILLIS = 9; -- -- /** -- * A date/time combination -- * -- * Date and time recorded as microseconds since the Unix epoch. The value is -- * stored as an INT64 physical type. -- */ -- TIMESTAMP_MICROS = 10; -- -- -- /** -- * An unsigned integer value. -- * -- * The number describes the maximum number of meaningful data bits in -- * the stored value. 8, 16 and 32 bit values are stored using the -- * INT32 physical type. 64 bit values are stored using the INT64 -- * physical type. -- * -- */ -- UINT_8 = 11; -- UINT_16 = 12; -- UINT_32 = 13; -- UINT_64 = 14; -- -- /** -- * A signed integer value. -- * -- * The number describes the maximum number of meaningful data bits in -- * the stored value. 8, 16 and 32 bit values are stored using the -- * INT32 physical type. 64 bit values are stored using the INT64 -- * physical type. -- * -- */ -- INT_8 = 15; -- INT_16 = 16; -- INT_32 = 17; -- INT_64 = 18; -- -- /** -- * An embedded JSON document -- * -- * A JSON document embedded within a single UTF8 column. -- */ -- JSON = 19; -- -- /** -- * An embedded BSON document -- * -- * A BSON document embedded within a single BINARY column. -- */ -- BSON = 20; -- -- /** -- * An interval of time -- * -- * This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 -- * This data is composed of three separate little endian unsigned -- * integers. Each stores a component of a duration of time. The first -- * integer identifies the number of months associated with the duration, -- * the second identifies the number of days associated with the duration -- * and the third identifies the number of milliseconds associated with -- * the provided duration. This duration of time is independent of any -- * particular timezone or date. -- */ -- INTERVAL = 21; --} -- /** -- * Representation of Schemas -- */ -- enum FieldRepetitionType { def "FieldRepetitionType" $ doc "Representation of Schemas" $ union [ -- /** This field is required (can not be null) and each record has exactly 1 value. */ -- REQUIRED = 0; "required">: doc "This field is required (can not be null) and each record has exactly 1 value." unit, -- -- /** The field is optional (can be null) and each record has 0 or 1 values. */ -- OPTIONAL = 1; "optional">: doc "The field is optional (can be null) and each record has 0 or 1 values." unit, -- -- /** The field is repeated and can contain 0 or more values */ -- REPEATED = 2; "repeated">: doc "The field is repeated and can contain 0 or more values" unit], -- } -- /** -- * Statistics per row group and per page -- * All fields are optional. -- */ -- struct Statistics { def "Statistics" $ doc "Statistics per row group and per page. All fields are optional." $ record [ -- /** -- * DEPRECATED: min and max value of the column. Use min_value and max_value. -- * -- * Values are encoded using PLAIN encoding, except that variable-length byte -- * arrays do not include a length prefix. -- * -- * These fields encode min and max values determined by signed comparison -- * only. New files should use the correct order for a column's logical type -- * and store the values in the min_value and max_value fields. -- * -- * To support older readers, these may be set when the column order is -- * signed. -- */ -- 1: optional binary max; -- 2: optional binary min; -- /** count of null value in the column */ -- 3: optional i64 null_count; "nullCount">: optional uint64, -- /** count of distinct values occurring */ -- 4: optional i64 distinct_count; "distinctCount">: optional uint64, -- /** -- * Min and max values for the column, determined by its ColumnOrder. -- * -- * Values are encoded using PLAIN encoding, except that variable-length byte -- * arrays do not include a length prefix. -- */ -- 5: optional binary max_value; "maxValue">: doc ("Max value for the column, determined by its ColumnOrder. " ++ "Values are encoded using PLAIN encoding, except that variable-length byte " ++ "arrays do not include a length prefix.") $ optional binary, -- 6: optional binary min_value; "minValue">: doc ("Max value for the column, determined by its ColumnOrder. " ++ "Values are encoded using PLAIN encoding, except that variable-length byte " ++ "arrays do not include a length prefix.") $ optional binary], -- } -- /** Empty structs to use as logical type annotations */ -- struct StringType {} // allowed for BINARY, must be encoded with UTF-8 -- struct UUIDType {} // allowed for FIXED[16], must encoded raw UUID bytes -- struct MapType {} // see LogicalTypes.md -- struct ListType {} // see LogicalTypes.md -- struct EnumType {} // allowed for BINARY, must be encoded with UTF-8 -- struct DateType {} // allowed for INT32 -- /** -- * Logical type to annotate a column that is always null. -- * -- * Sometimes when discovering the schema of existing data, values are always -- * null and the physical type can't be determined. This annotation signals -- * the case where the physical type was guessed from all null values. -- */ -- struct NullType {} // allowed for any physical type, only null values stored -- /** -- * Decimal logical type annotation -- * -- * To maintain forward-compatibility in v1, implementations using this logical -- * type must also set scale and precision on the annotated SchemaElement. -- * -- * Allowed for physical types: INT32, INT64, FIXED, and BINARY -- */ -- struct DecimalType { def "DecimalType" $ doc ("Decimal logical type annotation. " ++ "To maintain forward-compatibility in v1, implementations using this logical " ++ "type must also set scale and precision on the annotated SchemaElement. " ++ "Allowed for physical types: INT32, INT64, FIXED, and BINARY") $ record [ -- 1: required i32 scale "scale">: int32, -- 2: required i32 precision "precision">: int32], -- } -- /** Time units for logical types */ -- struct MilliSeconds {} -- struct MicroSeconds {} -- struct NanoSeconds {} -- union TimeUnit { def "TimeUnit" $ enum [ -- 1: MilliSeconds MILLIS "millis", -- 2: MicroSeconds MICROS "micros", -- 3: NanoSeconds NANOS "nanos"], -- } -- /** -- * Timestamp logical type annotation -- * -- * Allowed for physical types: INT64 -- */ -- struct TimestampType { def "TimestampType" $ doc ("Timestamp logical type annotation. " ++ "Allowed for physical types: INT64") $ record [ -- 1: required bool isAdjustedToUTC "isAdjustedToUtc">: boolean, -- 2: required TimeUnit unit "unit">: parquet "TimeUnit"], -- } -- /** -- * Time logical type annotation -- * -- * Allowed for physical types: INT32 (millis), INT64 (micros, nanos) -- */ -- struct TimeType { def "TimeType" $ doc ("Time logical type annotation. " ++ "Allowed for physical types: INT32 (millis), INT64 (micros, nanos)") $ record [ -- 1: required bool isAdjustedToUTC "isAdjustedToUtc">: boolean, -- 2: required TimeUnit unit "unit">: parquet "TimeUnit"], -- } -- /** -- * Integer logical type annotation -- * -- * bitWidth must be 8, 16, 32, or 64. -- * -- * Allowed for physical types: INT32, INT64 -- */ -- struct IntType { def "IntType" $ doc ("Integer logical type annotation. " ++ "bitWidth must be 8, 16, 32, or 64. " ++ "Allowed for physical types: INT32, INT64") $ record [ -- 1: required i8 bitWidth "bitWidth">: uint8, -- 2: required bool isSigned "isSigned">: boolean], -- } -- /** -- * Embedded JSON logical type annotation -- * -- * Allowed for physical types: BINARY -- */ -- struct JsonType { -- } -- -- /** -- * Embedded BSON logical type annotation -- * -- * Allowed for physical types: BINARY -- */ -- struct BsonType { -- } -- -- /** -- * LogicalType annotations to replace ConvertedType. -- * -- * To maintain compatibility, implementations using LogicalType for a -- * SchemaElement aust also set the corresponding ConvertedType (if any) -- * from the following table. -- */ -- union LogicalType { def "LogicalType" $ doc ("LogicalType annotations to replace ConvertedType. " ++ "To maintain compatibility, implementations using LogicalType for a " ++ "SchemaElement aust also set the corresponding ConvertedType (if any) " ++ "from the following table.") $ union [ -- 1: StringType STRING // use ConvertedType UTF8 "string">: doc "use ConvertedType UTF8" unit, -- 2: MapType MAP // use ConvertedType MAP "map">: doc "use ConvertedType MAP" unit, -- 3: ListType LIST // use ConvertedType LIST "list">: doc "use ConvertedType LIST" unit, -- 4: EnumType ENUM // use ConvertedType ENUM "enum">: doc "use ConvertedType ENUM" unit, -- 5: DecimalType DECIMAL // use ConvertedType DECIMAL + SchemaElement.{scale, precision} "decimal">: doc "use ConvertedType DECIMAL + SchemaElement.{scale, precision}" $ parquet "DecimalType", -- 6: DateType DATE // use ConvertedType DATE "date">: doc "use ConvertedType DATE" unit, -- -- // use ConvertedType TIME_MICROS for TIME(isAdjustedToUTC = *, unit = MICROS) -- // use ConvertedType TIME_MILLIS for TIME(isAdjustedToUTC = *, unit = MILLIS) -- 7: TimeType TIME "time">: doc ("use ConvertedType TIME_MICROS for TIME(isAdjustedToUTC = *, unit = MICROS). " ++ "use ConvertedType TIME_MILLIS for TIME(isAdjustedToUTC = *, unit = MILLIS)") $ parquet "TimeType", -- -- // use ConvertedType TIMESTAMP_MICROS for TIMESTAMP(isAdjustedToUTC = *, unit = MICROS) -- // use ConvertedType TIMESTAMP_MILLIS for TIMESTAMP(isAdjustedToUTC = *, unit = MILLIS) -- 8: TimestampType TIMESTAMP "timestamp">: doc ("use ConvertedType TIMESTAMP_MICROS for TIMESTAMP(isAdjustedToUTC = *, unit = MICROS). " ++ "use ConvertedType TIMESTAMP_MILLIS for TIMESTAMP(isAdjustedToUTC = *, unit = MILLIS)") $ parquet "TimestampType", -- -- // 9: reserved for INTERVAL -- 10: IntType INTEGER // use ConvertedType INT_* or UINT_* "integer">: doc "use ConvertedType INT_* or UINT_*" $ parquet "IntType", -- 11: NullType UNKNOWN // no compatible ConvertedType "unknown">: doc "no compatible ConvertedType" unit, -- 12: JsonType JSON // use ConvertedType JSON "json">: doc "use ConvertedType JSON" unit, -- 13: BsonType BSON // use ConvertedType BSON "bson">: doc "use ConvertedType BSON" unit, -- 14: UUIDType UUID // no compatible ConvertedType "uuid">: doc "no compatible ConvertedType" unit], -- } -- /** -- * Represents a element inside a schema definition. -- * - if it is a group (inner node) then type is undefined and num_children is defined -- * - if it is a primitive type (leaf) then type is defined and num_children is undefined -- * the nodes are listed in depth first traversal order. -- */ -- struct SchemaElement { def "SchemaElement" $ doc ("Represents a element inside a schema definition.\n" ++ "- if it is a group (inner node) then type is undefined and num_children is defined\n" ++ "- if it is a primitive type (leaf) then type is defined and num_children is undefined\n" ++ "the nodes are listed in depth first traversal order.") $ record [ -- /** Data type for this field. Not set if the current element is a non-leaf node */ -- 1: optional Type type; "type">: doc "Data type for this field. Not set if the current element is a non-leaf node" $ optional $ parquet "Type", -- -- /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales. -- * Otherwise, if specified, this is the maximum bit length to store any of the values. -- * (e.g. a low cardinality INT col could have this set to 3). Note that this is -- * in the schema, and therefore fixed for the entire file. -- */ -- 2: optional i32 type_length; "typeLength">: doc ("If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values. " ++ "Otherwise, if specified, this is the maximum bit length to store any of the values. " ++ "(e.g. a low cardinality INT col could have this set to 3). Note that this is " ++ "in the schema, and therefore fixed for the entire file.") $ optional int32, -- -- /** repetition of the field. The root of the schema does not have a repetition_type. -- * All other nodes must have one */ -- 3: optional FieldRepetitionType repetition_type; "repetitionType">: doc ("repetition of the field. The root of the schema does not have a repetition_type. " ++ "All other nodes must have one") $ optional $ parquet "FieldRepetitionType", -- -- /** Name of the field in the schema */ -- 4: required string name; "name">: doc "Name of the field in the schema" string, -- -- /** Nested fields. Since thrift does not support nested fields, -- * the nesting is flattened to a single list by a depth-first traversal. -- * The children count is used to construct the nested relationship. -- * This field is not set when the element is a primitive type -- */ -- 5: optional i32 num_children; "numChildren">: doc ("Nested fields. Since thrift does not support nested fields, " ++ "the nesting is flattened to a single list by a depth-first traversal. " ++ "The children count is used to construct the nested relationship. " ++ "This field is not set when the element is a primitive type") $ optional int32, -- -- /** -- * DEPRECATED: When the schema is the result of a conversion from another model. -- * Used to record the original type to help with cross conversion. -- * -- * This is superseded by logicalType. -- */ -- 6: optional ConvertedType converted_type; -- -- /** -- * DEPRECATED: Used when this column contains decimal data. -- * See the DECIMAL converted type for more details. -- * -- * This is superseded by using the DecimalType annotation in logicalType. -- */ -- 7: optional i32 scale -- 8: optional i32 precision -- -- /** When the original schema supports field ids, this will save the -- * original field id in the parquet schema -- */ -- 9: optional i32 field_id; "fieldId">: doc ("When the original schema supports field ids, this will save the " ++ "original field id in the parquet schema") $ optional int32, -- -- /** -- * The logical type of this SchemaElement -- * -- * LogicalType replaces ConvertedType, but ConvertedType is still required -- * for some logical types to ensure forward-compatibility in format v1. -- */ -- 10: optional LogicalType logicalType "logicalType">: doc ("The logical type of this SchemaElement. " ++ "LogicalType replaces ConvertedType, but ConvertedType is still required " ++ "for some logical types to ensure forward-compatibility in format v1.") $ optional $ parquet "LogicalType"], -- } -- /** -- * Encodings supported by Parquet. Not all encodings are valid for all types. These -- * enums are also used to specify the encoding of definition and repetition levels. -- * See the accompanying doc for the details of the more complicated encodings. -- */ -- enum Encoding { def "Encoding" $ doc ("Encodings supported by Parquet. Not all encodings are valid for all types. These " ++ "enums are also used to specify the encoding of definition and repetition levels. " ++ "See the accompanying doc for the details of the more complicated encodings.") $ union [ -- /** Default encoding. -- * BOOLEAN - 1 bit per value. 0 is false; 1 is true. -- * INT32 - 4 bytes per value. Stored as little-endian. -- * INT64 - 8 bytes per value. Stored as little-endian. -- * FLOAT - 4 bytes per value. IEEE. Stored as little-endian. -- * DOUBLE - 8 bytes per value. IEEE. Stored as little-endian. -- * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. -- * FIXED_LEN_BYTE_ARRAY - Just the bytes. -- */ -- PLAIN = 0; "plain">: doc ("Default encoding.\n" ++ "BOOLEAN - 1 bit per value. 0 is false; 1 is true.\n" ++ "INT32 - 4 bytes per value. Stored as little-endian.\n" ++ "INT64 - 8 bytes per value. Stored as little-endian.\n" ++ "FLOAT - 4 bytes per value. IEEE. Stored as little-endian.\n" ++ "DOUBLE - 8 bytes per value. IEEE. Stored as little-endian.\n" ++ "BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.\n" ++ "FIXED_LEN_BYTE_ARRAY - Just the bytes.") $ unit, -- -- /** Group VarInt encoding for INT32/INT64. -- * This encoding is deprecated. It was never used -- */ -- // GROUP_VAR_INT = 1; -- -- /** -- * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the -- * plain type. -- * in a data page use RLE_DICTIONARY instead. -- * in a Dictionary page use PLAIN instead -- */ -- PLAIN_DICTIONARY = 2; -- -- /** Group packed run length encoding. Usable for definition/repetition levels -- * encoding and Booleans (on one bit: 0 is false; 1 is true.) -- */ -- RLE = 3; "rle">: doc ("Group packed run length encoding. Usable for definition/repetition levels " ++ "encoding and Booleans (on one bit: 0 is false; 1 is true.)") unit, -- -- /** Bit packed encoding. This can only be used if the data has a known max -- * width. Usable for definition/repetition levels encoding. -- */ -- BIT_PACKED = 4; "bitPacked">: doc ("Bit packed encoding. This can only be used if the data has a known max " ++ "width. Usable for definition/repetition levels encoding.") unit, -- -- /** Delta encoding for integers. This can be used for int columns and works best -- * on sorted data -- */ -- DELTA_BINARY_PACKED = 5; "deltaBinaryPacked">: doc ("Delta encoding for integers. This can be used for int columns and works best " ++ "on sorted data") unit, -- -- /** Encoding for byte arrays to separate the length values and the data. The lengths -- * are encoded using DELTA_BINARY_PACKED -- */ -- DELTA_LENGTH_BYTE_ARRAY = 6; "deltaLengthByteArray">: doc ("Encoding for byte arrays to separate the length values and the data. The lengths " ++ "are encoded using DELTA_BINARY_PACKED") unit, -- -- /** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. -- * Suffixes are stored as delta length byte arrays. -- */ -- DELTA_BYTE_ARRAY = 7; "deltaByteArray">: doc ("Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. " ++ "Suffixes are stored as delta length byte arrays.") unit, -- -- /** Dictionary encoding: the ids are encoded using the RLE encoding -- */ -- RLE_DICTIONARY = 8; "rleDictionary">: doc ("Dictionary encoding: the ids are encoded using the RLE encoding") unit, -- -- /** Encoding for floating-point data. -- K byte-streams are created where K is the size in bytes of the data type. -- The individual bytes of an FP value are scattered to the corresponding stream and -- the streams are concatenated. -- This itself does not reduce the size of the data but can lead to better compression -- afterwards. -- */ -- BYTE_STREAM_SPLIT = 9; "byteStreamSplit">: doc ("Encoding for floating-point data. " ++ "K byte-streams are created where K is the size in bytes of the data type. " ++ "The individual bytes of an FP value are scattered to the corresponding stream and " ++ "the streams are concatenated. " ++ "This itself does not reduce the size of the data but can lead to better compression " ++ "afterwards.") unit], -- } -- /** -- * Supported compression algorithms. -- * -- * Codecs added in format version X.Y can be read by readers based on X.Y and later. -- * Codec support may vary between readers based on the format version and -- * libraries available at runtime. -- * -- * See Compression.md for a detailed specification of these algorithms. -- */ -- enum CompressionCodec { def "CompressionCodec" $ doc ("Supported compression algorithms. " ++ "Codecs added in format version X.Y can be read by readers based on X.Y and later. " ++ "Codec support may vary between readers based on the format version and " ++ "libraries available at runtime. " ++ "See Compression.md for a detailed specification of these algorithms.") $ union [ -- UNCOMPRESSED = 0; "uncompressed">: unit, -- SNAPPY = 1; "snappy">: unit, -- GZIP = 2; "gzip">: unit, -- LZO = 3; "lzo">: unit, -- BROTLI = 4; // Added in 2.4 "brotli">: doc "Added in 2.4" unit, -- LZ4 = 5; // DEPRECATED (Added in 2.4) -- ZSTD = 6; // Added in 2.4 "zstd">: doc "Added in 2.4" unit, -- LZ4_RAW = 7; // Added in 2.9 "lz4Raw">: doc "Added in 2.9" unit], -- } -- enum PageType { def "PageType" $ enum [ -- DATA_PAGE = 0; "dataPage", -- INDEX_PAGE = 1; "indexPage", -- DICTIONARY_PAGE = 2; "dictionaryPage", -- DATA_PAGE_V2 = 3; "dataPageV2"], -- } -- /** -- * Enum to annotate whether lists of min/max elements inside ColumnIndex -- * are ordered and if so, in which direction. -- */ -- enum BoundaryOrder { def "BoundaryOrder" $ doc ("Enum to annotate whether lists of min/max elements inside ColumnIndex " ++ "are ordered and if so, in which direction.") $ enum [ -- UNORDERED = 0; "unordered", -- ASCENDING = 1; "ascending", -- DESCENDING = 2; "descending"], -- } -- /** Data page header */ -- struct DataPageHeader { def "DataPageHeader" $ doc "Data page header" $ record [ -- /** Number of values, including NULLs, in this data page. **/ -- 1: required i32 num_values "numValues">: doc "Number of values, including NULLs, in this data page." int32, -- -- /** Encoding used for this data page **/ -- 2: required Encoding encoding "encoding">: doc "Encoding used for this data page" $ parquet "Encoding", -- -- /** Encoding used for definition levels **/ -- 3: required Encoding definition_level_encoding; "definitionLevelEncoding">: doc "Encoding used for definition levels" $ parquet "Encoding", -- -- /** Encoding used for repetition levels **/ -- 4: required Encoding repetition_level_encoding; "repetitionLevelEncoding">: doc "Encoding used for repetition levels" $ parquet "Encoding", -- -- /** Optional statistics for the data in this page**/ -- 5: optional Statistics statistics; "statistics">: doc "Optional statistics for the data in this page" $ optional $ parquet "Statistics"], -- } -- -- struct IndexPageHeader { def "IndexPageHeader" $ record [], -- // TODO -- } -- /** -- * The dictionary page must be placed at the first position of the column chunk -- * if it is partly or completely dictionary encoded. At most one dictionary page -- * can be placed in a column chunk. -- **/ -- struct DictionaryPageHeader { def "DictionaryPageHeader" $ doc ("The dictionary page must be placed at the first position of the column chunk " ++ "if it is partly or completely dictionary encoded. At most one dictionary page " ++ "can be placed in a column chunk.") $ record [ -- /** Number of values in the dictionary **/ -- 1: required i32 num_values; "numValues">: doc "Number of values in the dictionary" $ int32, -- -- /** Encoding using this dictionary page **/ -- 2: required Encoding encoding "encoding">: doc "Encoding using this dictionary page" $ parquet "Encoding", -- -- /** If true, the entries in the dictionary are sorted in ascending order **/ -- 3: optional bool is_sorted; "isSorted">: doc "If true, the entries in the dictionary are sorted in ascending order" $ optional boolean], -- } -- /** -- * New page format allowing reading levels without decompressing the data -- * Repetition and definition levels are uncompressed -- * The remaining section containing the data is compressed if is_compressed is true -- **/ -- struct DataPageHeaderV2 { def "DataPageHeaderV2" $ doc ("New page format allowing reading levels without decompressing the data " ++ "Repetition and definition levels are uncompressed " ++ "The remaining section containing the data is compressed if is_compressed is true") $ record [ -- /** Number of values, including NULLs, in this data page. **/ -- 1: required i32 num_values "numValues">: doc "Number of values, including NULLs, in this data page." $ int32, -- /** Number of NULL values, in this data page. -- Number of non-null = num_values - num_nulls which is also the number of values in the data section **/ -- 2: required i32 num_nulls "numNulls">: doc ("Number of NULL values, in this data page. " ++ "Number of non-null = num_values - num_nulls which is also the number of values in the data section") $ int32, -- /** Number of rows in this data page. which means pages change on record boundaries (r = 0) **/ -- 3: required i32 num_rows "numRows">: doc "Number of rows in this data page. which means pages change on record boundaries (r = 0)" $ int32, -- /** Encoding used for data in this page **/ -- 4: required Encoding encoding "encoding">: doc "Encoding used for data in this page" $ parquet "Encoding", -- -- // repetition levels and definition levels are always using RLE (without size in it) -- -- /** length of the definition levels */ -- 5: required i32 definition_levels_byte_length; "definitionLevelsByteLength">: doc "length of the definition levels" $ int32, -- /** length of the repetition levels */ -- 6: required i32 repetition_levels_byte_length; "repetitionLevelsByteLength">: doc "length of the repetition levels" $ int32, -- -- /** whether the values are compressed. -- Which means the section of the page between -- definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) -- is compressed with the compression_codec. -- If missing it is considered compressed */ -- 7: optional bool is_compressed = 1; "isCompressed">: doc ("whether the values are compressed. " ++ "Which means the section of the page between " ++ "definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) " ++ "is compressed with the compression_codec. " ++ "If missing it is considered compressed") $ optional boolean, -- -- /** optional statistics for the data in this page **/ -- 8: optional Statistics statistics; "statistics">: doc "optional statistics for the data in this page" $ optional $ parquet "Statistics"], -- } -- /** Block-based algorithm type annotation. **/ -- struct SplitBlockAlgorithm {} -- /** The algorithm used in Bloom filter. **/ -- union BloomFilterAlgorithm { def "BloomFilterAlgorithm" $ doc "The algorithm used in Bloom filter." $ union [ -- /** Block-based Bloom filter. **/ -- 1: SplitBlockAlgorithm BLOCK; "block">: doc "Block-based Bloom filter." unit], -- } -- /** Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash -- * algorithm. It uses 64 bits version of xxHash. -- **/ -- struct XxHash {} -- -- /** -- * The hash function used in Bloom filter. This function takes the hash of a column value -- * using plain encoding. -- **/ -- union BloomFilterHash { def "BloomFilterHash" $ doc ("The hash function used in Bloom filter. This function takes the hash of a column value " ++ "using plain encoding.") $ union [ -- /** xxHash Strategy. **/ -- 1: XxHash XXHASH; "xxhash">: doc "xxHash Strategy." unit], -- } -- /** -- * The compression used in the Bloom filter. -- **/ -- struct Uncompressed {} -- union BloomFilterCompression { def "BloomFilterCompression" $ doc "The compression used in the Bloom filter." $ enum [ -- 1: Uncompressed UNCOMPRESSED; "uncompressed"], -- } -- /** -- * Bloom filter header is stored at beginning of Bloom filter data of each column -- * and followed by its bitset. -- **/ -- struct BloomFilterHeader { def "BloomFilterHeader" $ doc ("Bloom filter header is stored at beginning of Bloom filter data of each column " ++ "and followed by its bitset.") $ record [ -- /** The size of bitset in bytes **/ -- 1: required i32 numBytes; "numBytes">: doc "The size of bitset in bytes" $ int32, -- /** The algorithm for setting bits. **/ -- 2: required BloomFilterAlgorithm algorithm; "algorithm">: doc "The algorithm for setting bits." $ parquet "BloomFilterAlgorithm", -- /** The hash function used for Bloom filter. **/ -- 3: required BloomFilterHash hash; "hash">: doc "The hash function used for Bloom filter." $ parquet "BloomFilterHash", -- /** The compression used in the Bloom filter **/ -- 4: required BloomFilterCompression compression; "compression">: doc "The compression used in the Bloom filter" $ parquet "BloomFilterCompression"], -- } -- struct PageHeader { def "PageHeader" $ record [ -- /** the type of the page: indicates which of the *_header fields is set **/ -- 1: required PageType type "type">: doc "the type of the page: indicates which of the *_header fields is set" $ parquet "PageType", -- -- /** Uncompressed page size in bytes (not including this header) **/ -- 2: required i32 uncompressed_page_size "uncompressedPageSize">: doc "Uncompressed page size in bytes (not including this header)" $ int32, -- -- /** Compressed (and potentially encrypted) page size in bytes, not including this header **/ -- 3: required i32 compressed_page_size "compressedPageSize">: doc "Compressed (and potentially encrypted) page size in bytes, not including this header" $ int32, -- -- /** The 32bit CRC for the page, to be be calculated as follows: -- * - Using the standard CRC32 algorithm -- * - On the data only, i.e. this header should not be included. 'Data' -- * hereby refers to the concatenation of the repetition levels, the -- * definition levels and the column value, in this exact order. -- * - On the encoded versions of the repetition levels, definition levels and -- * column values -- * - On the compressed versions of the repetition levels, definition levels -- * and column values where possible; -- * - For v1 data pages, the repetition levels, definition levels and column -- * values are always compressed together. If a compression scheme is -- * specified, the CRC shall be calculated on the compressed version of -- * this concatenation. If no compression scheme is specified, the CRC -- * shall be calculated on the uncompressed version of this concatenation. -- * - For v2 data pages, the repetition levels and definition levels are -- * handled separately from the data and are never compressed (only -- * encoded). If a compression scheme is specified, the CRC shall be -- * calculated on the concatenation of the uncompressed repetition levels, -- * uncompressed definition levels and the compressed column values. -- * If no compression scheme is specified, the CRC shall be calculated on -- * the uncompressed concatenation. -- * - In encrypted columns, CRC is calculated after page encryption; the -- * encryption itself is performed after page compression (if compressed) -- * If enabled, this allows for disabling checksumming in HDFS if only a few -- * pages need to be read. -- **/ -- 4: optional i32 crc "crc">: doc ("The 32bit CRC for the page, to be be calculated as follows:\n" ++ "- Using the standard CRC32 algorithm\n" ++ "- On the data only, i.e. this header should not be included. 'Data'\n" ++ " hereby refers to the concatenation of the repetition levels, the\n" ++ " definition levels and the column value, in this exact order.\n" ++ "- On the encoded versions of the repetition levels, definition levels and\n" ++ " column values\n" ++ "- On the compressed versions of the repetition levels, definition levels\n" ++ " and column values where possible;\n" ++ " - For v1 data pages, the repetition levels, definition levels and column\n" ++ " values are always compressed together. If a compression scheme is\n" ++ " specified, the CRC shall be calculated on the compressed version of\n" ++ " this concatenation. If no compression scheme is specified, the CRC\n" ++ " shall be calculated on the uncompressed version of this concatenation.\n" ++ " - For v2 data pages, the repetition levels and definition levels are\n" ++ " handled separately from the data and are never compressed (only\n" ++ " encoded). If a compression scheme is specified, the CRC shall be\n" ++ " calculated on the concatenation of the uncompressed repetition levels,\n" ++ " uncompressed definition levels and the compressed column values.\n" ++ " If no compression scheme is specified, the CRC shall be calculated on\n" ++ " the uncompressed concatenation.\n" ++ "- In encrypted columns, CRC is calculated after page encryption; the\n" ++ " encryption itself is performed after page compression (if compressed)\n" ++ "If enabled, this allows for disabling checksumming in HDFS if only a few " ++ "pages need to be read. ") $ optional int32, -- -- // Headers for page specific data. One only will be set. -- 5: optional DataPageHeader data_page_header; "dataPageHeader">: optional $ parquet "DataPageHeader", -- 6: optional IndexPageHeader index_page_header; "indexPageHeader">: optional $ parquet "IndexPageHeader", -- 7: optional DictionaryPageHeader dictionary_page_header; "dictionaryPageHeader">: optional $ parquet "DictionaryPageHeader", -- 8: optional DataPageHeaderV2 data_page_header_v2; "dataPageHeaderV2">: optional $ parquet "DataPageHeaderV2"], -- } -- /** -- * Wrapper struct to store key values -- */ -- struct KeyValue { def "KeyValue" $ doc "Wrapper struct to store key values" $ record [ -- 1: required string key "key">: string, -- 2: optional string value "value">: optional string], -- } -- /** -- * Wrapper struct to specify sort order -- */ -- struct SortingColumn { def "SortingColumn" $ doc "Wrapper struct to specify sort order" $ record [ -- /** The column index (in this row group) **/ -- 1: required i32 column_idx "columnIdx">: doc "The column index (in this row group)" int32, -- -- /** If true, indicates this column is sorted in descending order. **/ -- 2: required bool descending "descending">: doc "If true, indicates this column is sorted in descending order." boolean, -- -- /** If true, nulls will come before non-null values, otherwise, -- * nulls go at the end. */ -- 3: required bool nulls_first "nullsFirst">: doc ("If true, nulls will come before non-null values, otherwise, " ++ "nulls go at the end.") boolean], -- } -- /** -- * statistics of a given page type and encoding -- */ -- struct PageEncodingStats { def "PageEncodingStats" $ doc "statistics of a given page type and encoding" $ record [ -- -- /** the page type (data/dic/...) **/ -- 1: required PageType page_type; "pageType">: doc "the page type (data/dic/...)" $ parquet "PageType", -- -- /** encoding of the page **/ -- 2: required Encoding encoding; "encoding">: doc "encoding of the page" $ parquet "Encoding", -- -- /** number of pages of this type with this encoding **/ -- 3: required i32 count; "count">: doc "number of pages of this type with this encoding" int32], -- -- } -- /** -- * Description for column metadata -- */ -- struct ColumnMetaData { def "ColumnMetaData" $ doc "Description for column metadata" $ record [ -- /** Type of this column **/ -- 1: required Type type "type">: doc "Type of this column" $ parquet "Type", -- -- /** Set of all encodings used for this column. The purpose is to validate -- * whether we can decode those pages. **/ -- 2: required list encodings "encodings">: doc ("Set of all encodings used for this column. The purpose is to validate " ++ "whether we can decode those pages.") $ list $ parquet "Encoding", -- -- /** Path in schema **/ -- 3: required list path_in_schema "pathInSchema">: doc "Path in schema" $ list string, -- -- /** Compression codec **/ -- 4: required CompressionCodec codec "codec">: doc "Compression codec" $ parquet "CompressionCodec", -- -- /** Number of values in this column **/ -- 5: required i64 num_values "numValues">: doc "Number of values in this column" int64, -- -- /** total byte size of all uncompressed pages in this column chunk (including the headers) **/ -- 6: required i64 total_uncompressed_size "totalUncompressedSize">: doc "total byte size of all uncompressed pages in this column chunk (including the headers)" int64, -- -- /** total byte size of all compressed, and potentially encrypted, pages -- * in this column chunk (including the headers) **/ -- 7: required i64 total_compressed_size "totalCompressedSize">: doc ("total byte size of all compressed, and potentially encrypted, pages " ++ "in this column chunk (including the headers)") int64, -- -- /** Optional key/value metadata **/ -- 8: optional list key_value_metadata "keyValueMetadata">: doc "Optional key/value metadata" $ optional $ list $ parquet "KeyValue", -- -- /** Byte offset from beginning of file to first data page **/ -- 9: required i64 data_page_offset "dataPageOffset">: doc "Byte offset from beginning of file to first data page" int64, -- -- /** Byte offset from beginning of file to root index page **/ -- 10: optional i64 index_page_offset "indexPageOffset">: doc "Byte offset from beginning of file to root index page" $ optional int64, -- -- /** Byte offset from the beginning of file to first (only) dictionary page **/ -- 11: optional i64 dictionary_page_offset "dictionaryPageOffset">: doc "Byte offset from the beginning of file to first (only) dictionary page" $ optional int64, -- -- /** optional statistics for this column chunk */ -- 12: optional Statistics statistics; "statistics">: doc "optional statistics for this column chunk" $ optional $ parquet "Statistics", -- -- /** Set of all encodings used for pages in this column chunk. -- * This information can be used to determine if all data pages are -- * dictionary encoded for example **/ -- 13: optional list encoding_stats; "encodingStats">: doc ("Set of all encodings used for pages in this column chunk. " ++ "This information can be used to determine if all data pages are " ++ "dictionary encoded for example") $ optional $ list $ parquet "PageEncodingStats", -- -- /** Byte offset from beginning of file to Bloom filter data. **/ -- 14: optional i64 bloom_filter_offset; "bloomFilterOffset">: doc "Byte offset from beginning of file to Bloom filter data." $ optional int64], -- } -- -- struct EncryptionWithFooterKey { def "EncryptionWithFooterKey" $ record [], -- } -- -- struct EncryptionWithColumnKey { def "EncryptionWithColumnKey" $ record [ -- /** Column path in schema **/ -- 1: required list path_in_schema "pathInSchema">: doc "Column path in schema" $ list string, -- -- /** Retrieval metadata of column encryption key **/ -- 2: optional binary key_metadata "keyMetadata">: doc "Retrieval metadata of column encryption key" $ optional binary], -- } -- -- union ColumnCryptoMetaData { def "ColumnCryptoMetaData" $ union [ -- 1: EncryptionWithFooterKey ENCRYPTION_WITH_FOOTER_KEY "encryptionWithFooterKey">: parquet "EncryptionWithFooterKey", -- 2: EncryptionWithColumnKey ENCRYPTION_WITH_COLUMN_KEY "encryptionWithColumnKey">: parquet "EncryptionWithColumnKey"], -- } -- struct ColumnChunk { def "ColumnChunk" $ record [ -- /** File where column data is stored. If not set, assumed to be same file as -- * metadata. This path is relative to the current file. -- **/ -- 1: optional string file_path "filePath">: doc ("File where column data is stored. If not set, assumed to be same file as " ++ "metadata. This path is relative to the current file.") $ optional string, -- -- /** Byte offset in file_path to the ColumnMetaData **/ -- 2: required i64 file_offset "fileOffset">: doc "Byte offset in file_path to the ColumnMetaData" int64, -- -- /** Column metadata for this chunk. This is the same content as what is at -- * file_path/file_offset. Having it here has it replicated in the file -- * metadata. -- **/ -- 3: optional ColumnMetaData meta_data "metaData">: doc ("Column metadata for this chunk. This is the same content as what is at " ++ "file_path/file_offset. Having it here has it replicated in the file " ++ "metadata.") $ optional $ parquet "ColumnMetaData", -- -- /** File offset of ColumnChunk's OffsetIndex **/ -- 4: optional i64 offset_index_offset "offsetIndexOffset">: doc "File offset of ColumnChunk's OffsetIndex" $ optional int64, -- -- /** Size of ColumnChunk's OffsetIndex, in bytes **/ -- 5: optional i32 offset_index_length "offsetIndexLength">: doc "Size of ColumnChunk's OffsetIndex, in bytes" $ optional int32, -- -- /** File offset of ColumnChunk's ColumnIndex **/ -- 6: optional i64 column_index_offset "columnIndexOffset">: doc "File offset of ColumnChunk's ColumnIndex" $ optional int64, -- -- /** Size of ColumnChunk's ColumnIndex, in bytes **/ -- 7: optional i32 column_index_length "columnIndexLength">: doc "Size of ColumnChunk's ColumnIndex, in bytes" $ optional int32, -- -- /** Crypto metadata of encrypted columns **/ -- 8: optional ColumnCryptoMetaData crypto_metadata "cryptoMetadata">: doc "Crypto metadata of encrypted columns" $ optional $ parquet "ColumnCryptoMetaData", -- -- /** Encrypted column metadata for this chunk **/ -- 9: optional binary encrypted_column_metadata "encryptedColumnMetadata">: doc "Encrypted column metadata for this chunk" $ optional binary], -- } -- struct RowGroup { def "RowGroup" $ record [ -- /** Kvdata for each column chunk in this row group. -- * This list must have the same order as the SchemaElement list in FileMetaData. -- **/ -- 1: required list columns "columns">: doc ("Metadata for each column chunk in this row group. " ++ "This list must have the same order as the SchemaElement list in FileMetaData.") $ list $ parquet "ColumnChunk", -- -- /** Total byte size of all the uncompressed column data in this row group **/ -- 2: required i64 total_byte_size "totalByteSize">: doc "Total byte size of all the uncompressed column data in this row group" int64, -- -- /** Number of rows in this row group **/ -- 3: required i64 num_rows "numRows">: doc "Number of rows in this row group" int64, -- -- /** If set, specifies a sort ordering of the rows in this RowGroup. -- * The sorting columns can be a subset of all the columns. -- */ -- 4: optional list sorting_columns "sortingColumns">: doc ("If set, specifies a sort ordering of the rows in this RowGroup. " ++ "The sorting columns can be a subset of all the columns.") $ optional $ list $ parquet "SortingColumn", -- -- /** Byte offset from beginning of file to first page (data or dictionary) -- * in this row group **/ -- 5: optional i64 file_offset "fileOffset">: doc ("Byte offset from beginning of file to first page (data or dictionary) " ++ "in this row group") $ optional int64, -- -- /** Total byte size of all compressed (and potentially encrypted) column data -- * in this row group **/ -- 6: optional i64 total_compressed_size "totalCompressedSize">: doc ("Total byte size of all compressed (and potentially encrypted) column data " ++ "in this row group") $ optional int64, -- -- /** Row group ordinal in the file **/ -- 7: optional i16 ordinal "ordinal">: doc "Row group ordinal in the file" $ optional int16], -- } -- -- /** Empty struct to signal the order defined by the physical or logical type */ -- struct TypeDefinedOrder {} -- -- /** -- * Union to specify the order used for the min_value and max_value fields for a -- * column. This union takes the role of an enhanced enum that allows rich -- * elements (which will be needed for a collation-based ordering in the future). -- * -- * Possible values are: -- * * TypeDefinedOrder - the column uses the order defined by its logical or -- * physical type (if there is no logical type). -- * -- * If the reader does not support the value of this union, min and max stats -- * for this column should be ignored. -- */ -- union ColumnOrder { def "ColumnOrder" $ doc ("Union to specify the order used for the min_value and max_value fields for a " ++ "column. This union takes the role of an enhanced enum that allows rich " ++ "elements (which will be needed for a collation-based ordering in the future). " ++ "Possible values are:\n" ++ "* TypeDefinedOrder - the column uses the order defined by its logical or " ++ "physical type (if there is no logical type).\n" ++ "If the reader does not support the value of this union, min and max stats " ++ "for this column should be ignored. ") $ union [ -- -- /** -- * The sort orders for logical types are: -- * UTF8 - unsigned byte-wise comparison -- * INT8 - signed comparison -- * INT16 - signed comparison -- * INT32 - signed comparison -- * INT64 - signed comparison -- * UINT8 - unsigned comparison -- * UINT16 - unsigned comparison -- * UINT32 - unsigned comparison -- * UINT64 - unsigned comparison -- * DECIMAL - signed comparison of the represented value -- * DATE - signed comparison -- * TIME_MILLIS - signed comparison -- * TIME_MICROS - signed comparison -- * TIMESTAMP_MILLIS - signed comparison -- * TIMESTAMP_MICROS - signed comparison -- * INTERVAL - unsigned comparison -- * JSON - unsigned byte-wise comparison -- * BSON - unsigned byte-wise comparison -- * ENUM - unsigned byte-wise comparison -- * LIST - undefined -- * MAP - undefined -- * -- * In the absence of logical types, the sort order is determined by the physical type: -- * BOOLEAN - false, true -- * INT32 - signed comparison -- * INT64 - signed comparison -- * INT96 (only used for legacy timestamps) - undefined -- * FLOAT - signed comparison of the represented value (*) -- * DOUBLE - signed comparison of the represented value (*) -- * BYTE_ARRAY - unsigned byte-wise comparison -- * FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison -- * -- * (*) Because the sorting order is not specified properly for floating -- * point values (relations vs. total ordering) the following -- * compatibility rules should be applied when reading statistics: -- * - If the min is a NaN, it should be ignored. -- * - If the max is a NaN, it should be ignored. -- * - If the min is +0, the row group may contain -0 values as well. -- * - If the max is -0, the row group may contain +0 values as well. -- * - When looking for NaN values, min and max should be ignored. -- */ -- 1: TypeDefinedOrder TYPE_ORDER; "typeOrder">: doc ("The sort orders for logical types are:\n" ++ " UTF8 - unsigned byte-wise comparison\n" ++ " INT8 - signed comparison\n" ++ " INT16 - signed comparison\n" ++ " INT32 - signed comparison\n" ++ " INT64 - signed comparison\n" ++ " UINT8 - unsigned comparison\n" ++ " UINT16 - unsigned comparison\n" ++ " UINT32 - unsigned comparison\n" ++ " UINT64 - unsigned comparison\n" ++ " DECIMAL - signed comparison of the represented value\n" ++ " DATE - signed comparison\n" ++ " TIME_MILLIS - signed comparison\n" ++ " TIME_MICROS - signed comparison\n" ++ " TIMESTAMP_MILLIS - signed comparison\n" ++ " TIMESTAMP_MICROS - signed comparison\n" ++ " INTERVAL - unsigned comparison\n" ++ " JSON - unsigned byte-wise comparison\n" ++ " BSON - unsigned byte-wise comparison\n" ++ " ENUM - unsigned byte-wise comparison\n" ++ " LIST - undefined\n" ++ " MAP - undefined\n" ++ "In the absence of logical types, the sort order is determined by the physical type:\n" ++ " BOOLEAN - false, true\n" ++ " INT32 - signed comparison\n" ++ " INT64 - signed comparison\n" ++ " INT96 (only used for legacy timestamps) - undefined\n" ++ " FLOAT - signed comparison of the represented value (*)\n" ++ " DOUBLE - signed comparison of the represented value (*)\n" ++ " BYTE_ARRAY - unsigned byte-wise comparison\n" ++ " FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison\n" ++ "(*) Because the sorting order is not specified properly for floating\n" ++ " point values (relations vs. total ordering) the following\n" ++ " compatibility rules should be applied when reading statistics:\n" ++ " - If the min is a NaN, it should be ignored.\n" ++ " - If the max is a NaN, it should be ignored.\n" ++ " - If the min is +0, the row group may contain -0 values as well.\n" ++ " - If the max is -0, the row group may contain +0 values as well.\n" ++ " - When looking for NaN values, min and max should be ignored.") unit], -- } -- struct PageLocation { def "PageLocation" $ record [ -- /** Offset of the page in the file **/ -- 1: required i64 offset "offset">: doc "Offset of the page in the file" int64, -- -- /** -- * Size of the page, including header. Sum of compressed_page_size and header -- * length -- */ -- 2: required i32 compressed_page_size "compressedPageSize">: doc ("Size of the page, including header. Sum of compressed_page_size and header " ++ "length") int32, -- -- /** -- * Index within the RowGroup of the first row of the page; this means pages -- * change on record boundaries (r = 0). -- */ -- 3: required i64 first_row_index "firstRowIndex">: doc ("Index within the RowGroup of the first row of the page; this means pages " ++ "change on record boundaries (r = 0).") int64], -- } -- -- struct OffsetIndex { def "OffsetIndex" $ record [ -- /** -- * PageLocations, ordered by increasing PageLocation.offset. It is required -- * that page_locations[i].first_row_index < page_locations[i+1].first_row_index. -- */ -- 1: required list page_locations "pageLocations">: doc ("PageLocations, ordered by increasing PageLocation.offset. It is required " ++ "that page_locations[i].first_row_index < page_locations[i+1].first_row_index.") $ list $ parquet "PageLocation"], -- } -- -- /** -- * Description for ColumnIndex. -- * Each [i] refers to the page at OffsetIndex.page_locations[i] -- */ -- struct ColumnIndex { def "ColumnIndex" $ doc ("Description for ColumnIndex. " ++ "Each [i] refers to the page at OffsetIndex.page_locations[i]") $ record [ -- /** -- * A list of Boolean values to determine the validity of the corresponding -- * min and max values. If true, a page contains only null values, and writers -- * have to set the corresponding entries in min_values and max_values to -- * byte[0], so that all lists have the same length. If false, the -- * corresponding entries in min_values and max_values must be valid. -- */ -- 1: required list null_pages "nullPages">: doc ("A list of Boolean values to determine the validity of the corresponding " ++ "min and max values. If true, a page contains only null values, and writers " ++ "have to set the corresponding entries in min_values and max_values to " ++ "byte[0], so that all lists have the same length. If false, the " ++ "corresponding entries in min_values and max_values must be valid.") $ list boolean, -- -- /** -- * Two lists containing lower and upper bounds for the values of each page -- * determined by the ColumnOrder of the column. These may be the actual -- * minimum and maximum values found on a page, but can also be (more compact) -- * values that do not exist on a page. For example, instead of storing ""Blart -- * Versenwald III", a writer may set min_values[i]="B", max_values[i]="C". -- * Such more compact values must still be valid values within the column's -- * logical type. Readers must make sure that list entries are populated before -- * using them by inspecting null_pages. -- */ -- 2: required list min_values "minValues">: doc ("minValues and maxValues are lists containing lower and upper bounds for the values of each page " ++ "determined by the ColumnOrder of the column. These may be the actual " ++ "minimum and maximum values found on a page, but can also be (more compact) " ++ "values that do not exist on a page. For example, instead of storing \"Blart " ++ "Versenwald III\", a writer may set min_values[i]=\"B\", max_values[i]=\"C\". " ++ "Such more compact values must still be valid values within the column's " ++ "logical type. Readers must make sure that list entries are populated before " ++ "using them by inspecting null_pages.") $ list binary, -- 3: required list max_values "maxValues">: list binary, -- -- /** -- * Stores whether both min_values and max_values are orderd and if so, in -- * which direction. This allows readers to perform binary searches in both -- * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even -- * if the lists are ordered. -- */ -- 4: required BoundaryOrder boundary_order "boundaryOrder">: doc ("Stores whether both min_values and max_values are orderd and if so, in " ++ "which direction. This allows readers to perform binary searches in both " ++ "lists. Readers cannot assume that max_values[i] <= min_values[i+1], even " ++ "if the lists are ordered.") $ parquet "BoundaryOrder", -- -- /** A list containing the number of null values for each page **/ -- 5: optional list null_counts "nullCounts">: doc "A list containing the number of null values for each page" $ optional $ list int64], -- } -- struct AesGcmV1 { def "AesGcmV1" $ record [ -- /** AAD prefix **/ -- 1: optional binary aad_prefix "aadPrefix">: doc "AAD prefix" $ optional binary, -- -- /** Unique file identifier part of AAD suffix **/ -- 2: optional binary aad_file_unique "aadFileUnique">: doc "Unique file identifier part of AAD suffix" $ optional binary, -- -- /** In files encrypted with AAD prefix without storing it, -- * readers must supply the prefix **/ -- 3: optional bool supply_aad_prefix "supplyAadPrefix">: doc ("In files encrypted with AAD prefix without storing it, " ++ "readers must supply the prefix") $ optional boolean], -- } -- struct AesGcmCtrV1 { def "AesGcmCtrV1" $ record [ -- /** AAD prefix **/ -- 1: optional binary aad_prefix "aadPrefix">: doc "AAD prefix" $ optional binary, -- -- /** Unique file identifier part of AAD suffix **/ -- 2: optional binary aad_file_unique "aadFileUnique">: doc "Unique file identifier part of AAD suffix" $ optional binary, -- -- /** In files encrypted with AAD prefix without storing it, -- * readers must supply the prefix **/ -- 3: optional bool supply_aad_prefix "supplyAadPrefix">: doc ("In files encrypted with AAD prefix without storing it, " ++ "readers must supply the prefix") $ optional boolean], -- } -- union EncryptionAlgorithm { def "EncryptionAlgorithm" $ union [ -- 1: AesGcmV1 AES_GCM_V1 "aesGcmV1">: parquet "AesGcmV1", -- 2: AesGcmCtrV1 AES_GCM_CTR_V1 "aesGcmCtrV1">: parquet "AesGcmCtrV1"], -- } -- /** -- * Description for file metadata -- */ -- struct FileMetaData { def "FileMetaData" $ doc "Description for file metadata" $ record [ -- /** Version of this file **/ -- 1: required i32 version "version">: doc "Version of this file" int32, -- -- /** Parquet schema for this file. This schema contains metadata for all the columns. -- * The schema is represented as a tree with a single root. The nodes of the tree -- * are flattened to a list by doing a depth-first traversal. -- * The column metadata contains the path in the schema for that column which can be -- * used to map columns to nodes in the schema. -- * The first element is the root **/ -- 2: required list schema; "schema">: doc ("Parquet schema for this file. This schema contains metadata for all the columns. " ++ "The schema is represented as a tree with a single root. The nodes of the tree " ++ "are flattened to a list by doing a depth-first traversal. " ++ "The column metadata contains the path in the schema for that column which can be " ++ "used to map columns to nodes in the schema. " ++ "The first element is the root") $ list $ parquet "SchemaElement", -- -- /** Number of rows in this file **/ -- 3: required i64 num_rows "numRows">: doc "Number of rows in this file" int64, -- -- /** Row groups in this file **/ -- 4: required list row_groups "rowGroups">: doc "Row groups in this file" $ list $ parquet "RowGroup", -- -- /** Optional key/value metadata **/ -- 5: optional list key_value_metadata "keyValueMetadata">: doc "Optional key/value metadata" $ optional $ list $ parquet "KeyValue", -- -- /** String for application that wrote this file. This should be in the format -- * version (build ). -- * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) -- **/ -- 6: optional string created_by "createdBy">: doc ("String for application that wrote this file. This should be in the format " ++ " version (build ). " ++ "e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)") $ optional string, -- -- /** -- * Sort order used for the min_value and max_value fields in the Statistics -- * objects and the min_values and max_values fields in the ColumnIndex -- * objects of each column in this file. Sort orders are listed in the order -- * matching the columns in the schema. The indexes are not necessary the same -- * though, because only leaf nodes of the schema are represented in the list -- * of sort orders. -- * -- * Without column_orders, the meaning of the min_value and max_value fields -- * in the Statistics object and the ColumnIndex object is undefined. To ensure -- * well-defined behaviour, if these fields are written to a Parquet file, -- * column_orders must be written as well. -- * -- * The obsolete min and max fields in the Statistics object are always sorted -- * by signed comparison regardless of column_orders. -- */ -- 7: optional list column_orders; "columnOrders">: doc ("Sort order used for the min_value and max_value fields in the Statistics " ++ "objects and the min_values and max_values fields in the ColumnIndex " ++ "objects of each column in this file. Sort orders are listed in the order " ++ "matching the columns in the schema. The indexes are not necessary the same " ++ "though, because only leaf nodes of the schema are represented in the list " ++ "of sort orders.\n" ++ "Without column_orders, the meaning of the min_value and max_value fields " ++ "in the Statistics object and the ColumnIndex object is undefined. To ensure " ++ "well-defined behaviour, if these fields are written to a Parquet file, " ++ "column_orders must be written as well.\n" ++ "The obsolete min and max fields in the Statistics object are always sorted " ++ "by signed comparison regardless of column_orders.") $ optional $ list $ parquet "ColumnOrder", -- -- /** -- * Encryption algorithm. This field is set only in encrypted files -- * with plaintext footer. Files with encrypted footer store algorithm id -- * in FileCryptoMetaData structure. -- */ -- 8: optional EncryptionAlgorithm encryption_algorithm "encryptionAlgorithm">: doc ("Encryption algorithm. This field is set only in encrypted files " ++ "with plaintext footer. Files with encrypted footer store algorithm id " ++ "in FileCryptoMetaData structure.") $ optional $ parquet "EncryptionAlgorithm", -- -- /** -- * Retrieval metadata of key used for signing the footer. -- * Used only in encrypted files with plaintext footer. -- */ -- 9: optional binary footer_signing_key_metadata "footerSigningKeyMetadata">: doc ("Retrieval metadata of key used for signing the footer. " ++ "Used only in encrypted files with plaintext footer.") $ optional binary], -- } -- /** Crypto metadata for files with encrypted footer **/ -- struct FileCryptoMetaData { def "FileCryptoMetaData" $ doc "Crypto metadata for files with encrypted footer" $ record [ -- /** -- * Encryption algorithm. This field is only used for files -- * with encrypted footer. Files with plaintext footer store algorithm id -- * inside footer (FileMetaData structure). -- */ -- 1: required EncryptionAlgorithm encryption_algorithm "encryptionAlgorithm">: doc ("Encryption algorithm. This field is only used for files " ++ "with encrypted footer. Files with plaintext footer store algorithm id " ++ "inside footer (FileMetaData structure).") $ parquet "EncryptionAlgorithm", -- -- /** Retrieval metadata of key used for encryption of footer, -- * and (possibly) columns **/ -- 2: optional binary key_metadata "keyMetadata">: doc ("Retrieval metadata of key used for encryption of footer, " ++ "and (possibly) columns") $ optional binary]] -- }