// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // This module contains the logical parquet-cpp types (independent of Thrift // structures), schema nodes, and related type tools #pragma once #include #include #include #include #include #include #include #include "parquet/platform.h" #include "parquet/types.h" #include "parquet/windows_fixup.h" // for OPTIONAL namespace parquet { class SchemaDescriptor; namespace schema { class Node; // List encodings: using the terminology from Impala to define different styles // of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since // the converted type named in the Parquet metadata is ConvertedType::LIST we // use that terminology here. It also helps distinguish from the *_ARRAY // primitive types. // // One-level encoding: Only allows required lists with required cells // repeated value_type name // // Two-level encoding: Enables optional lists with only required cells // group list // repeated value_type item // // Three-level encoding: Enables optional lists with optional cells // group bag // repeated group list // value_type item // // 2- and 1-level encoding are respectively equivalent to 3-level encoding with // the non-repeated nodes set to required. // // The "official" encoding recommended in the Parquet spec is the 3-level, and // we use that as the default when creating list types. For semantic completeness // we allow the other two. Since all types of encodings will occur "in the // wild" we need to be able to interpret the associated definition levels in // the context of the actual encoding used in the file. // // NB: Some Parquet writers may not set ConvertedType::LIST on the repeated // SchemaElement, which could make things challenging if we are trying to infer // that a sequence of nodes semantically represents an array according to one // of these encodings (versus a struct containing an array). We should refuse // the temptation to guess, as they say. struct ListEncoding { enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL }; }; class PARQUET_EXPORT ColumnPath { public: ColumnPath() : path_() {} explicit ColumnPath(const std::vector& path) : path_(path) {} explicit ColumnPath(std::vector&& path) : path_(std::move(path)) {} static std::shared_ptr FromDotString(const std::string& dotstring); static std::shared_ptr FromNode(const Node& node); std::shared_ptr extend(const std::string& node_name) const; std::string ToDotString() const; const std::vector& ToDotVector() const; protected: std::vector path_; }; // Base class for logical schema types. A type has a name, repetition level, // and optionally a logical type (ConvertedType in Parquet metadata parlance) class PARQUET_EXPORT Node { public: enum type { PRIMITIVE, GROUP }; virtual ~Node() {} bool is_primitive() const { return type_ == Node::PRIMITIVE; } bool is_group() const { return type_ == Node::GROUP; } bool is_optional() const { return repetition_ == Repetition::OPTIONAL; } bool is_repeated() const { return repetition_ == Repetition::REPEATED; } bool is_required() const { return repetition_ == Repetition::REQUIRED; } virtual bool Equals(const Node* other) const = 0; const std::string& name() const { return name_; } Node::type node_type() const { return type_; } Repetition::type repetition() const { return repetition_; } ConvertedType::type converted_type() const { return converted_type_; } const std::shared_ptr& logical_type() const { return logical_type_; } /// \brief The field_id value for the serialized SchemaElement. If the /// field_id is less than 0 (e.g. -1), it will not be set when serialized to /// Thrift. int field_id() const { return field_id_; } const Node* parent() const { return parent_; } const std::shared_ptr path() const; virtual void ToParquet(void* element) const = 0; // Node::Visitor abstract class for walking schemas with the visitor pattern class Visitor { public: virtual ~Visitor() {} virtual void Visit(Node* node) = 0; }; class ConstVisitor { public: virtual ~ConstVisitor() {} virtual void Visit(const Node* node) = 0; }; virtual void Visit(Visitor* visitor) = 0; virtual void VisitConst(ConstVisitor* visitor) const = 0; protected: friend class GroupNode; Node(Node::type type, const std::string& name, Repetition::type repetition, ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1) : type_(type), name_(name), repetition_(repetition), converted_type_(converted_type), field_id_(field_id), parent_(NULLPTR) {} Node(Node::type type, const std::string& name, Repetition::type repetition, std::shared_ptr logical_type, int field_id = -1) : type_(type), name_(name), repetition_(repetition), logical_type_(std::move(logical_type)), field_id_(field_id), parent_(NULLPTR) {} Node::type type_; std::string name_; Repetition::type repetition_; ConvertedType::type converted_type_; std::shared_ptr logical_type_; int field_id_; // Nodes should not be shared, they have a single parent. const Node* parent_; bool EqualsInternal(const Node* other) const; void SetParent(const Node* p_parent); private: PARQUET_DISALLOW_COPY_AND_ASSIGN(Node); }; // Save our breath all over the place with these typedefs using NodePtr = std::shared_ptr; using NodeVector = std::vector; // A type that is one of the primitive Parquet storage types. In addition to // the other type metadata (name, repetition level, logical type), also has the // physical storage type and their type-specific metadata (byte width, decimal // parameters) class PARQUET_EXPORT PrimitiveNode : public Node { public: static std::unique_ptr FromParquet(const void* opaque_element); // A field_id -1 (or any negative value) will be serialized as null in Thrift static inline NodePtr Make(const std::string& name, Repetition::type repetition, Type::type type, ConvertedType::type converted_type = ConvertedType::NONE, int length = -1, int precision = -1, int scale = -1, int field_id = -1) { return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length, precision, scale, field_id)); } // If no logical type, pass LogicalType::None() or nullptr // A field_id -1 (or any negative value) will be serialized as null in Thrift static inline NodePtr Make(const std::string& name, Repetition::type repetition, std::shared_ptr logical_type, Type::type primitive_type, int primitive_length = -1, int field_id = -1) { return NodePtr(new PrimitiveNode(name, repetition, std::move(logical_type), primitive_type, primitive_length, field_id)); } bool Equals(const Node* other) const override; Type::type physical_type() const { return physical_type_; } ColumnOrder column_order() const { return column_order_; } void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; } int32_t type_length() const { return type_length_; } const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; } void ToParquet(void* element) const override; void Visit(Visitor* visitor) override; void VisitConst(ConstVisitor* visitor) const override; private: PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type, ConvertedType::type converted_type = ConvertedType::NONE, int length = -1, int precision = -1, int scale = -1, int field_id = -1); PrimitiveNode(const std::string& name, Repetition::type repetition, std::shared_ptr logical_type, Type::type primitive_type, int primitive_length = -1, int field_id = -1); Type::type physical_type_; int32_t type_length_; DecimalMetadata decimal_metadata_; ColumnOrder column_order_; // For FIXED_LEN_BYTE_ARRAY void SetTypeLength(int32_t length) { type_length_ = length; } bool EqualsInternal(const PrimitiveNode* other) const; FRIEND_TEST(TestPrimitiveNode, Attrs); FRIEND_TEST(TestPrimitiveNode, Equals); FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping); FRIEND_TEST(TestPrimitiveNode, FromParquet); }; class PARQUET_EXPORT GroupNode : public Node { public: static std::unique_ptr FromParquet(const void* opaque_element, NodeVector fields = {}); // A field_id -1 (or any negative value) will be serialized as null in Thrift static inline NodePtr Make(const std::string& name, Repetition::type repetition, const NodeVector& fields, ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1) { return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id)); } // If no logical type, pass nullptr // A field_id -1 (or any negative value) will be serialized as null in Thrift static inline NodePtr Make(const std::string& name, Repetition::type repetition, const NodeVector& fields, std::shared_ptr logical_type, int field_id = -1) { return NodePtr(new GroupNode(name, repetition, fields, logical_type, field_id)); } bool Equals(const Node* other) const override; const NodePtr& field(int i) const { return fields_[i]; } // Get the index of a field by its name, or negative value if not found. // If several fields share the same name, it is unspecified which one // is returned. int FieldIndex(const std::string& name) const; // Get the index of a field by its node, or negative value if not found. int FieldIndex(const Node& node) const; int field_count() const { return static_cast(fields_.size()); } void ToParquet(void* element) const override; void Visit(Visitor* visitor) override; void VisitConst(ConstVisitor* visitor) const override; /// \brief Return true if this node or any child node has REPEATED repetition /// type bool HasRepeatedFields() const; private: GroupNode(const std::string& name, Repetition::type repetition, const NodeVector& fields, ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1); GroupNode(const std::string& name, Repetition::type repetition, const NodeVector& fields, std::shared_ptr logical_type, int field_id = -1); NodeVector fields_; bool EqualsInternal(const GroupNode* other) const; // Mapping between field name to the field index std::unordered_multimap field_name_to_idx_; FRIEND_TEST(TestGroupNode, Attrs); FRIEND_TEST(TestGroupNode, Equals); FRIEND_TEST(TestGroupNode, FieldIndex); FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName); }; // ---------------------------------------------------------------------- // Convenience primitive type factory functions #define PRIMITIVE_FACTORY(FuncName, TYPE) \ static inline NodePtr FuncName(const std::string& name, \ Repetition::type repetition = Repetition::OPTIONAL, \ int field_id = -1) { \ return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE, \ /*length=*/-1, /*precision=*/-1, /*scale=*/-1, field_id); \ } PRIMITIVE_FACTORY(Boolean, BOOLEAN) PRIMITIVE_FACTORY(Int32, INT32) PRIMITIVE_FACTORY(Int64, INT64) PRIMITIVE_FACTORY(Int96, INT96) PRIMITIVE_FACTORY(Float, FLOAT) PRIMITIVE_FACTORY(Double, DOUBLE) PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY) void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream, int indent_width = 2); } // namespace schema // The ColumnDescriptor encapsulates information necessary to interpret // primitive column data in the context of a particular schema. We have to // examine the node structure of a column's path to the root in the schema tree // to be able to reassemble the nested structure from the repetition and // definition levels. class PARQUET_EXPORT ColumnDescriptor { public: ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level, int16_t max_repetition_level, const SchemaDescriptor* schema_descr = NULLPTR); bool Equals(const ColumnDescriptor& other) const; int16_t max_definition_level() const { return max_definition_level_; } int16_t max_repetition_level() const { return max_repetition_level_; } Type::type physical_type() const { return primitive_node_->physical_type(); } ConvertedType::type converted_type() const { return primitive_node_->converted_type(); } const std::shared_ptr& logical_type() const { return primitive_node_->logical_type(); } ColumnOrder column_order() const { return primitive_node_->column_order(); } SortOrder::type sort_order() const { auto la = logical_type(); auto pt = physical_type(); return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt); } const std::string& name() const { return primitive_node_->name(); } const std::shared_ptr path() const; const schema::NodePtr& schema_node() const { return node_; } std::string ToString() const; int type_length() const; int type_precision() const; int type_scale() const; private: schema::NodePtr node_; const schema::PrimitiveNode* primitive_node_; int16_t max_definition_level_; int16_t max_repetition_level_; }; // Container for the converted Parquet schema with a computed information from // the schema analysis needed for file reading // // * Column index to Node // * Max repetition / definition levels for each primitive node // // The ColumnDescriptor objects produced by this class can be used to assist in // the reconstruction of fully materialized data structures from the // repetition-definition level encoding of nested data // // TODO(wesm): this object can be recomputed from a Schema class PARQUET_EXPORT SchemaDescriptor { public: SchemaDescriptor() {} ~SchemaDescriptor() {} // Analyze the schema void Init(std::unique_ptr schema); void Init(schema::NodePtr schema); const ColumnDescriptor* Column(int i) const; // Get the index of a column by its dotstring path, or negative value if not found. // If several columns share the same dotstring path, it is unspecified which one // is returned. int ColumnIndex(const std::string& node_path) const; // Get the index of a column by its node, or negative value if not found. int ColumnIndex(const schema::Node& node) const; bool Equals(const SchemaDescriptor& other, std::ostream* diff_output = NULLPTR) const; // The number of physical columns appearing in the file int num_columns() const { return static_cast(leaves_.size()); } const schema::NodePtr& schema_root() const { return schema_; } const schema::GroupNode* group_node() const { return group_node_; } // Returns the root (child of the schema root) node of the leaf(column) node const schema::Node* GetColumnRoot(int i) const; const std::string& name() const { return group_node_->name(); } std::string ToString() const; void updateColumnOrders(const std::vector& column_orders); /// \brief Return column index corresponding to a particular /// PrimitiveNode. Returns -1 if not found int GetColumnIndex(const schema::PrimitiveNode& node) const; /// \brief Return true if any field or their children have REPEATED repetition /// type bool HasRepeatedFields() const; private: friend class ColumnDescriptor; // Root Node schema::NodePtr schema_; // Root Node const schema::GroupNode* group_node_; void BuildTree(const schema::NodePtr& node, int16_t max_def_level, int16_t max_rep_level, const schema::NodePtr& base); // Result of leaf node / tree analysis std::vector leaves_; std::unordered_map node_to_leaf_index_; // Mapping between leaf nodes and root group of leaf (first node // below the schema's root group) // // For example, the leaf `a.b.c.d` would have a link back to `a` // // -- a <------ // -- -- b | // -- -- -- c | // -- -- -- -- d std::unordered_map leaf_to_base_; // Mapping between ColumnPath DotString to the leaf index std::unordered_multimap leaf_to_idx_; }; } // namespace parquet