// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // This module defines an abstract interface for iterating through pages in a // Parquet column chunk within a row group. It could be extended in the future // to iterate through all data pages in all chunks in a file. #pragma once #include #include #include #include #include "parquet/statistics.h" #include "parquet/types.h" namespace parquet { // TODO: Parallel processing is not yet safe because of memory-ownership // semantics (the PageReader may or may not own the memory referenced by a // page) // // TODO(wesm): In the future Parquet implementations may store the crc code // in format::PageHeader. parquet-mr currently does not, so we also skip it // here, both on the read and write path class Page { public: Page(const std::shared_ptr& buffer, PageType::type type) : buffer_(buffer), type_(type) {} PageType::type type() const { return type_; } std::shared_ptr buffer() const { return buffer_; } // @returns: a pointer to the page's data const uint8_t* data() const { return buffer_->data(); } // @returns: the total size in bytes of the page's data buffer int32_t size() const { return static_cast(buffer_->size()); } private: std::shared_ptr buffer_; PageType::type type_; }; /// \brief Base type for DataPageV1 and DataPageV2 including common attributes class DataPage : public Page { public: int32_t num_values() const { return num_values_; } Encoding::type encoding() const { return encoding_; } int64_t uncompressed_size() const { return uncompressed_size_; } const EncodedStatistics& statistics() const { return statistics_; } /// Return the row ordinal within the row group to the first row in the data page. /// Currently it is only present from data pages created by ColumnWriter in order /// to collect page index. std::optional first_row_index() const { return first_row_index_; } virtual ~DataPage() = default; protected: DataPage(PageType::type type, const std::shared_ptr& buffer, int32_t num_values, Encoding::type encoding, int64_t uncompressed_size, const EncodedStatistics& statistics = EncodedStatistics(), std::optional first_row_index = std::nullopt) : Page(buffer, type), num_values_(num_values), encoding_(encoding), uncompressed_size_(uncompressed_size), statistics_(statistics), first_row_index_(std::move(first_row_index)) {} int32_t num_values_; Encoding::type encoding_; int64_t uncompressed_size_; EncodedStatistics statistics_; /// Row ordinal within the row group to the first row in the data page. std::optional first_row_index_; }; class DataPageV1 : public DataPage { public: DataPageV1(const std::shared_ptr& buffer, int32_t num_values, Encoding::type encoding, Encoding::type definition_level_encoding, Encoding::type repetition_level_encoding, int64_t uncompressed_size, const EncodedStatistics& statistics = EncodedStatistics(), std::optional first_row_index = std::nullopt) : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size, statistics, std::move(first_row_index)), definition_level_encoding_(definition_level_encoding), repetition_level_encoding_(repetition_level_encoding) {} Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; } Encoding::type definition_level_encoding() const { return definition_level_encoding_; } private: Encoding::type definition_level_encoding_; Encoding::type repetition_level_encoding_; }; class DataPageV2 : public DataPage { public: DataPageV2(const std::shared_ptr& buffer, int32_t num_values, int32_t num_nulls, int32_t num_rows, Encoding::type encoding, int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length, int64_t uncompressed_size, bool is_compressed = false, const EncodedStatistics& statistics = EncodedStatistics(), std::optional first_row_index = std::nullopt) : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size, statistics, std::move(first_row_index)), num_nulls_(num_nulls), num_rows_(num_rows), definition_levels_byte_length_(definition_levels_byte_length), repetition_levels_byte_length_(repetition_levels_byte_length), is_compressed_(is_compressed) {} int32_t num_nulls() const { return num_nulls_; } int32_t num_rows() const { return num_rows_; } int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; } int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; } bool is_compressed() const { return is_compressed_; } private: int32_t num_nulls_; int32_t num_rows_; int32_t definition_levels_byte_length_; int32_t repetition_levels_byte_length_; bool is_compressed_; }; class DictionaryPage : public Page { public: DictionaryPage(const std::shared_ptr& buffer, int32_t num_values, Encoding::type encoding, bool is_sorted = false) : Page(buffer, PageType::DICTIONARY_PAGE), num_values_(num_values), encoding_(encoding), is_sorted_(is_sorted) {} int32_t num_values() const { return num_values_; } Encoding::type encoding() const { return encoding_; } bool is_sorted() const { return is_sorted_; } private: int32_t num_values_; Encoding::type encoding_; bool is_sorted_; }; } // namespace parquet