// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include #include #include "arrow/io/caching.h" #include "arrow/util/type_fwd.h" #include "parquet/metadata.h" // IWYU pragma: keep #include "parquet/platform.h" #include "parquet/properties.h" namespace parquet { class ColumnReader; class FileMetaData; class PageIndexReader; class BloomFilterReader; class PageReader; class RowGroupMetaData; namespace internal { class RecordReader; } class PARQUET_EXPORT RowGroupReader { public: // Forward declare a virtual class 'Contents' to aid dependency injection and more // easily create test fixtures // An implementation of the Contents class is defined in the .cc file struct Contents { virtual ~Contents() {} virtual std::unique_ptr GetColumnPageReader(int i) = 0; virtual const RowGroupMetaData* metadata() const = 0; virtual const ReaderProperties* properties() const = 0; }; explicit RowGroupReader(std::unique_ptr contents); // Returns the rowgroup metadata const RowGroupMetaData* metadata() const; // Construct a ColumnReader for the indicated row group-relative // column. Ownership is shared with the RowGroupReader. std::shared_ptr Column(int i); // EXPERIMENTAL: Construct a RecordReader for the indicated column of the row group. // Ownership is shared with the RowGroupReader. std::shared_ptr RecordReader(int i, bool read_dictionary = false); // Construct a ColumnReader, trying to enable exposed encoding. // // For dictionary encoding, currently we only support column chunks that are fully // dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded. // If a column chunk uses dictionary encoding but then falls back to plain encoding, the // encoding will not be exposed. // // The returned column reader provides an API GetExposedEncoding() for the // users to check the exposed encoding and determine how to read the batches. // // \note API EXPERIMENTAL std::shared_ptr ColumnWithExposeEncoding( int i, ExposedEncoding encoding_to_expose); // Construct a RecordReader, trying to enable exposed encoding. // // For dictionary encoding, currently we only support column chunks that are // fully dictionary encoded byte arrays. The caller should verify if the reader can read // and expose the dictionary by checking the reader's read_dictionary(). If a column // chunk uses dictionary encoding but then falls back to plain encoding, the returned // reader will read decoded data without exposing the dictionary. // // \note API EXPERIMENTAL std::shared_ptr RecordReaderWithExposeEncoding( int i, ExposedEncoding encoding_to_expose); std::unique_ptr GetColumnPageReader(int i); private: // Holds a pointer to an instance of Contents implementation std::unique_ptr contents_; }; class PARQUET_EXPORT ParquetFileReader { public: // Declare a virtual class 'Contents' to aid dependency injection and more // easily create test fixtures // An implementation of the Contents class is defined in the .cc file struct PARQUET_EXPORT Contents { static std::unique_ptr Open( std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props = default_reader_properties(), std::shared_ptr metadata = NULLPTR); static ::arrow::Future> OpenAsync( std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props = default_reader_properties(), std::shared_ptr metadata = NULLPTR); virtual ~Contents() = default; // Perform any cleanup associated with the file contents virtual void Close() = 0; virtual std::shared_ptr GetRowGroup(int i) = 0; virtual std::shared_ptr metadata() const = 0; virtual std::shared_ptr GetPageIndexReader() = 0; virtual BloomFilterReader& GetBloomFilterReader() = 0; }; ParquetFileReader(); ~ParquetFileReader(); // Create a file reader instance from an Arrow file object. Thread-safety is // the responsibility of the file implementation static std::unique_ptr Open( std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props = default_reader_properties(), std::shared_ptr metadata = NULLPTR); // API Convenience to open a serialized Parquet file on disk, using Arrow IO // interfaces. static std::unique_ptr OpenFile( const std::string& path, bool memory_map = false, const ReaderProperties& props = default_reader_properties(), std::shared_ptr metadata = NULLPTR); // Asynchronously open a file reader from an Arrow file object. // Does not throw - all errors are reported through the Future. static ::arrow::Future> OpenAsync( std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props = default_reader_properties(), std::shared_ptr metadata = NULLPTR); void Open(std::unique_ptr contents); void Close(); // The RowGroupReader is owned by the FileReader std::shared_ptr RowGroup(int i); // Returns the file metadata. Only one instance is ever created std::shared_ptr metadata() const; /// Returns the PageIndexReader. Only one instance is ever created. /// /// If the file does not have the page index, nullptr may be returned. /// Because it pays to check existence of page index in the file, it /// is possible to return a non null value even if page index does /// not exist. It is the caller's responsibility to check the return /// value and follow-up calls to PageIndexReader. /// /// WARNING: The returned PageIndexReader must not outlive the ParquetFileReader. /// Initialize GetPageIndexReader() is not thread-safety. std::shared_ptr GetPageIndexReader(); /// Returns the BloomFilterReader. Only one instance is ever created. /// /// WARNING: The returned BloomFilterReader must not outlive the ParquetFileReader. /// Initialize GetBloomFilterReader() is not thread-safety. BloomFilterReader& GetBloomFilterReader(); /// Pre-buffer the specified column indices in all row groups. /// /// Readers can optionally call this to cache the necessary slices /// of the file in-memory before deserialization. Arrow readers can /// automatically do this via an option. This is intended to /// increase performance when reading from high-latency filesystems /// (e.g. Amazon S3). /// /// After calling this, creating readers for row groups/column /// indices that were not buffered may fail. Creating multiple /// readers for the a subset of the buffered regions is /// acceptable. This may be called again to buffer a different set /// of row groups/columns. /// /// If memory usage is a concern, note that data will remain /// buffered in memory until either \a PreBuffer() is called again, /// or the reader itself is destructed. Reading - and buffering - /// only one row group at a time may be useful. /// /// This method may throw. void PreBuffer(const std::vector& row_groups, const std::vector& column_indices, const ::arrow::io::IOContext& ctx, const ::arrow::io::CacheOptions& options); /// Wait for the specified row groups and column indices to be pre-buffered. /// /// After the returned Future completes, reading the specified row /// groups/columns will not block. /// /// PreBuffer must be called first. This method does not throw. ::arrow::Future<> WhenBuffered(const std::vector& row_groups, const std::vector& column_indices) const; private: // Holds a pointer to an instance of Contents implementation std::unique_ptr contents_; }; // Read only Parquet file metadata std::shared_ptr PARQUET_EXPORT ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source); /// \brief Scan all values in file. Useful for performance testing /// \param[in] columns the column numbers to scan. If empty scans all /// \param[in] column_batch_size number of values to read at a time when scanning column /// \param[in] reader a ParquetFileReader instance /// \return number of semantic rows in file PARQUET_EXPORT int64_t ScanFileContents(std::vector columns, const int32_t column_batch_size, ParquetFileReader* reader); } // namespace parquet