// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // This API is EXPERIMENTAL. #pragma once #include #include #include #include #include #include "arrow/compute/type_fwd.h" #include "arrow/dataset/type_fwd.h" #include "arrow/engine/substrait/options.h" #include "arrow/engine/substrait/relation.h" #include "arrow/engine/substrait/type_fwd.h" #include "arrow/engine/substrait/visibility.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" #include "arrow/util/macros.h" namespace arrow { namespace engine { /// \brief Serialize an Acero Plan to a binary protobuf Substrait message /// /// \param[in] declaration the Acero declaration to serialize. /// This declaration is the sink relation of the Acero plan. /// \param[in,out] ext_set the extension mapping to use; may be updated to add /// \param[in] conversion_options options to control how the conversion is done /// /// \return a buffer containing the protobuf serialization of the Acero relation ARROW_ENGINE_EXPORT Result> SerializePlan( const acero::Declaration& declaration, ExtensionSet* ext_set, const ConversionOptions& conversion_options = {}); /// \brief Serialize expressions to a Substrait message /// /// \param[in] bound_expressions the expressions to serialize. /// \param[in] conversion_options options to control how the conversion is done /// \param[in,out] ext_set the extension mapping to use, optional, only needed /// if you want to control the value of function anchors /// to mirror a previous serialization / deserialization. /// Will be updated if new functions are encountered ARROW_ENGINE_EXPORT Result> SerializeExpressions( const BoundExpressions& bound_expressions, const ConversionOptions& conversion_options = {}, ExtensionSet* ext_set = NULLPTR); /// Factory function type for generating the node that consumes the batches produced by /// each toplevel Substrait relation when deserializing a Substrait Plan. using ConsumerFactory = std::function()>; /// \brief Deserializes a Substrait Plan message to a list of ExecNode declarations /// /// The output of each top-level Substrait relation will be sent to a caller supplied /// consumer function provided by consumer_factory /// /// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan /// message /// \param[in] consumer_factory factory function for generating the node that consumes /// the batches produced by each toplevel Substrait relation /// \param[in] registry an extension-id-registry to use, or null for the default one. /// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait /// Plan is returned here. /// \param[in] conversion_options options to control how the conversion is to be done. /// \return a vector of ExecNode declarations, one for each toplevel relation in the /// Substrait Plan ARROW_ENGINE_EXPORT Result> DeserializePlans( const Buffer& buf, const ConsumerFactory& consumer_factory, const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR, const ConversionOptions& conversion_options = {}); /// \brief Deserializes a single-relation Substrait Plan message to an execution plan /// /// The output of each top-level Substrait relation will be sent to a caller supplied /// consumer function provided by consumer_factory /// /// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan /// message /// \param[in] consumer node that consumes the batches produced by each toplevel Substrait /// relation /// \param[in] registry an extension-id-registry to use, or null for the default one. /// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait /// \param[in] conversion_options options to control how the conversion is to be done. /// Plan is returned here. /// \return an ExecPlan for the Substrait Plan ARROW_ENGINE_EXPORT Result> DeserializePlan( const Buffer& buf, const std::shared_ptr& consumer, const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR, const ConversionOptions& conversion_options = {}); /// Factory function type for generating the write options of a node consuming the batches /// produced by each toplevel Substrait relation when deserializing a Substrait Plan. using WriteOptionsFactory = std::function()>; /// \brief Deserializes a Substrait Plan message to a list of ExecNode declarations /// /// The output of each top-level Substrait relation will be written to a filesystem. /// `write_options_factory` can be used to control write behavior. /// /// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan /// message /// \param[in] write_options_factory factory function for generating the write options of /// a node consuming the batches produced by each toplevel Substrait relation /// \param[in] registry an extension-id-registry to use, or null for the default one. /// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait /// Plan is returned here. /// \param[in] conversion_options options to control how the conversion is to be done. /// \return a vector of ExecNode declarations, one for each toplevel relation in the /// Substrait Plan ARROW_ENGINE_EXPORT Result> DeserializePlans( const Buffer& buf, const WriteOptionsFactory& write_options_factory, const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR, const ConversionOptions& conversion_options = {}); /// \brief Deserializes a single-relation Substrait Plan message to an execution plan /// /// The output of the single Substrait relation will be written to a filesystem. /// `write_options_factory` can be used to control write behavior. /// /// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan /// message /// \param[in] write_options write options of a node consuming the batches produced by /// each toplevel Substrait relation /// \param[in] registry an extension-id-registry to use, or null for the default one. /// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait /// Plan is returned here. /// \param[in] conversion_options options to control how the conversion is to be done. /// \return an ExecPlan for the Substrait Plan ARROW_ENGINE_EXPORT Result> DeserializePlan( const Buffer& buf, const std::shared_ptr& write_options, const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR, const ConversionOptions& conversion_options = {}); /// \brief Deserializes a Substrait Plan message to a Declaration /// /// The plan will not contain any sink nodes and will be suitable for use in any /// of the arrow::compute::DeclarationToXyz methods. /// /// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan /// message /// \param[in] registry an extension-id-registry to use, or null for the default one. /// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait /// Plan is returned here. /// \param[in] conversion_options options to control how the conversion is to be done. /// \return A declaration representing the Substrait plan ARROW_ENGINE_EXPORT Result DeserializePlan( const Buffer& buf, const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR, const ConversionOptions& conversion_options = {}); /// \brief Deserialize a Substrait ExtendedExpression message to the corresponding Arrow /// type /// /// \param[in] buf a buffer containing the protobuf serialization of a collection of bound /// expressions /// \param[in] registry an extension-id-registry to use, or null for the default one /// \param[in] conversion_options options to control how the conversion is done /// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait /// message is returned here. /// \return A collection of expressions and a common input schema they are bound to ARROW_ENGINE_EXPORT Result DeserializeExpressions( const Buffer& buf, const ExtensionIdRegistry* registry = NULLPTR, const ConversionOptions& conversion_options = {}, ExtensionSet* ext_set_out = NULLPTR); /// \brief Deserializes a Substrait Type message to the corresponding Arrow type /// /// \param[in] buf a buffer containing the protobuf serialization of a Substrait Type /// message /// \param[in] ext_set the extension mapping to use, normally provided by the /// surrounding Plan message /// \param[in] conversion_options options to control how the conversion is to be done. /// \return the corresponding Arrow data type ARROW_ENGINE_EXPORT Result> DeserializeType( const Buffer& buf, const ExtensionSet& ext_set, const ConversionOptions& conversion_options = {}); /// \brief Serializes an Arrow type to a Substrait Type message /// /// \param[in] type the Arrow data type to serialize /// \param[in,out] ext_set the extension mapping to use; may be updated to add a /// mapping for the given type /// \param[in] conversion_options options to control how the conversion is to be done. /// \return a buffer containing the protobuf serialization of the corresponding Substrait /// Type message ARROW_ENGINE_EXPORT Result> SerializeType( const DataType& type, ExtensionSet* ext_set, const ConversionOptions& conversion_options = {}); /// \brief Deserializes a Substrait NamedStruct message to an Arrow schema /// /// \param[in] buf a buffer containing the protobuf serialization of a Substrait /// NamedStruct message /// \param[in] ext_set the extension mapping to use, normally provided by the /// surrounding Plan message /// \param[in] conversion_options options to control how the conversion is to be done. /// \return the corresponding Arrow schema ARROW_ENGINE_EXPORT Result> DeserializeSchema( const Buffer& buf, const ExtensionSet& ext_set, const ConversionOptions& conversion_options = {}); /// \brief Serializes an Arrow schema to a Substrait NamedStruct message /// /// \param[in] schema the Arrow schema to serialize /// \param[in,out] ext_set the extension mapping to use; may be updated to add /// mappings for the types used in the schema /// \param[in] conversion_options options to control how the conversion is to be done. /// \return a buffer containing the protobuf serialization of the corresponding Substrait /// NamedStruct message ARROW_ENGINE_EXPORT Result> SerializeSchema( const Schema& schema, ExtensionSet* ext_set, const ConversionOptions& conversion_options = {}); /// \brief Deserializes a Substrait Expression message to a compute expression /// /// \param[in] buf a buffer containing the protobuf serialization of a Substrait /// Expression message /// \param[in] ext_set the extension mapping to use, normally provided by the /// surrounding Plan message /// \param[in] conversion_options options to control how the conversion is to be done. /// \return the corresponding Arrow compute expression ARROW_ENGINE_EXPORT Result DeserializeExpression( const Buffer& buf, const ExtensionSet& ext_set, const ConversionOptions& conversion_options = {}); /// \brief Serializes an Arrow compute expression to a Substrait Expression message /// /// \param[in] expr the Arrow compute expression to serialize /// \param[in,out] ext_set the extension mapping to use; may be updated to add /// mappings for the types used in the expression /// \param[in] conversion_options options to control how the conversion is to be done. /// \return a buffer containing the protobuf serialization of the corresponding Substrait /// Expression message ARROW_ENGINE_EXPORT Result> SerializeExpression( const compute::Expression& expr, ExtensionSet* ext_set, const ConversionOptions& conversion_options = {}); /// \brief Serialize an Acero Declaration to a binary protobuf Substrait message /// /// \param[in] declaration the Acero declaration to serialize /// \param[in,out] ext_set the extension mapping to use; may be updated to add /// \param[in] conversion_options options to control how the conversion is done /// /// \return a buffer containing the protobuf serialization of the Acero relation ARROW_ENGINE_EXPORT Result> SerializeRelation( const acero::Declaration& declaration, ExtensionSet* ext_set, const ConversionOptions& conversion_options = {}); /// \brief Deserializes a Substrait Rel (relation) message to an ExecNode declaration /// /// \param[in] buf a buffer containing the protobuf serialization of a Substrait /// Rel message /// \param[in] ext_set the extension mapping to use, normally provided by the /// surrounding Plan message /// \param[in] conversion_options options to control how the conversion is to be done. /// \return the corresponding ExecNode declaration ARROW_ENGINE_EXPORT Result DeserializeRelation( const Buffer& buf, const ExtensionSet& ext_set, const ConversionOptions& conversion_options = {}); namespace internal { /// \brief Checks whether two protobuf serializations of a particular Substrait message /// type are equivalent /// /// Note that a binary comparison of the two buffers is insufficient. One reason for this /// is that the fields of a message can be specified in any order in the serialization. /// /// \param[in] message_name the name of the Substrait message type to check /// \param[in] l_buf buffer containing the first protobuf serialization to compare /// \param[in] r_buf buffer containing the second protobuf serialization to compare /// \return success if equivalent, failure if not ARROW_ENGINE_EXPORT Status CheckMessagesEquivalent(std::string_view message_name, const Buffer& l_buf, const Buffer& r_buf); /// \brief Utility function to convert a JSON serialization of a Substrait message to /// its binary serialization /// /// \param[in] type_name the name of the Substrait message type to convert /// \param[in] json the JSON string to convert /// \param[in] ignore_unknown_fields if true then unknown fields will be ignored and /// will not cause an error /// /// This should generally be true to allow consumption of plans from newer /// producers but setting to false can be useful if you are testing /// conformance to a specific Substrait version /// \return a buffer filled with the binary protobuf serialization of message ARROW_ENGINE_EXPORT Result> SubstraitFromJSON(std::string_view type_name, std::string_view json, bool ignore_unknown_fields = true); /// \brief Utility function to convert a binary protobuf serialization of a Substrait /// message to JSON /// /// \param[in] type_name the name of the Substrait message type to convert /// \param[in] buf the buffer containing the binary protobuf serialization of the message /// \return a JSON string representing the message ARROW_ENGINE_EXPORT Result SubstraitToJSON(std::string_view type_name, const Buffer& buf); } // namespace internal } // namespace engine } // namespace arrow