pub trait ScalarUDFImpl:
Debug
+ Send
+ Sync {
Show 24 methods
// Required methods
fn as_any(&self) -> &dyn Any;
fn name(&self) -> &str;
fn signature(&self) -> &Signature;
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType>;
// Provided methods
fn display_name(&self, args: &[Expr]) -> Result<String> { ... }
fn schema_name(&self, args: &[Expr]) -> Result<String> { ... }
fn return_type_from_exprs(
&self,
_args: &[Expr],
_schema: &dyn ExprSchema,
arg_types: &[DataType],
) -> Result<DataType> { ... }
fn return_type_from_args(
&self,
args: ReturnTypeArgs<'_>,
) -> Result<ReturnInfo> { ... }
fn is_nullable(&self, _args: &[Expr], _schema: &dyn ExprSchema) -> bool { ... }
fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> { ... }
fn invoke_batch(
&self,
args: &[ColumnarValue],
number_rows: usize,
) -> Result<ColumnarValue> { ... }
fn invoke_with_args(
&self,
args: ScalarFunctionArgs<'_>,
) -> Result<ColumnarValue> { ... }
fn invoke_no_args(&self, _number_rows: usize) -> Result<ColumnarValue> { ... }
fn aliases(&self) -> &[String] { ... }
fn simplify(
&self,
args: Vec<Expr>,
_info: &dyn SimplifyInfo,
) -> Result<ExprSimplifyResult> { ... }
fn short_circuits(&self) -> bool { ... }
fn evaluate_bounds(&self, _input: &[&Interval]) -> Result<Interval> { ... }
fn propagate_constraints(
&self,
_interval: &Interval,
_inputs: &[&Interval],
) -> Result<Option<Vec<Interval>>> { ... }
fn output_ordering(
&self,
inputs: &[ExprProperties],
) -> Result<SortProperties> { ... }
fn preserves_lex_ordering(&self, _inputs: &[ExprProperties]) -> Result<bool> { ... }
fn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>> { ... }
fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { ... }
fn hash_value(&self) -> u64 { ... }
fn documentation(&self) -> Option<&Documentation> { ... }
}
Expand description
Trait for implementing user defined scalar functions.
This trait exposes the full API for implementing user defined functions and can be used to implement any function.
See advanced_udf.rs
for a full example with complete implementation and
ScalarUDF
for other available options.
§Basic Example
/// This struct for a simple UDF that adds one to an int32
#[derive(Debug)]
struct AddOne {
signature: Signature,
}
impl AddOne {
fn new() -> Self {
Self {
signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable),
}
}
}
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
fn get_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(DOC_SECTION_MATH, "Add one to an int32", "add_one(2)")
.with_argument("arg1", "The int32 number to add one to")
.build()
})
}
/// Implement the ScalarUDFImpl trait for AddOne
impl ScalarUDFImpl for AddOne {
fn as_any(&self) -> &dyn Any { self }
fn name(&self) -> &str { "add_one" }
fn signature(&self) -> &Signature { &self.signature }
fn return_type(&self, args: &[DataType]) -> Result<DataType> {
if !matches!(args.get(0), Some(&DataType::Int32)) {
return plan_err!("add_one only accepts Int32 arguments");
}
Ok(DataType::Int32)
}
// The actual implementation would add one to the argument
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
unimplemented!()
}
fn documentation(&self) -> Option<&Documentation> {
Some(get_doc())
}
}
// Create a new ScalarUDF from the implementation
let add_one = ScalarUDF::from(AddOne::new());
// Call the function `add_one(col)`
let expr = add_one.call(vec![col("a")]);
Required Methods§
Sourcefn signature(&self) -> &Signature
fn signature(&self) -> &Signature
Returns the function’s Signature
for information about what input
types are accepted and the function’s Volatility.
Sourcefn return_type(&self, arg_types: &[DataType]) -> Result<DataType>
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType>
What DataType
will be returned by this function, given the types of
the arguments.
§Notes
If you provide an implementation for Self::return_type_from_args
,
DataFusion will not call return_type
(this function). In such cases
is recommended to return DataFusionError::Internal
.
Provided Methods§
Sourcefn display_name(&self, args: &[Expr]) -> Result<String>
fn display_name(&self, args: &[Expr]) -> Result<String>
Returns the user-defined display name of function, given the arguments
This can be used to customize the output column name generated by this function.
Defaults to name(args[0], args[1], ...)
Sourcefn schema_name(&self, args: &[Expr]) -> Result<String>
fn schema_name(&self, args: &[Expr]) -> Result<String>
Returns the name of the column this expression would create
See Expr::schema_name
for details
fn return_type_from_exprs( &self, _args: &[Expr], _schema: &dyn ExprSchema, arg_types: &[DataType], ) -> Result<DataType>
return_type_from_args
insteadSourcefn return_type_from_args(&self, args: ReturnTypeArgs<'_>) -> Result<ReturnInfo>
fn return_type_from_args(&self, args: ReturnTypeArgs<'_>) -> Result<ReturnInfo>
What type will be returned by this function, given the arguments?
By default, this function calls Self::return_type
with the
types of each argument.
§Notes
Most UDFs should implement Self::return_type
and not this
function as the output type for most functions only depends on the types
of their inputs (e.g. sqrt(f32)
is always f32
).
This function can be used for more advanced cases such as:
- specifying nullability
- return types based on the values of the arguments (rather than their types.
§Output Type based on Values
For example, the following two function calls get the same argument
types (something and a Utf8
string) but return different types based
on the value of the second argument:
arrow_cast(x, 'Int16')
–>Int16
arrow_cast(x, 'Float32')
–>Float32
§Requirements
This function must consistently return the same type for the same
logical input even if the input is simplified (e.g. it must return the same
value for ('foo' | 'bar')
as it does for (‘foobar’).
fn is_nullable(&self, _args: &[Expr], _schema: &dyn ExprSchema) -> bool
return_type_from_args
instead. if you use is_nullable
that returns non-nullable with return_type
, you would need to switch to return_type_from_args
, you might have errorSourcefn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue>
👎Deprecated since 42.1.0: Use invoke_with_args
instead
fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue>
invoke_with_args
insteadInvoke the function on args
, returning the appropriate result
Note: This method is deprecated and will be removed in future releases.
User defined functions should implement Self::invoke_with_args
instead.
Sourcefn invoke_batch(
&self,
args: &[ColumnarValue],
number_rows: usize,
) -> Result<ColumnarValue>
fn invoke_batch( &self, args: &[ColumnarValue], number_rows: usize, ) -> Result<ColumnarValue>
Invoke the function with args
and the number of rows,
returning the appropriate result.
Note: See notes on Self::invoke_with_args
Note: This method is deprecated and will be removed in future releases.
User defined functions should implement Self::invoke_with_args
instead.
See https://github.com/apache/datafusion/issues/13515 for more details.
Sourcefn invoke_with_args(
&self,
args: ScalarFunctionArgs<'_>,
) -> Result<ColumnarValue>
fn invoke_with_args( &self, args: ScalarFunctionArgs<'_>, ) -> Result<ColumnarValue>
Invoke the function returning the appropriate result.
§Performance
For the best performance, the implementations should handle the common case
when one or more of their arguments are constant values (aka
ColumnarValue::Scalar
).
ColumnarValue::values_to_arrays
can be used to convert the arguments
to arrays, which will likely be simpler code, but be slower.
Sourcefn invoke_no_args(&self, _number_rows: usize) -> Result<ColumnarValue>
👎Deprecated since 42.1.0: Use invoke_with_args
instead
fn invoke_no_args(&self, _number_rows: usize) -> Result<ColumnarValue>
invoke_with_args
insteadInvoke the function without args
, instead the number of rows are provided,
returning the appropriate result.
Note: This method is deprecated and will be removed in future releases.
User defined functions should implement Self::invoke_with_args
instead.
Sourcefn aliases(&self) -> &[String]
fn aliases(&self) -> &[String]
Returns any aliases (alternate names) for this function.
Aliases can be used to invoke the same function using different names.
For example in some databases now()
and current_timestamp()
are
aliases for the same function. This behavior can be obtained by
returning current_timestamp
as an alias for the now
function.
Note: aliases
should only include names other than Self::name
.
Defaults to []
(no aliases)
Sourcefn simplify(
&self,
args: Vec<Expr>,
_info: &dyn SimplifyInfo,
) -> Result<ExprSimplifyResult>
fn simplify( &self, args: Vec<Expr>, _info: &dyn SimplifyInfo, ) -> Result<ExprSimplifyResult>
Optionally apply per-UDF simplification / rewrite rules.
This can be used to apply function specific simplification rules during
optimization (e.g. arrow_cast
–> Expr::Cast
). The default
implementation does nothing.
Note that DataFusion handles simplifying arguments and “constant
folding” (replacing a function call with constant arguments such as
my_add(1,2) --> 3
). Thus, there is no need to implement such
optimizations manually for specific UDFs.
§Arguments
args
: The arguments of the functioninfo
: The necessary information for simplification
§Returns
ExprSimplifyResult
indicating the result of the simplification NOTE
if the function cannot be simplified, the arguments MUST be returned
unmodified
Sourcefn short_circuits(&self) -> bool
fn short_circuits(&self) -> bool
Returns true if some of this exprs
subexpressions may not be evaluated
and thus any side effects (like divide by zero) may not be encountered
Setting this to true prevents certain optimizations such as common subexpression elimination
Sourcefn evaluate_bounds(&self, _input: &[&Interval]) -> Result<Interval>
fn evaluate_bounds(&self, _input: &[&Interval]) -> Result<Interval>
Computes the output interval for a ScalarUDFImpl
, given the input
intervals.
§Parameters
children
are the intervals for the children (inputs) of this function.
§Example
If the function is ABS(a)
, and the input interval is a: [-3, 2]
,
then the output interval would be [0, 3]
.
Sourcefn propagate_constraints(
&self,
_interval: &Interval,
_inputs: &[&Interval],
) -> Result<Option<Vec<Interval>>>
fn propagate_constraints( &self, _interval: &Interval, _inputs: &[&Interval], ) -> Result<Option<Vec<Interval>>>
Updates bounds for child expressions, given a known interval for this function. This is used to propagate constraints down through an expression tree.
§Parameters
interval
is the currently known interval for this function.inputs
are the current intervals for the inputs (children) of this function.
§Returns
A Vec
of new intervals for the children, in order.
If constraint propagation reveals an infeasibility for any child, returns
None
. If none of the children intervals change as a result of
propagation, may return an empty vector instead of cloning children
.
This is the default (and conservative) return value.
§Example
If the function is ABS(a)
, the current interval
is [4, 5]
and the
input a
is given as [-7, 3]
, then propagation would return [-5, 3]
.
Sourcefn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties>
fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties>
Calculates the SortProperties
of this function based on its children’s properties.
Sourcefn preserves_lex_ordering(&self, _inputs: &[ExprProperties]) -> Result<bool>
fn preserves_lex_ordering(&self, _inputs: &[ExprProperties]) -> Result<bool>
Whether the function preserves lexicographical ordering based on the input ordering
Sourcefn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>>
fn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>>
Coerce arguments of a function call to types that the function can evaluate.
This function is only called if ScalarUDFImpl::signature
returns crate::TypeSignature::UserDefined
. Most
UDFs should return one of the other variants of TypeSignature
which handle common
cases
See the type coercion module documentation for more details on type coercion
For example, if your function requires a floating point arguments, but the user calls
it like my_func(1::int)
(i.e. with 1
as an integer), coerce_types can return [DataType::Float64]
to ensure the argument is converted to 1::double
§Parameters
arg_types
: The argument types of the arguments this function with
§Return value
A Vec the same length as arg_types
. DataFusion will CAST
the function call
arguments to these specific types.
Sourcefn equals(&self, other: &dyn ScalarUDFImpl) -> bool
fn equals(&self, other: &dyn ScalarUDFImpl) -> bool
Return true if this scalar UDF is equal to the other.
Allows customizing the equality of scalar UDFs.
Must be consistent with Self::hash_value
and follow the same rules as Eq
:
- reflexive:
a.equals(a)
; - symmetric:
a.equals(b)
impliesb.equals(a)
; - transitive:
a.equals(b)
andb.equals(c)
impliesa.equals(c)
.
By default, compares Self::name
and Self::signature
.
Sourcefn hash_value(&self) -> u64
fn hash_value(&self) -> u64
Returns a hash value for this scalar UDF.
Allows customizing the hash code of scalar UDFs. Similarly to Hash
and Eq
,
if Self::equals
returns true for two UDFs, their hash_value
s must be the same.
By default, hashes Self::name
and Self::signature
.
Sourcefn documentation(&self) -> Option<&Documentation>
fn documentation(&self) -> Option<&Documentation>
Returns the documentation for this Scalar UDF.
Documentation can be accessed programmatically as well as generating publicly facing documentation.