-
Notifications
You must be signed in to change notification settings - Fork 3.7k
[Feature](function) support function array_combinations #60192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,163 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #include "common/logging.h" | ||
| #include "common/status.h" | ||
| #include "runtime/define_primitive_type.h" | ||
| #include "runtime/primitive_type.h" | ||
| #include "vec/columns/column.h" | ||
| #include "vec/columns/column_array.h" | ||
| #include "vec/common/assert_cast.h" | ||
| #include "vec/core/field.h" | ||
| #include "vec/core/types.h" | ||
| #include "vec/data_types/data_type.h" | ||
| #include "vec/data_types/data_type_array.h" | ||
| #include "vec/data_types/data_type_decimal.h" | ||
| #include "vec/data_types/data_type_nullable.h" | ||
| #include "vec/functions/function.h" | ||
| #include "vec/functions/function_helpers.h" | ||
| #include "vec/functions/simple_function_factory.h" | ||
|
|
||
| namespace doris::vectorized { | ||
| // array_combinations([1, 2, 3],2) -> [[1,2], [1,3], [2,3]] | ||
| // array_combinations([1, NULL, 3, NULL, 5],4) -> [[1,NULL,3,NULL], [1,NULL,3,5], [NULL,3,NULL,5]] | ||
|
|
||
| class FunctionArrayCombinations : public IFunction { | ||
| public: | ||
| static constexpr auto name = "array_combinations"; | ||
| static FunctionPtr create() { return std::make_shared<FunctionArrayCombinations>(); } | ||
| bool is_variadic() const override { return false; } | ||
| String get_name() const override { return name; } | ||
|
|
||
| size_t get_number_of_arguments() const override { return 2; } | ||
|
|
||
| DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | ||
| const auto* array_type = check_and_get_data_type<DataTypeArray>(arguments[0].get()); | ||
| auto elem_t = make_nullable(remove_nullable(array_type->get_nested_type())); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why remove then make? directly make is ok? |
||
| auto res = std::make_shared<DataTypeArray>( | ||
| make_nullable(std::make_shared<DataTypeArray>(elem_t))); | ||
| return res; | ||
| } | ||
|
|
||
| Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, | ||
| uint32_t result, size_t input_rows_count) const override { | ||
| auto left = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dont directly use |
||
| auto* src_arr = assert_cast<ColumnArray*>(remove_nullable(left)->assume_mutable().get()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why need |
||
|
|
||
| ColumnPtr k_col = | ||
| block.get_by_position(arguments[1]).column->convert_to_full_column_if_const(); | ||
|
|
||
| Int64 k = k_col->get_int(0); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this function restrict the second arg to be const? |
||
|
|
||
| const auto& offsets = | ||
| assert_cast<ColumnArray::ColumnOffsets&>(src_arr->get_offsets_column()); | ||
|
|
||
| ColumnPtr res = _execute_combination(src_arr, input_rows_count, offsets, k); | ||
| block.replace_by_position(result, std::move(res)); | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| private: | ||
| size_t _combination_count(size_t array_length, size_t k) const { | ||
| size_t combinations = 1; | ||
| for (int i = 1; i <= k; i++) { | ||
| combinations = combinations * (array_length - k + i) / i; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add comment to explain this |
||
| } | ||
| return combinations; | ||
| } | ||
|
|
||
| std::vector<size_t> _first_combination(Int64 k, size_t length) const { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe |
||
| std::vector<size_t> comb(k + 1); | ||
| for (size_t i = 0; i < static_cast<size_t>(k); ++i) { | ||
| comb[i] = i; | ||
| } | ||
| comb[k] = length; | ||
| return comb; | ||
| } | ||
|
|
||
| bool _next_combination(std::vector<size_t>& comb, Int64 k) const { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the meaning of I, j, k? dont use those meaningless identifier |
||
| for (size_t i = 0; i < static_cast<size_t>(k); ++i) { | ||
| if (comb[i] + 1 < comb[i + 1]) { | ||
| ++comb[i]; | ||
| for (size_t j = 0; j < i; ++j) { | ||
| comb[j] = j; | ||
| } | ||
| return true; | ||
| } | ||
| } | ||
| return false; | ||
| } | ||
|
|
||
| ColumnPtr _execute_combination(const ColumnArray* nested, size_t input_rows_count, | ||
| const ColumnArray::ColumnOffsets& offsets, Int64 k) const { | ||
| const auto& data_col = nested->get_data(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. re-consider all your var names
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can reserve for result column |
||
| const auto& in_offs = offsets.get_data(); | ||
|
|
||
| auto inner_data = data_col.clone_empty(); | ||
| auto inner_offsets = ColumnArray::ColumnOffsets::create(); | ||
| auto inner_arr = ColumnArray::create(std::move(inner_data), std::move(inner_offsets)); | ||
| auto* inner = assert_cast<ColumnArray*>(inner_arr.get()); | ||
|
|
||
| auto outer_offsets = ColumnArray::ColumnOffsets::create(); | ||
| auto& outer_offs = outer_offsets->get_data(); | ||
| outer_offs.resize(input_rows_count); | ||
|
|
||
| Field element; | ||
| size_t prev_off = 0, outer_off = 0; | ||
|
|
||
| for (size_t row = 0; row < input_rows_count; ++row) { | ||
| size_t curr_off = in_offs[row]; | ||
| size_t row_len = curr_off - prev_off; | ||
|
|
||
| if (k <= 0 || static_cast<size_t>(k) > row_len) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add reg case and make sure this behaviour is same with target system |
||
| outer_offs[row] = outer_off; | ||
| prev_off = curr_off; | ||
| continue; | ||
| } | ||
|
|
||
| std::vector comb = _first_combination(k, row_len); | ||
|
|
||
| for (int i = 0; i < static_cast<size_t>(k); ++i) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why put a single same for-loop outside the while-loop? maybe you need a do-while? |
||
| size_t idx = prev_off + comb[i]; | ||
| data_col.get(idx, element); | ||
| inner->get_data().insert(element); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. directly use |
||
| } | ||
| inner->get_offsets().push_back(inner->get_data().size()); | ||
|
|
||
| while (_next_combination(comb, k)) { | ||
| for (int i = 0; i < static_cast<size_t>(k); ++i) { | ||
| size_t idx = prev_off + comb[i]; | ||
| data_col.get(idx, element); | ||
| inner->get_data().insert(element); | ||
| } | ||
| inner->get_offsets().push_back(inner->get_data().size()); | ||
| } | ||
|
|
||
| outer_off += _combination_count(row_len, k); | ||
| outer_offs[row] = outer_off; | ||
| prev_off = curr_off; | ||
| } | ||
| auto nullable_arr = ColumnNullable::create(std::move(inner_arr), | ||
| ColumnUInt8::create(inner_arr->size(), 0)); | ||
| return ColumnArray::create(std::move(nullable_arr), std::move(outer_offsets)); | ||
| } | ||
| }; | ||
|
|
||
| void register_function_array_combinations(SimpleFunctionFactory& factory) { | ||
| factory.register_function<FunctionArrayCombinations>(); | ||
| } | ||
| } // namespace doris::vectorized | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| package org.apache.doris.nereids.trees.expressions.functions.scalar; | ||
|
|
||
| import org.apache.doris.catalog.FunctionSignature; | ||
| import org.apache.doris.nereids.trees.expressions.Expression; | ||
| import org.apache.doris.nereids.trees.expressions.functions.CustomSignature; | ||
| import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; | ||
| import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; | ||
| import org.apache.doris.nereids.types.ArrayType; | ||
| import org.apache.doris.nereids.types.DataType; | ||
|
|
||
| import com.google.common.base.Preconditions; | ||
|
|
||
| import java.util.List; | ||
|
|
||
| /** | ||
| * ScalarFunction 'combinations' | ||
| */ | ||
| public class ArrayCombinations extends ScalarFunction | ||
| implements CustomSignature, PropagateNullable { | ||
|
|
||
| /** | ||
| * constructor with 2 arguments. | ||
| */ | ||
| public ArrayCombinations(Expression arg0, Expression arg1) { | ||
| super("array_combinations", arg0, arg1); | ||
| } | ||
|
|
||
| @Override | ||
| public FunctionSignature customSignature() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not |
||
| DataType arg0Type = getArgument(0).getDataType(); | ||
| Preconditions.checkArgument(arg0Type instanceof ArrayType, | ||
| "array_combinations first argument must be Array"); | ||
| DataType itemType = ((ArrayType) arg0Type).getItemType(); | ||
| return FunctionSignature.ret(ArrayType.of(ArrayType.of(itemType))) | ||
| .args(getArgument(0).getDataType(), getArgument(1).getDataType()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if arg1 is not number? |
||
| } | ||
|
|
||
| /** | ||
| * withChildren. | ||
| */ | ||
| @Override | ||
| public ArrayCombinations withChildren(List<Expression> children) { | ||
| Preconditions.checkArgument(children.size() == 2); | ||
| return new ArrayCombinations(children.get(0), children.get(1)); | ||
| } | ||
|
|
||
| @Override | ||
| public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { | ||
| return visitor.visitArrayCombinations(this, context); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| -- This file is automatically generated. You should know what you did if you want to edit this | ||
| -- !test -- | ||
| 1 [["foo", "bar"], ["foo", "baz"], ["bar", "baz"]] [[1, 2], [1, 3], [2, 3]] [[1, 2], [1, 2], [2, 2]] [[[1, 1], [4, 5]], [[1, 1], [1, 4]], [[4, 5], [1, 4]]] | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| suite("array_combinations") { | ||
| sql """DROP TABLE IF EXISTS t_array_combinations""" | ||
| sql """ | ||
| CREATE TABLE IF NOT EXISTS t_array_combinations ( | ||
| `k1` int(11) NULL COMMENT "", | ||
| `s1` array<string> NULL COMMENT "", | ||
| `a1` array<tinyint(4)> NULL COMMENT "", | ||
| `a2` array<largeint(40)> NULL COMMENT "", | ||
| `aa1` array<array<int(11)>> NOT NULL COMMENT "", | ||
| ) ENGINE=OLAP | ||
| DISTRIBUTED BY HASH(`k1`) BUCKETS 1 | ||
| PROPERTIES ( | ||
| "replication_allocation" = "tag.location.default: 1", | ||
| "storage_format" = "V2" | ||
| ) | ||
| """ | ||
| sql """ INSERT INTO t_array_combinations VALUES(1, ['foo','bar','baz'], [1,2,3], [1,2,2], [[1,1],[4,5],[1,4]]) """ | ||
|
|
||
| qt_test """ | ||
| select k1, array_combinations(s1, 2), array_combinations(a1, 2), array_combinations(a2, 2), array_combinations(aa1, 2) from t_array_combinations order by k1; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there's too few testcases. should add more to cover situations |
||
| """ | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getting a pointer and not checking result is wrong. maybe
assert_cast