Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 163 additions & 0 deletions be/src/vec/functions/array/function_array_combinations.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "common/logging.h"
#include "common/status.h"
#include "runtime/define_primitive_type.h"
#include "runtime/primitive_type.h"
#include "vec/columns/column.h"
#include "vec/columns/column_array.h"
#include "vec/common/assert_cast.h"
#include "vec/core/field.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_array.h"
#include "vec/data_types/data_type_decimal.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/functions/function.h"
#include "vec/functions/function_helpers.h"
#include "vec/functions/simple_function_factory.h"

namespace doris::vectorized {
// array_combinations([1, 2, 3],2) -> [[1,2], [1,3], [2,3]]
// array_combinations([1, NULL, 3, NULL, 5],4) -> [[1,NULL,3,NULL], [1,NULL,3,5], [NULL,3,NULL,5]]

class FunctionArrayCombinations : public IFunction {
public:
static constexpr auto name = "array_combinations";
static FunctionPtr create() { return std::make_shared<FunctionArrayCombinations>(); }
bool is_variadic() const override { return false; }
String get_name() const override { return name; }

size_t get_number_of_arguments() const override { return 2; }

DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
const auto* array_type = check_and_get_data_type<DataTypeArray>(arguments[0].get());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getting a pointer and not checking result is wrong. maybe assert_cast

auto elem_t = make_nullable(remove_nullable(array_type->get_nested_type()));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why remove then make? directly make is ok?

auto res = std::make_shared<DataTypeArray>(
make_nullable(std::make_shared<DataTypeArray>(elem_t)));
return res;
}

Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
uint32_t result, size_t input_rows_count) const override {
auto left = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dont directly use convert_to_full_column_if_const, but vector_const...

auto* src_arr = assert_cast<ColumnArray*>(remove_nullable(left)->assume_mutable().get());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why need assume_mutable here?


ColumnPtr k_col =
block.get_by_position(arguments[1]).column->convert_to_full_column_if_const();

Int64 k = k_col->get_int(0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this function restrict the second arg to be const?


const auto& offsets =
assert_cast<ColumnArray::ColumnOffsets&>(src_arr->get_offsets_column());

ColumnPtr res = _execute_combination(src_arr, input_rows_count, offsets, k);
block.replace_by_position(result, std::move(res));
return Status::OK();
}

private:
size_t _combination_count(size_t array_length, size_t k) const {
size_t combinations = 1;
for (int i = 1; i <= k; i++) {
combinations = combinations * (array_length - k + i) / i;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add comment to explain this

}
return combinations;
}

std::vector<size_t> _first_combination(Int64 k, size_t length) const {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe ALWAYS_INLINE this function to make sure no copy of vector occurs here.

std::vector<size_t> comb(k + 1);
for (size_t i = 0; i < static_cast<size_t>(k); ++i) {
comb[i] = i;
}
comb[k] = length;
return comb;
}

bool _next_combination(std::vector<size_t>& comb, Int64 k) const {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the meaning of I, j, k? dont use those meaningless identifier

for (size_t i = 0; i < static_cast<size_t>(k); ++i) {
if (comb[i] + 1 < comb[i + 1]) {
++comb[i];
for (size_t j = 0; j < i; ++j) {
comb[j] = j;
}
return true;
}
}
return false;
}

ColumnPtr _execute_combination(const ColumnArray* nested, size_t input_rows_count,
const ColumnArray::ColumnOffsets& offsets, Int64 k) const {
const auto& data_col = nested->get_data();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re-consider all your var names

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can reserve for result column

const auto& in_offs = offsets.get_data();

auto inner_data = data_col.clone_empty();
auto inner_offsets = ColumnArray::ColumnOffsets::create();
auto inner_arr = ColumnArray::create(std::move(inner_data), std::move(inner_offsets));
auto* inner = assert_cast<ColumnArray*>(inner_arr.get());

auto outer_offsets = ColumnArray::ColumnOffsets::create();
auto& outer_offs = outer_offsets->get_data();
outer_offs.resize(input_rows_count);

Field element;
size_t prev_off = 0, outer_off = 0;

for (size_t row = 0; row < input_rows_count; ++row) {
size_t curr_off = in_offs[row];
size_t row_len = curr_off - prev_off;

if (k <= 0 || static_cast<size_t>(k) > row_len) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add reg case and make sure this behaviour is same with target system

outer_offs[row] = outer_off;
prev_off = curr_off;
continue;
}

std::vector comb = _first_combination(k, row_len);

for (int i = 0; i < static_cast<size_t>(k); ++i) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why put a single same for-loop outside the while-loop? maybe you need a do-while?

size_t idx = prev_off + comb[i];
data_col.get(idx, element);
inner->get_data().insert(element);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

directly use insert_from could get rid of Field?

}
inner->get_offsets().push_back(inner->get_data().size());

while (_next_combination(comb, k)) {
for (int i = 0; i < static_cast<size_t>(k); ++i) {
size_t idx = prev_off + comb[i];
data_col.get(idx, element);
inner->get_data().insert(element);
}
inner->get_offsets().push_back(inner->get_data().size());
}

outer_off += _combination_count(row_len, k);
outer_offs[row] = outer_off;
prev_off = curr_off;
}
auto nullable_arr = ColumnNullable::create(std::move(inner_arr),
ColumnUInt8::create(inner_arr->size(), 0));
return ColumnArray::create(std::move(nullable_arr), std::move(outer_offsets));
}
};

void register_function_array_combinations(SimpleFunctionFactory& factory) {
factory.register_function<FunctionArrayCombinations>();
}
} // namespace doris::vectorized
2 changes: 2 additions & 0 deletions be/src/vec/functions/array/function_array_register.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ void register_function_array_filter_function(SimpleFunctionFactory&);
void register_function_array_splits(SimpleFunctionFactory&);
void register_function_array_contains_all(SimpleFunctionFactory&);
void register_function_array_match(SimpleFunctionFactory&);
void register_function_array_combinations(SimpleFunctionFactory&);

void register_function_array(SimpleFunctionFactory& factory) {
register_function_array_flatten(factory);
Expand Down Expand Up @@ -95,6 +96,7 @@ void register_function_array(SimpleFunctionFactory& factory) {
register_function_array_splits(factory);
register_function_array_contains_all(factory);
register_function_array_match(factory);
register_function_array_combinations(factory);
}

} // namespace doris::vectorized
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.Array;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayApply;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayAvg;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayCombinations;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayCompact;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayConcat;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayContains;
Expand Down Expand Up @@ -605,6 +606,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(ArrayFirst.class, "array_first"),
scalar(ArrayFirstIndex.class, "array_first_index"),
scalar(ArrayFlatten.class, "array_flatten"),
scalar(ArrayCombinations.class, "array_combinations"),
scalar(ArrayIntersect.class, "array_intersect"),
scalar(ArrayJoin.class, "array_join"),
scalar(ArrayLast.class, "array_last"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.trees.expressions.functions.scalar;

import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.CustomSignature;
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.ArrayType;
import org.apache.doris.nereids.types.DataType;

import com.google.common.base.Preconditions;

import java.util.List;

/**
* ScalarFunction 'combinations'
*/
public class ArrayCombinations extends ScalarFunction
implements CustomSignature, PropagateNullable {

/**
* constructor with 2 arguments.
*/
public ArrayCombinations(Expression arg0, Expression arg1) {
super("array_combinations", arg0, arg1);
}

@Override
public FunctionSignature customSignature() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not getSignature? your way now will skip some passes in ComputeSignature presets.

DataType arg0Type = getArgument(0).getDataType();
Preconditions.checkArgument(arg0Type instanceof ArrayType,
"array_combinations first argument must be Array");
DataType itemType = ((ArrayType) arg0Type).getItemType();
return FunctionSignature.ret(ArrayType.of(ArrayType.of(itemType)))
.args(getArgument(0).getDataType(), getArgument(1).getDataType());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if arg1 is not number?

}

/**
* withChildren.
*/
@Override
public ArrayCombinations withChildren(List<Expression> children) {
Preconditions.checkArgument(children.size() == 2);
return new ArrayCombinations(children.get(0), children.get(1));
}

@Override
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
return visitor.visitArrayCombinations(this, context);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.Array;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayApply;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayAvg;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayCombinations;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayCompact;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayConcat;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayContains;
Expand Down Expand Up @@ -786,6 +787,10 @@ default R visitArrayFlatten(ArrayFlatten arrayFlatten, C context) {
return visitScalarFunction(arrayFlatten, context);
}

default R visitArrayCombinations(ArrayCombinations arrayCombinations, C context) {
return visitScalarFunction(arrayCombinations, context);
}

default R visitArrayMap(ArrayMap arrayMap, C context) {
return visitScalarFunction(arrayMap, context);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16128,3 +16128,12 @@ false false
-- !sql --
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

-- !sql --
[["foo", "bar"], ["foo", "baz"], ["bar", "baz"]]

-- !sql --
[[1, 2], [1, 3], [2, 3]]

-- !sql --
[[1, 2], [1, 2], [2, 2]]

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !test --
1 [["foo", "bar"], ["foo", "baz"], ["bar", "baz"]] [[1, 2], [1, 3], [2, 3]] [[1, 2], [1, 2], [2, 2]] [[[1, 1], [4, 5]], [[1, 1], [1, 4]], [[4, 5], [1, 4]]]

Original file line number Diff line number Diff line change
Expand Up @@ -1367,4 +1367,7 @@ suite("nereids_scalar_fn_Array") {
qt_sql """select array_flatten([ [[1,2,3,4,5]],[[6,7],[8,9]] ]);"""
qt_sql """select array_flatten([[[[[[1,2,3,4,5],[6,7],[8,9],[10,11],[12]]]]]]);"""

qt_sql """select array_combinations(['foo', 'bar', 'baz'], 2);"""
qt_sql """select array_combinations([1, 2, 3], 2);"""
qt_sql """select array_combinations([1, 2, 2], 2);"""
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("array_combinations") {
sql """DROP TABLE IF EXISTS t_array_combinations"""
sql """
CREATE TABLE IF NOT EXISTS t_array_combinations (
`k1` int(11) NULL COMMENT "",
`s1` array<string> NULL COMMENT "",
`a1` array<tinyint(4)> NULL COMMENT "",
`a2` array<largeint(40)> NULL COMMENT "",
`aa1` array<array<int(11)>> NOT NULL COMMENT "",
) ENGINE=OLAP
DISTRIBUTED BY HASH(`k1`) BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"storage_format" = "V2"
)
"""
sql """ INSERT INTO t_array_combinations VALUES(1, ['foo','bar','baz'], [1,2,3], [1,2,2], [[1,1],[4,5],[1,4]]) """

qt_test """
select k1, array_combinations(s1, 2), array_combinations(a1, 2), array_combinations(a2, 2), array_combinations(aa1, 2) from t_array_combinations order by k1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's too few testcases. should add more to cover situations

"""
}