Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions cpp/src/arrow/array/array_dict_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1761,4 +1761,81 @@ TEST(TestDictionaryUnifier, TableZeroColumns) {
AssertTablesEqual(*table, *unified);
}

// GH-49689: Ordered dictionary tests

TEST(TestDictionaryBuilderOrdered, TypePreservesOrderedFlag) {
auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true);
std::unique_ptr<ArrayBuilder> boxed_builder;
ASSERT_OK(MakeBuilder(default_memory_pool(), ordered_type, &boxed_builder));

auto builder_type = boxed_builder->type();
ASSERT_TRUE(checked_cast<const DictionaryType&>(*builder_type).ordered());
}

TEST(TestDictionaryBuilderOrdered, UnorderedTypeStaysUnordered) {
auto unordered_type = dictionary(int8(), utf8(), /*ordered=*/false);
std::unique_ptr<ArrayBuilder> boxed_builder;
ASSERT_OK(MakeBuilder(default_memory_pool(), unordered_type, &boxed_builder));

auto builder_type = boxed_builder->type();
ASSERT_FALSE(checked_cast<const DictionaryType&>(*builder_type).ordered());
}

TEST(TestDictionaryBuilderOrdered, FinishPreservesOrderedFlag) {
auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true);
std::unique_ptr<ArrayBuilder> boxed_builder;
ASSERT_OK(MakeBuilder(default_memory_pool(), ordered_type, &boxed_builder));
auto& builder = checked_cast<DictionaryBuilder<StringType>&>(*boxed_builder);

ASSERT_OK(builder.Append("a"));
ASSERT_OK(builder.Append("b"));
ASSERT_OK(builder.Append("a"));

std::shared_ptr<Array> result;
ASSERT_OK(builder.Finish(&result));

const auto& result_type = checked_cast<const DictionaryType&>(*result->type());
ASSERT_TRUE(result_type.ordered());

auto ex_dict = ArrayFromJSON(utf8(), R"(["a", "b"])");
auto ex_indices = ArrayFromJSON(int8(), "[0, 1, 0]");
DictionaryArray expected(ordered_type, ex_indices, ex_dict);
AssertArraysEqual(expected, *result);
}

TEST(TestDictionaryBuilderOrdered, ListOfOrderedDictionary) {
auto ordered_dict_type = dictionary(int8(), utf8(), /*ordered=*/true);
auto list_type = list(field("item", ordered_dict_type));

std::unique_ptr<ArrayBuilder> boxed_builder;
ASSERT_OK(MakeBuilder(default_memory_pool(), list_type, &boxed_builder));
auto& list_builder = checked_cast<ListBuilder&>(*boxed_builder);
auto& dict_builder =
checked_cast<DictionaryBuilder<StringType>&>(*list_builder.value_builder());

ASSERT_OK(list_builder.Append());
ASSERT_OK(dict_builder.Append("a"));
ASSERT_OK(dict_builder.Append("b"));
ASSERT_OK(list_builder.Append());
ASSERT_OK(dict_builder.Append("a"));

std::shared_ptr<Array> result;
ASSERT_OK(list_builder.Finish(&result));

const auto& result_list_type = checked_cast<const ListType&>(*result->type());
const auto& result_dict_type =
checked_cast<const DictionaryType&>(*result_list_type.value_type());
ASSERT_TRUE(result_dict_type.ordered());
}

TEST(TestDictionaryBuilderOrdered, MakeDictionaryBuilderPreservesOrdered) {
auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true);
std::unique_ptr<ArrayBuilder> builder;
ASSERT_OK(MakeDictionaryBuilder(default_memory_pool(), ordered_type,
/*dictionary=*/nullptr, &builder));

auto builder_type = builder->type();
ASSERT_TRUE(checked_cast<const DictionaryType&>(*builder_type).ordered());
}

} // namespace arrow
33 changes: 23 additions & 10 deletions cpp/src/arrow/array/builder_dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ class DictionaryBuilderBase : public ArrayBuilder {
delta_offset_(0),
byte_width_(-1),
indices_builder_(start_int_size, pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(false) {}

template <typename T1 = T>
explicit DictionaryBuilderBase(
Expand All @@ -173,7 +174,8 @@ class DictionaryBuilderBase : public ArrayBuilder {
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(false) {}

template <typename T1 = T>
explicit DictionaryBuilderBase(
Expand All @@ -187,7 +189,8 @@ class DictionaryBuilderBase : public ArrayBuilder {
delta_offset_(0),
byte_width_(-1),
indices_builder_(index_type, pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(false) {}

template <typename B = BuilderType, typename T1 = T>
DictionaryBuilderBase(uint8_t start_int_size,
Expand All @@ -202,7 +205,8 @@ class DictionaryBuilderBase : public ArrayBuilder {
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(start_int_size, pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(false) {}

template <typename T1 = T>
explicit DictionaryBuilderBase(
Expand All @@ -214,7 +218,8 @@ class DictionaryBuilderBase : public ArrayBuilder {
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(false) {}

template <typename T1 = T>
explicit DictionaryBuilderBase(
Expand All @@ -227,7 +232,8 @@ class DictionaryBuilderBase : public ArrayBuilder {
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(index_type, pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(false) {}

template <typename T1 = T>
explicit DictionaryBuilderBase(
Expand All @@ -243,7 +249,8 @@ class DictionaryBuilderBase : public ArrayBuilder {
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool, alignment),
value_type_(dictionary->type()) {}
value_type_(dictionary->type()),
ordered_(false) {}

~DictionaryBuilderBase() override = default;

Expand Down Expand Up @@ -490,9 +497,11 @@ class DictionaryBuilderBase : public ArrayBuilder {
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }

std::shared_ptr<DataType> type() const override {
return ::arrow::dictionary(indices_builder_.type(), value_type_);
return ::arrow::dictionary(indices_builder_.type(), value_type_, ordered_);
}

void set_ordered(bool ordered) { ordered_ = ordered; }

protected:
template <typename c_type>
Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
Expand Down Expand Up @@ -561,6 +570,7 @@ class DictionaryBuilderBase : public ArrayBuilder {

BuilderType indices_builder_;
std::shared_ptr<DataType> value_type_;
bool ordered_;
};

template <typename BuilderType>
Expand Down Expand Up @@ -647,7 +657,7 @@ class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {

Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
(*out)->type = dictionary((*out)->type, null());
(*out)->type = dictionary((*out)->type, null(), ordered_);
(*out)->dictionary = NullArray(0).data();
return Status::OK();
}
Expand All @@ -659,11 +669,14 @@ class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }

std::shared_ptr<DataType> type() const override {
return ::arrow::dictionary(indices_builder_.type(), null());
return ::arrow::dictionary(indices_builder_.type(), null(), ordered_);
}

void set_ordered(bool ordered) { ordered_ = ordered; }

protected:
BuilderType indices_builder_;
bool ordered_ = false;
};

} // namespace internal
Expand Down
29 changes: 21 additions & 8 deletions cpp/src/arrow/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -168,17 +168,24 @@ struct DictionaryBuilderCase {
template <typename ValueType>
Status CreateFor() {
using AdaptiveBuilderType = DictionaryBuilder<ValueType>;
using ExactBuilderType =
internal::DictionaryBuilderBase<TypeErasedIntBuilder, ValueType>;
if (dictionary != nullptr) {
out->reset(new AdaptiveBuilderType(dictionary, pool));
auto builder = new AdaptiveBuilderType(dictionary, pool);
builder->set_ordered(ordered);
out->reset(builder);
} else if (exact_index_type) {
if (!is_integer(index_type->id())) {
return Status::TypeError("MakeBuilder: invalid index type ", *index_type);
}
out->reset(new internal::DictionaryBuilderBase<TypeErasedIntBuilder, ValueType>(
index_type, value_type, pool));
auto builder = new ExactBuilderType(index_type, value_type, pool);
builder->set_ordered(ordered);
out->reset(builder);
} else {
auto start_int_size = index_type->byte_width();
out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool));
auto builder = new AdaptiveBuilderType(start_int_size, value_type, pool);
builder->set_ordered(ordered);
out->reset(builder);
}
return Status::OK();
}
Expand All @@ -188,8 +195,9 @@ struct DictionaryBuilderCase {
MemoryPool* pool;
const std::shared_ptr<DataType>& index_type;
const std::shared_ptr<DataType>& value_type;
const std::shared_ptr<Array>& dictionary;
std::shared_ptr<Array> dictionary;
bool exact_index_type;
bool ordered;
std::unique_ptr<ArrayBuilder>* out;
};

Expand All @@ -206,6 +214,7 @@ struct MakeBuilderImpl {
dict_type.value_type(),
/*dictionary=*/nullptr,
exact_index_type,
dict_type.ordered(),
&out};
return visitor.Make();
}
Expand Down Expand Up @@ -332,9 +341,13 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
const std::shared_ptr<Array>& dictionary,
std::unique_ptr<ArrayBuilder>* out) {
const auto& dict_type = static_cast<const DictionaryType&>(*type);
DictionaryBuilderCase visitor = {
pool, dict_type.index_type(), dict_type.value_type(),
dictionary, /*exact_index_type=*/false, out};
DictionaryBuilderCase visitor = {pool,
dict_type.index_type(),
dict_type.value_type(),
dictionary,
/*exact_index_type=*/false,
dict_type.ordered(),
out};
return visitor.Make();
}

Expand Down
8 changes: 0 additions & 8 deletions r/src/r_to_arrow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -996,14 +996,6 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() override {
ARROW_ASSIGN_OR_RAISE(auto result, this->builder_->Finish());

auto result_type = checked_cast<DictionaryType*>(result->type().get());
if (this->dict_type_->ordered() && !result_type->ordered()) {
// TODO: we should not have to do that, there is probably something wrong
// in the DictionaryBuilder code
result->data()->type =
arrow::dictionary(result_type->index_type(), result_type->value_type(), true);
}

return std::make_shared<ChunkedArray>(
std::make_shared<DictionaryArray>(result->data()));
}
Expand Down
21 changes: 21 additions & 0 deletions r/tests/testthat/test-Array.R
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,13 @@ test_that("arrow_array() handles vector -> list arrays (ARROW-7662)", {
expect_array_roundtrip(list(factor(c("b", "a"), levels = c("a", "b"))), list_of(dictionary(int8(), utf8())))
expect_array_roundtrip(list(factor(NA, levels = c("a", "b"))), list_of(dictionary(int8(), utf8())))

# ordered factor (GH-49689)
expect_array_roundtrip(
list(ordered(c("b", "a"), levels = c("a", "b"))),
list_of(dictionary(int8(), utf8(), ordered = TRUE))
)
expect_array_roundtrip(list(ordered(NA, levels = c("a", "b"))), list_of(dictionary(int8(), utf8(), ordered = TRUE)))

# struct
expect_array_roundtrip(
list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))),
Expand Down Expand Up @@ -742,6 +749,13 @@ test_that("arrow_array() handles vector -> large list arrays", {
as = large_list_of(dictionary(int8(), utf8()))
)

# ordered factor (GH-49689)
expect_array_roundtrip(
list(ordered(c("b", "a"), levels = c("a", "b"))),
large_list_of(dictionary(int8(), utf8(), ordered = TRUE)),
as = large_list_of(dictionary(int8(), utf8(), ordered = TRUE))
)

# struct
expect_array_roundtrip(
list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))),
Expand Down Expand Up @@ -809,6 +823,13 @@ test_that("arrow_array() handles vector -> fixed size list arrays", {
as = fixed_size_list_of(dictionary(int8(), utf8()), 2L)
)

# ordered factor (GH-49689)
expect_array_roundtrip(
list(ordered(c("b", "a"), levels = c("a", "b"))),
fixed_size_list_of(dictionary(int8(), utf8(), ordered = TRUE), 2L),
as = fixed_size_list_of(dictionary(int8(), utf8(), ordered = TRUE), 2L)
)

# struct
expect_array_roundtrip(
list(tibble::tibble(a = 1L, b = 1L, c = "", d = TRUE)),
Expand Down
Loading