Skip to content

Commit aa0ecc9

Browse files
committed
add average_byte_width with relevant tests
average_byte_width average_byte_width_exact
1 parent e2a5b4e commit aa0ecc9

File tree

5 files changed

+124
-1
lines changed

5 files changed

+124
-1
lines changed

cpp/src/arrow/array/array_test.cc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3861,6 +3861,7 @@ class TestArrayDataStatistics : public ::testing::Test {
38613861
values_ = {1, 0, 3, -4};
38623862
min_ = *std::min_element(values_.begin(), values_.end());
38633863
max_ = *std::max_element(values_.begin(), values_.end());
3864+
average_byte_width_ = 4.0;
38643865
values_buffer_ = Buffer::FromVector(values_);
38653866
data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_},
38663867
null_count_);
@@ -3870,6 +3871,8 @@ class TestArrayDataStatistics : public ::testing::Test {
38703871
data_->statistics->is_min_exact = true;
38713872
data_->statistics->max = max_;
38723873
data_->statistics->is_max_exact = true;
3874+
data_->statistics->average_byte_width = average_byte_width_;
3875+
data_->statistics->is_average_byte_width_exact = true;
38733876
}
38743877

38753878
protected:
@@ -3879,6 +3882,7 @@ class TestArrayDataStatistics : public ::testing::Test {
38793882
std::vector<int32_t> values_;
38803883
int64_t min_;
38813884
int64_t max_;
3885+
double average_byte_width_;
38823886
std::shared_ptr<Buffer> values_buffer_;
38833887
std::shared_ptr<ArrayData> data_;
38843888
};
@@ -3899,6 +3903,10 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) {
38993903
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
39003904
ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
39013905
ASSERT_TRUE(moved_data.statistics->is_max_exact);
3906+
3907+
ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
3908+
ASSERT_EQ(average_byte_width_, moved_data.statistics->average_byte_width.value());
3909+
ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact);
39023910
}
39033911

39043912
TEST_F(TestArrayDataStatistics, CopyConstructor) {
@@ -3916,6 +3924,10 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) {
39163924
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
39173925
ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
39183926
ASSERT_TRUE(copied_data.statistics->is_max_exact);
3927+
3928+
ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
3929+
ASSERT_EQ(average_byte_width_, copied_data.statistics->average_byte_width.value());
3930+
ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact);
39193931
}
39203932

39213933
TEST_F(TestArrayDataStatistics, MoveAssignment) {
@@ -3935,6 +3947,10 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) {
39353947
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
39363948
ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
39373949
ASSERT_TRUE(moved_data.statistics->is_max_exact);
3950+
3951+
ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
3952+
ASSERT_EQ(average_byte_width_, moved_data.statistics->average_byte_width.value());
3953+
ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact);
39383954
}
39393955

39403956
TEST_F(TestArrayDataStatistics, CopyAssignment) {
@@ -3953,6 +3969,10 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) {
39533969
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
39543970
ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
39553971
ASSERT_TRUE(copied_data.statistics->is_max_exact);
3972+
3973+
ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
3974+
ASSERT_EQ(average_byte_width_, copied_data.statistics->average_byte_width.value());
3975+
ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact);
39563976
}
39573977

39583978
TEST_F(TestArrayDataStatistics, CopyTo) {

cpp/src/arrow/array/statistics.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,12 @@ struct ARROW_EXPORT ArrayStatistics {
7777
/// \brief The number of distinct values, may not be set
7878
std::optional<int64_t> distinct_count = std::nullopt;
7979

80+
/// \brief The number of distinct values, may not be set
81+
std::optional<double> average_byte_width = std::nullopt;
82+
83+
/// \brief Whether the maximum value is exact or not
84+
bool is_average_byte_width_exact = false;
85+
8086
/// \brief The minimum value, may not be set
8187
std::optional<ValueType> min = std::nullopt;
8288

@@ -131,7 +137,9 @@ struct ARROW_EXPORT ArrayStatistics {
131137
bool Equals(const ArrayStatistics& other) const {
132138
return null_count == other.null_count && distinct_count == other.distinct_count &&
133139
min == other.min && is_min_exact == other.is_min_exact && max == other.max &&
134-
is_max_exact == other.is_max_exact;
140+
is_max_exact == other.is_max_exact &&
141+
average_byte_width == other.average_byte_width &&
142+
is_average_byte_width_exact == other.is_average_byte_width_exact;
135143
}
136144

137145
/// \brief Check two statistics for equality

cpp/src/arrow/array/statistics_test.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,17 @@ TEST(ArrayStatisticsTest, TestMax) {
6161
ASSERT_FALSE(statistics.is_max_exact);
6262
}
6363

64+
TEST(ArrayStatisticsTest, average_byte_width) {
65+
ArrayStatistics statistics;
66+
ASSERT_FALSE(statistics.average_byte_width.has_value());
67+
ASSERT_FALSE(statistics.is_average_byte_width_exact);
68+
statistics.average_byte_width = 12.0;
69+
statistics.is_average_byte_width_exact = true;
70+
ASSERT_TRUE(statistics.average_byte_width.has_value());
71+
ASSERT_EQ(12.0, statistics.average_byte_width.value());
72+
ASSERT_TRUE(statistics.is_average_byte_width_exact);
73+
}
74+
6475
TEST(ArrayStatisticsTest, TestEquality) {
6576
ArrayStatistics statistics1;
6677
ArrayStatistics statistics2;
@@ -96,6 +107,16 @@ TEST(ArrayStatisticsTest, TestEquality) {
96107
ASSERT_NE(statistics1, statistics2);
97108
statistics2.is_max_exact = true;
98109
ASSERT_EQ(statistics1, statistics2);
110+
111+
statistics1.average_byte_width = 12.0;
112+
ASSERT_NE(statistics1, statistics2);
113+
statistics2.average_byte_width = 12.0;
114+
ASSERT_EQ(statistics1, statistics2);
115+
116+
statistics1.is_average_byte_width_exact = true;
117+
ASSERT_NE(statistics1, statistics2);
118+
statistics2.is_average_byte_width_exact = true;
119+
ASSERT_EQ(statistics1, statistics2);
99120
}
100121

101122
} // namespace arrow

cpp/src/arrow/record_batch.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,18 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat
555555
RETURN_NOT_OK(on_statistics(statistics));
556556
statistics.start_new_column = false;
557557
}
558+
if (column_statistics->average_byte_width.has_value()) {
559+
statistics.nth_statistics++;
560+
if (column_statistics->is_average_byte_width_exact) {
561+
statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT;
562+
} else {
563+
statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE;
564+
}
565+
statistics.type = float64();
566+
statistics.value = column_statistics->average_byte_width.value();
567+
RETURN_NOT_OK(on_statistics(statistics));
568+
statistics.start_new_column = false;
569+
}
558570
}
559571
return Status::OK();
560572
}

cpp/src/arrow/record_batch_test.cc

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,6 +1467,68 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) {
14671467
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
14681468
}
14691469

1470+
TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthApproximate) {
1471+
auto schema =
1472+
::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())});
1473+
auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
1474+
auto string_array = ArrayFromJSON(utf8(), R"(["aa", "bb", "ccc"])");
1475+
string_array->data()->statistics = std::make_shared<ArrayStatistics>();
1476+
string_array->data()->statistics->average_byte_width = 2.3;
1477+
auto batch = RecordBatch::Make(schema, string_array->length(),
1478+
{no_statistics_array, string_array});
1479+
1480+
ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
1481+
1482+
ASSERT_OK_AND_ASSIGN(
1483+
auto expected_statistics_array,
1484+
MakeStatisticsArray("[null, 1]",
1485+
{{
1486+
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
1487+
},
1488+
{
1489+
ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE,
1490+
}},
1491+
{{
1492+
ArrayStatistics::ValueType{int64_t{3}},
1493+
},
1494+
{
1495+
ArrayStatistics::ValueType{2.3},
1496+
}}));
1497+
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
1498+
}
1499+
1500+
TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthExact) {
1501+
auto schema =
1502+
::arrow::schema({field("no-statistics", boolean()), field("float64", float64())});
1503+
auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
1504+
auto float_array = ArrayFromJSON(float64(), R"([1.0, 2.0, 3.0])");
1505+
float_array->data()->statistics = std::make_shared<ArrayStatistics>();
1506+
float_array->data()->statistics->average_byte_width = 8.0;
1507+
float_array->data()->statistics->is_average_byte_width_exact = true;
1508+
1509+
auto batch = RecordBatch::Make(schema, float_array->length(),
1510+
{no_statistics_array, float_array});
1511+
1512+
ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
1513+
1514+
ASSERT_OK_AND_ASSIGN(
1515+
auto expected_statistics_array,
1516+
MakeStatisticsArray("[null, 1]",
1517+
{{
1518+
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
1519+
},
1520+
{
1521+
ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT,
1522+
}},
1523+
{{
1524+
ArrayStatistics::ValueType{int64_t{3}},
1525+
},
1526+
{
1527+
ArrayStatistics::ValueType{8.0},
1528+
}}));
1529+
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
1530+
}
1531+
14701532
template <typename DataType>
14711533
class TestRecordBatchMakeStatisticsArrayBinary : public ::testing::Test {
14721534
public:

0 commit comments

Comments
 (0)