Skip to content

Commit

Permalink
Merge remote-tracking branch 'apache/main' into alamb/doc_extended
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Jan 28, 2025
2 parents de9e6e5 + a4917d4 commit dfab8d2
Show file tree
Hide file tree
Showing 40 changed files with 1,210 additions and 170 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ homepage = "https://datafusion.apache.org"
license = "Apache-2.0"
readme = "README.md"
repository = "https://github.com/apache/datafusion"
rust-version = "1.80.1"
rust-version = "1.81.0"
version = "44.0.0"

[workspace.dependencies]
Expand Down
2 changes: 1 addition & 1 deletion datafusion-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ keywords = ["arrow", "datafusion", "query", "sql"]
license = "Apache-2.0"
homepage = "https://datafusion.apache.org"
repository = "https://github.com/apache/datafusion"
rust-version = "1.80.1"
rust-version = "1.81.0"
readme = "README.md"

[dependencies]
Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/expr_api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ fn range_analysis_demo() -> Result<()> {
// In this case, we can see that, as expected, `analyze` has figured out
// that in this case, `date` must be in the range `['2020-09-01', '2020-10-01']`
let expected_range = Interval::try_new(september_1, october_1)?;
assert_eq!(analysis_result.boundaries[0].interval, expected_range);
assert_eq!(analysis_result.boundaries[0].interval, Some(expected_range));

Ok(())
}
Expand Down
11 changes: 5 additions & 6 deletions datafusion/common/src/scalar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3963,7 +3963,7 @@ mod tests {
use arrow::error::ArrowError;
use arrow::util::pretty::pretty_format_columns;
use arrow_array::types::Float64Type;
use arrow_buffer::{Buffer, NullBuffer};
use arrow_buffer::{Buffer, NullBufferBuilder};
use arrow_schema::Fields;
use chrono::NaiveDate;
use rand::Rng;
Expand Down Expand Up @@ -6912,12 +6912,11 @@ mod tests {
let array_b = Arc::new(Int32Array::from_iter_values([2]));
let arrays: Vec<ArrayRef> = vec![array_a, array_b];

let mut not_nulls = BooleanBufferBuilder::new(1);
not_nulls.append(true);
let not_nulls = not_nulls.finish();
let not_nulls = Some(NullBuffer::new(not_nulls));
let mut not_nulls = NullBufferBuilder::new(1);

let ar = StructArray::new(fields, arrays, not_nulls);
not_nulls.append_non_null();

let ar = StructArray::new(fields, arrays, not_nulls.finish());
let s = ScalarValue::Struct(Arc::new(ar));

assert_eq!(s.to_string(), "{a:1,b:2}");
Expand Down
173 changes: 161 additions & 12 deletions datafusion/common/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use std::fmt::{self, Debug, Display};

use crate::{Result, ScalarValue};

use arrow_schema::{Schema, SchemaRef};
use arrow_schema::{DataType, Schema, SchemaRef};

/// Represents a value with a degree of certainty. `Precision` is used to
/// propagate information the precision of statistical values.
Expand Down Expand Up @@ -170,24 +170,63 @@ impl Precision<ScalarValue> {
pub fn add(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
match (self, other) {
(Precision::Exact(a), Precision::Exact(b)) => {
if let Ok(result) = a.add(b) {
Precision::Exact(result)
} else {
Precision::Absent
}
a.add(b).map(Precision::Exact).unwrap_or(Precision::Absent)
}
(Precision::Inexact(a), Precision::Exact(b))
| (Precision::Exact(a), Precision::Inexact(b))
| (Precision::Inexact(a), Precision::Inexact(b)) => {
if let Ok(result) = a.add(b) {
Precision::Inexact(result)
} else {
Precision::Absent
}
| (Precision::Inexact(a), Precision::Inexact(b)) => a
.add(b)
.map(Precision::Inexact)
.unwrap_or(Precision::Absent),
(_, _) => Precision::Absent,
}
}

/// Calculates the difference of two (possibly inexact) [`ScalarValue`] values,
/// conservatively propagating exactness information. If one of the input
/// values is [`Precision::Absent`], the result is `Absent` too.
pub fn sub(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
match (self, other) {
(Precision::Exact(a), Precision::Exact(b)) => {
a.sub(b).map(Precision::Exact).unwrap_or(Precision::Absent)
}
(Precision::Inexact(a), Precision::Exact(b))
| (Precision::Exact(a), Precision::Inexact(b))
| (Precision::Inexact(a), Precision::Inexact(b)) => a
.sub(b)
.map(Precision::Inexact)
.unwrap_or(Precision::Absent),
(_, _) => Precision::Absent,
}
}

/// Calculates the multiplication of two (possibly inexact) [`ScalarValue`] values,
/// conservatively propagating exactness information. If one of the input
/// values is [`Precision::Absent`], the result is `Absent` too.
pub fn multiply(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
match (self, other) {
(Precision::Exact(a), Precision::Exact(b)) => a
.mul_checked(b)
.map(Precision::Exact)
.unwrap_or(Precision::Absent),
(Precision::Inexact(a), Precision::Exact(b))
| (Precision::Exact(a), Precision::Inexact(b))
| (Precision::Inexact(a), Precision::Inexact(b)) => a
.mul_checked(b)
.map(Precision::Inexact)
.unwrap_or(Precision::Absent),
(_, _) => Precision::Absent,
}
}

/// Casts the value to the given data type, propagating exactness information.
pub fn cast_to(&self, data_type: &DataType) -> Result<Precision<ScalarValue>> {
match self {
Precision::Exact(value) => value.cast_to(data_type).map(Precision::Exact),
Precision::Inexact(value) => value.cast_to(data_type).map(Precision::Inexact),
Precision::Absent => Ok(Precision::Absent),
}
}
}

impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Debug for Precision<T> {
Expand All @@ -210,6 +249,18 @@ impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Display for Precision<T> {
}
}

impl From<Precision<usize>> for Precision<ScalarValue> {
fn from(value: Precision<usize>) -> Self {
match value {
Precision::Exact(v) => Precision::Exact(ScalarValue::UInt64(Some(v as u64))),
Precision::Inexact(v) => {
Precision::Inexact(ScalarValue::UInt64(Some(v as u64)))
}
Precision::Absent => Precision::Absent,
}
}
}

/// Statistics for a relation
/// Fields are optional and can be inexact because the sources
/// sometimes provide approximate estimates for performance reasons
Expand Down Expand Up @@ -401,6 +452,11 @@ impl Display for Statistics {
} else {
s
};
let s = if cs.sum_value != Precision::Absent {
format!("{} Sum={}", s, cs.sum_value)
} else {
s
};
let s = if cs.null_count != Precision::Absent {
format!("{} Null={}", s, cs.null_count)
} else {
Expand Down Expand Up @@ -436,6 +492,8 @@ pub struct ColumnStatistics {
pub max_value: Precision<ScalarValue>,
/// Minimum value of column
pub min_value: Precision<ScalarValue>,
/// Sum value of a column
pub sum_value: Precision<ScalarValue>,
/// Number of distinct values
pub distinct_count: Precision<usize>,
}
Expand All @@ -458,6 +516,7 @@ impl ColumnStatistics {
null_count: Precision::Absent,
max_value: Precision::Absent,
min_value: Precision::Absent,
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
}
}
Expand All @@ -469,6 +528,7 @@ impl ColumnStatistics {
self.null_count = self.null_count.to_inexact();
self.max_value = self.max_value.to_inexact();
self.min_value = self.min_value.to_inexact();
self.sum_value = self.sum_value.to_inexact();
self.distinct_count = self.distinct_count.to_inexact();
self
}
Expand Down Expand Up @@ -563,6 +623,26 @@ mod tests {
assert_eq!(precision1.add(&absent_precision), Precision::Absent);
}

#[test]
fn test_add_scalar() {
let precision = Precision::Exact(ScalarValue::Int32(Some(42)));

assert_eq!(
precision.add(&Precision::Exact(ScalarValue::Int32(Some(23)))),
Precision::Exact(ScalarValue::Int32(Some(65))),
);
assert_eq!(
precision.add(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
Precision::Inexact(ScalarValue::Int32(Some(65))),
);
assert_eq!(
precision.add(&Precision::Exact(ScalarValue::Int32(None))),
// As per behavior of ScalarValue::add
Precision::Exact(ScalarValue::Int32(None)),
);
assert_eq!(precision.add(&Precision::Absent), Precision::Absent);
}

#[test]
fn test_sub() {
let precision1 = Precision::Exact(42);
Expand All @@ -575,6 +655,26 @@ mod tests {
assert_eq!(precision1.sub(&absent_precision), Precision::Absent);
}

#[test]
fn test_sub_scalar() {
let precision = Precision::Exact(ScalarValue::Int32(Some(42)));

assert_eq!(
precision.sub(&Precision::Exact(ScalarValue::Int32(Some(23)))),
Precision::Exact(ScalarValue::Int32(Some(19))),
);
assert_eq!(
precision.sub(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
Precision::Inexact(ScalarValue::Int32(Some(19))),
);
assert_eq!(
precision.sub(&Precision::Exact(ScalarValue::Int32(None))),
// As per behavior of ScalarValue::sub
Precision::Exact(ScalarValue::Int32(None)),
);
assert_eq!(precision.sub(&Precision::Absent), Precision::Absent);
}

#[test]
fn test_multiply() {
let precision1 = Precision::Exact(6);
Expand All @@ -588,6 +688,54 @@ mod tests {
assert_eq!(precision1.multiply(&absent_precision), Precision::Absent);
}

#[test]
fn test_multiply_scalar() {
let precision = Precision::Exact(ScalarValue::Int32(Some(6)));

assert_eq!(
precision.multiply(&Precision::Exact(ScalarValue::Int32(Some(5)))),
Precision::Exact(ScalarValue::Int32(Some(30))),
);
assert_eq!(
precision.multiply(&Precision::Inexact(ScalarValue::Int32(Some(5)))),
Precision::Inexact(ScalarValue::Int32(Some(30))),
);
assert_eq!(
precision.multiply(&Precision::Exact(ScalarValue::Int32(None))),
// As per behavior of ScalarValue::mul_checked
Precision::Exact(ScalarValue::Int32(None)),
);
assert_eq!(precision.multiply(&Precision::Absent), Precision::Absent);
}

#[test]
fn test_cast_to() {
// Valid
assert_eq!(
Precision::Exact(ScalarValue::Int32(Some(42)))
.cast_to(&DataType::Int64)
.unwrap(),
Precision::Exact(ScalarValue::Int64(Some(42))),
);
assert_eq!(
Precision::Inexact(ScalarValue::Int32(Some(42)))
.cast_to(&DataType::Int64)
.unwrap(),
Precision::Inexact(ScalarValue::Int64(Some(42))),
);
// Null
assert_eq!(
Precision::Exact(ScalarValue::Int32(None))
.cast_to(&DataType::Int64)
.unwrap(),
Precision::Exact(ScalarValue::Int64(None)),
);
// Overflow returns error
assert!(Precision::Exact(ScalarValue::Int32(Some(256)))
.cast_to(&DataType::Int8)
.is_err());
}

#[test]
fn test_precision_cloning() {
// Precision<usize> is copy
Expand Down Expand Up @@ -646,6 +794,7 @@ mod tests {
null_count: Precision::Exact(null_count),
max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))),
distinct_count: Precision::Exact(100),
}
}
Expand Down
6 changes: 5 additions & 1 deletion datafusion/core/src/datasource/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ pub async fn get_statistics_with_limit(
col_stats_set[index].null_count = file_column.null_count;
col_stats_set[index].max_value = file_column.max_value;
col_stats_set[index].min_value = file_column.min_value;
col_stats_set[index].sum_value = file_column.sum_value;
}

// If the number of rows exceeds the limit, we can stop processing
Expand Down Expand Up @@ -113,12 +114,14 @@ pub async fn get_statistics_with_limit(
null_count: file_nc,
max_value: file_max,
min_value: file_min,
sum_value: file_sum,
distinct_count: _,
} = file_col_stats;

col_stats.null_count = add_row_stats(*file_nc, col_stats.null_count);
set_max_if_greater(file_max, &mut col_stats.max_value);
set_min_if_lesser(file_min, &mut col_stats.min_value)
set_min_if_lesser(file_min, &mut col_stats.min_value);
col_stats.sum_value = file_sum.add(&col_stats.sum_value);
}

// If the number of rows exceeds the limit, we can stop processing
Expand Down Expand Up @@ -204,6 +207,7 @@ pub(crate) fn get_col_stats(
null_count: null_counts[i],
max_value: max_value.map(Precision::Exact).unwrap_or(Precision::Absent),
min_value: min_value.map(Precision::Exact).unwrap_or(Precision::Absent),
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
}
})
Expand Down
2 changes: 2 additions & 0 deletions datafusion/core/tests/custom_sources_cases/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,14 @@ fn fully_defined() -> (Statistics, Schema) {
distinct_count: Precision::Exact(2),
max_value: Precision::Exact(ScalarValue::Int32(Some(1023))),
min_value: Precision::Exact(ScalarValue::Int32(Some(-24))),
sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
null_count: Precision::Exact(0),
},
ColumnStatistics {
distinct_count: Precision::Exact(13),
max_value: Precision::Exact(ScalarValue::Int64(Some(5486))),
min_value: Precision::Exact(ScalarValue::Int64(Some(-6783))),
sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
null_count: Precision::Exact(5),
},
],
Expand Down
Loading

0 comments on commit dfab8d2

Please sign in to comment.