From f1a9cd7f4cfbfbb2f6456b07b5f016433440aa9f Mon Sep 17 00:00:00 2001 From: BohuTANG Date: Wed, 30 Apr 2025 17:43:35 +0800 Subject: [PATCH 1/4] chore(optimizer): add more error log for get_upper_bound --- src/common/storage/src/statistics.rs | 10 ++++++++++ .../sql/src/planner/optimizer/ir/stats/histogram.rs | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/common/storage/src/statistics.rs b/src/common/storage/src/statistics.rs index c6f94ce88ead5..814d95dbdcb79 100644 --- a/src/common/storage/src/statistics.rs +++ b/src/common/storage/src/statistics.rs @@ -156,6 +156,16 @@ impl Datum { matches!(self, Datum::Int(_) | Datum::UInt(_) | Datum::Float(_)) } + pub fn type_name(&self) -> &'static str { + match self { + Datum::Bool(_) => "Boolean", + Datum::Int(_) => "Integer", + Datum::UInt(_) => "Unsigned Integer", + Datum::Float(_) => "Float", + Datum::Bytes(_) => "String", + } + } + pub fn compare(&self, other: &Self) -> Result { match (self, other) { (Datum::Bool(l), Datum::Bool(r)) => Ok(l.cmp(r)), diff --git a/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs b/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs index 31b4684397361..adeebae063bc3 100644 --- a/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs +++ b/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs @@ -244,9 +244,9 @@ impl SampleSet for UniformSampleSet { } _ => Err(format!( - "Unsupported datum type: {:?}, {:?}", - self.min, self.max - )), + "Unsupported datum type for histogram calculation: {} (type: {}), {} (type: {}). Only numeric types are supported.", + self.min, self.min.type_name(), self.max, self.max.type_name() + )) } } } From 5e845e748442db590182c4e3f65534973d9d6c70 Mon Sep 17 00:00:00 2001 From: BohuTANG Date: Wed, 30 Apr 2025 20:52:11 +0800 Subject: [PATCH 2/4] fix the string types for the histogram calc --- .../planner/optimizer/ir/stats/histogram.rs | 32 +++++++++++++ .../base/09_fuse_engine/09_0020_analyze.test | 45 +++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs b/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs index adeebae063bc3..2c57e8868a451 100644 --- a/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs +++ b/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs @@ -243,6 +243,38 @@ impl SampleSet for UniformSampleSet { Ok(Datum::Float(upper_bound)) } + // Handle Bytes type for histogram calculation by converting to strings first + (Datum::Bytes(min_bytes), Datum::Bytes(max_bytes)) => { + // Convert bytes to strings for comparison + let min_str = String::from_utf8_lossy(min_bytes); + let max_str = String::from_utf8_lossy(max_bytes); + + // For boundary cases, return the exact values + if min_str == max_str { + return Ok(Datum::Bytes(min_bytes.clone())); + } + + if bucket_index == 0 { + return Ok(Datum::Bytes(min_bytes.clone())); + } else if bucket_index >= num_buckets { + return Ok(Datum::Bytes(max_bytes.clone())); + } + + // For intermediate buckets, use a simple approach based on string comparison + // Just divide the range into equal parts based on bucket_index + + // If bucket_index is in the first half, return min + // If bucket_index is in the second half, return max + // This preserves the string ordering semantics + let mid_bucket = num_buckets / 2; + + if bucket_index <= mid_bucket { + Ok(Datum::Bytes(min_bytes.clone())) + } else { + Ok(Datum::Bytes(max_bytes.clone())) + } + } + _ => Err(format!( "Unsupported datum type for histogram calculation: {} (type: {}), {} (type: {}). Only numeric types are supported.", self.min, self.min.type_name(), self.max, self.max.type_name() diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test index 73188c3027314..5b9ce575f41de 100644 --- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test +++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test @@ -154,5 +154,50 @@ analyze table t1; statement ok DROP TABLE t +# Test case for string histogram functionality +statement ok +create or replace table t_string(id int, str_val varchar); + +statement ok +insert into t_string values + (1, '1.0'), + (2, '2.0'), + (3, '3.0'), + (4, '4.0'), + (5, '5.0'), + (6, '6.0'), + (7, '7.0'), + (8, '8.0'), + (9, '9.0'), + (10, '10.0'); + +statement ok +set enable_analyze_histogram=1; + +statement ok +analyze table t_string; + +# Verify that histogram was created for string column +query IIT +select * from fuse_statistic('db_09_0020', 't_string'); +---- +id 10 [bucket id: 0, min: "1", max: "1", ndv: 1.0, count: 1.0], [bucket id: 1, min: "2", max: "2", ndv: 1.0, count: 1.0], [bucket id: 2, min: "3", max: "3", ndv: 1.0, count: 1.0], [bucket id: 3, min: "4", max: "4", ndv: 1.0, count: 1.0], [bucket id: 4, min: "5", max: "5", ndv: 1.0, count: 1.0], [bucket id: 5, min: "6", max: "6", ndv: 1.0, count: 1.0], [bucket id: 6, min: "7", max: "7", ndv: 1.0, count: 1.0], [bucket id: 7, min: "8", max: "8", ndv: 1.0, count: 1.0], [bucket id: 8, min: "9", max: "9", ndv: 1.0, count: 1.0], [bucket id: 9, min: "10", max: "10", ndv: 1.0, count: 1.0] +str_val 10 [bucket id: 0, min: "1.0", max: "1.0", ndv: 1.0, count: 1.0], [bucket id: 1, min: "10.0", max: "10.0", ndv: 1.0, count: 1.0], [bucket id: 2, min: "2.0", max: "2.0", ndv: 1.0, count: 1.0], [bucket id: 3, min: "3.0", max: "3.0", ndv: 1.0, count: 1.0], [bucket id: 4, min: "4.0", max: "4.0", ndv: 1.0, count: 1.0], [bucket id: 5, min: "5.0", max: "5.0", ndv: 1.0, count: 1.0], [bucket id: 6, min: "6.0", max: "6.0", ndv: 1.0, count: 1.0], [bucket id: 7, min: "7.0", max: "7.0", ndv: 1.0, count: 1.0], [bucket id: 8, min: "8.0", max: "8.0", ndv: 1.0, count: 1.0], [bucket id: 9, min: "9.0", max: "9.0", ndv: 1.0, count: 1.0] + +# Test string comparison with histogram +query I +select count(*) from t_string where str_val > '5.0'; +---- +5 + +# Test string range query with histogram +query I +select count(*) from t_string where str_val between '3.0' and '7.0'; +---- +5 + +statement ok +DROP TABLE t_string + statement ok DROP DATABASE db_09_0020 From b477ed4da4dee1d5e60f91bf604de9dc3c491cc3 Mon Sep 17 00:00:00 2001 From: BohuTANG Date: Wed, 30 Apr 2025 21:29:26 +0800 Subject: [PATCH 3/4] fix t_string in analyze.test --- .../suites/base/09_fuse_engine/09_0020_analyze.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test index 5b9ce575f41de..c472c7ad3f691 100644 --- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test +++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test @@ -188,7 +188,7 @@ str_val 10 [bucket id: 0, min: "1.0", max: "1.0", ndv: 1.0, count: 1.0], [bucket query I select count(*) from t_string where str_val > '5.0'; ---- -5 +4 # Test string range query with histogram query I From e466753cdb8f247bbd0639164cf314d5c9f43796 Mon Sep 17 00:00:00 2001 From: BohuTANG Date: Tue, 6 May 2025 09:57:44 +0800 Subject: [PATCH 4/4] fix fuse_statistic after enable_analyze_histogram --- .../suites/base/09_fuse_engine/09_0020_analyze.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test index c472c7ad3f691..f61b1d9359cb1 100644 --- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test +++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test @@ -179,7 +179,7 @@ analyze table t_string; # Verify that histogram was created for string column query IIT -select * from fuse_statistic('db_09_0020', 't_string'); +select * from fuse_statistic('db_09_0020', 't_string') order by column_name asc; ---- id 10 [bucket id: 0, min: "1", max: "1", ndv: 1.0, count: 1.0], [bucket id: 1, min: "2", max: "2", ndv: 1.0, count: 1.0], [bucket id: 2, min: "3", max: "3", ndv: 1.0, count: 1.0], [bucket id: 3, min: "4", max: "4", ndv: 1.0, count: 1.0], [bucket id: 4, min: "5", max: "5", ndv: 1.0, count: 1.0], [bucket id: 5, min: "6", max: "6", ndv: 1.0, count: 1.0], [bucket id: 6, min: "7", max: "7", ndv: 1.0, count: 1.0], [bucket id: 7, min: "8", max: "8", ndv: 1.0, count: 1.0], [bucket id: 8, min: "9", max: "9", ndv: 1.0, count: 1.0], [bucket id: 9, min: "10", max: "10", ndv: 1.0, count: 1.0] str_val 10 [bucket id: 0, min: "1.0", max: "1.0", ndv: 1.0, count: 1.0], [bucket id: 1, min: "10.0", max: "10.0", ndv: 1.0, count: 1.0], [bucket id: 2, min: "2.0", max: "2.0", ndv: 1.0, count: 1.0], [bucket id: 3, min: "3.0", max: "3.0", ndv: 1.0, count: 1.0], [bucket id: 4, min: "4.0", max: "4.0", ndv: 1.0, count: 1.0], [bucket id: 5, min: "5.0", max: "5.0", ndv: 1.0, count: 1.0], [bucket id: 6, min: "6.0", max: "6.0", ndv: 1.0, count: 1.0], [bucket id: 7, min: "7.0", max: "7.0", ndv: 1.0, count: 1.0], [bucket id: 8, min: "8.0", max: "8.0", ndv: 1.0, count: 1.0], [bucket id: 9, min: "9.0", max: "9.0", ndv: 1.0, count: 1.0]