From f1a9cd7f4cfbfbb2f6456b07b5f016433440aa9f Mon Sep 17 00:00:00 2001
From: BohuTANG <overred.shuttler@gmail.com>
Date: Wed, 30 Apr 2025 17:43:35 +0800
Subject: [PATCH 1/4] chore(optimizer): add more error log for get_upper_bound

---
 src/common/storage/src/statistics.rs                   | 10 ++++++++++
 .../sql/src/planner/optimizer/ir/stats/histogram.rs    |  6 +++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/common/storage/src/statistics.rs b/src/common/storage/src/statistics.rs
index c6f94ce88ead5..814d95dbdcb79 100644
--- a/src/common/storage/src/statistics.rs
+++ b/src/common/storage/src/statistics.rs
@@ -156,6 +156,16 @@ impl Datum {
         matches!(self, Datum::Int(_) | Datum::UInt(_) | Datum::Float(_))
     }
 
+    pub fn type_name(&self) -> &'static str {
+        match self {
+            Datum::Bool(_) => "Boolean",
+            Datum::Int(_) => "Integer",
+            Datum::UInt(_) => "Unsigned Integer",
+            Datum::Float(_) => "Float",
+            Datum::Bytes(_) => "String",
+        }
+    }
+
     pub fn compare(&self, other: &Self) -> Result<std::cmp::Ordering> {
         match (self, other) {
             (Datum::Bool(l), Datum::Bool(r)) => Ok(l.cmp(r)),
diff --git a/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs b/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs
index 31b4684397361..adeebae063bc3 100644
--- a/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs
+++ b/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs
@@ -244,9 +244,9 @@ impl SampleSet for UniformSampleSet {
             }
 
             _ => Err(format!(
-                "Unsupported datum type: {:?}, {:?}",
-                self.min, self.max
-            )),
+                "Unsupported datum type for histogram calculation: {} (type: {}), {} (type: {}). Only numeric types are supported.",
+                self.min, self.min.type_name(), self.max, self.max.type_name()
+            ))
         }
     }
 }

From 5e845e748442db590182c4e3f65534973d9d6c70 Mon Sep 17 00:00:00 2001
From: BohuTANG <overred.shuttler@gmail.com>
Date: Wed, 30 Apr 2025 20:52:11 +0800
Subject: [PATCH 2/4] fix the string types for the histogram calc

---
 .../planner/optimizer/ir/stats/histogram.rs   | 32 +++++++++++++
 .../base/09_fuse_engine/09_0020_analyze.test  | 45 +++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs b/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs
index adeebae063bc3..2c57e8868a451 100644
--- a/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs
+++ b/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs
@@ -243,6 +243,38 @@ impl SampleSet for UniformSampleSet {
                 Ok(Datum::Float(upper_bound))
             }
 
+            // Handle Bytes type for histogram calculation by converting to strings first
+            (Datum::Bytes(min_bytes), Datum::Bytes(max_bytes)) => {
+                // Convert bytes to strings for comparison
+                let min_str = String::from_utf8_lossy(min_bytes);
+                let max_str = String::from_utf8_lossy(max_bytes);
+
+                // For boundary cases, return the exact values
+                if min_str == max_str {
+                    return Ok(Datum::Bytes(min_bytes.clone()));
+                }
+
+                if bucket_index == 0 {
+                    return Ok(Datum::Bytes(min_bytes.clone()));
+                } else if bucket_index >= num_buckets {
+                    return Ok(Datum::Bytes(max_bytes.clone()));
+                }
+
+                // For intermediate buckets, use a simple approach based on string comparison
+                // Just divide the range into equal parts based on bucket_index
+
+                // If bucket_index is in the first half, return min
+                // If bucket_index is in the second half, return max
+                // This preserves the string ordering semantics
+                let mid_bucket = num_buckets / 2;
+
+                if bucket_index <= mid_bucket {
+                    Ok(Datum::Bytes(min_bytes.clone()))
+                } else {
+                    Ok(Datum::Bytes(max_bytes.clone()))
+                }
+            }
+
             _ => Err(format!(
                 "Unsupported datum type for histogram calculation: {} (type: {}), {} (type: {}). Only numeric types are supported.",
                 self.min, self.min.type_name(), self.max, self.max.type_name()
diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test
index 73188c3027314..5b9ce575f41de 100644
--- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test
+++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test
@@ -154,5 +154,50 @@ analyze table t1;
 statement ok
 DROP TABLE t
 
+# Test case for string histogram functionality
+statement ok
+create or replace table t_string(id int, str_val varchar);
+
+statement ok
+insert into t_string values
+    (1, '1.0'),
+    (2, '2.0'),
+    (3, '3.0'),
+    (4, '4.0'),
+    (5, '5.0'),
+    (6, '6.0'),
+    (7, '7.0'),
+    (8, '8.0'),
+    (9, '9.0'),
+    (10, '10.0');
+
+statement ok
+set enable_analyze_histogram=1;
+
+statement ok
+analyze table t_string;
+
+# Verify that histogram was created for string column
+query IIT
+select * from fuse_statistic('db_09_0020', 't_string');
+----
+id 10 [bucket id: 0, min: "1", max: "1", ndv: 1.0, count: 1.0], [bucket id: 1, min: "2", max: "2", ndv: 1.0, count: 1.0], [bucket id: 2, min: "3", max: "3", ndv: 1.0, count: 1.0], [bucket id: 3, min: "4", max: "4", ndv: 1.0, count: 1.0], [bucket id: 4, min: "5", max: "5", ndv: 1.0, count: 1.0], [bucket id: 5, min: "6", max: "6", ndv: 1.0, count: 1.0], [bucket id: 6, min: "7", max: "7", ndv: 1.0, count: 1.0], [bucket id: 7, min: "8", max: "8", ndv: 1.0, count: 1.0], [bucket id: 8, min: "9", max: "9", ndv: 1.0, count: 1.0], [bucket id: 9, min: "10", max: "10", ndv: 1.0, count: 1.0]
+str_val 10 [bucket id: 0, min: "1.0", max: "1.0", ndv: 1.0, count: 1.0], [bucket id: 1, min: "10.0", max: "10.0", ndv: 1.0, count: 1.0], [bucket id: 2, min: "2.0", max: "2.0", ndv: 1.0, count: 1.0], [bucket id: 3, min: "3.0", max: "3.0", ndv: 1.0, count: 1.0], [bucket id: 4, min: "4.0", max: "4.0", ndv: 1.0, count: 1.0], [bucket id: 5, min: "5.0", max: "5.0", ndv: 1.0, count: 1.0], [bucket id: 6, min: "6.0", max: "6.0", ndv: 1.0, count: 1.0], [bucket id: 7, min: "7.0", max: "7.0", ndv: 1.0, count: 1.0], [bucket id: 8, min: "8.0", max: "8.0", ndv: 1.0, count: 1.0], [bucket id: 9, min: "9.0", max: "9.0", ndv: 1.0, count: 1.0]
+
+# Test string comparison with histogram
+query I
+select count(*) from t_string where str_val > '5.0';
+----
+5
+
+# Test string range query with histogram
+query I
+select count(*) from t_string where str_val between '3.0' and '7.0';
+----
+5
+
+statement ok
+DROP TABLE t_string
+
 statement ok
 DROP DATABASE db_09_0020

From b477ed4da4dee1d5e60f91bf604de9dc3c491cc3 Mon Sep 17 00:00:00 2001
From: BohuTANG <overred.shuttler@gmail.com>
Date: Wed, 30 Apr 2025 21:29:26 +0800
Subject: [PATCH 3/4] fix t_string in analyze.test

---
 .../suites/base/09_fuse_engine/09_0020_analyze.test             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test
index 5b9ce575f41de..c472c7ad3f691 100644
--- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test
+++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test
@@ -188,7 +188,7 @@ str_val 10 [bucket id: 0, min: "1.0", max: "1.0", ndv: 1.0, count: 1.0], [bucket
 query I
 select count(*) from t_string where str_val > '5.0';
 ----
-5
+4
 
 # Test string range query with histogram
 query I

From e466753cdb8f247bbd0639164cf314d5c9f43796 Mon Sep 17 00:00:00 2001
From: BohuTANG <overred.shuttler@gmail.com>
Date: Tue, 6 May 2025 09:57:44 +0800
Subject: [PATCH 4/4] fix fuse_statistic after enable_analyze_histogram

---
 .../suites/base/09_fuse_engine/09_0020_analyze.test             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test
index c472c7ad3f691..f61b1d9359cb1 100644
--- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test
+++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test
@@ -179,7 +179,7 @@ analyze table t_string;
 
 # Verify that histogram was created for string column
 query IIT
-select * from fuse_statistic('db_09_0020', 't_string');
+select * from fuse_statistic('db_09_0020', 't_string') order by column_name asc;
 ----
 id 10 [bucket id: 0, min: "1", max: "1", ndv: 1.0, count: 1.0], [bucket id: 1, min: "2", max: "2", ndv: 1.0, count: 1.0], [bucket id: 2, min: "3", max: "3", ndv: 1.0, count: 1.0], [bucket id: 3, min: "4", max: "4", ndv: 1.0, count: 1.0], [bucket id: 4, min: "5", max: "5", ndv: 1.0, count: 1.0], [bucket id: 5, min: "6", max: "6", ndv: 1.0, count: 1.0], [bucket id: 6, min: "7", max: "7", ndv: 1.0, count: 1.0], [bucket id: 7, min: "8", max: "8", ndv: 1.0, count: 1.0], [bucket id: 8, min: "9", max: "9", ndv: 1.0, count: 1.0], [bucket id: 9, min: "10", max: "10", ndv: 1.0, count: 1.0]
 str_val 10 [bucket id: 0, min: "1.0", max: "1.0", ndv: 1.0, count: 1.0], [bucket id: 1, min: "10.0", max: "10.0", ndv: 1.0, count: 1.0], [bucket id: 2, min: "2.0", max: "2.0", ndv: 1.0, count: 1.0], [bucket id: 3, min: "3.0", max: "3.0", ndv: 1.0, count: 1.0], [bucket id: 4, min: "4.0", max: "4.0", ndv: 1.0, count: 1.0], [bucket id: 5, min: "5.0", max: "5.0", ndv: 1.0, count: 1.0], [bucket id: 6, min: "6.0", max: "6.0", ndv: 1.0, count: 1.0], [bucket id: 7, min: "7.0", max: "7.0", ndv: 1.0, count: 1.0], [bucket id: 8, min: "8.0", max: "8.0", ndv: 1.0, count: 1.0], [bucket id: 9, min: "9.0", max: "9.0", ndv: 1.0, count: 1.0]