databendlabs · KKould · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/src/query/ee/tests/it/inverted_index/pruning.rs b/src/query/ee/tests/it/inverted_index/pruning.rs
@@ -70,7 +70,7 @@ async fn apply_block_pruning(
     let segment_locs = table_snapshot.segments.clone();
     let segment_locs = create_segment_location_vector(segment_locs, None);
 
-    FusePruner::create(&ctx, dal, schema, push_down, bloom_index_cols, None)?
+    FusePruner::create(&ctx, dal, schema, push_down, bloom_index_cols, vec![], None)?
         .read_pruning(segment_locs)
         .await
 }

diff --git a/src/query/functions/src/scalars/hash.rs b/src/query/functions/src/scalars/hash.rs
@@ -250,13 +250,13 @@ where for<'a> T::ScalarRef<'a>: DFHash {
     );
 }
 
-struct CityHasher64 {
+pub struct CityHasher64 {
     seed: u64,
     value: u64,
 }
 
 impl CityHasher64 {
-    fn with_seed(s: u64) -> Self {
+    pub fn with_seed(s: u64) -> Self {
         Self { seed: s, value: 0 }
     }
 }

diff --git a/src/query/functions/src/scalars/mod.rs b/src/query/functions/src/scalars/mod.rs
@@ -48,6 +48,8 @@ mod vector;
 pub use comparison::ALL_COMP_FUNC_NAMES;
 use databend_functions_scalar_arithmetic::arithmetic;
 use databend_functions_scalar_numeric_basic_arithmetic::register_numeric_basic_arithmetic;
+pub use hash::CityHasher64;
+pub use hash::DFHash;
 pub use string::ALL_STRING_FUNC_NAMES;
 
 pub fn register(registry: &mut FunctionRegistry) {

diff --git a/src/query/service/src/interpreters/interpreter_table_show_create.rs b/src/query/service/src/interpreters/interpreter_table_show_create.rs
@@ -27,6 +27,7 @@ use databend_common_expression::ComputedExpr;
 use databend_common_expression::DataBlock;
 use databend_common_expression::Scalar;
 use databend_common_expression::Value;
+use databend_common_meta_app::schema::TableIndexType;
 use databend_common_sql::plans::ShowCreateTablePlan;
 use databend_common_storages_fuse::FUSE_OPT_KEY_ATTACH_COLUMN_IDS;
 use databend_common_storages_stream::stream_table::StreamTable;
@@ -242,9 +243,14 @@ impl ShowCreateTableInterpreter {
                     let option = format!("{} = '{}'", key, value);
                     options.push(option);
                 }
+                let index_type = match index_field.index_type {
+                    TableIndexType::Inverted => "INVERTED",
+                    TableIndexType::Ngram => "NGRAM",
+                };
                 let mut index_str = format!(
-                    "  {} INVERTED INDEX {} ({})",
+                    "  {} {} INDEX {} ({})",
                     sync,
+                    index_type,
                     display_ident(
                         &index_field.name,
                         force_quoted_ident,

diff --git a/src/query/service/src/test_kits/block_writer.rs b/src/query/service/src/test_kits/block_writer.rs
@@ -130,7 +130,8 @@ impl<'a> BlockWriter<'a> {
         let bloom_index_cols = BloomIndexColumns::All;
         let bloom_columns_map =
             bloom_index_cols.bloom_index_fields(schema.clone(), BloomIndex::supported_type)?;
-        let mut builder = BloomIndexBuilder::create(FunctionContext::default(), bloom_columns_map);
+        let mut builder =
+            BloomIndexBuilder::create(FunctionContext::default(), bloom_columns_map, &[])?;
         builder.add_block(block)?;
         let maybe_bloom_index = builder.finalize()?;
         if let Some(bloom_index) = maybe_bloom_index {

diff --git a/src/query/service/tests/it/storages/fuse/pruning.rs b/src/query/service/tests/it/storages/fuse/pruning.rs
@@ -63,7 +63,7 @@ async fn apply_block_pruning(
     let ctx: Arc<dyn TableContext> = ctx;
     let segment_locs = table_snapshot.segments.clone();
     let segment_locs = create_segment_location_vector(segment_locs, None);
-    FusePruner::create(&ctx, op, schema, push_down, bloom_index_cols, None)?
+    FusePruner::create(&ctx, op, schema, push_down, bloom_index_cols, vec![], None)?
         .read_pruning(segment_locs)
         .await
         .map(|v| v.into_iter().map(|(_, v)| v).collect())

diff --git a/src/query/service/tests/it/storages/fuse/pruning_column_oriented_segment.rs b/src/query/service/tests/it/storages/fuse/pruning_column_oriented_segment.rs
@@ -80,6 +80,7 @@ async fn apply_snapshot_pruning(
         schema.clone(),
         push_down,
         bloom_index_cols,
+        vec![],
         None,
     )?);
 

diff --git a/src/query/service/tests/it/storages/fuse/pruning_pipeline.rs b/src/query/service/tests/it/storages/fuse/pruning_pipeline.rs
@@ -80,6 +80,7 @@ async fn apply_snapshot_pruning(
         schema,
         push_down,
         bloom_index_cols,
+        vec![],
         None,
     )?);
 

diff --git a/src/query/sql/src/planner/binder/ddl/index.rs b/src/query/sql/src/planner/binder/ddl/index.rs
@@ -65,6 +65,9 @@ use crate::MetadataRef;
 use crate::RefreshAggregatingIndexRewriter;
 use crate::SUPPORTED_AGGREGATING_INDEX_FUNCTIONS;
 
+const MAXIMUM_BLOOM_SIZE: u64 = 10 * 1024 * 1024;
+const MINIMUM_BLOOM_SIZE: u64 = 512;
+
 // valid values for inverted index option tokenizer
 static INDEX_TOKENIZER_VALUES: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
     let mut r = HashSet::new();
@@ -580,13 +583,49 @@ impl Binder {
             let value = val.to_lowercase();
             match key.as_str() {
                 "gram_size" => {
-                    if value.parse::<u32>().is_err() {
-                        return Err(ErrorCode::IndexOptionInvalid(format!(
-                            "value `{value}` is not a legal number",
-                        )));
+                    match value.parse::<usize>() {
+                        Ok(num) => {
+                            if num == 0 {
+                                return Err(ErrorCode::IndexOptionInvalid(
+                                    "`gram_size` cannot be 0",
+                                ));
+                            }
+                        }
+                        Err(_) => {
+                            return Err(ErrorCode::IndexOptionInvalid(format!(
+                                "value `{value}` is not a legal number",
+                            )));
+                        }
                     }
                     options.insert("gram_size".to_string(), value);
                 }
+                "bloom_size" => {
+                    match value.parse::<u64>() {
+                        Ok(num) => {
+                            if num == 0 {
+                                return Err(ErrorCode::IndexOptionInvalid(
+                                    "`bloom_size` cannot be 0",
+                                ));
+                            }
+                            if num < MINIMUM_BLOOM_SIZE {
+                                return Err(ErrorCode::IndexOptionInvalid(format!(
+                                    "bloom_size: `{num}` is too small (bloom_size is minimum: {MINIMUM_BLOOM_SIZE})",
+                                )));
+                            }
+                            if num > MAXIMUM_BLOOM_SIZE {
+                                return Err(ErrorCode::IndexOptionInvalid(format!(
+                                    "bloom_size: `{num}` is too large (bloom_size is maximum: {MAXIMUM_BLOOM_SIZE})",
+                                )));
+                            }
+                        }
+                        Err(_) => {
+                            return Err(ErrorCode::IndexOptionInvalid(format!(
+                                "value `{value}` is not a legal number",
+                            )));
+                        }
+                    }
+                    options.insert("bloom_size".to_string(), value);
+                }
                 _ => {
                     return Err(ErrorCode::IndexOptionInvalid(format!(
                         "index option `{key}` is invalid key for create ngram index statement",

diff --git a/src/query/storages/common/cache/src/cache_items.rs b/src/query/storages/common/cache/src/cache_items.rs
@@ -17,7 +17,7 @@ pub use databend_common_catalog::plan::PartStatistics;
 pub use databend_common_catalog::plan::Partitions;
 pub use databend_common_catalog::table::Table;
 use databend_common_exception::ErrorCode;
-pub use databend_storages_common_index::filters::Xor8Filter;
+pub use databend_storages_common_index::filters::FilterImpl;
 pub use databend_storages_common_index::BloomIndexMeta;
 pub use databend_storages_common_index::InvertedIndexFile;
 pub use databend_storages_common_index::InvertedIndexMeta;

diff --git a/src/query/storages/common/cache/src/caches.rs b/src/query/storages/common/cache/src/caches.rs
@@ -45,7 +45,7 @@ pub type TableSnapshotCache = InMemoryLruCache<TableSnapshot>;
 pub type TableSnapshotStatisticCache = InMemoryLruCache<TableSnapshotStatistics>;
 /// In memory object cache of bloom filter.
 /// For each indexed data block, the bloom xor8 filter of column is cached individually
-pub type BloomIndexFilterCache = HybridCache<Xor8Filter>;
+pub type BloomIndexFilterCache = HybridCache<FilterImpl>;
 /// In memory object cache of parquet FileMetaData of bloom index data
 pub type BloomIndexMetaCache = HybridCache<BloomIndexMeta>;
 
@@ -123,7 +123,7 @@ impl CachedObject<(PartStatistics, Partitions)> for (PartStatistics, Partitions)
     }
 }
 
-impl CachedObject<Xor8Filter> for Xor8Filter {
+impl CachedObject<FilterImpl> for FilterImpl {
     type Cache = BloomIndexFilterCache;
     fn cache() -> Option<Self::Cache> {
         CacheManager::instance().get_bloom_index_filter_cache()
@@ -235,10 +235,10 @@ impl From<TableSnapshotStatistics> for CacheValue<TableSnapshotStatistics> {
     }
 }
 
-impl From<Xor8Filter> for CacheValue<Xor8Filter> {
-    fn from(value: Xor8Filter) -> Self {
+impl From<FilterImpl> for CacheValue<FilterImpl> {
+    fn from(value: FilterImpl) -> Self {
         CacheValue {
-            mem_bytes: std::mem::size_of::<Xor8Filter>() + value.filter.finger_prints.len(),
+            mem_bytes: value.mem_bytes(),
             inner: Arc::new(value),
         }
     }
-Original file line number
+Diff line change
@@ Expand Up / @@ -80,6 +80,7 @@ async fn apply_snapshot_pruning( @@
             schema.clone(),
             push_down,
             bloom_index_cols,
+            vec![],
             None,
         )?);
@@ Expand Down @@