diff --git a/src/query/ee/tests/it/inverted_index/pruning.rs b/src/query/ee/tests/it/inverted_index/pruning.rs index d7b5e52dc34cc..98356ad4d7390 100644 --- a/src/query/ee/tests/it/inverted_index/pruning.rs +++ b/src/query/ee/tests/it/inverted_index/pruning.rs @@ -70,7 +70,7 @@ async fn apply_block_pruning( let segment_locs = table_snapshot.segments.clone(); let segment_locs = create_segment_location_vector(segment_locs, None); - FusePruner::create(&ctx, dal, schema, push_down, bloom_index_cols, None)? + FusePruner::create(&ctx, dal, schema, push_down, bloom_index_cols, vec![], None)? .read_pruning(segment_locs) .await } diff --git a/src/query/functions/src/scalars/hash.rs b/src/query/functions/src/scalars/hash.rs index cc008bd2647bd..ef94cd00c28a8 100644 --- a/src/query/functions/src/scalars/hash.rs +++ b/src/query/functions/src/scalars/hash.rs @@ -250,13 +250,13 @@ where for<'a> T::ScalarRef<'a>: DFHash { ); } -struct CityHasher64 { +pub struct CityHasher64 { seed: u64, value: u64, } impl CityHasher64 { - fn with_seed(s: u64) -> Self { + pub fn with_seed(s: u64) -> Self { Self { seed: s, value: 0 } } } diff --git a/src/query/functions/src/scalars/mod.rs b/src/query/functions/src/scalars/mod.rs index f096e80d16b24..3f6bc5fba3f63 100644 --- a/src/query/functions/src/scalars/mod.rs +++ b/src/query/functions/src/scalars/mod.rs @@ -48,6 +48,8 @@ mod vector; pub use comparison::ALL_COMP_FUNC_NAMES; use databend_functions_scalar_arithmetic::arithmetic; use databend_functions_scalar_numeric_basic_arithmetic::register_numeric_basic_arithmetic; +pub use hash::CityHasher64; +pub use hash::DFHash; pub use string::ALL_STRING_FUNC_NAMES; pub fn register(registry: &mut FunctionRegistry) { diff --git a/src/query/service/src/interpreters/interpreter_table_show_create.rs b/src/query/service/src/interpreters/interpreter_table_show_create.rs index e9609e23f2795..815bcda38e2b5 100644 --- a/src/query/service/src/interpreters/interpreter_table_show_create.rs +++ b/src/query/service/src/interpreters/interpreter_table_show_create.rs @@ -27,6 +27,7 @@ use databend_common_expression::ComputedExpr; use databend_common_expression::DataBlock; use databend_common_expression::Scalar; use databend_common_expression::Value; +use databend_common_meta_app::schema::TableIndexType; use databend_common_sql::plans::ShowCreateTablePlan; use databend_common_storages_fuse::FUSE_OPT_KEY_ATTACH_COLUMN_IDS; use databend_common_storages_stream::stream_table::StreamTable; @@ -242,9 +243,14 @@ impl ShowCreateTableInterpreter { let option = format!("{} = '{}'", key, value); options.push(option); } + let index_type = match index_field.index_type { + TableIndexType::Inverted => "INVERTED", + TableIndexType::Ngram => "NGRAM", + }; let mut index_str = format!( - " {} INVERTED INDEX {} ({})", + " {} {} INDEX {} ({})", sync, + index_type, display_ident( &index_field.name, force_quoted_ident, diff --git a/src/query/service/src/test_kits/block_writer.rs b/src/query/service/src/test_kits/block_writer.rs index 8831630b0a8c8..7bd9b7bed0729 100644 --- a/src/query/service/src/test_kits/block_writer.rs +++ b/src/query/service/src/test_kits/block_writer.rs @@ -130,7 +130,8 @@ impl<'a> BlockWriter<'a> { let bloom_index_cols = BloomIndexColumns::All; let bloom_columns_map = bloom_index_cols.bloom_index_fields(schema.clone(), BloomIndex::supported_type)?; - let mut builder = BloomIndexBuilder::create(FunctionContext::default(), bloom_columns_map); + let mut builder = + BloomIndexBuilder::create(FunctionContext::default(), bloom_columns_map, &[])?; builder.add_block(block)?; let maybe_bloom_index = builder.finalize()?; if let Some(bloom_index) = maybe_bloom_index { diff --git a/src/query/service/tests/it/storages/fuse/pruning.rs b/src/query/service/tests/it/storages/fuse/pruning.rs index a24fa8dbacc27..285a864670678 100644 --- a/src/query/service/tests/it/storages/fuse/pruning.rs +++ b/src/query/service/tests/it/storages/fuse/pruning.rs @@ -63,7 +63,7 @@ async fn apply_block_pruning( let ctx: Arc = ctx; let segment_locs = table_snapshot.segments.clone(); let segment_locs = create_segment_location_vector(segment_locs, None); - FusePruner::create(&ctx, op, schema, push_down, bloom_index_cols, None)? + FusePruner::create(&ctx, op, schema, push_down, bloom_index_cols, vec![], None)? .read_pruning(segment_locs) .await .map(|v| v.into_iter().map(|(_, v)| v).collect()) diff --git a/src/query/service/tests/it/storages/fuse/pruning_column_oriented_segment.rs b/src/query/service/tests/it/storages/fuse/pruning_column_oriented_segment.rs index eac7a0eb076fb..6ad0979b0dfd7 100644 --- a/src/query/service/tests/it/storages/fuse/pruning_column_oriented_segment.rs +++ b/src/query/service/tests/it/storages/fuse/pruning_column_oriented_segment.rs @@ -80,6 +80,7 @@ async fn apply_snapshot_pruning( schema.clone(), push_down, bloom_index_cols, + vec![], None, )?); diff --git a/src/query/service/tests/it/storages/fuse/pruning_pipeline.rs b/src/query/service/tests/it/storages/fuse/pruning_pipeline.rs index c58d946296c5b..5281b9acd59a4 100644 --- a/src/query/service/tests/it/storages/fuse/pruning_pipeline.rs +++ b/src/query/service/tests/it/storages/fuse/pruning_pipeline.rs @@ -80,6 +80,7 @@ async fn apply_snapshot_pruning( schema, push_down, bloom_index_cols, + vec![], None, )?); diff --git a/src/query/sql/src/planner/binder/ddl/index.rs b/src/query/sql/src/planner/binder/ddl/index.rs index 75daf7ddcf4b8..0a14313f856aa 100644 --- a/src/query/sql/src/planner/binder/ddl/index.rs +++ b/src/query/sql/src/planner/binder/ddl/index.rs @@ -65,6 +65,9 @@ use crate::MetadataRef; use crate::RefreshAggregatingIndexRewriter; use crate::SUPPORTED_AGGREGATING_INDEX_FUNCTIONS; +const MAXIMUM_BLOOM_SIZE: u64 = 10 * 1024 * 1024; +const MINIMUM_BLOOM_SIZE: u64 = 512; + // valid values for inverted index option tokenizer static INDEX_TOKENIZER_VALUES: LazyLock> = LazyLock::new(|| { let mut r = HashSet::new(); @@ -580,13 +583,49 @@ impl Binder { let value = val.to_lowercase(); match key.as_str() { "gram_size" => { - if value.parse::().is_err() { - return Err(ErrorCode::IndexOptionInvalid(format!( - "value `{value}` is not a legal number", - ))); + match value.parse::() { + Ok(num) => { + if num == 0 { + return Err(ErrorCode::IndexOptionInvalid( + "`gram_size` cannot be 0", + )); + } + } + Err(_) => { + return Err(ErrorCode::IndexOptionInvalid(format!( + "value `{value}` is not a legal number", + ))); + } } options.insert("gram_size".to_string(), value); } + "bloom_size" => { + match value.parse::() { + Ok(num) => { + if num == 0 { + return Err(ErrorCode::IndexOptionInvalid( + "`bloom_size` cannot be 0", + )); + } + if num < MINIMUM_BLOOM_SIZE { + return Err(ErrorCode::IndexOptionInvalid(format!( + "bloom_size: `{num}` is too small (bloom_size is minimum: {MINIMUM_BLOOM_SIZE})", + ))); + } + if num > MAXIMUM_BLOOM_SIZE { + return Err(ErrorCode::IndexOptionInvalid(format!( + "bloom_size: `{num}` is too large (bloom_size is maximum: {MAXIMUM_BLOOM_SIZE})", + ))); + } + } + Err(_) => { + return Err(ErrorCode::IndexOptionInvalid(format!( + "value `{value}` is not a legal number", + ))); + } + } + options.insert("bloom_size".to_string(), value); + } _ => { return Err(ErrorCode::IndexOptionInvalid(format!( "index option `{key}` is invalid key for create ngram index statement", diff --git a/src/query/storages/common/cache/src/cache_items.rs b/src/query/storages/common/cache/src/cache_items.rs index 7184b803be3fe..84bc4cb8dd13f 100644 --- a/src/query/storages/common/cache/src/cache_items.rs +++ b/src/query/storages/common/cache/src/cache_items.rs @@ -17,7 +17,7 @@ pub use databend_common_catalog::plan::PartStatistics; pub use databend_common_catalog::plan::Partitions; pub use databend_common_catalog::table::Table; use databend_common_exception::ErrorCode; -pub use databend_storages_common_index::filters::Xor8Filter; +pub use databend_storages_common_index::filters::FilterImpl; pub use databend_storages_common_index::BloomIndexMeta; pub use databend_storages_common_index::InvertedIndexFile; pub use databend_storages_common_index::InvertedIndexMeta; diff --git a/src/query/storages/common/cache/src/caches.rs b/src/query/storages/common/cache/src/caches.rs index 7e2ec9663e6d6..cfe565a00349b 100644 --- a/src/query/storages/common/cache/src/caches.rs +++ b/src/query/storages/common/cache/src/caches.rs @@ -45,7 +45,7 @@ pub type TableSnapshotCache = InMemoryLruCache; pub type TableSnapshotStatisticCache = InMemoryLruCache; /// In memory object cache of bloom filter. /// For each indexed data block, the bloom xor8 filter of column is cached individually -pub type BloomIndexFilterCache = HybridCache; +pub type BloomIndexFilterCache = HybridCache; /// In memory object cache of parquet FileMetaData of bloom index data pub type BloomIndexMetaCache = HybridCache; @@ -123,7 +123,7 @@ impl CachedObject<(PartStatistics, Partitions)> for (PartStatistics, Partitions) } } -impl CachedObject for Xor8Filter { +impl CachedObject for FilterImpl { type Cache = BloomIndexFilterCache; fn cache() -> Option { CacheManager::instance().get_bloom_index_filter_cache() @@ -235,10 +235,10 @@ impl From for CacheValue { } } -impl From for CacheValue { - fn from(value: Xor8Filter) -> Self { +impl From for CacheValue { + fn from(value: FilterImpl) -> Self { CacheValue { - mem_bytes: std::mem::size_of::() + value.filter.finger_prints.len(), + mem_bytes: value.mem_bytes(), inner: Arc::new(value), } } diff --git a/src/query/storages/common/index/src/bloom_index.rs b/src/query/storages/common/index/src/bloom_index.rs index eae1aad7edede..b0a7d90b75202 100644 --- a/src/query/storages/common/index/src/bloom_index.rs +++ b/src/query/storages/common/index/src/bloom_index.rs @@ -14,6 +14,7 @@ use std::collections::BTreeMap; use std::collections::HashMap; +use std::hash::Hasher; use std::ops::ControlFlow; use std::ops::Deref; use std::sync::Arc; @@ -25,6 +26,7 @@ use databend_common_exception::Result; use databend_common_expression::converts::datavalues::scalar_to_datavalue; use databend_common_expression::eval_function; use databend_common_expression::expr::*; +use databend_common_expression::generate_like_pattern; use databend_common_expression::types::boolean::BooleanDomain; use databend_common_expression::types::nullable::NullableDomain; use databend_common_expression::types::AnyType; @@ -49,6 +51,7 @@ use databend_common_expression::Expr; use databend_common_expression::ExprVisitor; use databend_common_expression::FieldIndex; use databend_common_expression::FunctionContext; +use databend_common_expression::LikePattern; use databend_common_expression::Scalar; use databend_common_expression::ScalarRef; use databend_common_expression::TableDataType; @@ -56,6 +59,8 @@ use databend_common_expression::TableField; use databend_common_expression::TableSchema; use databend_common_expression::TableSchemaRef; use databend_common_expression::Value; +use databend_common_functions::scalars::CityHasher64; +use databend_common_functions::scalars::DFHash; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_storages_common_table_meta::meta::SingleColumnMeta; use databend_storages_common_table_meta::meta::StatisticsOfColumns; @@ -69,14 +74,19 @@ use super::eliminate_cast::is_injective_cast; use crate::eliminate_cast::cast_const; use crate::filters::BlockBloomFilterIndexVersion; use crate::filters::BlockFilter; +use crate::filters::BloomBuilder; use crate::filters::Filter; use crate::filters::FilterBuilder; +use crate::filters::FilterImpl; +use crate::filters::FilterImplBuilder; use crate::filters::V2BloomBlock; use crate::filters::Xor8Builder; use crate::filters::Xor8Filter; use crate::statistics_to_domain; use crate::Index; +const NGRAM_HASH_SEED: u64 = 1575457558; + #[derive(Clone, Serialize, Deserialize)] pub struct BloomIndexMeta { pub columns: Vec<(String, SingleColumnMeta)>, @@ -178,7 +188,7 @@ pub struct BloomIndex { pub version: u64, /// filters. - pub filters: Vec>, + pub filters: Vec>, /// Approximate distinct count of columns generated by xor hash function. pub column_distinct_count: HashMap, @@ -196,13 +206,20 @@ pub enum FilterEvalResult { Uncertain, } +pub struct BloomIndexResult { + pub bloom_fields: Vec, + pub bloom_scalars: Vec<(usize, Scalar, DataType)>, + pub ngram_fields: Vec, + pub ngram_scalars: Vec<(usize, Scalar)>, +} + impl BloomIndex { /// Load a filter directly from the source table's schema and the corresponding filter parquet file. #[fastrace::trace] pub fn from_filter_block( func_ctx: FunctionContext, filter_schema: TableSchemaRef, - filters: Vec>, + filters: Vec>, version: u64, ) -> Result { Ok(Self { @@ -234,11 +251,20 @@ impl BloomIndex { pub fn apply( &self, expr: Expr, - scalar_map: &HashMap, + eq_scalar_map: &HashMap, + like_scalar_map: &HashMap>, + ngram_args: &[NgramArgs], column_stats: &StatisticsOfColumns, data_schema: TableSchemaRef, ) -> Result { - let (expr, domains) = self.rewrite_expr(expr, scalar_map, column_stats, data_schema)?; + let (expr, domains) = self.rewrite_expr( + expr, + eq_scalar_map, + like_scalar_map, + ngram_args, + column_stats, + data_schema, + )?; match ConstantFolder::fold_with_domain(&expr, &domains, &self.func_ctx, &BUILTIN_FUNCTIONS) .0 { @@ -253,7 +279,9 @@ impl BloomIndex { pub fn rewrite_expr( &self, expr: Expr, - scalar_map: &HashMap, + eq_scalar_map: &HashMap, + like_scalar_map: &HashMap>, + ngram_args: &[NgramArgs], column_stats: &StatisticsOfColumns, data_schema: TableSchemaRef, ) -> Result<(Expr, HashMap)> { @@ -290,7 +318,9 @@ impl BloomIndex { new_col_id: 1, index: self, data_schema, - scalar_map, + eq_scalar_map, + like_scalar_map, + ngram_args, column_stats, domains: &mut domains, }; @@ -347,6 +377,52 @@ impl BloomIndex { }) } + pub fn calculate_ngram_nullable_column<'a, F, T>( + arg: Value, + gram_size: usize, + fn_call: F, + ) -> impl Iterator> + 'a + where + F: Fn(&str) -> T + 'a, + { + (0..arg.len()).filter_map(move |i| { + arg.index(i).and_then(|scalar| { + scalar.as_string().and_then(|text| { + let text = text.to_lowercase(); + if text.is_empty() || gram_size > text.chars().count() { + return None; + } + + let indices: Vec<_> = text.char_indices().map(|(i, _)| i).collect(); + let char_count = indices.len(); + + if gram_size > char_count { + return None; + } + + let times = char_count - gram_size + 1; + let mut words = Vec::with_capacity(times); + for j in 0..times { + let start = indices[j]; + let end = if j + gram_size < char_count { + indices[j + gram_size] + } else { + text.len() + }; + words.push(fn_call(&text[start..end])); + } + Some(words) + }) + }) + }) + } + + pub fn ngram_hash(s: &str) -> u64 { + let mut hasher = CityHasher64::with_seed(NGRAM_HASH_SEED); + DFHash::hash(s, &mut hasher); + hasher.finish() + } + /// calculate digest for constant scalar pub fn calculate_scalar_digest( func_ctx: &FunctionContext, @@ -367,26 +443,38 @@ impl BloomIndex { } /// Find all columns that can be use for index in the expression. - #[expect(clippy::type_complexity)] pub fn filter_index_field( expr: &Expr, - fields: &[TableField], - ) -> Result<(Vec, Vec<(Scalar, DataType)>)> { + bloom_fields: Vec, + ngram_fields: Vec, + ) -> Result { let mut visitor = Visitor(ShortListVisitor { - fields: fields.to_vec(), - founds: Vec::new(), - scalars: Vec::new(), + bloom_fields, + ngram_fields, + bloom_founds: Vec::new(), + ngram_founds: Vec::new(), + bloom_scalars: Vec::new(), + ngram_scalars: Vec::new(), }); visit_expr(expr, &mut visitor)?; let Visitor(ShortListVisitor { - founds, scalars, .. + bloom_founds, + ngram_founds, + bloom_scalars, + ngram_scalars, + .. }) = visitor; - Ok((founds, scalars)) + Ok(BloomIndexResult { + bloom_fields: bloom_founds, + bloom_scalars, + ngram_fields: ngram_founds, + ngram_scalars, + }) } /// For every applicable column, we will create a filter. /// The filter will be stored with field name 'Bloom(column_name)' - pub fn build_filter_column_name(version: u64, field: &TableField) -> Result { + pub fn build_filter_bloom_name(version: u64, field: &TableField) -> Result { let index_version = BlockBloomFilterIndexVersion::try_from(version)?; match index_version { BlockBloomFilterIndexVersion::V0(_) => Err(ErrorCode::DeprecatedIndexFormat( @@ -399,14 +487,31 @@ impl BloomIndex { } } + pub fn build_filter_ngram_name(field: &TableField, gram_size: usize) -> String { + format!("Ngram({})_{gram_size}", field.column_id()) + } + fn find( &self, - filter_column: &str, + table_field: &TableField, target: &Scalar, ty: &DataType, - scalar_map: &HashMap, + eq_scalar_map: &HashMap, + like_scalar_map: &HashMap>, + ngram_args: &[NgramArgs], + is_like: bool, ) -> Result { - if !self.filter_schema.has_field(filter_column) + let filter_column = if is_like { + let Some(ngram_arg) = ngram_args.iter().find(|arg| &arg.field == table_field) else { + // The column doesn't have a Ngram Arg. + return Ok(FilterEvalResult::Uncertain); + }; + BloomIndex::build_filter_ngram_name(table_field, ngram_arg.gram_size) + } else { + BloomIndex::build_filter_bloom_name(self.version, table_field)? + }; + + if !self.filter_schema.has_field(&filter_column) || !Xor8Filter::supported_type(ty) || target.is_null() { @@ -414,14 +519,18 @@ impl BloomIndex { return Ok(FilterEvalResult::Uncertain); } - let idx = self.filter_schema.index_of(filter_column)?; + let idx = self.filter_schema.index_of(&filter_column)?; let filter = &self.filters[idx]; let contains = if self.version == V2BloomBlock::VERSION { let data_value = scalar_to_datavalue(target); filter.contains(&data_value) + } else if is_like { + like_scalar_map + .get(target) + .is_none_or(|digests| digests.iter().all(|digest| filter.contains_digest(*digest))) } else { - scalar_map + eq_scalar_map .get(target) .is_none_or(|digest| filter.contains_digest(*digest)) }; @@ -453,31 +562,85 @@ impl BloomIndex { pub struct BloomIndexBuilder { func_ctx: FunctionContext, - columns: Vec, + bloom_columns: Vec, + ngram_columns: Vec, +} + +struct ColumnFilterBuilder { + index: FieldIndex, + field: TableField, + gram_size: usize, + builder: FilterImplBuilder, } -struct ColumnXor8Builder { +#[derive(Clone)] +pub struct NgramArgs { index: FieldIndex, field: TableField, - builder: Xor8Builder, + gram_size: usize, + bloom_size: u64, +} + +impl NgramArgs { + pub fn new(index: FieldIndex, field: TableField, gram_size: usize, bloom_size: u64) -> Self { + Self { + index, + field, + gram_size, + bloom_size, + } + } + + pub fn field(&self) -> &TableField { + &self.field + } + + pub fn gram_size(&self) -> usize { + self.gram_size + } + + pub fn bloom_size(&self) -> u64 { + self.bloom_size + } } impl BloomIndexBuilder { pub fn create( func_ctx: FunctionContext, bloom_columns_map: BTreeMap, - ) -> Self { - let columns = bloom_columns_map - .iter() - .map(|(&index, field)| ColumnXor8Builder { + ngram_args: &[NgramArgs], + ) -> Result { + let mut bloom_columns = Vec::with_capacity(bloom_columns_map.len()); + let mut ngram_columns = Vec::with_capacity(ngram_args.len()); + for (&index, field) in bloom_columns_map.iter() { + bloom_columns.push(ColumnFilterBuilder { index, field: field.clone(), - builder: Xor8Builder::create(), - }) - .collect(); - Self { func_ctx, columns } + gram_size: 0, + builder: FilterImplBuilder::Xor(Xor8Builder::create()), + }); + } + for arg in ngram_args.iter() { + ngram_columns.push(ColumnFilterBuilder { + index: arg.index, + field: arg.field.clone(), + gram_size: arg.gram_size, + builder: FilterImplBuilder::Ngram(BloomBuilder::create( + arg.bloom_size, + NGRAM_HASH_SEED, + )), + }); + } + + Ok(Self { + func_ctx, + bloom_columns, + ngram_columns, + }) } +} +impl BloomIndexBuilder { pub fn add_block(&mut self, block: &DataBlock) -> Result<()> { if block.is_empty() { return Err(ErrorCode::BadArguments("block is empty")); @@ -486,15 +649,16 @@ impl BloomIndexBuilder { return Ok(()); } - let mut keys_to_remove = Vec::with_capacity(self.columns.len()); - for (index, bloom_index_column) in self.columns.iter_mut().enumerate() { - let field_type = &block.get_by_offset(bloom_index_column.index).data_type; + let mut bloom_keys_to_remove = Vec::with_capacity(self.bloom_columns.len()); + + for (index, index_column) in self.bloom_columns.iter_mut().enumerate() { + let field_type = &block.get_by_offset(index_column.index).data_type; if !Xor8Filter::supported_type(field_type) { - keys_to_remove.push(index); + bloom_keys_to_remove.push(index); continue; } - let column = match &block.get_by_offset(bloom_index_column.index).value { + let column = match &block.get_by_offset(index_column.index).value { Value::Scalar(s) => { let builder = ColumnBuilder::repeat(&s.as_ref(), 1, field_type); builder.build() @@ -538,14 +702,14 @@ impl BloomIndexBuilder { } let str_column = builder.build(); if BloomIndex::check_large_string(&str_column) { - keys_to_remove.push(index); + bloom_keys_to_remove.push(index); continue; } let str_type = DataType::Nullable(Box::new(DataType::String)); (str_column, str_type) } else { if BloomIndex::check_large_string(&column) { - keys_to_remove.push(index); + bloom_keys_to_remove.push(index); continue; } (column, val_type) @@ -553,7 +717,7 @@ impl BloomIndexBuilder { } _ => { if BloomIndex::check_large_string(&column) { - keys_to_remove.push(index); + bloom_keys_to_remove.push(index); continue; } (column, field_type.clone()) @@ -562,7 +726,6 @@ impl BloomIndexBuilder { let (column, validity) = BloomIndex::calculate_nullable_column_digest(&self.func_ctx, &column, &data_type)?; - // create filter per column if validity.as_ref().map(|v| v.null_count()).unwrap_or(0) > 0 { let validity = validity.unwrap(); @@ -575,29 +738,50 @@ impl BloomIndexBuilder { } }, ); - bloom_index_column.builder.add_digests(it); + index_column.builder.add_digests(it); } else { - bloom_index_column.builder.add_digests(column.deref()); + index_column.builder.add_digests(column.deref()); } } - for k in keys_to_remove { - self.columns.remove(k); + for index_column in self.ngram_columns.iter_mut() { + let field_type = &block.get_by_offset(index_column.index).data_type; + let column = match &block.get_by_offset(index_column.index).value { + Value::Scalar(s) => { + let builder = ColumnBuilder::repeat(&s.as_ref(), 1, field_type); + builder.build() + } + Value::Column(c) => c.clone(), + }; + + for digests in BloomIndex::calculate_ngram_nullable_column( + Value::Column(column), + index_column.gram_size, + BloomIndex::ngram_hash, + ) { + if digests.is_empty() { + continue; + } + index_column.builder.add_digests(digests.iter()) + } + } + for k in bloom_keys_to_remove { + self.bloom_columns.remove(k); } Ok(()) } - pub fn finalize(mut self) -> Result> { - let mut column_distinct_count = HashMap::with_capacity(self.columns.len()); - let mut filters = Vec::with_capacity(self.columns.len()); - let mut filter_fields = Vec::with_capacity(self.columns.len()); - for column in self.columns.iter_mut() { - let filter = column.builder.build()?; + pub fn finalize(&mut self) -> Result> { + let mut column_distinct_count = HashMap::with_capacity(self.columns_len()); + let mut filters = Vec::with_capacity(self.columns_len()); + let mut filter_fields = Vec::with_capacity(self.columns_len()); + for bloom_column in self.bloom_columns.iter_mut() { + let filter = bloom_column.builder.build()?; if let Some(len) = filter.len() { if !matches!( - column.field.data_type().remove_nullable(), + bloom_column.field.data_type().remove_nullable(), TableDataType::Map(_) | TableDataType::Variant ) { - column_distinct_count.insert(column.field.column_id, len); + column_distinct_count.insert(bloom_column.field.column_id, len); // Not need to generate bloom index, // it will never be used since range index is checked first. if len < 2 { @@ -606,7 +790,14 @@ impl BloomIndexBuilder { } } let filter_name = - BloomIndex::build_filter_column_name(BlockFilter::VERSION, &column.field)?; + BloomIndex::build_filter_bloom_name(BlockFilter::VERSION, &bloom_column.field)?; + filter_fields.push(TableField::new(&filter_name, TableDataType::Binary)); + filters.push(Arc::new(filter)); + } + for ngram_column in self.ngram_columns.iter_mut() { + let filter = ngram_column.builder.build()?; + let filter_name = + BloomIndex::build_filter_ngram_name(&ngram_column.field, ngram_column.gram_size); filter_fields.push(TableField::new(&filter_name, TableDataType::Binary)); filters.push(Arc::new(filter)); } @@ -616,13 +807,17 @@ impl BloomIndexBuilder { } let filter_schema = Arc::new(TableSchema::new(filter_fields)); Ok(Some(BloomIndex { - func_ctx: self.func_ctx, + func_ctx: self.func_ctx.clone(), version: BlockFilter::VERSION, filter_schema, filters, column_distinct_count, })) } + + pub fn columns_len(&self) -> usize { + self.bloom_columns.len() + self.ngram_columns.len() + } } struct Visitor(T); @@ -641,129 +836,178 @@ where T: EqVisitor .. } = expr; - if id.name() != "eq" { + if id.name() != "eq" && id.name() != "like" { return Self::visit_function_call(expr, self); } + let mut result = ControlFlow::Continue(None); - match match args.as_slice() { - // patterns like `Column = `, ` = Column` - [Expr::ColumnRef(ColumnRef { + if id.name() == "like" { + // patterns like `Column like ` + if let [Expr::ColumnRef(ColumnRef { id, data_type: column_type, .. - }), Expr::Constant(Constant { - scalar, - data_type: scalar_type, - .. - })] - | [Expr::Constant(Constant { - scalar, - data_type: scalar_type, - .. - }), Expr::ColumnRef(ColumnRef { - id, - data_type: column_type, - .. - })] => { - // decimal don't respect datatype equal - // debug_assert_eq!(scalar_type, column_type); - // If the visitor returns a new expression, then replace with the current expression. - if scalar_type == column_type { - self.0 - .enter_target(*span, id, scalar, column_type, return_type)? - } else { - ControlFlow::Continue(None) - } - } - // patterns like `MapColumn[] = `, ` = MapColumn[]` - [Expr::FunctionCall(FunctionCall { id, args, .. }), Expr::Constant(Constant { - scalar, - data_type: scalar_type, - .. - })] - | [Expr::Constant(Constant { - scalar, - data_type: scalar_type, - .. - }), Expr::FunctionCall(FunctionCall { id, args, .. })] - if id.name() == "get" => + }), Expr::Constant(Constant { scalar, .. })] = args.as_slice() { - self.0 - .enter_map_column(*span, args, scalar, scalar_type, return_type)? + if let Some(pattern) = scalar.as_string() { + match generate_like_pattern(pattern.as_bytes(), 1) { + LikePattern::StartOfPercent(v) | LikePattern::EndOfPercent(v) => { + let string = String::from_utf8_lossy(v.as_ref()).to_string(); + + result = self.0.enter_target( + *span, + id, + &Scalar::String(string), + column_type, + return_type, + true, + )?; + } + LikePattern::SurroundByPercent(v) => { + let string = String::from_utf8_lossy(v.needle()).to_string(); + + result = self.0.enter_target( + *span, + id, + &Scalar::String(string), + column_type, + return_type, + true, + )?; + } + _ => (), + } + } } - // patterns like `CAST(MapColumn[] as X) = `, ` = CAST(MapColumn[] as X)` - [Expr::Cast(Cast { - expr: - box Expr::FunctionCall(FunctionCall { - id, - args, - return_type, - .. - }), - dest_type, - .. - }), Expr::Constant(Constant { - scalar, - data_type: scalar_type, - .. - })] - | [Expr::Constant(Constant { - scalar, - data_type: scalar_type, - .. - }), Expr::Cast(Cast { - expr: - box Expr::FunctionCall(FunctionCall { - id, - args, - return_type, - .. - }), - dest_type, - .. - })] if id.name() == "get" => { - // Only support cast variant value in map to string value - if return_type.remove_nullable() != DataType::Variant - || dest_type.remove_nullable() != DataType::String + } else { + result = match args.as_slice() { + // patterns like `Column = `, ` = Column` + [Expr::ColumnRef(ColumnRef { + id, + data_type: column_type, + .. + }), Expr::Constant(Constant { + scalar, + data_type: scalar_type, + .. + })] + | [Expr::Constant(Constant { + scalar, + data_type: scalar_type, + .. + }), Expr::ColumnRef(ColumnRef { + id, + data_type: column_type, + .. + })] => { + // decimal don't respect datatype equal + // debug_assert_eq!(scalar_type, column_type); + // If the visitor returns a new expression, then replace with the current expression. + if scalar_type == column_type { + self.0 + .enter_target(*span, id, scalar, column_type, return_type, false)? + } else { + ControlFlow::Continue(None) + } + } + // patterns like `MapColumn[] = `, ` = MapColumn[]` + [Expr::FunctionCall(FunctionCall { id, args, .. }), Expr::Constant(Constant { + scalar, + data_type: scalar_type, + .. + })] + | [Expr::Constant(Constant { + scalar, + data_type: scalar_type, + .. + }), Expr::FunctionCall(FunctionCall { id, args, .. })] + if id.name() == "get" => { - ControlFlow::Break(None) - } else { self.0 - .enter_map_column(*span, args, scalar, scalar_type, return_type)? + .enter_map_column(*span, args, scalar, scalar_type, return_type, false)? + } + // patterns like `CAST(MapColumn[] as X) = `, ` = CAST(MapColumn[] as X)` + [Expr::Cast(Cast { + expr: + box Expr::FunctionCall(FunctionCall { + id, + args, + return_type, + .. + }), + dest_type, + .. + }), Expr::Constant(Constant { + scalar, + data_type: scalar_type, + .. + })] + | [Expr::Constant(Constant { + scalar, + data_type: scalar_type, + .. + }), Expr::Cast(Cast { + expr: + box Expr::FunctionCall(FunctionCall { + id, + args, + return_type, + .. + }), + dest_type, + .. + })] if id.name() == "get" => { + // Only support cast variant value in map to string value + if return_type.remove_nullable() != DataType::Variant + || dest_type.remove_nullable() != DataType::String + { + ControlFlow::Break(None) + } else { + self.0.enter_map_column( + *span, + args, + scalar, + scalar_type, + return_type, + false, + )? + } + } + [cast @ Expr::Cast(_), Expr::Constant(constant)] + | [Expr::Constant(constant), cast @ Expr::Cast(_)] => { + self.0.enter_cast(cast, constant, false)? } - } - [cast @ Expr::Cast(_), Expr::Constant(constant)] - | [Expr::Constant(constant), cast @ Expr::Cast(_)] => { - self.0.enter_cast(cast, constant)? - } - [func @ Expr::FunctionCall(FunctionCall { - id, - args, - return_type: dest_type, - .. - }), Expr::Constant(constant)] - | [Expr::Constant(constant), func @ Expr::FunctionCall(FunctionCall { - id, - args, - return_type: dest_type, - .. - })] if id.name().starts_with("to_") - && args.len() == 1 - && func.contains_column_ref() => - { - self.0.enter_cast( - &Expr::Cast(Cast { - span: *span, - is_try: false, - expr: Box::new(args[0].clone()), - dest_type: dest_type.clone(), - }), - constant, - )? - } - _ => ControlFlow::Continue(None), - } { + [func @ Expr::FunctionCall(FunctionCall { + id, + args, + return_type: dest_type, + .. + }), Expr::Constant(constant)] + | [Expr::Constant(constant), func @ Expr::FunctionCall(FunctionCall { + id, + args, + return_type: dest_type, + .. + })] if id.name().starts_with("to_") + && args.len() == 1 + && func.contains_column_ref() => + { + self.0.enter_cast( + &Expr::Cast(Cast { + span: *span, + is_try: false, + expr: Box::new(args[0].clone()), + dest_type: dest_type.clone(), + }), + constant, + false, + )? + } + _ => ControlFlow::Continue(None), + }; + } + match result { ControlFlow::Continue(Some(expr)) => visit_expr(&expr, self), ControlFlow::Continue(None) => Self::visit_function_call(expr, self), ControlFlow::Break(expr) => Ok(expr), @@ -788,6 +1032,7 @@ trait EqVisitor { scalar: &Scalar, ty: &DataType, return_type: &DataType, + is_like: bool, ) -> ResultRewrite; fn enter_map_column( @@ -797,7 +1042,11 @@ trait EqVisitor { scalar: &Scalar, scalar_type: &DataType, return_type: &DataType, + is_like: bool, ) -> ResultRewrite { + if is_like { + return Ok(ControlFlow::Continue(None)); + } match &args[0] { Expr::ColumnRef(ColumnRef { id, data_type, .. }) | Expr::Cast(Cast { @@ -825,6 +1074,7 @@ trait EqVisitor { &new_scalar, &new_scalar_type, return_type, + is_like, ); } } @@ -835,7 +1085,7 @@ trait EqVisitor { } else if val_type.remove_nullable() != scalar_type.remove_nullable() { return Ok(ControlFlow::Continue(None)); } - return self.enter_target(span, id, scalar, scalar_type, return_type); + return self.enter_target(span, id, scalar, scalar_type, return_type, is_like); } } _ => {} @@ -843,7 +1093,7 @@ trait EqVisitor { Ok(ControlFlow::Continue(None)) } - fn enter_cast(&mut self, _cast: &Expr, _constant: &Constant) -> ResultRewrite { + fn enter_cast(&mut self, _cast: &Expr, _constant: &Constant, _: bool) -> ResultRewrite { Ok(ControlFlow::Continue(None)) } } @@ -852,7 +1102,9 @@ struct RewriteVisitor<'a> { new_col_id: usize, index: &'a BloomIndex, data_schema: TableSchemaRef, - scalar_map: &'a HashMap, + eq_scalar_map: &'a HashMap, + like_scalar_map: &'a HashMap>, + ngram_args: &'a [NgramArgs], column_stats: &'a StatisticsOfColumns, domains: &'a mut HashMap, } @@ -865,20 +1117,23 @@ impl EqVisitor for RewriteVisitor<'_> { scalar: &Scalar, ty: &DataType, return_type: &DataType, + is_like: bool, ) -> ResultRewrite { - let filter_column = &BloomIndex::build_filter_column_name( - self.index.version, - self.data_schema.field_with_name(col_name)?, - )?; + let table_field = self.data_schema.field_with_name(col_name)?; // If the column doesn't contain the constant, // we rewrite the expression to a new column with `false` domain. - if self - .index - .find(filter_column, scalar, ty, self.scalar_map)? - != FilterEvalResult::MustFalse + if self.index.find( + table_field, + scalar, + ty, + self.eq_scalar_map, + self.like_scalar_map, + self.ngram_args, + is_like, + )? != FilterEvalResult::MustFalse { - return Ok(ControlFlow::Break(None)); + return Ok(ControlFlow::Continue(None)); } let new_col_name = format!("__bloom_column_{}_{}", col_name, self.new_col_id); self.new_col_id += 1; @@ -916,7 +1171,15 @@ impl EqVisitor for RewriteVisitor<'_> { ))) } - fn enter_cast(&mut self, cast: &Expr, constant: &Constant) -> ResultRewrite { + fn enter_cast( + &mut self, + cast: &Expr, + constant: &Constant, + is_like: bool, + ) -> ResultRewrite { + if is_like { + return Ok(ControlFlow::Continue(None)); + } let Expr::Cast(Cast { span, is_try: false, @@ -973,19 +1236,26 @@ impl EqVisitor for RewriteVisitor<'_> { } else { DataType::Boolean }, + is_like, ) } } struct ShortListVisitor { - fields: Vec, - founds: Vec, - scalars: Vec<(Scalar, DataType)>, + bloom_fields: Vec, + ngram_fields: Vec, + bloom_founds: Vec, + ngram_founds: Vec, + bloom_scalars: Vec<(usize, Scalar, DataType)>, + ngram_scalars: Vec<(usize, Scalar)>, } impl ShortListVisitor { - fn found_field(&self, name: &str) -> Option<&TableField> { - self.fields.iter().find(|field| field.name == name) + fn found_field<'a>(fields: &'a [TableField], name: &str) -> Option<(usize, &'a TableField)> { + fields + .iter() + .enumerate() + .find_map(|(i, field)| (field.name == name).then_some((i, field))) } } @@ -997,17 +1267,33 @@ impl EqVisitor for ShortListVisitor { scalar: &Scalar, ty: &DataType, _: &DataType, + is_like: bool, ) -> ResultRewrite { - if let Some(v) = self.found_field(col_name) { + if is_like { + if let Some((i, v)) = Self::found_field(&self.ngram_fields, col_name) { + if !scalar.is_null() && Xor8Filter::supported_type(ty) { + self.ngram_founds.push(v.clone()); + self.ngram_scalars.push((i, scalar.clone())); + } + } + } else if let Some((i, v)) = Self::found_field(&self.bloom_fields, col_name) { if !scalar.is_null() && Xor8Filter::supported_type(ty) { - self.founds.push(v.clone()); - self.scalars.push((scalar.clone(), ty.clone())); + self.bloom_founds.push(v.clone()); + self.bloom_scalars.push((i, scalar.clone(), ty.clone())); } } Ok(ControlFlow::Break(None)) } - fn enter_cast(&mut self, cast: &Expr, constant: &Constant) -> ResultRewrite { + fn enter_cast( + &mut self, + cast: &Expr, + constant: &Constant, + is_like: bool, + ) -> ResultRewrite { + if is_like { + return Ok(ControlFlow::Continue(None)); + } let Expr::Cast(Cast { is_try: false, expr: @@ -1023,24 +1309,43 @@ impl EqVisitor for ShortListVisitor { return Ok(ControlFlow::Continue(None)); }; - let Some(field) = self.found_field(id) else { - return Ok(ControlFlow::Break(None)); - }; - if !Xor8Filter::supported_type(src_type) || !is_injective_cast(src_type, dest_type) { - return Ok(ControlFlow::Break(None)); - } + if is_like { + let Some((i, field)) = Self::found_field(&self.ngram_fields, id) else { + return Ok(ControlFlow::Break(None)); + }; - let Some(s) = cast_const( - &FunctionContext::default(), - src_type.to_owned(), - constant.clone(), - ) else { - return Ok(ControlFlow::Break(None)); - }; + let Some(s) = cast_const( + &FunctionContext::default(), + src_type.to_owned(), + constant.clone(), + ) else { + return Ok(ControlFlow::Break(None)); + }; + + if !s.is_null() { + self.ngram_founds.push(field.to_owned()); + self.ngram_scalars.push((i, s)); + } + } else { + let Some((i, field)) = Self::found_field(&self.bloom_fields, id) else { + return Ok(ControlFlow::Break(None)); + }; + if !Xor8Filter::supported_type(src_type) || !is_injective_cast(src_type, dest_type) { + return Ok(ControlFlow::Break(None)); + } - if !s.is_null() { - self.founds.push(field.to_owned()); - self.scalars.push((s, src_type.to_owned())); + let Some(s) = cast_const( + &FunctionContext::default(), + src_type.to_owned(), + constant.clone(), + ) else { + return Ok(ControlFlow::Break(None)); + }; + + if !s.is_null() { + self.bloom_founds.push(field.to_owned()); + self.bloom_scalars.push((i, s, src_type.to_owned())); + } } Ok(ControlFlow::Break(None)) diff --git a/src/query/storages/common/index/src/filters/bloom_filter.rs b/src/query/storages/common/index/src/filters/bloom_filter.rs new file mode 100644 index 0000000000000..01b2589b8f21e --- /dev/null +++ b/src/query/storages/common/index/src/filters/bloom_filter.rs @@ -0,0 +1,327 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::max; +use std::collections::HashSet; +use std::hash::Hash; +use std::hash::Hasher; +use std::mem; + +use anyerror::AnyError; +use databend_common_exception::ErrorCode; +use databend_common_expression::types::DataType; +use databend_common_functions::scalars::CityHasher64; + +use crate::filters::Filter; +use crate::filters::FilterBuilder; +use crate::Index; + +type UnderType = u64; + +pub struct BloomBuilder { + bloom_size: u64, + seed: u64, + inner: HashSet, +} + +impl BloomBuilder { + pub fn create(bloom_size: u64, seed: u64) -> Self { + Self { + bloom_size, + seed, + inner: Default::default(), + } + } +} + +#[derive(thiserror::Error, serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq)] +#[error("{msg}")] +pub struct BloomCodecError { + msg: String, +} + +#[derive(thiserror::Error, serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq)] +#[error("fail to build bloom filter; cause: {cause}")] +pub struct BloomBuildingError { + #[source] + cause: AnyError, +} + +impl FilterBuilder for BloomBuilder { + type Filter = BloomFilter; + type Error = BloomBuildingError; + + fn add_key(&mut self, key: &K) { + let mut hasher64 = CityHasher64::with_seed(self.seed); + key.hash(&mut hasher64); + self.inner.insert(hasher64.finish()); + } + + fn add_keys(&mut self, keys: &[K]) { + for key in keys { + self.add_key::(key); + } + } + + fn add_digests<'i, I: IntoIterator>(&mut self, digests: I) { + self.inner.extend(digests); + } + + fn build(&mut self) -> Result { + let item_count = self.inner.len(); + + let mut filter = BloomFilter::with_item_count(self.bloom_size, item_count, self.seed); + for hash in mem::take(&mut self.inner) { + filter.add(hash); + } + Ok(filter) + } +} + +#[derive(Clone)] +pub struct BloomFilter { + size: u64, + hashes: u64, + seed: u64, + words: u64, + filter: Vec, +} + +impl Index for BloomFilter { + fn supported_type(data_type: &DataType) -> bool { + matches!(data_type.remove_nullable(), DataType::String) + } +} + +impl Filter for BloomFilter { + type CodecError = BloomCodecError; + + fn len(&self) -> Option { + None + } + + fn contains(&self, key: &K) -> bool { + let mut hasher64 = CityHasher64::with_seed(self.seed); + key.hash(&mut hasher64); + self.find(hasher64.finish()) + } + + fn contains_digest(&self, digest: u64) -> bool { + self.find(digest) + } + + fn to_bytes(&self) -> Result, Self::CodecError> { + let mut bytes = Vec::new(); + + bytes.extend(&self.size.to_le_bytes()); + bytes.extend(&self.hashes.to_le_bytes()); + bytes.extend(&self.seed.to_le_bytes()); + bytes.extend(&self.words.to_le_bytes()); + + let len = self.filter.len(); + bytes.extend(&(len as u64).to_le_bytes()); + + for word in &self.filter { + bytes.extend(&word.to_le_bytes()); + } + Ok(bytes) + } + + fn from_bytes(buf: &[u8]) -> Result<(Self, usize), Self::CodecError> { + let mut offset = 0; + + fn read_u64(data: &[u8], offset: &mut usize) -> Result { + if *offset + 8 > data.len() { + return Err(BloomCodecError { + msg: "Unexpected end of data".into(), + }); + } + let value = u64::from_le_bytes(data[*offset..*offset + 8].try_into().unwrap()); + *offset += 8; + Ok(value) + } + + let size = read_u64(buf, &mut offset)?; + let hashes = read_u64(buf, &mut offset)?; + let seed = read_u64(buf, &mut offset)?; + let words = read_u64(buf, &mut offset)?; + let filter_len = read_u64(buf, &mut offset)? as usize; + + let mut filter = Vec::with_capacity(filter_len); + for _ in 0..filter_len { + filter.push(read_u64(buf, &mut offset)?); + } + + Ok(( + BloomFilter { + size, + hashes, + seed, + words, + filter, + }, + buf.len(), + )) + } +} + +impl BloomFilter { + pub fn with_item_count(filter_size: u64, mut item_count: usize, seed: u64) -> Self { + assert!(filter_size > 0, "filter_size must be > 0"); + item_count = max(item_count, 1); + + let k = Self::optimal_k(filter_size, item_count); + + Self::with_params(filter_size, k, seed) + } + + #[inline] + fn optimal_k(filter_size: u64, item_count: usize) -> u64 { + let ln2 = std::f64::consts::LN_2; + let k = ((filter_size as f64 / item_count as f64) * ln2).ceil() as u64; + k.max(1) + } + + pub fn with_params(size: u64, hashes: u64, seed: u64) -> Self { + assert_ne!(size, 0); + assert_ne!(hashes, 0); + let words = size.div_ceil(std::mem::size_of::() as u64); + Self { + size, + hashes, + seed, + words, + filter: vec![0; words as usize], + } + } + + pub fn resize(&mut self, size: u64) { + self.size = size; + self.words = size.div_ceil(std::mem::size_of::() as u64); + self.filter.resize(self.words as usize, 0); + } + + pub fn find(&self, hash: u64) -> bool { + for i in 0..self.hashes { + let pos = hash.wrapping_add(i).wrapping_add(i * i) % (8 * self.size); + let bit_pos = pos as usize % (8 * std::mem::size_of::()); + let word_index = pos as usize / (8 * std::mem::size_of::()); + if self.filter[word_index] & (1 << bit_pos) == 0 { + return false; + } + } + true + } + + pub fn add(&mut self, hash: u64) { + for i in 0..self.hashes { + let pos = hash.wrapping_add(i).wrapping_add(i * i) % (8 * self.size); + let bit_pos = pos as usize % (8 * std::mem::size_of::()); + let word_index = pos as usize / (8 * std::mem::size_of::()); + self.filter[word_index] |= 1 << bit_pos; + } + } + + pub fn clear(&mut self) { + self.filter.fill(0); + } + + pub fn is_empty(&self) -> bool { + self.filter.iter().all(|&x| x == 0) + } + + pub fn memory_usage_bytes(&self) -> usize { + self.filter.capacity() * std::mem::size_of::() + } +} + +impl From for ErrorCode { + fn from(e: BloomCodecError) -> Self { + ErrorCode::Internal(e.to_string()) + } +} + +impl BloomBuildingError { + pub fn new(cause: impl ToString) -> Self { + Self { + cause: AnyError::error(cause), + } + } +} + +impl From for ErrorCode { + fn from(e: BloomBuildingError) -> Self { + ErrorCode::Internal(e.to_string()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_block_insert_and_check() { + for i in 0..1_000_000 { + let mut filter = BloomFilter::with_params(10, 1, 0); + filter.add(i); + assert!(filter.find(i)); + } + } + + #[test] + fn test_sbbf_insert_and_check() { + let item_count = 1_000_000; + let mut filter = BloomFilter::with_item_count(10 * 1024, item_count, 0); + for i in 0..item_count as u64 { + filter.add(i); + assert!(filter.find(i)); + } + } + + #[test] + fn test_encode_and_decode() { + let mut hashes = Vec::new(); + for i in 0..500000 { + hashes.push(i); + } + let mut filter = BloomFilter::with_params(10 * 1024, 1, 0); + for hash in hashes.iter() { + filter.add(*hash); + } + assert!(hashes.iter().all(|hash| filter.find(*hash))); + let buf = filter.to_bytes().unwrap(); + let (decode_filter, _) = BloomFilter::from_bytes(&buf).unwrap(); + filter + .filter + .iter() + .zip(decode_filter.filter.iter()) + .for_each(|(a, b)| { + assert_eq!(a, b); + }); + assert!(hashes.iter().all(|hash| decode_filter.find(*hash))); + } + + #[test] + fn test_optimal_k() { + assert_eq!(BloomFilter::optimal_k(1000, 100), 7); // (1000/100)*ln(2) ≈ 6.93 → ceil → 7 + assert_eq!(BloomFilter::optimal_k(1024, 128), 6); // (1024/128)*ln(2) ≈ 5.545 → ceil → 6 + assert_eq!(BloomFilter::optimal_k(100, 1000), 1); // (100/1000)*ln(2) ≈ 0.069 → ceil → 1 + assert_eq!(BloomFilter::optimal_k(100, 100), 1); // (100/100)*ln(2) ≈ 0.693 → ceil → 1 + assert_eq!(BloomFilter::optimal_k(1, 1), 1); // (1/1)*ln(2) ≈ 0.693 → ceil → 1 + assert_eq!(BloomFilter::optimal_k(1, 1000), 1); // (1/1000)*ln(2) ≈ 0.0007 → ceil → 1 + assert_eq!(BloomFilter::optimal_k(100, 50), 2); // (100/50)*ln(2) ≈ 1.386 → ceil → 2 + assert_eq!(BloomFilter::optimal_k(101, 50), 2); // (101/50)*ln(2) ≈ 1.400 → ceil → 2 + assert_eq!(BloomFilter::optimal_k(1_000_000, 10_000), 70); // (1e6/1e4)*ln(2) ≈ 69.31 → ceil → 70 + } +} diff --git a/src/query/storages/common/index/src/filters/mod.rs b/src/query/storages/common/index/src/filters/mod.rs index 42f9ff264a263..75d6f428a1068 100644 --- a/src/query/storages/common/index/src/filters/mod.rs +++ b/src/query/storages/common/index/src/filters/mod.rs @@ -14,6 +14,7 @@ //! Probabilistic filters +mod bloom_filter; mod filter; mod xor8; @@ -21,8 +22,10 @@ pub use filter::Filter; pub use filter::FilterBuilder; pub use xor8::BlockBloomFilterIndexVersion; pub use xor8::BlockFilter; +pub use xor8::BloomBuilder; +pub use xor8::BloomFilter; +pub use xor8::FilterImpl; +pub use xor8::FilterImplBuilder; pub use xor8::V2BloomBlock; pub use xor8::Xor8Builder; -pub use xor8::Xor8BuildingError; -pub use xor8::Xor8CodecError; pub use xor8::Xor8Filter; diff --git a/src/query/storages/common/index/src/filters/xor8/block_filter.rs b/src/query/storages/common/index/src/filters/xor8/block_filter.rs index 637a53f3552b5..cb27dec275dfb 100644 --- a/src/query/storages/common/index/src/filters/xor8/block_filter.rs +++ b/src/query/storages/common/index/src/filters/xor8/block_filter.rs @@ -16,7 +16,7 @@ use std::sync::Arc; use databend_common_expression::TableSchemaRef; -use crate::filters::xor8::xor8_filter::Xor8Filter; +use crate::filters::FilterImpl; /// Filters of a given DataBlock /// `filter_schema.fields.len()` should equals `filters.len()` @@ -24,5 +24,5 @@ pub struct BlockFilter { // schema of index block, chosen columns only pub filter_schema: TableSchemaRef, // filters of index block, chosen columns only - pub filters: Vec>, + pub filters: Vec>, } diff --git a/src/query/storages/common/index/src/filters/xor8/mod.rs b/src/query/storages/common/index/src/filters/xor8/mod.rs index 31f25b9bc1fe7..da7ac2be56de0 100644 --- a/src/query/storages/common/index/src/filters/xor8/mod.rs +++ b/src/query/storages/common/index/src/filters/xor8/mod.rs @@ -16,10 +16,133 @@ mod block_filter; mod block_filter_versions; mod xor8_filter; +use std::hash::Hash; + pub use block_filter::BlockFilter; pub use block_filter_versions::BlockBloomFilterIndexVersion; pub use block_filter_versions::V2BloomBlock; +use bytes::Bytes; +use databend_common_exception::ErrorCode; pub use xor8_filter::Xor8Builder; -pub use xor8_filter::Xor8BuildingError; -pub use xor8_filter::Xor8CodecError; pub use xor8_filter::Xor8Filter; + +pub use crate::filters::bloom_filter::BloomBuilder; +pub use crate::filters::bloom_filter::BloomFilter; +use crate::filters::Filter; +use crate::filters::FilterBuilder; + +pub enum FilterImpl { + Xor(Xor8Filter), + Ngram(BloomFilter), +} + +pub enum FilterImplBuilder { + Xor(Xor8Builder), + Ngram(BloomBuilder), +} + +impl TryFrom<&FilterImpl> for Vec { + type Error = ErrorCode; + + fn try_from(value: &FilterImpl) -> std::result::Result, ErrorCode> { + value.to_bytes() + } +} + +impl TryFrom for FilterImpl { + type Error = ErrorCode; + + fn try_from(value: Bytes) -> std::result::Result { + Ok(Self::from_bytes(value.as_ref())?.0) + } +} + +impl FilterImpl { + pub fn mem_bytes(&self) -> usize { + match self { + FilterImpl::Xor(filter) => { + std::mem::size_of::() + filter.filter.finger_prints.len() + } + FilterImpl::Ngram(_) => std::mem::size_of::(), + } + } +} + +impl Filter for FilterImpl { + type CodecError = ErrorCode; + + fn len(&self) -> Option { + match self { + FilterImpl::Xor(filter) => filter.len(), + FilterImpl::Ngram(filter) => filter.len(), + } + } + + fn contains(&self, key: &K) -> bool { + match self { + FilterImpl::Xor(filter) => filter.contains(key), + FilterImpl::Ngram(filter) => filter.contains(key), + } + } + + fn contains_digest(&self, digest: u64) -> bool { + match self { + FilterImpl::Xor(filter) => filter.contains_digest(digest), + FilterImpl::Ngram(filter) => filter.contains_digest(digest), + } + } + + fn to_bytes(&self) -> Result, Self::CodecError> { + Ok(match self { + FilterImpl::Xor(filter) => filter.to_bytes()?, + FilterImpl::Ngram(filter) => { + let mut bytes = filter.to_bytes()?; + // major ranges from [0, 7] is Xor8Filter + bytes.insert(0, b'n'); + bytes + } + }) + } + + fn from_bytes(buf: &[u8]) -> Result<(Self, usize), Self::CodecError> { + Ok(if buf[0] == b'n' { + BloomFilter::from_bytes(&buf[1..]) + .map(|(filter, len)| (FilterImpl::Ngram(filter), len))? + } else { + Xor8Filter::from_bytes(buf).map(|(filter, len)| (FilterImpl::Xor(filter), len))? + }) + } +} + +impl FilterBuilder for FilterImplBuilder { + type Filter = FilterImpl; + type Error = ErrorCode; + + fn add_key(&mut self, key: &K) { + match self { + FilterImplBuilder::Xor(filter) => filter.add_key(key), + FilterImplBuilder::Ngram(filter) => filter.add_key(key), + } + } + + fn add_keys(&mut self, keys: &[K]) { + match self { + FilterImplBuilder::Xor(filter) => filter.add_keys(keys), + FilterImplBuilder::Ngram(filter) => filter.add_keys(keys), + } + } + + fn add_digests<'i, I: IntoIterator>(&mut self, digests: I) { + match self { + FilterImplBuilder::Xor(filter) => filter.add_digests(digests), + FilterImplBuilder::Ngram(filter) => filter.add_digests(digests), + } + } + + fn build(&mut self) -> Result { + match self { + FilterImplBuilder::Xor(filter) => Ok(FilterImpl::Xor(filter.build()?)), + FilterImplBuilder::Ngram(filter) => Ok(FilterImpl::Ngram(filter.build()?)), + } + } +} diff --git a/src/query/storages/common/index/src/lib.rs b/src/query/storages/common/index/src/lib.rs index 779ac928f9a4a..50f57df6908f8 100644 --- a/src/query/storages/common/index/src/lib.rs +++ b/src/query/storages/common/index/src/lib.rs @@ -27,7 +27,9 @@ mod range_index; pub use bloom_index::BloomIndex; pub use bloom_index::BloomIndexBuilder; pub use bloom_index::BloomIndexMeta; +pub use bloom_index::BloomIndexResult; pub use bloom_index::FilterEvalResult; +pub use bloom_index::NgramArgs; pub use eliminate_cast::eliminate_cast; pub use index::Index; pub use inverted_index::extract_component_fields; diff --git a/src/query/storages/common/index/tests/it/bloom_pruner.rs b/src/query/storages/common/index/tests/it/bloom_pruner.rs index abc8b4a5f6899..a81dc3f1cc35c 100644 --- a/src/query/storages/common/index/tests/it/bloom_pruner.rs +++ b/src/query/storages/common/index/tests/it/bloom_pruner.rs @@ -56,6 +56,7 @@ use databend_storages_common_index::BloomIndex; use databend_storages_common_index::BloomIndexBuilder; use databend_storages_common_index::FilterEvalResult; use databend_storages_common_index::Index; +use databend_storages_common_index::NgramArgs; use databend_storages_common_table_meta::meta::ColumnStatistics; use goldenfile::Mint; @@ -135,7 +136,10 @@ fn test_base(file: &mut impl Write) { ), DataBlock::new_from_columns(vec![ UInt8Type::from_data(vec![2, 3]), - StringType::from_data(vec!["b", "c"]), + StringType::from_data(vec![ + "The quick brown fox jumps over the lazy dog", + "The early bird catches the worm", + ]), Column::Map(Box::new( ArrayColumn::>::new( KvColumn { @@ -164,6 +168,7 @@ fn test_base(file: &mut impl Write) { ]; let block = DataBlock::concat(&blocks).unwrap(); let bloom_columns = bloom_columns_map(&schema, &[0, 1, 2, 3]); + let ngram_args = ngram_args(&schema, &[1]); for v in [0, 1, 2] { eval_index( @@ -173,11 +178,13 @@ fn test_base(file: &mut impl Write) { DataType::Number(NumberDataType::UInt8), &block, &bloom_columns, + &ngram_args, schema.clone(), + false, ); } - for v in ["a", "b", "d"] { + for v in ["%fox jumps%", "%bird catches%", "%the doctor%"] { eval_index( file, "1", @@ -185,7 +192,27 @@ fn test_base(file: &mut impl Write) { DataType::String, &block, &bloom_columns, + &ngram_args, schema.clone(), + true, + ); + } + + for v in [ + "The quick brown fox jumps over the lazy dog", + "The early bird catches the worm", + "d", + ] { + eval_index( + file, + "1", + Scalar::String(v.to_string()), + DataType::String, + &block, + &bloom_columns, + &ngram_args, + schema.clone(), + false, ); } @@ -200,6 +227,7 @@ fn test_base(file: &mut impl Write) { DataType::String, &block, &bloom_columns, + &ngram_args, schema.clone(), ); } @@ -220,6 +248,7 @@ fn test_base(file: &mut impl Write) { v_type, &block, &bloom_columns, + &ngram_args, schema.clone(), ); } @@ -233,20 +262,42 @@ fn test_specify(file: &mut impl Write) { let blocks = [DataBlock::new_from_columns(vec![ UInt8Type::from_data(vec![1, 2]), - StringType::from_data(vec!["a", "b"]), + StringType::from_data(vec![ + "The quick brown fox jumps over the lazy dog", + "The early bird catches the worm", + ]), ])]; let block = DataBlock::concat(&blocks).unwrap(); - let bloom_columns = bloom_columns_map(&schema, &[0]); + { + let bloom_columns = bloom_columns_map(&schema, &[0]); - eval_index( - file, - "1", - Scalar::String("d".to_string()), - DataType::String, - &block, - &bloom_columns, - schema, - ); + eval_index( + file, + "1", + Scalar::String("d".to_string()), + DataType::String, + &block, + &bloom_columns, + &[], + schema.clone(), + false, + ); + } + { + let ngram_args = ngram_args(&schema, &[0]); + + eval_index( + file, + "1", + Scalar::String("d".to_string()), + DataType::String, + &block, + &BTreeMap::new(), + &ngram_args, + schema, + true, + ); + } } fn test_long_string(file: &mut impl Write) { @@ -272,7 +323,9 @@ fn test_long_string(file: &mut impl Write) { DataType::String, &block, &bloom_columns, + &[], schema, + false, ); } @@ -441,6 +494,7 @@ fn eval_text( .collect(); let schema = Arc::new(TableSchema::new(fields)); let bloom_columns = bloom_columns_map(&schema, cols); + let ngram_args = ngram_args(&schema, cols); let block = DataBlock::new_from_columns(columns.iter().map(|(_, _, col)| col.clone()).collect()); @@ -455,13 +509,14 @@ fn eval_text( let expr = type_check::rewrite_function_to_cast(expr); let expr = expr.project_column_ref(|i| columns[*i].0.to_string()); - eval_index_expr(file, &block, &bloom_columns, schema, expr); + eval_index_expr(file, &block, &bloom_columns, &ngram_args, schema, expr); } fn eval_index_expr( file: &mut impl Write, block: &DataBlock, bloom_columns: &BTreeMap, + ngram_args: &[NgramArgs], schema: Arc, expr: Expr, ) { @@ -477,17 +532,42 @@ fn eval_index_expr( expr }; - let fields = bloom_columns.values().cloned().collect::>(); - let (_, scalars) = BloomIndex::filter_index_field(&expr, &fields).unwrap(); + let bloom_fields = bloom_columns.values().cloned().collect::>(); + let ngram_fields = ngram_args + .iter() + .map(|arg| arg.field().clone()) + .collect::>(); + let result = BloomIndex::filter_index_field(&expr, bloom_fields, ngram_fields).unwrap(); - let mut scalar_map = HashMap::::new(); - for (scalar, ty) in scalars.into_iter() { - scalar_map.entry(scalar).or_insert_with_key(|scalar| { + let mut eq_scalar_map = HashMap::::new(); + for (_, scalar, ty) in result.bloom_scalars.into_iter() { + eq_scalar_map.entry(scalar).or_insert_with_key(|scalar| { BloomIndex::calculate_scalar_digest(&func_ctx, scalar, &ty).unwrap() }); } - let mut builder = BloomIndexBuilder::create(func_ctx.clone(), bloom_columns.clone()); + let mut like_scalar_map = HashMap::>::new(); + for (field, (_, scalar)) in result + .ngram_fields + .iter() + .zip(result.ngram_scalars.into_iter()) + { + let Some(ngram_arg) = ngram_args.iter().find(|arg| arg.field() == field) else { + continue; + }; + let Some(digests) = BloomIndex::calculate_ngram_nullable_column( + Value::Scalar(scalar.clone()), + ngram_arg.gram_size(), + BloomIndex::ngram_hash, + ) + .next() else { + continue; + }; + like_scalar_map.entry(scalar).or_insert(digests); + } + + let mut builder = + BloomIndexBuilder::create(func_ctx.clone(), bloom_columns.clone(), ngram_args).unwrap(); builder.add_block(block).unwrap(); let index = builder.finalize().unwrap().unwrap(); @@ -514,7 +594,14 @@ fn eval_index_expr( .collect(); let (expr, domains) = index - .rewrite_expr(expr, &scalar_map, &column_stats, schema) + .rewrite_expr( + expr, + &eq_scalar_map, + &like_scalar_map, + ngram_args, + &column_stats, + schema, + ) .unwrap(); let result = match ConstantFolder::fold_with_domain(&expr, &domains, &func_ctx, &BUILTIN_FUNCTIONS).0 { @@ -539,11 +626,13 @@ fn eval_index( ty: DataType, block: &DataBlock, bloom_columns: &BTreeMap, + ngram_args: &[NgramArgs], schema: Arc, + is_like: bool, ) { let expr = check_function( None, - "eq", + if is_like { "like" } else { "eq" }, &[], &[ Expr::ColumnRef(ColumnRef { @@ -562,7 +651,7 @@ fn eval_index( ) .unwrap(); - eval_index_expr(file, block, bloom_columns, schema, expr) + eval_index_expr(file, block, bloom_columns, ngram_args, schema, expr) } #[allow(clippy::too_many_arguments)] @@ -576,6 +665,7 @@ fn eval_map_index( ty: DataType, block: &DataBlock, bloom_columns: &BTreeMap, + ngram_args: &[NgramArgs], schema: Arc, ) { let fields = schema.fields.clone(); @@ -611,7 +701,7 @@ fn eval_map_index( check_function(None, "eq", &[], &[get_expr, const_expr], &BUILTIN_FUNCTIONS).unwrap(); let expr = check_function(None, "is_true", &[], &[eq_expr], &BUILTIN_FUNCTIONS).unwrap(); - eval_index_expr(file, block, bloom_columns, schema, expr); + eval_index_expr(file, block, bloom_columns, ngram_args, schema, expr); } fn bloom_columns_map( @@ -628,3 +718,15 @@ fn bloom_columns_map( } bloom_columns_map } + +fn ngram_args(schema: &TableSchema, cols: &[FieldIndex]) -> Vec { + let mut ngram_args = Vec::new(); + for &i in cols { + let table_field = schema.field(i); + let data_type = DataType::from(table_field.data_type()); + if Xor8Filter::supported_type(&data_type) { + ngram_args.push(NgramArgs::new(i, table_field.clone(), 3, 1024)) + } + } + ngram_args +} diff --git a/src/query/storages/common/index/tests/it/testdata/test_bloom_filter.txt b/src/query/storages/common/index/tests/it/testdata/test_bloom_filter.txt index 7f59e75245a55..e2b295fd3435b 100644 --- a/src/query/storages/common/index/tests/it/testdata/test_bloom_filter.txt +++ b/src/query/storages/common/index/tests/it/testdata/test_bloom_filter.txt @@ -2,7 +2,7 @@ | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -16,7 +16,7 @@ result : MustFalse | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -30,7 +30,7 @@ result : Uncertain | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -44,13 +44,13 @@ result : Uncertain | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -expr : eq(1, "a") -filter : eq(1, "a") -domains : {"1": String(StringDomain { min: "a", max: Some("c") })} +expr : like(1, "%fox jumps%") +filter : like(1, "%fox jumps%") +domains : {"1": String(StringDomain { min: "The early bird catches the worm", max: Some("a") })} result : Uncertain @@ -58,13 +58,13 @@ result : Uncertain | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -expr : eq(1, "b") -filter : eq(1, "b") -domains : {"1": String(StringDomain { min: "a", max: Some("c") })} +expr : like(1, "%bird catches%") +filter : like(1, "%bird catches%") +domains : {"1": String(StringDomain { min: "The early bird catches the worm", max: Some("a") })} result : Uncertain @@ -72,13 +72,55 @@ result : Uncertain | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | +| 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | +| 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | ++-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +expr : like(1, "%the doctor%") +filter : __bloom_column_1_1 +domains : {"1": String(StringDomain { min: "The early bird catches the worm", max: Some("a") }), "__bloom_column_1_1": Boolean(BooleanDomain { has_false: true, has_true: false })} +result : MustFalse + + ++-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Column ID | Type | Column Data | ++-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | +| 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | +| 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | ++-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +expr : eq(1, "The quick brown fox jumps over the lazy dog") +filter : eq(1, "The quick brown fox jumps over the lazy dog") +domains : {"1": String(StringDomain { min: "The early bird catches the worm", max: Some("a") })} +result : Uncertain + + ++-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Column ID | Type | Column Data | ++-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | +| 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | +| 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | ++-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +expr : eq(1, "The early bird catches the worm") +filter : eq(1, "The early bird catches the worm") +domains : {"1": String(StringDomain { min: "The early bird catches the worm", max: Some("a") })} +result : Uncertain + + ++-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Column ID | Type | Column Data | ++-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ expr : eq(1, "d") filter : __bloom_column_1_1 -domains : {"1": String(StringDomain { min: "a", max: Some("c") }), "__bloom_column_1_1": Boolean(BooleanDomain { has_false: true, has_true: false })} +domains : {"1": String(StringDomain { min: "The early bird catches the worm", max: Some("a") }), "__bloom_column_1_1": Boolean(BooleanDomain { has_false: true, has_true: false })} result : MustFalse @@ -86,7 +128,7 @@ result : MustFalse | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -101,7 +143,7 @@ result : Uncertain | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -116,7 +158,7 @@ result : Uncertain | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -131,7 +173,7 @@ result : MustFalse | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -146,7 +188,7 @@ result : Uncertain | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -161,7 +203,7 @@ result : MustFalse | Column ID | Type | Column Data | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 0 | UInt8 | Column(UInt8([1, 1, 2, 3])) | -| 1 | String | Column(StringColumn[a, a, b, c]) | +| 1 | String | Column(StringColumn[a, a, The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | | 2 | Map(UInt8, String) | Column(ArrayColumn { values: Tuple([UInt8([1, 2, 1, 2, 1, 2, 3]), StringColumn[a, b, a, b, b, c, d]]), offsets: [0, 2, 4, 6, 7] }) | | 3 | Map(String, Variant) | Column(ArrayColumn { values: Tuple([StringColumn[a, b, a, b, b, c, d], Variant([0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003616263, 0x20000000200000025064, 0x2000000010000003646566, 0x2000000040000000, 0x200000001000000378797a])]), offsets: [0, 2, 4, 6, 7] }) | +-----------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -172,18 +214,30 @@ domains : {"3": Map(Some(Tuple([String(StringDomain { min: "", max: None }), Un result : Uncertain -+-----------+--------+----------------------------+ -| Column ID | Type | Column Data | -+-----------+--------+----------------------------+ -| 0 | UInt8 | Column(UInt8([1, 2])) | -| 1 | String | Column(StringColumn[a, b]) | -+-----------+--------+----------------------------+ ++-----------+--------+----------------------------------------------------------------------------------------------------+ +| Column ID | Type | Column Data | ++-----------+--------+----------------------------------------------------------------------------------------------------+ +| 0 | UInt8 | Column(UInt8([1, 2])) | +| 1 | String | Column(StringColumn[The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | ++-----------+--------+----------------------------------------------------------------------------------------------------+ expr : eq(1, "d") filter : eq(1, "d") domains : {"1": String(StringDomain { min: "", max: None })} result : Uncertain ++-----------+--------+----------------------------------------------------------------------------------------------------+ +| Column ID | Type | Column Data | ++-----------+--------+----------------------------------------------------------------------------------------------------+ +| 0 | UInt8 | Column(UInt8([1, 2])) | +| 1 | String | Column(StringColumn[The quick brown fox jumps over the lazy dog, The early bird catches the worm]) | ++-----------+--------+----------------------------------------------------------------------------------------------------+ +expr : like(1, "d") +filter : like(1, "d") +domains : {"1": String(StringDomain { min: "", max: None })} +result : Uncertain + + +-----------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Column ID | Type | Column Data | +-----------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs b/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs index aa72b09d91a8c..3d5bc526aeaf9 100644 --- a/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs +++ b/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeMap; use std::collections::HashSet; use std::future::Future; use std::sync::Arc; @@ -29,7 +30,7 @@ use databend_common_expression::TableDataType; use databend_common_expression::TableField; use databend_common_expression::TableSchema; use databend_storages_common_cache::LoadParams; -use databend_storages_common_index::filters::Xor8Filter; +use databend_storages_common_index::filters::FilterImpl; use databend_storages_common_index::BloomIndexMeta; use databend_storages_common_table_meta::meta::Location; use databend_storages_common_table_meta::meta::SingleColumnMeta; @@ -95,11 +96,11 @@ async fn load_bloom_filter_by_columns<'a>( let column_needed: HashSet<&String> = HashSet::from_iter(column_needed); // 2.2 collects the column metas and their column ids let index_column_chunk_metas = &bloom_index_meta.columns; - let mut col_metas = Vec::with_capacity(column_needed.len()); + let mut col_metas = BTreeMap::new(); for column_name in column_needed { for (idx, (name, column_meta)) in index_column_chunk_metas.iter().enumerate() { if name == column_name { - col_metas.push((idx as ColumnId, (name, column_meta))); + col_metas.insert(idx as ColumnId, (name, column_meta)); break; } } @@ -118,7 +119,7 @@ async fn load_bloom_filter_by_columns<'a>( let futs = col_metas .iter() .map(|(idx, (_, col_chunk_meta))| { - load_column_xor8_filter( + load_column_bloom_filter( *idx, col_chunk_meta, index_path, @@ -147,13 +148,13 @@ async fn load_bloom_filter_by_columns<'a>( /// Loads bytes and index of the given column. /// read data from cache, or populate cache items if possible #[fastrace::trace] -async fn load_column_xor8_filter<'a>( +async fn load_column_bloom_filter<'a>( idx: ColumnId, col_chunk_meta: &'a SingleColumnMeta, index_path: &'a str, dal: &'a Operator, bloom_index_schema_desc: SchemaDescPtr, -) -> Result> { +) -> Result> { let storage_runtime = GlobalIORuntime::instance(); let bytes = { let column_data_reader = BloomColumnFilterReader::new( diff --git a/src/query/storages/fuse/src/io/read/bloom/column_filter_reader.rs b/src/query/storages/fuse/src/io/read/bloom/column_filter_reader.rs index 9a591b743b674..0dc0bc45bf134 100644 --- a/src/query/storages/fuse/src/io/read/bloom/column_filter_reader.rs +++ b/src/query/storages/fuse/src/io/read/bloom/column_filter_reader.rs @@ -24,7 +24,7 @@ use databend_storages_common_cache::HybridCacheReader; use databend_storages_common_cache::LoadParams; use databend_storages_common_cache::Loader; use databend_storages_common_index::filters::Filter; -use databend_storages_common_index::filters::Xor8Filter; +use databend_storages_common_index::filters::FilterImpl; use databend_storages_common_table_meta::meta::SingleColumnMeta; use opendal::Operator; use parquet::arrow::arrow_reader::ParquetRecordBatchReader; @@ -35,7 +35,7 @@ use parquet::schema::types::SchemaDescPtr; use crate::io::read::block::parquet::RowGroupImplBuilder; -type CachedReader = HybridCacheReader; +type CachedReader = HybridCacheReader; /// Load the filter of a given bloom index column. Also /// - generates the proper cache key @@ -61,7 +61,7 @@ impl BloomColumnFilterReader { num_values, } = column_chunk_meta; - let loader = Xor8FilterLoader { + let loader = BloomFilterLoader { cache_key, operator, offset: *offset, @@ -71,7 +71,7 @@ impl BloomColumnFilterReader { column_id, }; - let cached_reader = CachedReader::new(Xor8Filter::cache(), loader); + let cached_reader = CachedReader::new(FilterImpl::cache(), loader); let param = LoadParams { location: index_path, @@ -87,13 +87,13 @@ impl BloomColumnFilterReader { } #[async_backtrace::framed] - pub async fn read(&self) -> Result> { + pub async fn read(&self) -> Result> { self.cached_reader.read(&self.param).await } } /// Loader that fetch range of the target object with customized cache key -pub struct Xor8FilterLoader { +pub struct BloomFilterLoader { pub offset: u64, pub len: u64, pub num_values: u64, @@ -104,9 +104,9 @@ pub struct Xor8FilterLoader { } #[async_trait::async_trait] -impl Loader for Xor8FilterLoader { +impl Loader for BloomFilterLoader { #[async_backtrace::framed] - async fn load(&self, params: &LoadParams) -> Result { + async fn load(&self, params: &LoadParams) -> Result { let bytes = self .operator .read_with(¶ms.location) @@ -143,7 +143,7 @@ impl Loader for Xor8FilterLoader { .index(0) .unwrap(); metrics_inc_block_index_read_bytes(filter_bytes.len() as u64); - let (filter, _size) = Xor8Filter::from_bytes(filter_bytes)?; + let (filter, _size) = FilterImpl::from_bytes(filter_bytes)?; Ok(filter) } diff --git a/src/query/storages/fuse/src/io/write/block_writer.rs b/src/query/storages/fuse/src/io/write/block_writer.rs index 11881a7382d63..f23b8ebf1cad5 100644 --- a/src/query/storages/fuse/src/io/write/block_writer.rs +++ b/src/query/storages/fuse/src/io/write/block_writer.rs @@ -39,6 +39,7 @@ use databend_common_metrics::storage::metrics_inc_block_write_milliseconds; use databend_common_metrics::storage::metrics_inc_block_write_nums; use databend_common_native::write::NativeWriter; use databend_storages_common_blocks::blocks_to_parquet; +use databend_storages_common_index::NgramArgs; use databend_storages_common_table_meta::meta::BlockMeta; use databend_storages_common_table_meta::meta::ClusterStatistics; use databend_storages_common_table_meta::meta::ColumnMeta; @@ -139,6 +140,7 @@ pub struct BlockBuilder { pub write_settings: WriteSettings, pub cluster_stats_gen: ClusterStatsGenerator, pub bloom_columns_map: BTreeMap, + pub ngram_args: Vec, pub inverted_index_builders: Vec, pub virtual_column_builder: Option, pub table_meta_timestamps: TableMetaTimestamps, @@ -159,6 +161,7 @@ impl BlockBuilder { &data_block, bloom_index_location, self.bloom_columns_map.clone(), + &self.ngram_args, )?; let column_distinct_count = bloom_index_state .as_ref() diff --git a/src/query/storages/fuse/src/io/write/bloom_index_writer.rs b/src/query/storages/fuse/src/io/write/bloom_index_writer.rs index 5f2b621f5edd9..26baa1a78939e 100644 --- a/src/query/storages/fuse/src/io/write/bloom_index_writer.rs +++ b/src/query/storages/fuse/src/io/write/bloom_index_writer.rs @@ -29,6 +29,7 @@ use databend_storages_common_blocks::blocks_to_parquet; use databend_storages_common_index::filters::BlockFilter; use databend_storages_common_index::BloomIndex; use databend_storages_common_index::BloomIndexBuilder; +use databend_storages_common_index::NgramArgs; use databend_storages_common_io::ReadSettings; use databend_storages_common_table_meta::meta::column_oriented_segment::BlockReadInfo; use databend_storages_common_table_meta::meta::Location; @@ -70,9 +71,11 @@ impl BloomIndexState { block: &DataBlock, location: Location, bloom_columns_map: BTreeMap, + ngram_args: &[NgramArgs], ) -> Result> { // write index - let mut builder = BloomIndexBuilder::create(ctx.get_function_context()?, bloom_columns_map); + let mut builder = + BloomIndexBuilder::create(ctx.get_function_context()?, bloom_columns_map, ngram_args)?; builder.add_block(block)?; let maybe_bloom_index = builder.finalize()?; if let Some(bloom_index) = maybe_bloom_index { @@ -89,6 +92,7 @@ pub struct BloomIndexRebuilder { pub table_dal: Operator, pub storage_format: FuseStorageFormat, pub bloom_columns_map: BTreeMap, + pub ngram_args: Vec, } impl BloomIndexRebuilder { @@ -132,7 +136,8 @@ impl BloomIndexRebuilder { let mut builder = BloomIndexBuilder::create( self.table_ctx.get_function_context()?, self.bloom_columns_map.clone(), - ); + &self.ngram_args, + )?; builder.add_block(&data_block)?; let maybe_bloom_index = builder.finalize()?; diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs index 64240200f1cbc..b388419c66bf6 100644 --- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs +++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs @@ -38,6 +38,7 @@ use databend_common_native::write::NativeWriter; use databend_storages_common_index::BloomIndex; use databend_storages_common_index::BloomIndexBuilder; use databend_storages_common_index::Index; +use databend_storages_common_index::NgramArgs; use databend_storages_common_index::RangeIndex; use databend_storages_common_table_meta::meta::BlockMeta; use databend_storages_common_table_meta::meta::ColumnMeta; @@ -207,7 +208,8 @@ impl StreamBlockBuilder { let bloom_index_builder = BloomIndexBuilder::create( properties.ctx.get_function_context()?, properties.bloom_columns_map.clone(), - ); + &properties.ngram_args, + )?; let cluster_stats_state = ClusterStatisticsState::new(properties.cluster_stats_builder.clone()); @@ -356,6 +358,7 @@ pub struct StreamBlockProperties { stats_columns: Vec, distinct_columns: Vec, bloom_columns_map: BTreeMap, + ngram_args: Vec, inverted_index_builders: Vec, table_meta_timestamps: TableMetaTimestamps, } @@ -385,6 +388,7 @@ impl StreamBlockProperties { let bloom_columns_map = table .bloom_index_cols .bloom_index_fields(source_schema.clone(), BloomIndex::supported_type)?; + let ngram_args = FuseTable::create_ngram_index_args(&table.table_info.meta)?; let bloom_column_ids = bloom_columns_map .values() .map(|v| v.column_id()) @@ -420,6 +424,7 @@ impl StreamBlockProperties { stats_columns, distinct_columns, bloom_columns_map, + ngram_args, inverted_index_builders, table_meta_timestamps, })) diff --git a/src/query/storages/fuse/src/operations/changes.rs b/src/query/storages/fuse/src/operations/changes.rs index fac4f85b7a183..49431a507c471 100644 --- a/src/query/storages/fuse/src/operations/changes.rs +++ b/src/query/storages/fuse/src/operations/changes.rs @@ -298,6 +298,7 @@ impl FuseTable { ) }; let bloom_index_cols = self.bloom_index_cols(); + let ngram_args = Self::create_ngram_index_args(&self.table_info.meta)?; let mut pruner = FusePruner::create_with_pages( &ctx, self.get_operator(), @@ -306,6 +307,7 @@ impl FuseTable { cluster_key_meta, cluster_keys, bloom_index_cols, + ngram_args, None, )?; diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_serialize_block.rs b/src/query/storages/fuse/src/operations/common/processors/transform_serialize_block.rs index 7b2fc6a19ec74..935db0909cf44 100644 --- a/src/query/storages/fuse/src/operations/common/processors/transform_serialize_block.rs +++ b/src/query/storages/fuse/src/operations/common/processors/transform_serialize_block.rs @@ -151,6 +151,7 @@ impl TransformSerializeBlock { let bloom_columns_map = table .bloom_index_cols .bloom_index_fields(source_schema.clone(), BloomIndex::supported_type)?; + let ngram_args = FuseTable::create_ngram_index_args(&table.table_info.meta)?; let inverted_index_builders = create_inverted_index_builders(&table.table_info.meta); let virtual_column_builder = if ctx @@ -170,6 +171,7 @@ impl TransformSerializeBlock { write_settings: table.get_write_settings(), cluster_stats_gen, bloom_columns_map, + ngram_args, inverted_index_builders, virtual_column_builder, table_meta_timestamps, diff --git a/src/query/storages/fuse/src/operations/merge.rs b/src/query/storages/fuse/src/operations/merge.rs index 6fb547541e33d..2c2f0d5f8d015 100644 --- a/src/query/storages/fuse/src/operations/merge.rs +++ b/src/query/storages/fuse/src/operations/merge.rs @@ -92,6 +92,7 @@ impl FuseTable { let bloom_columns_map = self .bloom_index_cols() .bloom_index_fields(new_schema.clone(), BloomIndex::supported_type)?; + let ngram_args = FuseTable::create_ngram_index_args(&self.table_info.meta)?; let inverted_index_builders = create_inverted_index_builders(&self.table_info.meta); let block_builder = BlockBuilder { @@ -101,6 +102,7 @@ impl FuseTable { write_settings: self.get_write_settings(), cluster_stats_gen, bloom_columns_map, + ngram_args, inverted_index_builders, // todo virtual_column_builder: None, diff --git a/src/query/storages/fuse/src/operations/mutation_source.rs b/src/query/storages/fuse/src/operations/mutation_source.rs index e3bede929be06..37c7a761cb406 100644 --- a/src/query/storages/fuse/src/operations/mutation_source.rs +++ b/src/query/storages/fuse/src/operations/mutation_source.rs @@ -190,6 +190,7 @@ impl FuseTable { self.schema_with_stream(), &push_down, self.bloom_index_cols(), + Self::create_ngram_index_args(&self.table_info.meta)?, None, )?; diff --git a/src/query/storages/fuse/src/operations/read_partitions.rs b/src/query/storages/fuse/src/operations/read_partitions.rs index f0d406524fdcf..86f766f90bc1f 100644 --- a/src/query/storages/fuse/src/operations/read_partitions.rs +++ b/src/query/storages/fuse/src/operations/read_partitions.rs @@ -39,6 +39,8 @@ use databend_common_exception::Result; use databend_common_expression::Scalar; use databend_common_expression::TableSchemaRef; use databend_common_functions::BUILTIN_FUNCTIONS; +use databend_common_meta_app::schema::TableIndexType; +use databend_common_meta_app::schema::TableMeta; use databend_common_pipeline_core::ExecutionInfo; use databend_common_pipeline_core::Pipeline; use databend_common_sql::DefaultExprBinder; @@ -46,6 +48,7 @@ use databend_common_storage::ColumnNodes; use databend_storages_common_cache::CacheAccessor; use databend_storages_common_cache::CachedObject; use databend_storages_common_index::BloomIndex; +use databend_storages_common_index::NgramArgs; use databend_storages_common_pruner::BlockMetaIndex; use databend_storages_common_pruner::TopNPrunner; use databend_storages_common_table_meta::meta::column_oriented_segment::meta_name; @@ -65,6 +68,7 @@ use databend_storages_common_table_meta::meta::BlockMeta; use databend_storages_common_table_meta::meta::ColumnStatistics; use databend_storages_common_table_meta::table::ChangeType; use databend_storages_common_table_meta::table::ClusterType; +use itertools::Itertools; use log::info; use opendal::Operator; use sha2::Digest; @@ -95,6 +99,9 @@ use crate::FuseLazyPartInfo; use crate::FuseSegmentFormat; use crate::FuseTable; +const DEFAULT_GRAM_SIZE: usize = 3; +const DEFAULT_BLOOM_SIZE: u64 = 1024 * 1024; + impl FuseTable { #[fastrace::trace] #[async_backtrace::framed] @@ -622,6 +629,7 @@ impl FuseTable { table_schema: TableSchemaRef, dal: Operator, ) -> Result { + let ngram_args = Self::create_ngram_index_args(&self.table_info.meta)?; let bloom_index_builder = if ctx .get_settings() .get_enable_auto_fix_missing_bloom_index()? @@ -638,6 +646,7 @@ impl FuseTable { table_dal: dal.clone(), storage_format, bloom_columns_map, + ngram_args: ngram_args.clone(), }) } else { None @@ -651,6 +660,7 @@ impl FuseTable { table_schema.clone(), &push_downs, self.bloom_index_cols(), + ngram_args, bloom_index_builder, )? } else { @@ -664,12 +674,44 @@ impl FuseTable { self.cluster_key_meta.clone(), cluster_keys, self.bloom_index_cols(), + ngram_args, bloom_index_builder, )? }; Ok(pruner) } + pub fn create_ngram_index_args(table_meta: &TableMeta) -> Result> { + let mut ngram_index_args = Vec::with_capacity(table_meta.indexes.len()); + for index in table_meta.indexes.values() { + if !matches!(index.index_type, TableIndexType::Ngram) { + continue; + } + if !index.sync_creation { + continue; + } + + let Some((pos, field)) = table_meta + .schema + .fields() + .iter() + .find_position(|field| field.column_id() == index.column_ids[0]) + else { + continue; + }; + let gram_size = match index.options.get("gram_size") { + None => DEFAULT_GRAM_SIZE, + Some(s) => s.parse::()?, + }; + let bloom_size = match index.options.get("bloom_size") { + None => DEFAULT_BLOOM_SIZE, + Some(s) => s.parse::()?, + }; + ngram_index_args.push(NgramArgs::new(pos, field.clone(), gram_size, bloom_size)); + } + Ok(ngram_index_args) + } + pub fn check_prune_cache( derterministic_cache_key: &Option, ) -> Option<(PartStatistics, Partitions)> { diff --git a/src/query/storages/fuse/src/operations/recluster.rs b/src/query/storages/fuse/src/operations/recluster.rs index e80ac74ad039a..816112c0176ee 100644 --- a/src/query/storages/fuse/src/operations/recluster.rs +++ b/src/query/storages/fuse/src/operations/recluster.rs @@ -249,6 +249,7 @@ impl FuseTable { None, vec![], BloomIndexColumns::None, + vec![], max_concurrency, bloom_index_builder, )?; diff --git a/src/query/storages/fuse/src/operations/replace_into/mutator/replace_into_operation_agg.rs b/src/query/storages/fuse/src/operations/replace_into/mutator/replace_into_operation_agg.rs index b00d6592e35c9..b9a4feed42ebc 100644 --- a/src/query/storages/fuse/src/operations/replace_into/mutator/replace_into_operation_agg.rs +++ b/src/query/storages/fuse/src/operations/replace_into/mutator/replace_into_operation_agg.rs @@ -48,7 +48,7 @@ use databend_storages_common_cache::CacheAccessor; use databend_storages_common_cache::CacheManager; use databend_storages_common_cache::LoadParams; use databend_storages_common_index::filters::Filter; -use databend_storages_common_index::filters::Xor8Filter; +use databend_storages_common_index::filters::FilterImpl; use databend_storages_common_index::BloomIndex; use databend_storages_common_io::ReadSettings; use databend_storages_common_table_meta::meta::BlockMeta; @@ -754,12 +754,12 @@ impl AggregationContext { location: &Location, index_len: u64, bloom_on_conflict_field_index: &[FieldIndex], - ) -> Result>>> { + ) -> Result>>> { // different block may have different version of bloom filter index let mut col_names = Vec::with_capacity(bloom_on_conflict_field_index.len()); for idx in bloom_on_conflict_field_index { - let bloom_column_name = BloomIndex::build_filter_column_name( + let bloom_column_name = BloomIndex::build_filter_bloom_name( location.1, &self.on_conflict_fields[*idx].table_field, )?; diff --git a/src/query/storages/fuse/src/pruning/bloom_pruner.rs b/src/query/storages/fuse/src/pruning/bloom_pruner.rs index 4608158b57449..2f7df2a735bf8 100644 --- a/src/query/storages/fuse/src/pruning/bloom_pruner.rs +++ b/src/query/storages/fuse/src/pruning/bloom_pruner.rs @@ -26,10 +26,12 @@ use databend_common_expression::Scalar; use databend_common_expression::TableField; use databend_common_expression::TableSchema; use databend_common_expression::TableSchemaRef; +use databend_common_expression::Value; use databend_common_sql::BloomIndexColumns; use databend_storages_common_index::filters::BlockFilter; use databend_storages_common_index::BloomIndex; use databend_storages_common_index::FilterEvalResult; +use databend_storages_common_index::NgramArgs; use databend_storages_common_table_meta::meta::column_oriented_segment::BlockReadInfo; use databend_storages_common_table_meta::meta::Location; use databend_storages_common_table_meta::meta::StatisticsOfColumns; @@ -63,8 +65,14 @@ pub struct BloomPrunerCreator { /// the expression that would be evaluate filter_expression: Expr, - /// pre calculated digest for constant Scalar - scalar_map: HashMap, + /// pre calculated digest for constant Scalar for eq conditions + eq_scalar_map: HashMap, + + /// pre calculated digest for constant Scalar for like conditions + like_scalar_map: HashMap>, + + /// Ngram args aligned with BloomColumn using Ngram + ngram_args: Vec, /// the data accessor dal: Operator, @@ -83,6 +91,7 @@ impl BloomPrunerCreator { dal: Operator, filter_expr: Option<&Expr>, bloom_index_cols: BloomIndexColumns, + ngram_args: Vec, bloom_index_builder: Option, ) -> Result>> { let Some(expr) = filter_expr else { @@ -91,26 +100,47 @@ impl BloomPrunerCreator { let bloom_columns_map = bloom_index_cols.bloom_index_fields(schema.clone(), BloomIndex::supported_type)?; let bloom_column_fields = bloom_columns_map.values().cloned().collect::>(); - let (index_fields, scalars) = BloomIndex::filter_index_field(expr, &bloom_column_fields)?; + let ngram_column_fields: Vec = + ngram_args.iter().map(|arg| arg.field().clone()).collect(); + let mut result = + BloomIndex::filter_index_field(expr, bloom_column_fields, ngram_column_fields)?; - if index_fields.is_empty() { + if result.bloom_fields.is_empty() && result.ngram_fields.is_empty() { return Ok(None); } // convert to filter column names - let mut scalar_map = HashMap::::new(); - for (scalar, ty) in scalars.into_iter() { - if let Entry::Vacant(e) = scalar_map.entry(scalar) { + let mut eq_scalar_map = HashMap::::new(); + for (_, scalar, ty) in result.bloom_scalars.into_iter() { + if let Entry::Vacant(e) = eq_scalar_map.entry(scalar) { let digest = BloomIndex::calculate_scalar_digest(&func_ctx, e.key(), &ty)?; e.insert(digest); } } + let mut like_scalar_map = HashMap::>::new(); + for (i, scalar) in result.ngram_scalars.into_iter() { + let Some(digests) = BloomIndex::calculate_ngram_nullable_column( + Value::Scalar(scalar.clone()), + ngram_args[i].gram_size(), + BloomIndex::ngram_hash, + ) + .next() else { + continue; + }; + if let Entry::Vacant(e) = like_scalar_map.entry(scalar) { + e.insert(digests); + } + } + let mut index_fields = result.bloom_fields; + index_fields.append(&mut result.ngram_fields); Ok(Some(Arc::new(Self { func_ctx, index_fields, filter_expression: expr.clone(), - scalar_map, + eq_scalar_map, + like_scalar_map, + ngram_args, dal, data_schema: schema.clone(), bloom_index_builder, @@ -134,7 +164,13 @@ impl BloomPrunerCreator { Vec::with_capacity(self.index_fields.len()), |mut acc, field| { if column_ids_of_indexed_block.contains(&field.column_id()) { - acc.push(BloomIndex::build_filter_column_name(version, field)?); + acc.push(BloomIndex::build_filter_bloom_name(version, field)?); + } + if let Some(ngram_arg) = self.ngram_args.iter().find(|arg| arg.field() == field) { + acc.push(BloomIndex::build_filter_ngram_name( + field, + ngram_arg.gram_size(), + )); } Ok::<_, ErrorCode>(acc) }, @@ -186,7 +222,9 @@ impl BloomPrunerCreator { )? .apply( self.filter_expression.clone(), - &self.scalar_map, + &self.eq_scalar_map, + &self.like_scalar_map, + &self.ngram_args, column_stats, self.data_schema.clone(), )? != FilterEvalResult::MustFalse), diff --git a/src/query/storages/fuse/src/pruning/fuse_pruner.rs b/src/query/storages/fuse/src/pruning/fuse_pruner.rs index d19c37cf36794..a30493089d25c 100644 --- a/src/query/storages/fuse/src/pruning/fuse_pruner.rs +++ b/src/query/storages/fuse/src/pruning/fuse_pruner.rs @@ -31,6 +31,7 @@ use databend_common_sql::DefaultExprBinder; use databend_storages_common_cache::CacheAccessor; use databend_storages_common_cache::CacheManager; use databend_storages_common_cache::SegmentBlockMetasCache; +use databend_storages_common_index::NgramArgs; use databend_storages_common_index::RangeIndex; use databend_storages_common_pruner::BlockMetaIndex; use databend_storages_common_pruner::InternalColumnPruner; @@ -94,6 +95,7 @@ impl PruningContext { cluster_key_meta: Option, cluster_keys: Vec>, bloom_index_cols: BloomIndexColumns, + ngram_args: Vec, max_concurrency: usize, bloom_index_builder: Option, ) -> Result> { @@ -149,6 +151,7 @@ impl PruningContext { dal.clone(), filter_expr.as_ref(), bloom_index_cols, + ngram_args, bloom_index_builder, )?; @@ -219,6 +222,7 @@ impl FusePruner { table_schema: TableSchemaRef, push_down: &Option, bloom_index_cols: BloomIndexColumns, + ngram_args: Vec, bloom_index_builder: Option, ) -> Result { Self::create_with_pages( @@ -229,6 +233,7 @@ impl FusePruner { None, vec![], bloom_index_cols, + ngram_args, bloom_index_builder, ) } @@ -242,6 +247,7 @@ impl FusePruner { cluster_key_meta: Option, cluster_keys: Vec>, bloom_index_cols: BloomIndexColumns, + ngram_args: Vec, bloom_index_builder: Option, ) -> Result { let max_concurrency = { @@ -267,6 +273,7 @@ impl FusePruner { cluster_key_meta, cluster_keys, bloom_index_cols, + ngram_args, max_concurrency, bloom_index_builder, )?; diff --git a/src/query/storages/fuse/src/statistics/reducers.rs b/src/query/storages/fuse/src/statistics/reducers.rs index 4a59f9bcc6e03..05a7a852a5815 100644 --- a/src/query/storages/fuse/src/statistics/reducers.rs +++ b/src/query/storages/fuse/src/statistics/reducers.rs @@ -234,7 +234,6 @@ pub fn reduce_block_metas>( compressed_byte_size += b.file_size; index_size += b.bloom_filter_index_size; index_size += b.inverted_index_size.unwrap_or_default(); - index_size += b.ngram_filter_index_size.unwrap_or_default(); if let Some(virtual_block_meta) = &b.virtual_block_meta { index_size += virtual_block_meta.virtual_column_size; virtual_block_count += 1; diff --git a/src/query/storages/fuse/src/table_functions/fuse_block.rs b/src/query/storages/fuse/src/table_functions/fuse_block.rs index 20990f1aac3ef..e81525fbc11b8 100644 --- a/src/query/storages/fuse/src/table_functions/fuse_block.rs +++ b/src/query/storages/fuse/src/table_functions/fuse_block.rs @@ -65,10 +65,6 @@ impl TableMetaFunc for FuseBlock { "inverted_index_size", TableDataType::Nullable(Box::new(TableDataType::Number(NumberDataType::UInt64))), ), - TableField::new( - "ngram_index_size", - TableDataType::Nullable(Box::new(TableDataType::Number(NumberDataType::UInt64))), - ), ]) } @@ -90,7 +86,6 @@ impl TableMetaFunc for FuseBlock { let mut bloom_filter_location = vec![]; let mut bloom_filter_size = Vec::with_capacity(len); let mut inverted_index_size = Vec::with_capacity(len); - let mut ngram_index_size = Vec::with_capacity(len); let segments_io = SegmentsIO::create(ctx.clone(), tbl.operator.clone(), tbl.schema()); @@ -118,7 +113,6 @@ impl TableMetaFunc for FuseBlock { ); bloom_filter_size.push(block.bloom_filter_index_size); inverted_index_size.push(block.inverted_index_size); - ngram_index_size.push(block.ngram_filter_index_size); row_num += 1; if row_num >= limit { @@ -163,10 +157,6 @@ impl TableMetaFunc for FuseBlock { DataType::Nullable(Box::new(DataType::Number(NumberDataType::UInt64))), Value::Column(UInt64Type::from_opt_data(inverted_index_size)), ), - BlockEntry::new( - DataType::Nullable(Box::new(DataType::Number(NumberDataType::UInt64))), - Value::Column(UInt64Type::from_opt_data(ngram_index_size)), - ), ], row_num, )) diff --git a/tests/sqllogictests/suites/ee/08_ee_ngram_index/08_0000_ngram_index_base.test b/tests/sqllogictests/suites/ee/08_ee_ngram_index/08_0000_ngram_index_base.test index 13a19c7368cc5..ead31ef70932a 100644 --- a/tests/sqllogictests/suites/ee/08_ee_ngram_index/08_0000_ngram_index_base.test +++ b/tests/sqllogictests/suites/ee/08_ee_ngram_index/08_0000_ngram_index_base.test @@ -27,6 +27,18 @@ CREATE TABLE t1 (id int, content string, NGRAM INDEX idx1 (content) gram_size = statement ok CREATE TABLE t2 (id int, content string) +statement error +CREATE NGRAM INDEX idx2 ON t2(content) gram_size = 0 + +statement error +CREATE NGRAM INDEX idx2 ON t2(content) bitmap_size = 0 + +statement error +CREATE NGRAM INDEX idx2 ON t2(content) bitmap_size = 511 + +statement error +CREATE NGRAM INDEX idx2 ON t2(content) bitmap_size = 10485761 + statement ok CREATE NGRAM INDEX idx2 ON t2(content) gram_size = 5 @@ -47,6 +59,11 @@ select name, type, original, definition from system.indexes; ---- idx1 NGRAM (empty) t1(content)gram_size='5' +query T +show create table t1; +---- +t1 CREATE TABLE t1 ( id INT NULL, content VARCHAR NULL, SYNC NGRAM INDEX idx1 (content) gram_size = '5' ) ENGINE=FUSE + statement ok use default diff --git a/tests/sqllogictests/suites/mode/standalone/ee/explain_ngram_index.test b/tests/sqllogictests/suites/mode/standalone/ee/explain_ngram_index.test new file mode 100644 index 0000000000000..241631c0d9a38 --- /dev/null +++ b/tests/sqllogictests/suites/mode/standalone/ee/explain_ngram_index.test @@ -0,0 +1,345 @@ +## Copyright 2023 Databend Cloud +## +## Licensed under the Elastic License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## https://www.elastic.co/licensing/elastic-license +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. + +statement ok +DROP DATABASE IF EXISTS test_ngram_index_db + +statement ok +CREATE DATABASE test_ngram_index_db + +statement ok +USE test_ngram_index_db + +statement ok +DROP TABLE IF EXISTS t1 + +statement ok +CREATE TABLE t1 (id int, content string) row_per_block=2 storage_format='parquet' + +statement ok +CREATE NGRAM INDEX IF NOT EXISTS idx1 ON t1(content) + +statement ok +INSERT INTO t1 VALUES +(1, 'The quick brown fox jumps over the lazy dog'), +(2, 'A picture is worth a thousand words'), +(3, 'The early bird catches the worm'), +(4, 'Actions speak louder than words'), +(5, 'Time flies like an arrow; fruit flies like a banana'), +(6, 'Beauty is in the eye of the beholder'), +(7, 'When life gives you lemons, make lemonade'), +(8, 'Put all your eggs in one basket'), +(9, 'You can not judge a book by its cover'), +(10, 'An apple a day keeps the doctor away'), +(11, '江火暗还明'), +(12, '孤舟枕浪轻'), +(13, '风来云欲语'), +(14, '潮退月无声'), +(15, '客梦三更短'), +(16, '乡愁一水横') + +query T +EXPLAIN SELECT id, content FROM t1 WHERE content LIKE '%yo%' +---- +Filter +├── output columns: [t1.id (#0), t1.content (#1)] +├── filters: [is_true(like(t1.content (#1), '%yo%'))] +├── estimated rows: 8.00 +└── TableScan + ├── table: default.test_ngram_index_db.t1 + ├── output columns: [id (#0), content (#1)] + ├── read rows: 16 + ├── read size: 1.03 KiB + ├── partitions total: 8 + ├── partitions scanned: 8 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [is_true(like(t1.content (#1), '%yo%'))], limit: NONE] + └── estimated rows: 16.00 + +query T +EXPLAIN SELECT id, content FROM t1 WHERE content LIKE '%your eggs%' +---- +Filter +├── output columns: [t1.id (#0), t1.content (#1)] +├── filters: [is_true(like(t1.content (#1), '%your eggs%'))] +├── estimated rows: 0.06 +└── TableScan + ├── table: default.test_ngram_index_db.t1 + ├── output columns: [id (#0), content (#1)] + ├── read rows: 2 + ├── read size: < 1 KiB + ├── partitions total: 8 + ├── partitions scanned: 1 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [is_true(like(t1.content (#1), '%your eggs%'))], limit: NONE] + └── estimated rows: 16.00 + +query T +EXPLAIN SELECT id, content FROM t1 WHERE content LIKE '%your eggs' +---- +Filter +├── output columns: [t1.id (#0), t1.content (#1)] +├── filters: [is_true(like(t1.content (#1), '%your eggs'))] +├── estimated rows: 0.03 +└── TableScan + ├── table: default.test_ngram_index_db.t1 + ├── output columns: [id (#0), content (#1)] + ├── read rows: 2 + ├── read size: < 1 KiB + ├── partitions total: 8 + ├── partitions scanned: 1 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [is_true(like(t1.content (#1), '%your eggs'))], limit: NONE] + └── estimated rows: 16.00 + +query T +EXPLAIN SELECT id, content FROM t1 WHERE content LIKE 'A picture%' +---- +Filter +├── output columns: [t1.id (#0), t1.content (#1)] +├── filters: [is_true(t1.content (#1) >= 'A picture'), is_true(t1.content (#1) < 'A picturf')] +├── estimated rows: 3.20 +└── TableScan + ├── table: default.test_ngram_index_db.t1 + ├── output columns: [id (#0), content (#1)] + ├── read rows: 2 + ├── read size: < 1 KiB + ├── partitions total: 8 + ├── partitions scanned: 1 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [and_filters(t1.content (#1) >= 'A picture', t1.content (#1) < 'A picturf')], limit: NONE] + └── estimated rows: 16.00 + +query T +EXPLAIN SELECT id, content FROM t1 WHERE content LIKE '%风来%' +---- +Filter +├── output columns: [t1.id (#0), t1.content (#1)] +├── filters: [is_true(like(t1.content (#1), '%风来%'))] +├── estimated rows: 8.00 +└── TableScan + ├── table: default.test_ngram_index_db.t1 + ├── output columns: [id (#0), content (#1)] + ├── read rows: 16 + ├── read size: 1.03 KiB + ├── partitions total: 8 + ├── partitions scanned: 8 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [is_true(like(t1.content (#1), '%风来%'))], limit: NONE] + └── estimated rows: 16.00 + +query T +EXPLAIN SELECT id, content FROM t1 WHERE content LIKE '%月无声%' +---- +Filter +├── output columns: [t1.id (#0), t1.content (#1)] +├── filters: [is_true(like(t1.content (#1), '%月无声%'))] +├── estimated rows: 4.00 +└── TableScan + ├── table: default.test_ngram_index_db.t1 + ├── output columns: [id (#0), content (#1)] + ├── read rows: 2 + ├── read size: < 1 KiB + ├── partitions total: 8 + ├── partitions scanned: 1 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [is_true(like(t1.content (#1), '%月无声%'))], limit: NONE] + └── estimated rows: 16.00 + +query T +SELECT id, content FROM t1 WHERE content LIKE '%your eggs%' +---- +8 Put all your eggs in one basket + +statement ok +DROP TABLE IF EXISTS t2 + +statement ok +CREATE TABLE t2 (id int, content string) row_per_block=2 storage_format='native' + +statement ok +CREATE NGRAM INDEX IF NOT EXISTS idx1 ON t2(content) + +statement ok +INSERT INTO t2 VALUES +(1, 'The quick brown fox jumps over the lazy dog'), +(2, 'A picture is worth a thousand words'), +(3, 'The early bird catches the worm'), +(4, 'Actions speak louder than words'), +(5, 'Time flies like an arrow; fruit flies like a banana'), +(6, 'Beauty is in the eye of the beholder'), +(7, 'When life gives you lemons, make lemonade'), +(8, 'Put all your eggs in one basket'), +(9, 'You can not judge a book by its cover'), +(10, 'An apple a day keeps the doctor away'), +(11, '江火暗还明'), +(12, '孤舟枕浪轻'), +(13, '风来云欲语'), +(14, '潮退月无声'), +(15, '客梦三更短'), +(16, '乡愁一水横') + +query T +EXPLAIN SELECT id, content FROM t2 WHERE content LIKE '%your eggs%' +---- +TableScan +├── table: default.test_ngram_index_db.t2 +├── output columns: [id (#0), content (#1)] +├── read rows: 2 +├── read size: < 1 KiB +├── partitions total: 8 +├── partitions scanned: 1 +├── pruning stats: [segments: , blocks: ] +├── push downs: [filters: [is_true(like(t2.content (#1), '%your eggs%'))], limit: NONE] +└── estimated rows: 0.06 + +query T +EXPLAIN SELECT id, content FROM t2 WHERE content LIKE '%your eggs' +---- +TableScan +├── table: default.test_ngram_index_db.t2 +├── output columns: [id (#0), content (#1)] +├── read rows: 2 +├── read size: < 1 KiB +├── partitions total: 8 +├── partitions scanned: 1 +├── pruning stats: [segments: , blocks: ] +├── push downs: [filters: [is_true(like(t2.content (#1), '%your eggs'))], limit: NONE] +└── estimated rows: 0.03 + +query T +EXPLAIN SELECT id, content FROM t2 WHERE content LIKE 'A picture%' +---- +TableScan +├── table: default.test_ngram_index_db.t2 +├── output columns: [id (#0), content (#1)] +├── read rows: 2 +├── read size: < 1 KiB +├── partitions total: 8 +├── partitions scanned: 1 +├── pruning stats: [segments: , blocks: ] +├── push downs: [filters: [and_filters(t2.content (#1) >= 'A picture', t2.content (#1) < 'A picturf')], limit: NONE] +└── estimated rows: 3.20 + + +query T +SELECT id, content FROM t1 WHERE content LIKE '%your eggs%' +---- +8 Put all your eggs in one basket + +query T +EXPLAIN SELECT id, content FROM t2 WHERE content LIKE '%yo%' +---- +TableScan +├── table: default.test_ngram_index_db.t2 +├── output columns: [id (#0), content (#1)] +├── read rows: 16 +├── read size: 1.26 KiB +├── partitions total: 8 +├── partitions scanned: 8 +├── pruning stats: [segments: , blocks: ] +├── push downs: [filters: [is_true(like(t2.content (#1), '%yo%'))], limit: NONE] +└── estimated rows: 8.00 + +query T +EXPLAIN SELECT id, content FROM t2 WHERE content LIKE '%风来%' +---- +TableScan +├── table: default.test_ngram_index_db.t2 +├── output columns: [id (#0), content (#1)] +├── read rows: 16 +├── read size: 1.26 KiB +├── partitions total: 8 +├── partitions scanned: 8 +├── pruning stats: [segments: , blocks: ] +├── push downs: [filters: [is_true(like(t2.content (#1), '%风来%'))], limit: NONE] +└── estimated rows: 8.00 + +query T +EXPLAIN SELECT id, content FROM t2 WHERE content LIKE '%月无声%' +---- +TableScan +├── table: default.test_ngram_index_db.t2 +├── output columns: [id (#0), content (#1)] +├── read rows: 2 +├── read size: < 1 KiB +├── partitions total: 8 +├── partitions scanned: 1 +├── pruning stats: [segments: , blocks: ] +├── push downs: [filters: [is_true(like(t2.content (#1), '%月无声%'))], limit: NONE] +└── estimated rows: 4.00 + +# Tests whether the ngram index with the same column and the same parameters +# will use the old index after the column is removed +statement ok +DROP ngram INDEX idx1 ON t2 + +statement ok +ALTER TABLE t2 DROP COLUMN content; + +statement ok +ALTER TABLE t2 ADD COLUMN content string; + +statement ok +INSERT INTO t2 VALUES +(17, 'The quick brown fox jumps over the lazy dog'), +(18, 'A picture is worth a thousand words'), +(19, 'The early bird catches the worm'), +(20, 'Actions speak louder than words'), +(21, 'Time flies like an arrow; fruit flies like a banana'), +(22, 'Beauty is in the eye of the beholder'), +(23, 'When life gives you lemons, make lemonade'), +(24, 'Put all your eggs in one basket'), +(25, 'You can not judge a book by its cover'), +(26, 'An apple a day keeps the doctor away') + +query T +EXPLAIN SELECT id, content FROM t2 WHERE content LIKE '%your eggs%' +---- +TableScan +├── table: default.test_ngram_index_db.t2 +├── output columns: [id (#0), content (#1)] +├── read rows: 10 +├── read size: < 1 KiB +├── partitions total: 2 +├── partitions scanned: 5 +├── pruning stats: [segments: , blocks: ] +├── push downs: [filters: [is_true(like(t2.content (#1), '%your eggs%'))], limit: NONE] +└── estimated rows: 0.10 + +statement ok +CREATE NGRAM INDEX IF NOT EXISTS idx1 ON t2(content) + +statement ok +INSERT INTO t2 VALUES (27, 'The Anthem of man is the Anthem of courage') + +query T +EXPLAIN SELECT id, content FROM t2 WHERE content LIKE '%your eggs%' +---- +TableScan +├── table: default.test_ngram_index_db.t2 +├── output columns: [id (#0), content (#1)] +├── read rows: 10 +├── read size: < 1 KiB +├── partitions total: 3 +├── partitions scanned: 5 +├── pruning stats: [segments: , blocks: ] +├── push downs: [filters: [is_true(like(t2.content (#1), '%your eggs%'))], limit: NONE] +└── estimated rows: 0.11 + +statement ok +USE default + +statement ok +DROP DATABASE IF EXISTS test_ngram_index_db