diff --git a/Cargo.lock b/Cargo.lock index c2959a314380f..70cb8d79605f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -635,7 +635,7 @@ source = "git+https://github.com/datafuse-extras/async-backtrace.git?rev=dea4553 dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -736,7 +736,7 @@ source = "git+https://github.com/datafuse-extras/async-recursion.git?rev=a353334 dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -758,7 +758,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -775,7 +775,7 @@ checksum = "d556ec1359574147ec0c4fc5eb525f3f23263a592b1a9c07e0a75b427de55c97" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -1346,7 +1346,7 @@ dependencies = [ "regex", "rustc-hash 1.1.0", "shlex", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -1534,7 +1534,7 @@ dependencies = [ "proc-macro-crate 3.1.0", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", "syn_derive", ] @@ -1676,7 +1676,7 @@ checksum = "1ee891b04274a59bd38b412188e24b849617b2e45a0fd8d057deb63e7403761b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -2138,7 +2138,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -2812,7 +2812,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" dependencies = [ "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -2855,7 +2855,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -2866,7 +2866,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -3292,7 +3292,7 @@ dependencies = [ "hex", "hyper-util", "itertools 0.13.0", - "jiff 0.2.1", + "jiff 0.2.4", "jsonb", "lexical-core", "log", @@ -3339,7 +3339,7 @@ dependencies = [ "geozero", "goldenfile", "hex", - "jiff 0.2.1", + "jiff 0.2.4", "jsonb", "lexical-core", "match-template", @@ -3394,7 +3394,7 @@ dependencies = [ "jaq-interpret", "jaq-parse", "jaq-std", - "jiff 0.2.1", + "jiff 0.2.4", "jsonb", "lexical-core", "libm", @@ -3487,7 +3487,7 @@ dependencies = [ "geo", "geozero", "hex", - "jiff 0.2.1", + "jiff 0.2.4", "lexical-core", "micromarshal", "rmp-serde", @@ -4590,7 +4590,7 @@ dependencies = [ "databend-storages-common-cache", "futures", "itertools 0.13.0", - "jiff 0.2.1", + "jiff 0.2.4", "jsonb", "log", "once_cell", @@ -4920,7 +4920,7 @@ dependencies = [ "databend-common-exception", "databend-common-expression", "dtparse", - "jiff 0.2.1", + "jiff 0.2.4", "num-traits", ] @@ -5186,7 +5186,7 @@ dependencies = [ "hyper-util", "indicatif", "itertools 0.13.0", - "jiff 0.2.1", + "jiff 0.2.4", "jsonb", "jwt-simple", "log", @@ -5290,7 +5290,7 @@ dependencies = [ "derive-visitor", "ethnum", "itertools 0.13.0", - "jiff 0.2.1", + "jiff 0.2.4", "jsonb", "rand 0.8.5", "reqwest", @@ -5467,7 +5467,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -5569,7 +5569,7 @@ checksum = "0c8e41236d5a9f04da3072d7186a76aba734e7bfd2cd05f7877fde172b65fb11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -5707,7 +5707,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -5717,7 +5717,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" dependencies = [ "derive_builder_core", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -5738,7 +5738,7 @@ dependencies = [ "convert_case 0.6.0", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", "unicode-xid", ] @@ -5815,7 +5815,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -5928,7 +5928,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -6012,7 +6012,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -6032,7 +6032,7 @@ checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -6044,7 +6044,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -6065,7 +6065,7 @@ checksum = "de0d48a183585823424a4ce1aa132d174a6a81bd540895822eb4c8373a8e49e8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -6131,7 +6131,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -6283,7 +6283,7 @@ dependencies = [ "proc-macro-error2", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -6529,7 +6529,7 @@ checksum = "b0fa992f1656e1707946bbba340ad244f0814009ef8c0118eb7b658395f19a2e" dependencies = [ "frunk_proc_macro_helpers", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -6541,7 +6541,7 @@ dependencies = [ "frunk_core", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -6553,7 +6553,7 @@ dependencies = [ "frunk_core", "frunk_proc_macro_helpers", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -6651,7 +6651,7 @@ checksum = "5ac45ed0bddbd110eb68862768a194f88700f5b91c39931d2f432fab67a16d08" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -6716,7 +6716,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -7388,7 +7388,7 @@ checksum = "999ce923619f88194171a67fb3e6d613653b8d4d6078b529b15a765da0edcc17" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -8680,7 +8680,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -9139,10 +9139,11 @@ dependencies = [ [[package]] name = "jiff" -version = "0.2.1" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3590fea8e9e22d449600c9bbd481a8163bef223e4ff938e5f55899f8cf1adb93" +checksum = "d699bc6dfc879fb1bf9bdff0d4c56f0884fc6f0d0eb0fba397a6d00cd9a6b85e" dependencies = [ + "jiff-static", "jiff-tzdb", "jiff-tzdb-platform", "log", @@ -9152,6 +9153,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "jiff-static" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d16e75759ee0aa64c57a56acbf43916987b20c77373cb7e808979e02b93c9f9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "jiff-tzdb" version = "0.1.2" @@ -10092,7 +10104,7 @@ dependencies = [ "proc-macro-error 1.0.4", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", "termcolor", "thiserror 1.0.65", ] @@ -10270,7 +10282,7 @@ dependencies = [ "proc-macro-error 1.0.4", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -10465,7 +10477,7 @@ dependencies = [ "proc-macro-crate 3.1.0", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -10649,7 +10661,7 @@ dependencies = [ "proc-macro2", "quote", "semver", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -10691,7 +10703,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -11043,7 +11055,7 @@ dependencies = [ "regex", "regex-syntax 0.8.4", "structmeta", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -11195,7 +11207,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -11342,7 +11354,7 @@ dependencies = [ "proc-macro-crate 3.1.0", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -11476,7 +11488,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -11576,7 +11588,7 @@ dependencies = [ "proc-macro-error-attr2", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -11587,9 +11599,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.92" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" dependencies = [ "unicode-ident", ] @@ -11665,7 +11677,7 @@ checksum = "440f724eba9f6996b75d63681b0a92b06947f1457076d503a4d2e2c8f56442b8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -11722,7 +11734,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.95", + "syn 2.0.100", "tempfile", ] @@ -11736,7 +11748,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -11941,7 +11953,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -11954,7 +11966,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -12069,9 +12081,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.36" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] @@ -12215,7 +12227,7 @@ checksum = "8b86292cf41ccfc96c5de7165c1c53d5b4ac540c5bab9d1857acbe9eba5f1a0b" dependencies = [ "proc-macro-hack", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -12268,7 +12280,7 @@ version = "0.1.1" source = "git+https://github.com/datafuse-extras/recursive.git?rev=6af35a1#6af35a1e59e7050f86ee19fbd0a79535d016c87d" dependencies = [ "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -12713,7 +12725,7 @@ dependencies = [ "proc-macro2", "quote", "rquickjs-core", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -13207,7 +13219,7 @@ checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -13249,7 +13261,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -13319,7 +13331,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -13577,7 +13589,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -13789,7 +13801,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -13812,7 +13824,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn 2.0.95", + "syn 2.0.100", "tempfile", "tokio", "url", @@ -14016,7 +14028,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -14027,7 +14039,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -14068,7 +14080,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -14224,9 +14236,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.95" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46f71c0377baf4ef1cc3e3402ded576dccc315800fbc62dfc7fe04b009773b4a" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", @@ -14253,7 +14265,7 @@ dependencies = [ "proc-macro-error 1.0.4", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -14279,7 +14291,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -14570,7 +14582,7 @@ checksum = "e71277381bd8b17eea2126a849dced540862c498398d4dd52405233a5d3cc643" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -14637,7 +14649,7 @@ checksum = "ae71770322cbd277e69d762a16c444af02aa0575ac0d174f0b9562d3b37f8602" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -14648,7 +14660,7 @@ checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -14808,7 +14820,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -15018,7 +15030,7 @@ dependencies = [ "prost-build", "prost-types", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -15100,7 +15112,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -15234,7 +15246,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -15245,7 +15257,7 @@ checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -15281,7 +15293,7 @@ checksum = "70b20a22c42c8f1cd23ce5e34f165d4d37038f5b663ad20fb6adbdf029172483" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -15600,7 +15612,7 @@ checksum = "d674d135b4a8c1d7e813e2f8d1c9a58308aee4a680323066025e53132218bd91" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -15763,7 +15775,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", "wasm-bindgen-shared", ] @@ -15797,7 +15809,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -15972,7 +15984,7 @@ dependencies = [ "anyhow", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", "wasmtime-component-util", "wasmtime-wit-bindgen", "wit-parser", @@ -16088,7 +16100,7 @@ checksum = "df09be00c38f49172ca9936998938476e3f2df782673a39ae2ef9fb0838341b6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -16234,7 +16246,7 @@ dependencies = [ "proc-macro2", "quote", "shellexpand", - "syn 2.0.95", + "syn 2.0.100", "witx", ] @@ -16246,7 +16258,7 @@ checksum = "9b8eb1a5783540696c59cefbfc9e52570c2d5e62bd47bdf0bdcef29231879db2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", "wiggle-generate", ] @@ -16363,7 +16375,7 @@ checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -16374,7 +16386,7 @@ checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -16833,7 +16845,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", "synstructure", ] @@ -16870,7 +16882,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -16881,7 +16893,7 @@ checksum = "6352c01d0edd5db859a63e2605f4ea3183ddbd15e2c4a9e7d32184df75e4f154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] @@ -16901,7 +16913,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", "synstructure", ] @@ -16930,7 +16942,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.100", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 1c9184126890d..d217d582b50a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -587,9 +587,8 @@ result_large_err = "allow" [profile.release] debug = 1 lto = "thin" -overflow-checks = false -opt-level = "s" # defaults to be 3 -incremental = true +opt-level = "s" # defaults to be 3 +#incremental = true [profile.ci] inherits = "release" diff --git a/src/common/base/src/runtime/memory/mem_stat.rs b/src/common/base/src/runtime/memory/mem_stat.rs index 295ff6e4927ff..d646d925750ce 100644 --- a/src/common/base/src/runtime/memory/mem_stat.rs +++ b/src/common/base/src/runtime/memory/mem_stat.rs @@ -40,14 +40,14 @@ pub struct MemStat { name: Option, pub(crate) used: AtomicI64, - pub(crate) peek_used: AtomicI64, + pub(crate) peak_used: AtomicI64, /// The limit of max used memory for this tracker. /// /// Set to 0 to disable the limit. limit: AtomicI64, - parent_memory_stat: Vec>, + parent_memory_stat: Option>, } impl MemStat { @@ -56,17 +56,17 @@ impl MemStat { id: 0, name: None, used: AtomicI64::new(0), - peek_used: AtomicI64::new(0), + peak_used: AtomicI64::new(0), limit: AtomicI64::new(0), - parent_memory_stat: vec![], + parent_memory_stat: None, } } pub fn create(name: String) -> Arc { - MemStat::create_child(name, vec![]) + MemStat::create_child(name, None) } - pub fn create_child(name: String, parent_memory_stat: Vec>) -> Arc { + pub fn create_child(name: String, parent_memory_stat: Option>) -> Arc { let id = match GlobalSequence::next() { 0 => GlobalSequence::next(), id => id, @@ -76,16 +76,12 @@ impl MemStat { id, name: Some(name), used: AtomicI64::new(0), - peek_used: AtomicI64::new(0), + peak_used: AtomicI64::new(0), limit: AtomicI64::new(0), parent_memory_stat, }) } - pub fn get_parent_memory_stat(&self) -> Vec> { - self.parent_memory_stat.clone() - } - pub fn set_limit(&self, mut size: i64) { // It may cause the process unable to run if memory limit is too low. if size > 0 && size < MINIMUM_MEMORY_LIMIT { @@ -107,19 +103,15 @@ impl MemStat { let mut used = self.used.fetch_add(batch_memory_used, Ordering::Relaxed); used += batch_memory_used; - self.peek_used.fetch_max(used, Ordering::Relaxed); + self.peak_used.fetch_max(used, Ordering::Relaxed); - for (idx, parent_memory_stat) in self.parent_memory_stat.iter().enumerate() { + if let Some(parent_memory_stat) = self.parent_memory_stat.as_ref() { if let Err(cause) = parent_memory_stat .record_memory::(batch_memory_used, current_memory_alloc) { if NEED_ROLLBACK { // We only roll back the memory that alloc failed self.used.fetch_sub(current_memory_alloc, Ordering::Relaxed); - - for index in 0..idx { - self.parent_memory_stat[index].rollback(current_memory_alloc); - } } return Err(cause); @@ -142,8 +134,8 @@ impl MemStat { pub fn rollback(&self, memory_usage: i64) { self.used.fetch_sub(memory_usage, Ordering::Relaxed); - for parent_memory_stat in &self.parent_memory_stat { - parent_memory_stat.rollback(memory_usage) + if let Some(parent_memory_stat) = &self.parent_memory_stat { + parent_memory_stat.rollback(memory_usage); } } @@ -171,7 +163,7 @@ impl MemStat { #[inline] pub fn get_peek_memory_usage(&self) -> i64 { - self.peek_used.load(Ordering::Relaxed) + self.peak_used.load(Ordering::Relaxed) } } @@ -268,7 +260,7 @@ mod tests { fn test_multiple_level_mem_stat() -> Result<()> { let mem_stat = MemStat::create("TEST".to_string()); let child_mem_stat = - MemStat::create_child("TEST_CHILD".to_string(), vec![mem_stat.clone()]); + MemStat::create_child("TEST_CHILD".to_string(), Some(mem_stat.clone())); mem_stat.record_memory::(1, 1).unwrap(); mem_stat.record_memory::(2, 2).unwrap(); @@ -292,7 +284,7 @@ mod tests { let mem_stat = MemStat::create("TEST".to_string()); mem_stat.set_limit(MINIMUM_MEMORY_LIMIT * 2); let child_mem_stat = - MemStat::create_child("TEST_CHILD".to_string(), vec![mem_stat.clone()]); + MemStat::create_child("TEST_CHILD".to_string(), Some(mem_stat.clone())); child_mem_stat.set_limit(MINIMUM_MEMORY_LIMIT); mem_stat.record_memory::(1, 1).unwrap(); @@ -322,7 +314,7 @@ mod tests { let mem_stat = MemStat::create("TEST".to_string()); mem_stat.set_limit(MINIMUM_MEMORY_LIMIT); let child_mem_stat = - MemStat::create_child("TEST_CHILD".to_string(), vec![mem_stat.clone()]); + MemStat::create_child("TEST_CHILD".to_string(), Some(mem_stat.clone())); child_mem_stat.set_limit(MINIMUM_MEMORY_LIMIT * 2); assert!(child_mem_stat @@ -335,7 +327,7 @@ mod tests { let mem_stat = MemStat::create("TEST".to_string()); mem_stat.set_limit(MINIMUM_MEMORY_LIMIT * 2); let child_mem_stat = - MemStat::create_child("TEST_CHILD".to_string(), vec![mem_stat.clone()]); + MemStat::create_child("TEST_CHILD".to_string(), Some(mem_stat.clone())); child_mem_stat.set_limit(MINIMUM_MEMORY_LIMIT); assert!(child_mem_stat diff --git a/src/common/base/src/runtime/memory/stat_buffer_global.rs b/src/common/base/src/runtime/memory/stat_buffer_global.rs index 4eb20f411f296..cce85443b3054 100644 --- a/src/common/base/src/runtime/memory/stat_buffer_global.rs +++ b/src/common/base/src/runtime/memory/stat_buffer_global.rs @@ -90,7 +90,7 @@ impl GlobalStatBuffer { .used .fetch_add(memory_usage, Ordering::Relaxed); self.global_mem_stat - .peek_used + .peak_used .fetch_max(used + memory_usage, Ordering::Relaxed); return Ok(()); } @@ -126,7 +126,7 @@ impl GlobalStatBuffer { .used .fetch_add(memory_usage, Ordering::Relaxed); self.global_mem_stat - .peek_used + .peak_used .fetch_max(used + memory_usage, Ordering::Relaxed); return; } diff --git a/src/common/base/src/runtime/memory/stat_buffer_mem_stat.rs b/src/common/base/src/runtime/memory/stat_buffer_mem_stat.rs index 8c890598dd61b..71f035fb9d726 100644 --- a/src/common/base/src/runtime/memory/stat_buffer_mem_stat.rs +++ b/src/common/base/src/runtime/memory/stat_buffer_mem_stat.rs @@ -93,7 +93,7 @@ impl MemStatBuffer { if self.destroyed_thread_local_macro { let used = mem_stat.used.fetch_add(usage, Ordering::Relaxed); mem_stat - .peek_used + .peak_used .fetch_max(used + usage, Ordering::Relaxed); return Ok(()); } @@ -134,7 +134,7 @@ impl MemStatBuffer { if self.destroyed_thread_local_macro { let used = mem_stat.used.fetch_add(memory_usage, Ordering::Relaxed); mem_stat - .peek_used + .peak_used .fetch_max(used + memory_usage, Ordering::Relaxed); return; } diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 3fae88f9f636b..0cf85106e336f 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -177,7 +177,7 @@ impl AggregateHashTable { row_count: usize, ) -> Result { state.row_count = row_count; - group_hash_columns(group_columns, &mut state.group_hashes); + group_hash_columns(group_columns, state.group_hashes.as_mut_slice()); let new_group_count = if self.direct_append { for idx in 0..row_count { @@ -337,7 +337,7 @@ impl AggregateHashTable { unsafe { row_match_columns( group_columns, - &state.addresses, + state.addresses.as_slice(), &mut state.group_compare_vector, &mut state.temp_vector, need_compare_count, diff --git a/src/query/expression/src/aggregate/mod.rs b/src/query/expression/src/aggregate/mod.rs index 6911a0efc3cf3..28a8a0a9eae0f 100644 --- a/src/query/expression/src/aggregate/mod.rs +++ b/src/query/expression/src/aggregate/mod.rs @@ -38,10 +38,10 @@ pub use payload::*; pub use payload_flush::*; pub use probe_state::*; -pub type SelectVector = [usize; BATCH_SIZE]; +pub type SelectVector = Vec; pub fn new_sel() -> SelectVector { - [0; BATCH_SIZE] + vec![0; BATCH_SIZE] } // A batch size to probe, flush, repartition, etc. diff --git a/src/query/expression/src/aggregate/partitioned_payload.rs b/src/query/expression/src/aggregate/partitioned_payload.rs index 5b27d6939f330..c5163811ffd90 100644 --- a/src/query/expression/src/aggregate/partitioned_payload.rs +++ b/src/query/expression/src/aggregate/partitioned_payload.rs @@ -16,6 +16,8 @@ use std::sync::Arc; use bumpalo::Bump; use itertools::Itertools; +use serde::Deserializer; +use serde::Serializer; use super::payload::Payload; use super::probe_state::ProbeState; @@ -50,6 +52,18 @@ pub struct PartitionedPayload { unsafe impl Send for PartitionedPayload {} unsafe impl Sync for PartitionedPayload {} +impl serde::Serialize for PartitionedPayload { + fn serialize(&self, _: S) -> Result { + unreachable!("PartitionedPayload must not be exchanged between multiple nodes.") + } +} + +impl<'de> serde::Deserialize<'de> for PartitionedPayload { + fn deserialize>(_: D) -> Result { + unreachable!("PartitionedPayload must not be exchanged between multiple nodes.") + } +} + impl PartitionedPayload { pub fn new( group_types: Vec, @@ -69,7 +83,7 @@ impl PartitionedPayload { let payloads = (0..partition_count) .map(|_| { Payload::new( - arenas[0].clone(), + arenas.clone(), group_types.clone(), aggrs.clone(), states_layout.clone(), @@ -116,9 +130,9 @@ impl PartitionedPayload { if self.payloads.len() == 1 { self.payloads[0].reserve_append_rows( &state.empty_vector, - &state.group_hashes, - &mut state.addresses, - &mut state.page_index, + state.group_hashes.as_slice(), + state.addresses.as_mut_slice(), + state.page_index.as_mut_slice(), new_group_rows, group_columns, ); @@ -143,9 +157,9 @@ impl PartitionedPayload { self.payloads[partition_index].reserve_append_rows( sel, - &state.group_hashes, - &mut state.addresses, - &mut state.page_index, + state.group_hashes.as_slice(), + state.addresses.as_mut_slice(), + state.page_index.as_mut_slice(), count, group_columns, ); diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index 788f187ed9699..28c6303aa3045 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -46,7 +46,7 @@ use crate::MAX_PAGE_SIZE; // [HASH] is the hash data of the groups // [STATE_ADDRS] is the state_addrs of the aggregate functions, 8 bytes each pub struct Payload { - pub arena: Arc, + pub arena: Vec>, // if true, the states are moved out of the payload into other payload, and will not be dropped pub state_move_out: bool, pub group_types: Vec, @@ -94,7 +94,7 @@ pub type Pages = Vec; impl Payload { pub fn new( - arena: Arc, + arena: Vec>, group_types: Vec, aggrs: Vec, states_layout: Option, @@ -267,7 +267,7 @@ impl Payload { unsafe { serialize_column_to_rowformat( - &self.arena, + &self.arena[0], col, select_vector, new_group_rows, @@ -297,7 +297,7 @@ impl Payload { // write states let (array_layout, padded_size) = layout.repeat(new_group_rows).unwrap(); // Bump only allocates but does not drop, so there is no use after free for any item. - let place = self.arena.alloc_layout(array_layout); + let place = self.arena[0].alloc_layout(array_layout); for (idx, place) in select_vector .iter() .take(new_group_rows) @@ -385,7 +385,11 @@ impl Payload { ); } - pub fn scatter(&self, state: &mut PayloadFlushState, partition_count: usize) -> bool { + pub fn scatter_with_seed( + &self, + state: &mut PayloadFlushState, + partitions: usize, + ) -> bool { if state.flush_page >= self.pages.len() { return false; } @@ -397,23 +401,27 @@ impl Payload { state.flush_page += 1; state.flush_page_row = 0; state.row_count = 0; - return self.scatter(state, partition_count); + return self.scatter_with_seed::(state, partitions); } let end = (state.flush_page_row + BATCH_SIZE).min(page.rows); let rows = end - state.flush_page_row; state.row_count = rows; - state.probe_state.reset_partitions(partition_count); + state.probe_state.reset_partitions(partitions); + + let mods: StrengthReducedU64 = StrengthReducedU64::new(partitions as u64); - let mods: StrengthReducedU64 = StrengthReducedU64::new(partition_count as u64); for idx in 0..rows { state.addresses[idx] = self.data_ptr(page, idx + state.flush_page_row); - let hash = unsafe { read::(state.addresses[idx].add(self.hash_offset) as _) }; + let mut hash = unsafe { read::(state.addresses[idx].add(self.hash_offset) as _) }; - let partition_idx = (hash % mods) as usize; + if SEED != 0 { + hash = Self::combine_hash(hash, SEED); + } + let partition_idx = (hash % mods) as usize; let sel = &mut state.probe_state.partition_entries[partition_idx]; sel[state.probe_state.partition_count[partition_idx]] = idx; state.probe_state.partition_count[partition_idx] += 1; @@ -422,6 +430,10 @@ impl Payload { true } + pub fn scatter(&self, state: &mut PayloadFlushState, partitions: usize) -> bool { + self.scatter_with_seed::<0>(state, partitions) + } + pub fn empty_block(&self, fake_rows: Option) -> DataBlock { let fake_rows = fake_rows.unwrap_or(0); let columns = (0..self.aggrs.len()) @@ -434,6 +446,18 @@ impl Payload { .collect_vec(); DataBlock::new_from_columns(columns) } + + #[allow(unused_parens)] + fn combine_hash(hash: u64, seed: u64) -> u64 { + static KMUL: u64 = 0x9ddfea08eb382d69; + + let mut a = (seed ^ hash).wrapping_mul(KMUL); + a ^= (a >> 47); + + let mut b = (hash ^ a).wrapping_mul(KMUL); + b ^= (b >> 47); + b.wrapping_mul(KMUL) + } } impl Drop for Payload { diff --git a/src/query/expression/src/aggregate/payload_flush.rs b/src/query/expression/src/aggregate/payload_flush.rs index 4fe9f35830227..2b3161c252950 100644 --- a/src/query/expression/src/aggregate/payload_flush.rs +++ b/src/query/expression/src/aggregate/payload_flush.rs @@ -164,22 +164,6 @@ impl Payload { Ok(Some(DataBlock::new_from_columns(cols))) } - pub fn group_by_flush_all(&self) -> Result { - let mut state = PayloadFlushState::default(); - let mut blocks = vec![]; - - while self.flush(&mut state) { - let cols = state.take_group_columns(); - blocks.push(DataBlock::new_from_columns(cols)); - } - - if blocks.is_empty() { - return Ok(self.empty_block(None)); - } - - DataBlock::concat(&blocks) - } - pub fn flush(&self, state: &mut PayloadFlushState) -> bool { if state.flush_page >= self.pages.len() { return false; diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs index ce8b908e0b5ae..ee73c9142e30c 100644 --- a/src/query/expression/src/aggregate/payload_row.rs +++ b/src/query/expression/src/aggregate/payload_row.rs @@ -421,7 +421,7 @@ unsafe fn row_match_binary_column( } } - select_vector.clone_from_slice(temp_vector); + select_vector.clone_from_slice(temp_vector.as_slice()); *count = match_count; } @@ -502,7 +502,7 @@ unsafe fn row_match_string_column( } } - select_vector.clone_from_slice(temp_vector); + select_vector.clone_from_slice(temp_vector.as_slice()); *count = match_count; } @@ -567,7 +567,7 @@ unsafe fn row_match_column_type( } } - select_vector.clone_from_slice(temp_vector); + select_vector.clone_from_slice(temp_vector.as_slice()); *count = match_count; } @@ -604,6 +604,6 @@ unsafe fn row_match_generic_column( *no_match_count += 1; } } - select_vector.clone_from_slice(temp_vector); + select_vector.clone_from_slice(temp_vector.as_slice()); *count = match_count; } diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index 896c1ff46cca9..5b1cb702abb18 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -20,10 +20,10 @@ use crate::BATCH_SIZE; /// ProbeState is the state to probe HT /// It could be reuse during multiple probe process pub struct ProbeState { - pub group_hashes: [u64; BATCH_SIZE], - pub addresses: [*const u8; BATCH_SIZE], - pub page_index: [usize; BATCH_SIZE], - pub state_places: [StateAddr; BATCH_SIZE], + pub group_hashes: Vec, + pub addresses: Vec<*const u8>, + pub page_index: Vec, + pub state_places: Vec, pub group_compare_vector: SelectVector, pub no_match_vector: SelectVector, pub empty_vector: SelectVector, @@ -37,10 +37,10 @@ pub struct ProbeState { impl Default for ProbeState { fn default() -> Self { Self { - group_hashes: [0_u64; BATCH_SIZE], - addresses: [std::ptr::null::(); BATCH_SIZE], - page_index: [0; BATCH_SIZE], - state_places: [StateAddr::new(0); BATCH_SIZE], + group_hashes: vec![0_u64; BATCH_SIZE], + addresses: vec![std::ptr::null::(); BATCH_SIZE], + page_index: vec![0; BATCH_SIZE], + state_places: vec![StateAddr::new(0); BATCH_SIZE], group_compare_vector: new_sel(), no_match_vector: new_sel(), empty_vector: new_sel(), @@ -64,8 +64,8 @@ impl ProbeState { pub fn reset_partitions(&mut self, partition_count: usize) { if self.partition_entries.len() < partition_count { - self.partition_entries.resize(partition_count, new_sel()); self.partition_count.resize(partition_count, 0); + self.partition_entries.resize_with(partition_count, new_sel); } for i in 0..partition_count { diff --git a/src/query/expression/src/lib.rs b/src/query/expression/src/lib.rs index e402fe927d1a2..0de870bd811ff 100755 --- a/src/query/expression/src/lib.rs +++ b/src/query/expression/src/lib.rs @@ -43,6 +43,7 @@ #![feature(alloc_layout_extra)] #![feature(debug_closure_helpers)] #![feature(never_type)] +extern crate core; #[allow(dead_code)] mod block; diff --git a/src/query/pipeline/core/Cargo.toml b/src/query/pipeline/core/Cargo.toml index 64866ff49dc29..3464bd77c300d 100644 --- a/src/query/pipeline/core/Cargo.toml +++ b/src/query/pipeline/core/Cargo.toml @@ -17,6 +17,7 @@ futures = { workspace = true } log = { workspace = true } petgraph = { workspace = true } serde = { workspace = true } +typetag = { workspace = true } [dev-dependencies] serde = { workspace = true } diff --git a/src/query/pipeline/core/src/lib.rs b/src/query/pipeline/core/src/lib.rs index d064965129771..a8a59cadeb076 100644 --- a/src/query/pipeline/core/src/lib.rs +++ b/src/query/pipeline/core/src/lib.rs @@ -15,6 +15,8 @@ #![feature(once_cell_try)] #![feature(variant_count)] #![feature(associated_type_defaults)] +#![feature(adt_const_params)] +#![feature(let_chains)] #![allow(clippy::arc_with_non_send_sync)] #![allow(clippy::useless_asref)] diff --git a/src/query/pipeline/core/src/pipeline.rs b/src/query/pipeline/core/src/pipeline.rs index 8072b7b997b88..808849efa2bf7 100644 --- a/src/query/pipeline/core/src/pipeline.rs +++ b/src/query/pipeline/core/src/pipeline.rs @@ -20,6 +20,7 @@ use std::sync::atomic::Ordering; use std::sync::Arc; use std::time::Instant; +use databend_common_base::base::tokio::sync::Barrier; use databend_common_base::runtime::defer; use databend_common_base::runtime::drop_guard; use databend_common_exception::ErrorCode; @@ -32,10 +33,14 @@ use crate::finished_chain::ExecutionInfo; use crate::finished_chain::FinishedCallbackChain; use crate::pipe::Pipe; use crate::pipe::PipeItem; +use crate::processors::BatchExchangeProcessor; +use crate::processors::BatchMergePartitionProcessor; +use crate::processors::BatchPartitionProcessor; use crate::processors::DuplicateProcessor; use crate::processors::Exchange; use crate::processors::InputPort; use crate::processors::MergePartitionProcessor; +use crate::processors::OnePartitionProcessor; use crate::processors::OutputPort; use crate::processors::PartitionProcessor; use crate::processors::PlanScope; @@ -447,23 +452,43 @@ impl Pipeline { } } - pub fn exchange(&mut self, n: usize, exchange: Arc) { + pub fn exchange(&mut self, n: usize, exchange: Arc) -> Result<()> { + debug_assert_ne!(n, 0); + + if !T::MULTIWAY_SORT { + return self.batch_exchange(n, exchange); + } + if let Some(pipe) = self.pipes.last() { if pipe.output_length < 1 { - return; + return Ok(()); } let input_len = pipe.output_length; + let barrier = Arc::new(Barrier::new(input_len)); let mut items = Vec::with_capacity(input_len); - for _index in 0..input_len { + for index in 0..input_len { let input = InputPort::create(); - let outputs: Vec<_> = (0..n).map(|_| OutputPort::create()).collect(); - items.push(PipeItem::create( - PartitionProcessor::create(input.clone(), outputs.clone(), exchange.clone()), - vec![input], - outputs, - )); + let outputs = (0..n).map(|_| OutputPort::create()).collect::>(); + let partition_processor = match n { + 1 => OnePartitionProcessor::create( + input.clone(), + outputs[0].clone(), + exchange.clone(), + index, + barrier.clone(), + ), + _ => PartitionProcessor::create( + input.clone(), + outputs.clone(), + exchange.clone(), + index, + barrier.clone(), + ), + }; + + items.push(PipeItem::create(partition_processor, vec![input], outputs)); } // partition data block @@ -481,7 +506,7 @@ impl Pipeline { let output = OutputPort::create(); let inputs: Vec<_> = (0..input_len).map(|_| InputPort::create()).collect(); items.push(PipeItem::create( - MergePartitionProcessor::create( + MergePartitionProcessor::::create( inputs.clone(), output.clone(), exchange.clone(), @@ -492,8 +517,43 @@ impl Pipeline { } // merge partition - self.add_pipe(Pipe::create(input_len * n, n, items)) + self.add_pipe(Pipe::create(input_len * n, n, items)); } + + Ok(()) + } + + fn batch_exchange(&mut self, n: usize, exchange: Arc) -> Result<()> { + self.add_transform(|input, output| { + Ok(BatchPartitionProcessor::create( + input, + output, + n, + exchange.clone(), + )) + })?; + + let input_len = self.output_len(); + let inputs = (0..input_len) + .map(|_| InputPort::create()) + .collect::>(); + let outputs = (0..n).map(|_| OutputPort::create()).collect::>(); + + self.add_pipe(Pipe::create(input_len, n, vec![PipeItem::create( + BatchExchangeProcessor::create(inputs.clone(), outputs.clone(), exchange.clone()), + inputs, + outputs, + )])); + + self.add_transform(|input, output| { + Ok(BatchMergePartitionProcessor::create( + input, + output, + exchange.clone(), + )) + })?; + + Ok(()) } #[track_caller] diff --git a/src/query/pipeline/core/src/processors/mod.rs b/src/query/pipeline/core/src/processors/mod.rs index c3b0e1772a341..00023c709fd0d 100644 --- a/src/query/pipeline/core/src/processors/mod.rs +++ b/src/query/pipeline/core/src/processors/mod.rs @@ -37,7 +37,12 @@ pub use profile::PlanScope; pub use profile::PlanScopeGuard; pub use resize_processor::create_resize_item; pub use resize_processor::ResizeProcessor; +pub use shuffle_processor::BatchExchangeProcessor; +pub use shuffle_processor::BatchMergePartitionProcessor; +pub use shuffle_processor::BatchPartitionProcessor; pub use shuffle_processor::Exchange; pub use shuffle_processor::MergePartitionProcessor; +pub use shuffle_processor::MultiwayStrategy; +pub use shuffle_processor::OnePartitionProcessor; pub use shuffle_processor::PartitionProcessor; pub use shuffle_processor::ShuffleProcessor; diff --git a/src/query/pipeline/core/src/processors/processor.rs b/src/query/pipeline/core/src/processors/processor.rs index ce70053b80ded..d9e885a1b69ad 100644 --- a/src/query/pipeline/core/src/processors/processor.rs +++ b/src/query/pipeline/core/src/processors/processor.rs @@ -80,6 +80,22 @@ pub trait Processor: Send { Err(ErrorCode::Unimplemented("Unimplemented async_process.")) } + fn prepare_spill_payload(&mut self) -> Result { + Err(ErrorCode::Unimplemented( + "Unimplemented prepare_spill_payload", + )) + } + + async fn flush_spill_payload(&mut self) -> Result { + Err(ErrorCode::Unimplemented( + "Unimplemented flush_spill_payload", + )) + } + + fn configure_peer_nodes(&mut self, _nodes: &[String]) { + // do nothing by default + } + fn details_status(&self) -> Option { None } @@ -198,6 +214,10 @@ impl ProcessorPtr { .boxed() } + pub fn configure_peer_nodes(&self, nodes: &[String]) { + unsafe { (*self.inner.get()).configure_peer_nodes(nodes) } + } + /// # Safety pub unsafe fn details_status(&self) -> Option { (*self.inner.get()).details_status() diff --git a/src/query/pipeline/core/src/processors/shuffle_processor.rs b/src/query/pipeline/core/src/processors/shuffle_processor.rs index 2b57c3b3cc333..3ba0673135f34 100644 --- a/src/query/pipeline/core/src/processors/shuffle_processor.rs +++ b/src/query/pipeline/core/src/processors/shuffle_processor.rs @@ -13,9 +13,16 @@ // limitations under the License. use std::any::Any; +use std::cmp::Ordering; +use std::collections::VecDeque; use std::sync::Arc; +use databend_common_base::base::tokio::sync::Barrier; use databend_common_exception::Result; +use databend_common_expression::local_block_meta_serde; +use databend_common_expression::BlockMetaInfo; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::BlockMetaInfoPtr; use databend_common_expression::DataBlock; use crate::processors::Event; @@ -25,6 +32,7 @@ use crate::processors::OutputPort; use crate::processors::Processor; use crate::processors::ProcessorPtr; +#[derive(Eq, PartialEq)] pub enum MultiwayStrategy { Random, Custom, @@ -32,14 +40,43 @@ pub enum MultiwayStrategy { pub trait Exchange: Send + Sync + 'static { const NAME: &'static str; + const MULTIWAY_SORT: bool = false; const SKIP_EMPTY_DATA_BLOCK: bool = false; - const STRATEGY: MultiwayStrategy = MultiwayStrategy::Random; fn partition(&self, data_block: DataBlock, n: usize) -> Result>; - fn multiway_pick(&self, _partitions: &[Option]) -> Result { + fn init_way(&self, _index: usize, _first_data: &DataBlock) -> Result<()> { + Ok(()) + } + + fn sorting_function(_: &DataBlock, _: &DataBlock) -> Ordering { unimplemented!() } + + fn multiway_pick(&self, data_blocks: &mut [Option]) -> Option { + let position = + data_blocks + .iter() + .enumerate() + .filter_map(|(idx, x)| x.as_ref().map(|d| (idx, d))) + .min_by(|(left_idx, left_block), (right_idx, right_block)| { + match Self::sorting_function(left_block, right_block) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => left_idx.cmp(right_idx), + } + }); + + position.map(|(idx, _)| idx) + } + + fn output_window_size(&self) -> usize { + 3 + } + + fn merge_output(&self, data_blocks: Vec) -> Result> { + Ok(data_blocks) + } } pub struct ShuffleProcessor { @@ -166,6 +203,11 @@ pub struct PartitionProcessor { exchange: Arc, input_data: Option, partitioned_data: Vec>, + + index: usize, + initialized: bool, + barrier: Arc, + hit: usize, } impl PartitionProcessor { @@ -173,18 +215,26 @@ impl PartitionProcessor { input: Arc, outputs: Vec>, exchange: Arc, + index: usize, + barrier: Arc, ) -> ProcessorPtr { let partitioned_data = vec![None; outputs.len()]; + let hit = index % outputs.len(); ProcessorPtr::create(Box::new(PartitionProcessor { input, outputs, exchange, partitioned_data, input_data: None, + initialized: !T::MULTIWAY_SORT, + index, + barrier, + hit, })) } } +#[async_trait::async_trait] impl Processor for PartitionProcessor { fn name(&self) -> String { format!("ShufflePartition({})", T::NAME) @@ -198,7 +248,15 @@ impl Processor for PartitionProcessor { let mut all_output_finished = true; let mut all_data_pushed_output = true; - for (index, output) in self.outputs.iter().enumerate() { + for _index in 0..self.outputs.len() { + let index = self.hit; + let output = &self.outputs[self.hit]; + self.hit += 1; + + if self.hit == self.outputs.len() { + self.hit = 0; + } + if output.is_finished() { self.partitioned_data[index].take(); continue; @@ -208,20 +266,25 @@ impl Processor for PartitionProcessor { if output.can_push() { if let Some(block) = self.partitioned_data[index].take() { - output.push_data(Ok(block)); - - continue; + if !block.is_empty() || block.get_meta().is_some() { + output.push_data(Ok(block)); + return Ok(Event::NeedConsume); + } } } - if self.partitioned_data[index].is_some() { + if !output.can_push() || self.partitioned_data[index].is_some() { all_data_pushed_output = false; } } if all_output_finished { self.input.finish(); - return Ok(Event::Finished); + + return match self.initialized { + true => Ok(Event::Finished), + false => Ok(Event::Async), + }; } if !all_data_pushed_output { @@ -229,9 +292,20 @@ impl Processor for PartitionProcessor { return Ok(Event::NeedConsume); } + if self.input_data.is_some() { + return match self.initialized { + true => Ok(Event::Sync), + false => Ok(Event::Async), + }; + } + if self.input.has_data() { self.input_data = Some(self.input.pull_data().unwrap()?); - return Ok(Event::Sync); + + return match self.initialized { + true => Ok(Event::Sync), + false => Ok(Event::Async), + }; } if self.input.is_finished() { @@ -239,7 +313,10 @@ impl Processor for PartitionProcessor { output.finish(); } - return Ok(Event::Finished); + return match self.initialized { + true => Ok(Event::Finished), + false => Ok(Event::Async), + }; } self.input.set_need_data(); @@ -254,25 +331,160 @@ impl Processor for PartitionProcessor { let partitioned = self.exchange.partition(block, self.outputs.len())?; - for (index, block) in partitioned.into_iter().enumerate() { - if block.is_empty() && block.get_meta().is_none() { - continue; - } + if partitioned.is_empty() { + return Ok(()); + } + assert_eq!(partitioned.len(), self.outputs.len()); + for (index, block) in partitioned.into_iter().enumerate() { self.partitioned_data[index] = Some(block); } } Ok(()) } + + async fn async_process(&mut self) -> Result<()> { + self.initialized = true; + if let Some(data_block) = self.input_data.as_ref() { + self.exchange.init_way(self.index, data_block)?; + } + + self.barrier.wait().await; + Ok(()) + } } -pub struct MergePartitionProcessor { +pub struct OnePartitionProcessor { + input: Arc, + output: Arc, + exchange: Arc, + input_data: Option, + + index: usize, + initialized: bool, + barrier: Arc, +} + +impl OnePartitionProcessor { + pub fn create( + input: Arc, + outputs: Arc, + exchange: Arc, + index: usize, + barrier: Arc, + ) -> ProcessorPtr { + ProcessorPtr::create(Box::new(OnePartitionProcessor { + input, + output: outputs, + exchange, + input_data: None, + initialized: !T::MULTIWAY_SORT, + index, + barrier, + })) + } +} + +#[async_trait::async_trait] +impl Processor for OnePartitionProcessor { + fn name(&self) -> String { + format!("ShufflePartition({})", T::NAME) + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + self.input.finish(); + + return match self.initialized { + true => Ok(Event::Finished), + false => Ok(Event::Async), + }; + } + + if !self.output.can_push() { + self.input.set_not_need_data(); + return Ok(Event::NeedConsume); + } + + if self.input_data.is_some() { + if !self.initialized { + return Ok(Event::Async); + } + + let block = self.input_data.take().unwrap(); + let mut partitioned_data = self.exchange.partition(block, 1)?; + + if let Some(block) = partitioned_data.pop() { + debug_assert!(partitioned_data.is_empty()); + self.output.push_data(Ok(block)); + return Ok(Event::NeedConsume); + } + } + + if self.input.has_data() { + if !self.initialized { + self.input_data = Some(self.input.pull_data().unwrap()?); + return Ok(Event::Async); + } + + let data_block = self.input.pull_data().unwrap()?; + let mut partitioned_data = self.exchange.partition(data_block, 1)?; + + if let Some(block) = partitioned_data.pop() { + debug_assert!(partitioned_data.is_empty()); + self.output.push_data(Ok(block)); + return Ok(Event::NeedConsume); + } + } + if self.input.is_finished() { + self.output.finish(); + + return match self.initialized { + true => Ok(Event::Finished), + false => Ok(Event::Async), + }; + } + + self.input.set_need_data(); + Ok(Event::NeedData) + } + + async fn async_process(&mut self) -> Result<()> { + self.initialized = true; + if let Some(data_block) = self.input_data.as_ref() { + self.exchange.init_way(self.index, data_block)?; + } + + self.barrier.wait().await; + Ok(()) + } +} + +#[derive(Clone, PartialEq)] +enum PortStatus { + Idle, + HasData, + Finished, +} + +pub struct MergePartitionProcessor { output: Arc, inputs: Vec>, inputs_data: Vec>, + exchange: Arc, + + initialize: bool, + finished_inputs: usize, + waiting_inputs: VecDeque, + wakeup_inputs: VecDeque, + inputs_status: Vec, } impl MergePartitionProcessor { @@ -282,18 +494,30 @@ impl MergePartitionProcessor { exchange: Arc, ) -> ProcessorPtr { let inputs_data = vec![None; inputs.len()]; - ProcessorPtr::create(Box::new(MergePartitionProcessor { + let inputs_status = vec![PortStatus::Idle; inputs.len()]; + let waiting_inputs = VecDeque::with_capacity(inputs.len()); + let wakeup_inputs = VecDeque::with_capacity(inputs.len()); + + ProcessorPtr::create(Box::new(MergePartitionProcessor:: { output, inputs, - exchange, inputs_data, + exchange, + inputs_status, + waiting_inputs, + initialize: false, + finished_inputs: 0, + wakeup_inputs, })) } } impl Processor for MergePartitionProcessor { fn name(&self) -> String { - format!("ShuffleMergePartition({})", T::NAME) + match T::MULTIWAY_SORT { + true => format!("ShuffleSortMergePartition({})", T::NAME), + false => format!("ShuffleMergePartition({})", T::NAME), + } } fn as_any(&mut self) -> &mut dyn Any { @@ -314,8 +538,7 @@ impl Processor for MergePartitionProcessor { } let mut all_inputs_finished = true; - let mut need_pick_block_to_push = matches!(T::STRATEGY, MultiwayStrategy::Custom); - + let mut need_pick_block_to_push = true; for (index, input) in self.inputs.iter().enumerate() { if input.is_finished() { continue; @@ -323,19 +546,8 @@ impl Processor for MergePartitionProcessor { all_inputs_finished = false; - if input.has_data() { - match T::STRATEGY { - MultiwayStrategy::Random => { - if self.output.can_push() { - self.output.push_data(Ok(input.pull_data().unwrap()?)); - } - } - MultiwayStrategy::Custom => { - if self.inputs_data[index].is_none() { - self.inputs_data[index] = Some(input.pull_data().unwrap()?); - } - } - } + if input.has_data() && self.inputs_data[index].is_none() { + self.inputs_data[index] = Some(input.pull_data().unwrap()?); } if self.inputs_data[index].is_none() { @@ -345,20 +557,456 @@ impl Processor for MergePartitionProcessor { input.set_need_data(); } + if need_pick_block_to_push { + if let Some(pick_index) = self.exchange.multiway_pick(&mut self.inputs_data) { + if let Some(block) = self.inputs_data[pick_index].take() { + self.output.push_data(Ok(block)); + return Ok(Event::NeedConsume); + } + } + } + if all_inputs_finished { self.output.finish(); return Ok(Event::Finished); } - if need_pick_block_to_push { - let pick_index = self.exchange.multiway_pick(&self.inputs_data)?; + Ok(Event::NeedData) + } - if let Some(block) = self.inputs_data[pick_index].take() { - self.output.push_data(Ok(block)); + fn event_with_cause(&mut self, cause: EventCause) -> Result { + if T::MULTIWAY_SORT { + return self.event(); + } + + if let EventCause::Output(_) = cause { + if self.output.is_finished() { + for input in &self.inputs { + input.finish(); + } + + return Ok(Event::Finished); + } + + if !self.output.can_push() { + return Ok(Event::NeedConsume); + } + + while let Some(idx) = self.wakeup_inputs.pop_front() { + self.inputs[idx].set_need_data(); + } + } + + if !self.initialize && self.waiting_inputs.is_empty() { + self.initialize = true; + + for input in &self.inputs { + input.set_need_data(); + } + + return Ok(Event::NeedData); + } + + if let EventCause::Input(idx) = cause { + if self.inputs[idx].is_finished() && self.inputs_status[idx] != PortStatus::Finished { + self.finished_inputs += 1; + self.inputs_status[idx] = PortStatus::Finished; + } + + if self.inputs[idx].has_data() && self.inputs_status[idx] != PortStatus::HasData { + self.waiting_inputs.push_back(idx); + self.inputs_status[idx] = PortStatus::HasData; + } + } + + if self.finished_inputs == self.inputs.len() { + self.output.finish(); + return Ok(Event::Finished); + } + + while !self.waiting_inputs.is_empty() && self.output.can_push() { + let idx = self.waiting_inputs.pop_front().unwrap(); + self.output.push_data(self.inputs[idx].pull_data().unwrap()); + self.inputs_status[idx] = PortStatus::Idle; + + if self.inputs[idx].is_finished() { + if self.inputs_status[idx] != PortStatus::Finished { + self.finished_inputs += 1; + self.inputs_status[idx] = PortStatus::Finished; + } + + continue; + } + + self.wakeup_inputs.push_back(idx); + } + + match self.waiting_inputs.is_empty() { + true => Ok(Event::NeedData), + false => Ok(Event::NeedConsume), + } + } +} + +#[derive(Debug)] +pub struct ExchangeMeta { + data_blocks: Vec, +} + +local_block_meta_serde!(ExchangeMeta); + +#[typetag::serde(name = "LocalExchangeMeta")] +impl BlockMetaInfo for ExchangeMeta {} + +impl ExchangeMeta { + pub fn create(blocks: Vec) -> BlockMetaInfoPtr { + Box::new(ExchangeMeta { + data_blocks: blocks, + }) + } +} + +pub struct BatchPartitionProcessor { + input: Arc, + output: Arc, + + input_data: Option, + output_data: Option, + + exchange: Arc, + to_partition: usize, +} + +impl BatchPartitionProcessor { + pub fn create( + input: Arc, + output: Arc, + to_partition: usize, + exchange: Arc, + ) -> ProcessorPtr { + ProcessorPtr::create(Box::new(BatchPartitionProcessor { + input, + output, + exchange, + to_partition, + input_data: None, + output_data: None, + })) + } +} + +impl Processor for BatchPartitionProcessor { + fn name(&self) -> String { + String::from("PartitionProcessor") + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.output.can_push() { + self.input.set_not_need_data(); + return Ok(Event::NeedConsume); + } + + if let Some(data_block) = self.output_data.take() { + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + + if self.input.has_data() { + self.input_data = Some(self.input.pull_data().unwrap()?); + return Ok(Event::Sync); + } + + if self.input.is_finished() { + self.output.finish(); + return Ok(Event::Finished); + } + + self.input.set_need_data(); + Ok(Event::NeedData) + } + + fn process(&mut self) -> Result<()> { + if let Some(block) = self.input_data.take() { + if T::SKIP_EMPTY_DATA_BLOCK && block.is_empty() { + return Ok(()); + } + + let partitioned_data = self.exchange.partition(block, self.to_partition)?; + self.output_data = Some(DataBlock::empty_with_meta(ExchangeMeta::create( + partitioned_data, + ))); + } + + Ok(()) + } +} + +pub struct BatchExchangeProcessor { + input: Vec>, + output: Vec>, + + initialize: bool, + + finished_input_size: usize, + input_finish_status: Vec, + waiting_inputs: VecDeque, + + finished_output_size: usize, + pending_outputs: Vec, + output_finish_status: Vec, + + exchange: Arc, + matrix: Vec>, +} + +impl BatchExchangeProcessor { + pub fn create( + input: Vec>, + output: Vec>, + exchange: Arc, + ) -> ProcessorPtr { + let pending_outputs = vec![false; output.len()]; + let input_finish_status = vec![false; input.len()]; + let output_finish_status = vec![false; output.len()]; + + let mut matrix = Vec::with_capacity(output.len()); + + for _ in 0..output.capacity() { + matrix.push(VecDeque::new()); + } + + ProcessorPtr::create(Box::new(BatchExchangeProcessor { + input, + output, + matrix, + exchange, + pending_outputs, + input_finish_status, + output_finish_status, + + initialize: false, + finished_input_size: 0, + finished_output_size: 0, + waiting_inputs: VecDeque::new(), + })) + } +} + +impl Processor for BatchExchangeProcessor { + fn name(&self) -> String { + String::from("BatchExchangeProcessor") + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event_with_cause(&mut self, cause: EventCause) -> Result { + if let EventCause::Input(index) = cause { + if self.input[index].has_data() { + let mut data_block = self.input[index].pull_data().unwrap()?; + + let meta = data_block.take_meta().unwrap(); + let meta = ExchangeMeta::downcast_from(meta).unwrap(); + + for (idx, block) in meta.data_blocks.into_iter().enumerate() { + self.matrix[idx].push_back(block); + } + } + + if self.input[index].is_finished() { + if !self.input_finish_status[index] { + self.finished_input_size += 1; + self.input_finish_status[index] = true; + } + } else { + self.waiting_inputs.push_back(index); + } + } + + if let EventCause::Output(index) = cause { + if self.output[index].is_finished() && !self.output_finish_status[index] { + self.finished_output_size += 1; + self.output_finish_status[index] = true; + } + + if self.output[index].can_push() { + self.pending_outputs[index] = true; + } + } + + if !self.initialize { + self.initialize = true; + + for input in &self.input { + input.set_need_data(); + } + + return Ok(Event::NeedData); + } + + if self.finished_output_size == self.output.len() { + for input in &self.input { + input.finish(); + } + + return Ok(Event::Finished); + } + + let all_input_finished = self.finished_input_size == self.input.len(); + + let mut sent_all_data = true; + for (idx, data) in self.matrix.iter_mut().enumerate() { + if data.is_empty() || self.output_finish_status[idx] { + continue; + } + + sent_all_data = false; + if self.pending_outputs[idx] + && (all_input_finished || (data.len() >= self.exchange.output_window_size())) + { + self.pending_outputs[idx] = false; + let mut output_data = Vec::with_capacity(self.exchange.output_window_size()); + + for _index in 0..self.exchange.output_window_size() { + if let Some(data) = data.pop_front() { + output_data.push(data); + } + } + + self.output[idx].push_data(Ok(DataBlock::empty_with_meta(ExchangeMeta::create( + output_data, + )))); return Ok(Event::NeedConsume); } } + while let Some(index) = self.waiting_inputs.pop_front() { + if !self.input[index].is_finished() { + self.input[index].set_need_data(); + return Ok(Event::NeedData); + } else if !self.input_finish_status[index] { + self.input_finish_status[index] = true; + self.finished_input_size += 1; + } + } + + let all_input_finished = self.finished_input_size == self.input.len(); + if sent_all_data && all_input_finished { + for output in &self.output { + output.finish(); + } + + return Ok(Event::Finished); + } + + Ok(Event::NeedConsume) + } +} + +pub struct BatchMergePartitionProcessor { + input: Arc, + output: Arc, + + input_data: Option, + output_data: VecDeque, + + exchange: Arc, +} + +impl BatchMergePartitionProcessor { + pub fn create( + input: Arc, + output: Arc, + exchange: Arc, + ) -> ProcessorPtr { + ProcessorPtr::create(Box::new(BatchMergePartitionProcessor { + input, + output, + input_data: None, + output_data: VecDeque::new(), + exchange, + })) + } +} + +impl Processor for BatchMergePartitionProcessor { + fn name(&self) -> String { + String::from("MergePartitionProcessor") + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.output.can_push() { + self.input.set_not_need_data(); + return Ok(Event::NeedConsume); + } + + if let Some(data_block) = self.output_data.pop_front() { + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + + if self.input.has_data() { + self.input_data = Some(self.input.pull_data().unwrap()?); + return Ok(Event::Sync); + } + + if self.input.is_finished() { + self.output.finish(); + return Ok(Event::Finished); + } + + self.input.set_need_data(); Ok(Event::NeedData) } + + fn process(&mut self) -> Result<()> { + if let Some(mut block) = self.input_data.take() { + let meta = block.take_meta().unwrap(); + let meta = ExchangeMeta::downcast_from(meta).unwrap(); + self.output_data + .extend(self.exchange.merge_output(meta.data_blocks)?); + } + + Ok(()) + } } + +// pub struct BatchSortingExchangeProcessor { +// exchange: Arc, +// +// inputs: Vec>, +// outputs: Vec>, +// } +// +// impl Processor for BatchSortingExchangeProcessor { +// fn name(&self) -> String { +// String::from("BatchSortingShuffleProcessor") +// } +// +// fn as_any(&mut self) -> &mut dyn Any { +// self +// } +// +// fn event_with_cause(&mut self, _cause: EventCause) -> Result { +// todo!() +// } +// } diff --git a/src/query/pipeline/transforms/src/processors/transforms/transform_accumulating.rs b/src/query/pipeline/transforms/src/processors/transforms/transform_accumulating.rs index 1ffc9dcbcf2ea..dc46a6c6d759d 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/transform_accumulating.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/transform_accumulating.rs @@ -18,6 +18,7 @@ use std::marker::PhantomData; use std::sync::Arc; use databend_common_base::runtime::drop_guard; +use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::BlockMetaInfo; use databend_common_expression::BlockMetaInfoDowncast; @@ -27,9 +28,12 @@ use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; +#[async_trait::async_trait] pub trait AccumulatingTransform: Send { const NAME: &'static str; + const SUPPORT_SPILL: bool = false; + fn transform(&mut self, data: DataBlock) -> Result>; fn on_finish(&mut self, _output: bool) -> Result> { @@ -37,6 +41,24 @@ pub trait AccumulatingTransform: Send { } fn interrupt(&self) {} + + fn configure_peer_nodes(&mut self, _nodes: &[String]) {} + + fn need_spill(&self) -> bool { + false + } + + fn prepare_spill_payload(&mut self) -> Result { + Err(ErrorCode::Unimplemented( + "Unimplemented prepare_spill_payload", + )) + } + + async fn flush_spill_payload(&mut self) -> Result { + Err(ErrorCode::Unimplemented( + "Unimplemented flush_spill_payload", + )) + } } pub struct AccumulatingTransformer { @@ -47,6 +69,10 @@ pub struct AccumulatingTransformer { called_on_finish: bool, input_data: Option, output_data: VecDeque, + + has_spill: bool, + flush_spill_payload: bool, + prepare_spill_payload: bool, } impl AccumulatingTransformer { @@ -58,6 +84,9 @@ impl AccumulatingTransformer { input_data: None, output_data: VecDeque::with_capacity(1), called_on_finish: false, + has_spill: false, + flush_spill_payload: false, + prepare_spill_payload: false, }) } } @@ -93,6 +122,14 @@ impl Processor for AccumulatingTransformer Processor for AccumulatingTransformer Ok(Event::Sync), + true => { + // To avoid downstream out-of-memory, once a spill occurs, all data must be spilled entirely. + if self.has_spill { + self.has_spill = false; + self.prepare_spill_payload = true; + } + + Ok(Event::Sync) + } false => { self.output.finish(); Ok(Event::Finished) @@ -126,9 +171,21 @@ impl Processor for AccumulatingTransformer Result<()> { + if self.prepare_spill_payload { + self.prepare_spill_payload = false; + self.flush_spill_payload = self.prepare_spill_payload()?; + return Ok(()); + } + if let Some(data_block) = self.input_data.take() { self.output_data.extend(self.inner.transform(data_block)?); + self.prepare_spill_payload = self.inner.need_spill(); + self.has_spill |= self.prepare_spill_payload; return Ok(()); } @@ -140,8 +197,25 @@ impl Processor for AccumulatingTransformer Result<()> { + if self.flush_spill_payload { + self.flush_spill_payload = false; + self.prepare_spill_payload = self.flush_spill_payload().await?; + } + + Ok(()) + } + + fn prepare_spill_payload(&mut self) -> Result { + self.inner.prepare_spill_payload() + } + + async fn flush_spill_payload(&mut self) -> Result { + self.inner.flush_spill_payload().await + } + + fn configure_peer_nodes(&mut self, nodes: &[String]) { + self.inner.configure_peer_nodes(nodes) } } diff --git a/src/query/service/src/interpreters/common/query_log.rs b/src/query/service/src/interpreters/common/query_log.rs index 2896858421360..2e9dd024b642b 100644 --- a/src/query/service/src/interpreters/common/query_log.rs +++ b/src/query/service/src/interpreters/common/query_log.rs @@ -225,7 +225,7 @@ impl InterpreterQueryLog { has_profiles: false, txn_state, txn_id, - peek_memory_usage: HashMap::new(), + peak_memory_usage: HashMap::new(), }) } @@ -337,7 +337,7 @@ impl InterpreterQueryLog { let txn_id = guard.txn_id().to_string(); drop(guard); - let peek_memory_usage = ctx.get_node_peek_memory_usage(); + let peak_memory_usage = ctx.get_node_peek_memory_usage(); Self::write_log(QueryLogElement { log_type, @@ -402,7 +402,7 @@ impl InterpreterQueryLog { has_profiles, txn_state, txn_id, - peek_memory_usage, + peak_memory_usage, }) } } diff --git a/src/query/service/src/pipelines/builders/builder_aggregate.rs b/src/query/service/src/pipelines/builders/builder_aggregate.rs index fde91de7ca754..e2a81b426cfed 100644 --- a/src/query/service/src/pipelines/builders/builder_aggregate.rs +++ b/src/query/service/src/pipelines/builders/builder_aggregate.rs @@ -37,13 +37,11 @@ use databend_common_sql::IndexType; use databend_common_storage::DataOperator; use itertools::Itertools; -use crate::pipelines::processors::transforms::aggregator::build_partition_bucket; +use crate::pipelines::processors::transforms::aggregator::build_final_aggregate; use crate::pipelines::processors::transforms::aggregator::create_udaf_script_function; -use crate::pipelines::processors::transforms::aggregator::AggregateInjector; use crate::pipelines::processors::transforms::aggregator::AggregatorParams; use crate::pipelines::processors::transforms::aggregator::FinalSingleStateAggregator; use crate::pipelines::processors::transforms::aggregator::PartialSingleStateAggregator; -use crate::pipelines::processors::transforms::aggregator::TransformAggregateSpillWriter; use crate::pipelines::processors::transforms::aggregator::TransformExpandGroupingSets; use crate::pipelines::processors::transforms::aggregator::TransformPartialAggregate; use crate::pipelines::PipelineBuilder; @@ -153,36 +151,21 @@ impl PipelineBuilder { }); } + let location_prefix = self.ctx.query_id_spill_prefix(); + let operator = DataOperator::instance().spill_operator(); self.main_pipeline.add_transform(|input, output| { Ok(ProcessorPtr::create(TransformPartialAggregate::try_create( self.ctx.clone(), input, output, + operator.clone(), params.clone(), partial_agg_config.clone(), + location_prefix.clone(), )?)) })?; - // If cluster mode, spill write will be completed in exchange serialize, because we need scatter the block data first - if !self.is_exchange_neighbor { - let operator = DataOperator::instance().spill_operator(); - let location_prefix = self.ctx.query_id_spill_prefix(); - - self.main_pipeline.add_transform(|input, output| { - Ok(ProcessorPtr::create( - TransformAggregateSpillWriter::try_create( - self.ctx.clone(), - input, - output, - operator.clone(), - params.clone(), - location_prefix.clone(), - )?, - )) - })?; - } - - self.exchange_injector = AggregateInjector::create(self.ctx.clone(), params.clone()); + self.enable_multiway_sort = true; Ok(()) } @@ -215,15 +198,13 @@ impl PipelineBuilder { return Ok(()); } - let old_inject = self.exchange_injector.clone(); - let input: &PhysicalPlan = &aggregate.input; - if matches!(input, PhysicalPlan::ExchangeSource(_)) { - self.exchange_injector = AggregateInjector::create(self.ctx.clone(), params.clone()); - } + let old_value = self.enable_multiway_sort; + self.enable_multiway_sort |= matches!(input, PhysicalPlan::ExchangeSource(_)); + self.build_pipeline(&aggregate.input)?; - self.exchange_injector = old_inject; - build_partition_bucket(&mut self.main_pipeline, params.clone()) + self.enable_multiway_sort = old_value; + build_final_aggregate(self.ctx.clone(), &mut self.main_pipeline, params.clone()) } fn build_aggregator_params( diff --git a/src/query/service/src/pipelines/builders/builder_exchange.rs b/src/query/service/src/pipelines/builders/builder_exchange.rs index 6c27b81ae366e..26af09a82e82f 100644 --- a/src/query/service/src/pipelines/builders/builder_exchange.rs +++ b/src/query/service/src/pipelines/builders/builder_exchange.rs @@ -24,7 +24,6 @@ impl PipelineBuilder { let mut build_res = exchange_manager.get_fragment_source( &exchange_source.query_id, exchange_source.source_fragment_id, - self.exchange_injector.clone(), )?; // add profile diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs index dd9ab7edd4a38..cc3a2b2ab90ad 100644 --- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs +++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs @@ -49,7 +49,7 @@ impl PipelineBuilder { self.main_pipeline.exchange( num_processors, HilbertPartitionExchange::create(partition.num_partitions), - ); + )?; let settings = self.ctx.get_settings(); let disk_bytes_limit = settings.get_window_partition_spilling_to_disk_bytes_limit()?; diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index ae0dffc1dc73b..6e531b5200c4b 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -184,12 +184,12 @@ impl PipelineBuilder { top_n.func, num_partitions as u64, ), - ) + )? } else { self.main_pipeline.exchange( num_processors, WindowPartitionExchange::create(partition_by.clone(), num_partitions), - ); + )?; } let disk_bytes_limit = settings.get_window_partition_spilling_to_disk_bytes_limit()?; diff --git a/src/query/service/src/pipelines/pipeline_build_res.rs b/src/query/service/src/pipelines/pipeline_build_res.rs index fd40f817e3cda..615caf8216153 100644 --- a/src/query/service/src/pipelines/pipeline_build_res.rs +++ b/src/query/service/src/pipelines/pipeline_build_res.rs @@ -24,8 +24,6 @@ use databend_common_pipeline_sources::OneBlockSource; use crate::interpreters::CreateTableInterpreter; use crate::pipelines::processors::transforms::HashJoinBuildState; -use crate::servers::flight::v1::exchange::DefaultExchangeInjector; -use crate::servers::flight::v1::exchange::ExchangeInjector; #[derive(Clone)] pub struct PipelineBuilderData { @@ -38,7 +36,7 @@ pub struct PipelineBuildResult { // Containing some sub queries pipelines, must be complete pipeline pub sources_pipelines: Vec, - pub exchange_injector: Arc, + pub enable_multiway_sort: bool, /// for local fragment data sharing pub builder_data: PipelineBuilderData, pub r_cte_scan_interpreters: Vec, @@ -49,7 +47,7 @@ impl PipelineBuildResult { PipelineBuildResult { main_pipeline: Pipeline::create(), sources_pipelines: vec![], - exchange_injector: DefaultExchangeInjector::create(), + enable_multiway_sort: false, builder_data: PipelineBuilderData { input_join_state: None, input_probe_schema: None, @@ -72,7 +70,7 @@ impl PipelineBuildResult { Ok(PipelineBuildResult { main_pipeline, sources_pipelines: vec![], - exchange_injector: DefaultExchangeInjector::create(), + enable_multiway_sort: false, builder_data: PipelineBuilderData { input_join_state: None, input_probe_schema: None, diff --git a/src/query/service/src/pipelines/pipeline_builder.rs b/src/query/service/src/pipelines/pipeline_builder.rs index 1763a9cc6dfc6..2563bf29cbf2a 100644 --- a/src/query/service/src/pipelines/pipeline_builder.rs +++ b/src/query/service/src/pipelines/pipeline_builder.rs @@ -33,8 +33,6 @@ use crate::interpreters::CreateTableInterpreter; use crate::pipelines::processors::transforms::HashJoinBuildState; use crate::pipelines::processors::HashJoinState; use crate::pipelines::PipelineBuildResult; -use crate::servers::flight::v1::exchange::DefaultExchangeInjector; -use crate::servers::flight::v1::exchange::ExchangeInjector; use crate::sessions::QueryContext; pub struct PipelineBuilder { @@ -49,7 +47,7 @@ pub struct PipelineBuilder { pub merge_into_probe_data_fields: Option>, pub join_state: Option>, - pub(crate) exchange_injector: Arc, + pub(crate) enable_multiway_sort: bool, pub hash_join_states: HashMap>, @@ -72,13 +70,13 @@ impl PipelineBuilder { settings, pipelines: vec![], main_pipeline: Pipeline::with_scopes(scopes), - exchange_injector: DefaultExchangeInjector::create(), merge_into_probe_data_fields: None, join_state: None, hash_join_states: HashMap::new(), r_cte_scan_interpreters: vec![], is_exchange_neighbor: false, contain_sink_processor: false, + enable_multiway_sort: false, } } @@ -105,12 +103,12 @@ impl PipelineBuilder { Ok(PipelineBuildResult { main_pipeline: self.main_pipeline, sources_pipelines: self.pipelines, - exchange_injector: self.exchange_injector, builder_data: PipelineBuilderData { input_join_state: self.join_state, input_probe_schema: self.merge_into_probe_data_fields, }, r_cte_scan_interpreters: self.r_cte_scan_interpreters, + enable_multiway_sort: self.enable_multiway_sort, }) } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs index 55688a4347259..5c91c2621275a 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs @@ -12,71 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; +use std::collections::HashMap; +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::Arc; -use bumpalo::Bump; -use databend_common_exception::ErrorCode; +use arrow_ipc::writer::IpcWriteOptions; +use arrow_ipc::CompressionType; +use databend_common_config::GlobalConfig; use databend_common_exception::Result; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; -use databend_common_expression::PartitionedPayload; use databend_common_expression::Payload; use databend_common_expression::PayloadFlushState; -use databend_common_pipeline_core::processors::ProcessorPtr; -use databend_common_pipeline_core::Pipeline; +use databend_common_pipeline_core::processors::Exchange; use databend_common_settings::FlightCompression; -use databend_common_storage::DataOperator; use crate::pipelines::processors::transforms::aggregator::aggregate_meta::AggregateMeta; -use crate::pipelines::processors::transforms::aggregator::serde::TransformExchangeAggregateSerializer; -use crate::pipelines::processors::transforms::aggregator::serde::TransformExchangeAsyncBarrier; -use crate::pipelines::processors::transforms::aggregator::AggregatorParams; -use crate::pipelines::processors::transforms::aggregator::TransformAggregateDeserializer; -use crate::pipelines::processors::transforms::aggregator::TransformAggregateSerializer; -use crate::pipelines::processors::transforms::aggregator::TransformAggregateSpillWriter; -use crate::servers::flight::v1::exchange::DataExchange; -use crate::servers::flight::v1::exchange::ExchangeInjector; -use crate::servers::flight::v1::exchange::ExchangeSorting; -use crate::servers::flight::v1::exchange::MergeExchangeParams; -use crate::servers::flight::v1::exchange::ShuffleExchangeParams; +use crate::servers::flight::v1::exchange::serde::serialize_block; +use crate::servers::flight::v1::exchange::serde::ExchangeSerializeMeta; use crate::servers::flight::v1::scatter::FlightScatter; -use crate::sessions::QueryContext; - -struct AggregateExchangeSorting {} - -pub fn compute_block_number(bucket: isize, max_partition_count: usize) -> Result { - Ok(max_partition_count as isize * 1000 + bucket) -} - -impl ExchangeSorting for AggregateExchangeSorting { - fn block_number(&self, data_block: &DataBlock) -> Result { - match data_block.get_meta() { - None => Ok(-1), - Some(block_meta_info) => match AggregateMeta::downcast_ref_from(block_meta_info) { - None => Err(ErrorCode::Internal(format!( - "Internal error, AggregateExchangeSorting only recv AggregateMeta {:?}", - serde_json::to_string(block_meta_info) - ))), - Some(meta_info) => match meta_info { - AggregateMeta::Partitioned { .. } => unreachable!(), - AggregateMeta::Serialized(v) => { - compute_block_number(v.bucket, v.max_partition_count) - } - AggregateMeta::AggregatePayload(v) => { - compute_block_number(v.bucket, v.max_partition_count) - } - AggregateMeta::AggregateSpilling(_) - | AggregateMeta::Spilled(_) - | AggregateMeta::BucketSpilled(_) => Ok(-1), - }, - }, - } - } -} - -struct HashTableHashScatter { - buckets: usize, -} fn scatter_payload(mut payload: Payload, buckets: usize) -> Result> { let mut buckets = Vec::with_capacity(buckets); @@ -112,222 +68,228 @@ fn scatter_payload(mut payload: Payload, buckets: usize) -> Result> Ok(buckets) } -fn scatter_partitioned_payload( - partitioned_payload: PartitionedPayload, - buckets: usize, -) -> Result> { - let mut buckets = Vec::with_capacity(buckets); +pub struct FlightExchange { + local_id: String, + node_list: Vec, + node_list_lookup: HashMap, - let group_types = partitioned_payload.group_types.clone(); - let aggrs = partitioned_payload.aggrs.clone(); - let partition_count = partitioned_payload.partition_count() as u64; - let mut state = PayloadFlushState::default(); - - for _ in 0..buckets.capacity() { - buckets.push(PartitionedPayload::new( - group_types.clone(), - aggrs.clone(), - partition_count, - partitioned_payload.arenas.clone(), - )); - } + options: IpcWriteOptions, + global_max_partition: Arc, + shuffle_scatter: Arc>, +} - let mut payloads = Vec::with_capacity(buckets.len()); +impl FlightExchange { + pub fn create( + node_list: Vec, + compression: Option, + shuffle_scatter: Arc>, + ) -> Arc { + let compression = match compression { + None => None, + Some(compression) => match compression { + FlightCompression::Lz4 => Some(CompressionType::LZ4_FRAME), + FlightCompression::Zstd => Some(CompressionType::ZSTD), + }, + }; - for _ in 0..payloads.capacity() { - payloads.push(Payload::new( - Arc::new(Bump::new()), - group_types.clone(), - aggrs.clone(), - partitioned_payload.states_layout.clone(), - )); + let node_list_lookup = node_list + .iter() + .cloned() + .enumerate() + .map(|(x, y)| (y, x)) + .collect::>(); + + Arc::new(FlightExchange { + local_id: GlobalConfig::instance().query.node_id.clone(), + node_list, + node_list_lookup, + options: IpcWriteOptions::default() + .try_with_compression(compression) + .unwrap(), + shuffle_scatter, + global_max_partition: Arc::new(AtomicUsize::new(0)), + }) } +} + +impl FlightExchange { + fn default_partition(&self, data_block: DataBlock) -> Result> { + if self.node_list.is_empty() { + let data_block = serialize_block(0, 0, 0, data_block, &self.options)?; + return Ok(vec![data_block]); + } - for mut payload in partitioned_payload.payloads.into_iter() { - // scatter each page of the payload. - while payload.scatter(&mut state, buckets.len()) { - // copy to the corresponding bucket. - for (idx, bucket) in payloads.iter_mut().enumerate() { - let count = state.probe_state.partition_count[idx]; + let data_blocks = self.shuffle_scatter.execute(data_block)?; - if count > 0 { - let sel = &state.probe_state.partition_entries[idx]; - bucket.copy_rows(sel, count, &state.addresses); - } + let mut blocks = Vec::with_capacity(data_blocks.len()); + for (idx, data_block) in data_blocks.into_iter().enumerate() { + if self.node_list[idx] == self.local_id { + blocks.push(data_block); + continue; } + + blocks.push(serialize_block(0, 0, 0, data_block, &self.options)?); } - state.clear(); - payload.state_move_out = true; - } - for (idx, payload) in payloads.into_iter().enumerate() { - buckets[idx].combine_single(payload, &mut state, None); + Ok(blocks) } - - Ok(buckets) } -impl FlightScatter for HashTableHashScatter { - fn execute(&self, mut data_block: DataBlock) -> Result> { - if let Some(block_meta) = data_block.take_meta() { - if let Some(block_meta) = AggregateMeta::downcast_from(block_meta) { - let mut blocks = Vec::with_capacity(self.buckets); - match block_meta { - AggregateMeta::Spilled(_) => unreachable!(), - AggregateMeta::BucketSpilled(_) => unreachable!(), - AggregateMeta::Serialized(_) => unreachable!(), - AggregateMeta::Partitioned { .. } => unreachable!(), - AggregateMeta::AggregateSpilling(payload) => { - for p in scatter_partitioned_payload(payload, self.buckets)? { - blocks.push(DataBlock::empty_with_meta( - AggregateMeta::create_agg_spilling(p), - )); - } - } - AggregateMeta::AggregatePayload(p) => { - for payload in scatter_payload(p.payload, self.buckets)? { - blocks.push(DataBlock::empty_with_meta( - AggregateMeta::create_agg_payload( - p.bucket, - payload, - p.max_partition_count, - ), - )); +impl Exchange for FlightExchange { + const NAME: &'static str = "AggregateExchange"; + const MULTIWAY_SORT: bool = MULTIWAY_SORT; + + fn partition(&self, mut data_block: DataBlock, n: usize) -> Result> { + let Some(meta) = data_block.take_meta() else { + // only exchange data + if data_block.is_empty() { + return Ok(vec![]); + } + + return self.default_partition(data_block); + }; + + let Some(_) = AggregateMeta::downcast_ref_from(&meta) else { + return self.default_partition(data_block.add_meta(Some(meta))?); + }; + + assert!(MULTIWAY_SORT); + assert_eq!(self.node_list_lookup.len(), n); + match AggregateMeta::downcast_from(meta).unwrap() { + AggregateMeta::FinalPartition(_) => unreachable!(), + AggregateMeta::InFlightPayload(_) => unreachable!(), + AggregateMeta::SpilledPayload(v) => { + let mut blocks = Vec::with_capacity(n); + let global_max_partition = self.global_max_partition.load(AtomicOrdering::SeqCst); + for node_id in &self.node_list { + let mut node_data_block = match *node_id == v.destination_node { + true => DataBlock::empty_with_meta(AggregateMeta::create_spilled_payload( + v.clone(), + )), + false => { + DataBlock::empty_with_meta(AggregateMeta::create_in_flight_payload( + v.get_sorting_partition(), + v.max_partition, + global_max_partition, + )) } + }; + + if *node_id != self.local_id { + node_data_block = serialize_block( + v.get_sorting_partition(), + v.max_partition, + global_max_partition, + node_data_block, + &self.options, + )? } - }; - return Ok(blocks); - } - } + blocks.push(node_data_block); + } - Err(ErrorCode::Internal( - "Internal, HashTableHashScatter only recv AggregateMeta", - )) - } -} + Ok(blocks) + } + AggregateMeta::AggregatePayload(p) => { + if p.payload.len() == 0 { + return Ok(vec![]); + } -pub struct AggregateInjector { - ctx: Arc, - aggregator_params: Arc, -} + let mut blocks = Vec::with_capacity(n); + let global_max_partition = self.global_max_partition.load(AtomicOrdering::SeqCst); + for (idx, payload) in scatter_payload(p.payload, n)?.into_iter().enumerate() { + if self.node_list[idx] == self.local_id { + blocks.push(DataBlock::empty_with_meta( + AggregateMeta::create_agg_payload( + payload, + p.partition, + p.max_partition, + global_max_partition, + ), + )); + + continue; + } -impl AggregateInjector { - pub fn create( - ctx: Arc, - params: Arc, - ) -> Arc { - Arc::new(AggregateInjector { - ctx, - aggregator_params: params, - }) - } -} + let data_block = match payload.len() == 0 { + true => DataBlock::empty(), + false => payload.aggregate_flush_all()?, + }; + + let data_block = + data_block.add_meta(Some(AggregateMeta::create_in_flight_payload( + p.partition, + p.max_partition, + global_max_partition, + )))?; + + let data_block = serialize_block( + p.partition, + p.max_partition, + global_max_partition, + data_block, + &self.options, + )?; + blocks.push(data_block); + } -impl ExchangeInjector for AggregateInjector { - fn flight_scatter( - &self, - _: &Arc, - exchange: &DataExchange, - ) -> Result>> { - match exchange { - DataExchange::Merge(_) => unreachable!(), - DataExchange::Broadcast(_) => unreachable!(), - DataExchange::ShuffleDataExchange(exchange) => { - Ok(Arc::new(Box::new(HashTableHashScatter { - buckets: exchange.destination_ids.len(), - }))) + Ok(blocks) } } } - fn exchange_sorting(&self) -> Option> { - Some(Arc::new(AggregateExchangeSorting {})) - } + fn init_way(&self, _index: usize, block: &DataBlock) -> Result<()> { + let max_partition = match block.get_meta() { + None => 0, + Some(meta) => match AggregateMeta::downcast_ref_from(meta) { + None => 0, + Some(v) => v.get_max_partition(), + }, + }; - fn apply_merge_serializer( - &self, - _: &MergeExchangeParams, - _compression: Option, - pipeline: &mut Pipeline, - ) -> Result<()> { - let params = self.aggregator_params.clone(); - - let operator = DataOperator::instance().spill_operator(); - let location_prefix = self.ctx.query_id_spill_prefix(); - - pipeline.add_transform(|input, output| { - Ok(ProcessorPtr::create( - TransformAggregateSpillWriter::try_create( - self.ctx.clone(), - input, - output, - operator.clone(), - params.clone(), - location_prefix.clone(), - )?, - )) - })?; - - pipeline.add_transform(|input, output| { - TransformAggregateSerializer::try_create(input, output, params.clone()) - }) + self.global_max_partition + .fetch_max(max_partition, std::sync::atomic::Ordering::SeqCst); + Ok(()) } - fn apply_shuffle_serializer( - &self, - shuffle_params: &ShuffleExchangeParams, - compression: Option, - pipeline: &mut Pipeline, - ) -> Result<()> { - let params = self.aggregator_params.clone(); - let operator = DataOperator::instance().spill_operator(); - let location_prefix = self.ctx.query_id_spill_prefix(); - - let schema = shuffle_params.schema.clone(); - let local_id = &shuffle_params.executor_id; - let local_pos = shuffle_params - .destination_ids - .iter() - .position(|x| x == local_id) - .unwrap(); - - pipeline.add_transform(|input, output| { - Ok(ProcessorPtr::create( - TransformExchangeAggregateSerializer::try_create( - self.ctx.clone(), - input, - output, - operator.clone(), - location_prefix.clone(), - params.clone(), - compression, - schema.clone(), - local_pos, - )?, - )) - })?; - - pipeline.add_transform(TransformExchangeAsyncBarrier::try_create) - } + fn sorting_function(left_block: &DataBlock, right_block: &DataBlock) -> Ordering { + let Some(left_meta) = left_block.get_meta() else { + return Ordering::Equal; + }; - fn apply_merge_deserializer( - &self, - params: &MergeExchangeParams, - pipeline: &mut Pipeline, - ) -> Result<()> { - pipeline.add_transform(|input, output| { - TransformAggregateDeserializer::try_create(input, output, ¶ms.schema) - }) - } + let (l_partition, l_max_partition) = + match ExchangeSerializeMeta::downcast_ref_from(left_meta) { + Some(meta) => (meta.partition, meta.max_partition), + None => { + let Some(meta) = AggregateMeta::downcast_ref_from(left_meta) else { + return Ordering::Equal; + }; - fn apply_shuffle_deserializer( - &self, - params: &ShuffleExchangeParams, - pipeline: &mut Pipeline, - ) -> Result<()> { - pipeline.add_transform(|input, output| { - TransformAggregateDeserializer::try_create(input, output, ¶ms.schema) - }) + (meta.get_sorting_partition(), meta.get_max_partition()) + } + }; + + let Some(right_meta) = right_block.get_meta() else { + return Ordering::Equal; + }; + + let (r_partition, r_max_partition) = + match ExchangeSerializeMeta::downcast_ref_from(right_meta) { + Some(meta) => (meta.partition, meta.max_partition), + None => { + let Some(meta) = AggregateMeta::downcast_ref_from(right_meta) else { + return Ordering::Equal; + }; + + (meta.get_sorting_partition(), meta.get_max_partition()) + } + }; + + // ORDER BY max_partition asc, partition asc + match l_max_partition.cmp(&r_max_partition) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => l_partition.cmp(&r_partition), + } } } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs index 2ae3cc620b928..3baee05794962 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs @@ -19,6 +19,7 @@ use std::sync::Arc; use bumpalo::Bump; use databend_common_exception::Result; +use databend_common_expression::local_block_meta_serde; use databend_common_expression::types::DataType; use databend_common_expression::AggregateFunction; use databend_common_expression::AggregateHashTable; @@ -36,7 +37,8 @@ pub struct SerializedPayload { pub bucket: isize, pub data_block: DataBlock, // use for new agg_hashtable - pub max_partition_count: usize, + pub max_partition: usize, + pub global_max_partition: usize, } impl SerializedPayload { @@ -106,114 +108,158 @@ impl SerializedPayload { } } -pub struct BucketSpilledPayload { - pub bucket: isize, +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug)] +pub struct SpilledPayload { + pub partition: isize, pub location: String, pub data_range: Range, - pub columns_layout: Vec, - pub max_partition_count: usize, + pub destination_node: String, + pub max_partition: usize, + pub global_max_partition: usize, +} + +impl SpilledPayload { + pub fn get_sorting_partition(&self) -> isize { + -(self.max_partition as isize - self.partition) + } } pub struct AggregatePayload { - pub bucket: isize, + pub partition: isize, pub payload: Payload, // use for new agg_hashtable - pub max_partition_count: usize, + pub max_partition: usize, + pub global_max_partition: usize, +} + +#[derive(serde::Serialize, serde::Deserialize)] +pub struct InFlightPayload { + pub partition: isize, + pub max_partition: usize, + pub global_max_partition: usize, +} + +pub struct FinalPayload { + pub data: Vec, } +#[derive(serde::Serialize, serde::Deserialize)] pub enum AggregateMeta { - Serialized(SerializedPayload), + SpilledPayload(SpilledPayload), AggregatePayload(AggregatePayload), - AggregateSpilling(PartitionedPayload), - BucketSpilled(BucketSpilledPayload), - Spilled(Vec), - - Partitioned { bucket: isize, data: Vec }, + InFlightPayload(InFlightPayload), + FinalPartition(FinalPayload), } impl AggregateMeta { pub fn create_agg_payload( - bucket: isize, payload: Payload, - max_partition_count: usize, + partition: isize, + max_partition: usize, + global_max_partition: usize, ) -> BlockMetaInfoPtr { Box::new(AggregateMeta::AggregatePayload(AggregatePayload { - bucket, payload, - max_partition_count, + partition, + max_partition, + global_max_partition, })) } - pub fn create_agg_spilling(payload: PartitionedPayload) -> BlockMetaInfoPtr { - Box::new(AggregateMeta::AggregateSpilling(payload)) - } - - pub fn create_serialized( - bucket: isize, - block: DataBlock, - max_partition_count: usize, + pub fn create_in_flight_payload( + partition: isize, + max_partition: usize, + global_max_partition: usize, ) -> BlockMetaInfoPtr { - Box::new(AggregateMeta::Serialized(SerializedPayload { - bucket, - data_block: block, - max_partition_count, + Box::new(AggregateMeta::InFlightPayload(InFlightPayload { + partition, + max_partition, + global_max_partition, })) } - pub fn create_spilled(buckets_payload: Vec) -> BlockMetaInfoPtr { - Box::new(AggregateMeta::Spilled(buckets_payload)) + pub fn create_spilled_payload(payload: SpilledPayload) -> BlockMetaInfoPtr { + Box::new(AggregateMeta::SpilledPayload(payload)) } - pub fn create_bucket_spilled(payload: BucketSpilledPayload) -> BlockMetaInfoPtr { - Box::new(AggregateMeta::BucketSpilled(payload)) + pub fn create_final(blocks: Vec) -> BlockMetaInfoPtr { + Box::new(AggregateMeta::FinalPartition(FinalPayload { data: blocks })) } - pub fn create_partitioned(bucket: isize, data: Vec) -> BlockMetaInfoPtr { - Box::new(AggregateMeta::Partitioned { data, bucket }) + pub fn get_global_max_partition(&self) -> usize { + match self { + AggregateMeta::SpilledPayload(v) => v.global_max_partition, + AggregateMeta::AggregatePayload(v) => v.global_max_partition, + AggregateMeta::InFlightPayload(v) => v.global_max_partition, + AggregateMeta::FinalPartition(_) => unreachable!(), + } } -} -impl serde::Serialize for AggregateMeta { - fn serialize(&self, _: S) -> std::result::Result - where S: serde::Serializer { - unreachable!("AggregateMeta does not support exchanging between multiple nodes") + pub fn get_partition(&self) -> isize { + match self { + AggregateMeta::SpilledPayload(v) => v.partition, + AggregateMeta::AggregatePayload(v) => v.partition, + AggregateMeta::InFlightPayload(v) => v.partition, + AggregateMeta::FinalPartition(_) => unreachable!(), + } } -} -impl<'de> serde::Deserialize<'de> for AggregateMeta { - fn deserialize(_: D) -> std::result::Result - where D: serde::Deserializer<'de> { - unreachable!("AggregateMeta does not support exchanging between multiple nodes") + pub fn get_sorting_partition(&self) -> isize { + match self { + AggregateMeta::AggregatePayload(v) => v.partition, + AggregateMeta::InFlightPayload(v) => v.partition, + AggregateMeta::SpilledPayload(v) => v.get_sorting_partition(), + AggregateMeta::FinalPartition(_) => unreachable!(), + } + } + + pub fn get_max_partition(&self) -> usize { + match self { + AggregateMeta::SpilledPayload(v) => v.max_partition, + AggregateMeta::AggregatePayload(v) => v.max_partition, + AggregateMeta::InFlightPayload(v) => v.max_partition, + AggregateMeta::FinalPartition(_) => unreachable!(), + } + } + + pub fn set_global_max_partition(&mut self, global_max_partition: usize) { + match self { + AggregateMeta::SpilledPayload(v) => { + v.global_max_partition = global_max_partition; + } + AggregateMeta::AggregatePayload(v) => { + v.global_max_partition = global_max_partition; + } + AggregateMeta::InFlightPayload(v) => { + v.global_max_partition = global_max_partition; + } + AggregateMeta::FinalPartition(_) => unreachable!(), + } } } impl Debug for AggregateMeta { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { match self { - AggregateMeta::Partitioned { .. } => { - f.debug_struct("AggregateMeta::Partitioned").finish() + AggregateMeta::FinalPartition(_) => { + f.debug_struct("AggregateMeta::FinalPartition").finish() + } + AggregateMeta::SpilledPayload(_) => { + f.debug_struct("Aggregate::SpilledPayload").finish() } - AggregateMeta::Serialized { .. } => { - f.debug_struct("AggregateMeta::Serialized").finish() + AggregateMeta::InFlightPayload(_) => { + f.debug_struct("Aggregate:InFlightPayload").finish() } - AggregateMeta::Spilled(_) => f.debug_struct("Aggregate::Spilled").finish(), - AggregateMeta::BucketSpilled(_) => f.debug_struct("Aggregate::BucketSpilled").finish(), AggregateMeta::AggregatePayload(_) => { f.debug_struct("AggregateMeta:AggregatePayload").finish() } - AggregateMeta::AggregateSpilling(_) => { - f.debug_struct("AggregateMeta:AggregateSpilling").finish() - } } } } -impl BlockMetaInfo for AggregateMeta { - fn typetag_deserialize(&self) { - unimplemented!("AggregateMeta does not support exchanging between multiple nodes") - } +#[typetag::serde(name = "AggregateMeta")] +impl BlockMetaInfo for AggregateMeta {} - fn typetag_name(&self) -> &'static str { - unimplemented!("AggregateMeta does not support exchanging between multiple nodes") - } -} +local_block_meta_serde!(FinalPayload); +local_block_meta_serde!(AggregatePayload); +local_block_meta_serde!(SerializedPayload); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/mod.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/mod.rs index bdd17a88364fc..94f2d0ec7bba8 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/mod.rs @@ -15,21 +15,26 @@ mod aggregate_exchange_injector; mod aggregate_meta; mod aggregator_params; -mod new_transform_partition_bucket; mod serde; mod transform_aggregate_expand; mod transform_aggregate_final; mod transform_aggregate_partial; +mod transform_partition_align; +mod transform_partition_bucket; +mod transform_partition_dispatch; +mod transform_partition_exchange; +mod transform_partition_resorting; +mod transform_partition_restore; mod transform_single_key; mod udaf_script; -pub use aggregate_exchange_injector::AggregateInjector; +pub use aggregate_exchange_injector::FlightExchange; pub use aggregate_meta::*; pub use aggregator_params::AggregatorParams; -pub use new_transform_partition_bucket::build_partition_bucket; pub use transform_aggregate_expand::TransformExpandGroupingSets; pub use transform_aggregate_final::TransformFinalAggregate; pub use transform_aggregate_partial::TransformPartialAggregate; +pub use transform_partition_bucket::build_final_aggregate; pub use transform_single_key::FinalSingleStateAggregator; pub use transform_single_key::PartialSingleStateAggregator; pub use udaf_script::*; diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/new_transform_partition_bucket.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/new_transform_partition_bucket.rs deleted file mode 100644 index 5c5cddc4258fd..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/new_transform_partition_bucket.rs +++ /dev/null @@ -1,612 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::collections::btree_map::Entry; -use std::collections::BTreeMap; -use std::sync::Arc; - -use bumpalo::Bump; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::DataBlock; -use databend_common_expression::PartitionedPayload; -use databend_common_expression::PayloadFlushState; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_core::processors::ProcessorPtr; -use databend_common_pipeline_core::Pipe; -use databend_common_pipeline_core::PipeItem; -use databend_common_pipeline_core::Pipeline; -use databend_common_storage::DataOperator; -use tokio::sync::Semaphore; - -use super::AggregatePayload; -use super::TransformAggregateSpillReader; -use super::TransformFinalAggregate; -use crate::pipelines::processors::transforms::aggregator::aggregate_meta::AggregateMeta; -use crate::pipelines::processors::transforms::aggregator::aggregate_meta::SerializedPayload; -use crate::pipelines::processors::transforms::aggregator::AggregatorParams; - -static SINGLE_LEVEL_BUCKET_NUM: isize = -1; -static MAX_PARTITION_COUNT: usize = 128; - -struct InputPortState { - port: Arc, - bucket: isize, - max_partition_count: usize, -} -pub struct NewTransformPartitionBucket { - output: Arc, - inputs: Vec, - params: Arc, - working_bucket: isize, - pushing_bucket: isize, - initialized_all_inputs: bool, - all_inputs_init: bool, - buckets_blocks: BTreeMap>, - flush_state: PayloadFlushState, - unpartitioned_blocks: Vec, - max_partition_count: usize, -} - -impl NewTransformPartitionBucket { - pub fn create(input_nums: usize, params: Arc) -> Result { - let mut inputs = Vec::with_capacity(input_nums); - - for _index in 0..input_nums { - inputs.push(InputPortState { - bucket: -1, - port: InputPort::create(), - max_partition_count: 0, - }); - } - - Ok(NewTransformPartitionBucket { - params, - inputs, - working_bucket: 0, - pushing_bucket: 0, - output: OutputPort::create(), - buckets_blocks: BTreeMap::new(), - unpartitioned_blocks: vec![], - flush_state: PayloadFlushState::default(), - initialized_all_inputs: false, - all_inputs_init: false, - max_partition_count: 0, - }) - } - - pub fn get_inputs(&self) -> Vec> { - let mut inputs = Vec::with_capacity(self.inputs.len()); - - for input_state in &self.inputs { - inputs.push(input_state.port.clone()); - } - - inputs - } - - pub fn get_output(&self) -> Arc { - self.output.clone() - } - - fn initialize_all_inputs(&mut self) -> Result { - self.initialized_all_inputs = true; - // in a cluster where partitions are only 8 and 128, - // we need to pull all data where the partition equals 8 until the partition changes to 128 or there is no data available. - if self.params.cluster_aggregator { - for index in 0..self.inputs.len() { - if self.inputs[index].port.is_finished() { - continue; - } - - // We pull all the data that are not the max_partition_count and all spill data - if self.inputs[index].max_partition_count == MAX_PARTITION_COUNT - && self.inputs[index].bucket > SINGLE_LEVEL_BUCKET_NUM - { - continue; - } - - if !self.inputs[index].port.has_data() { - self.inputs[index].port.set_need_data(); - self.initialized_all_inputs = false; - continue; - } - - let data_block = self.inputs[index].port.pull_data().unwrap()?; - - ( - self.inputs[index].bucket, - self.inputs[index].max_partition_count, - ) = self.add_bucket(data_block)?; - - // we need pull all spill data in init, and data less than max partition - if self.inputs[index].bucket <= SINGLE_LEVEL_BUCKET_NUM - || self.inputs[index].max_partition_count < MAX_PARTITION_COUNT - { - self.inputs[index].port.set_need_data(); - self.initialized_all_inputs = false; - } - } - } else { - // in singleton, the partition is 8, 32, 128. - // We pull the first data to ensure the max partition, - // and then pull all data that is less than the max partition - let mut refresh_index = 0; - for index in 0..self.inputs.len() { - if self.inputs[index].port.is_finished() { - continue; - } - - // We pull all the data that are not the max_partition_count - if self.inputs[index].max_partition_count > 0 - && self.inputs[index].bucket > SINGLE_LEVEL_BUCKET_NUM - && self.inputs[index].max_partition_count == self.max_partition_count - { - continue; - } - - if !self.inputs[index].port.has_data() { - self.inputs[index].port.set_need_data(); - self.initialized_all_inputs = false; - continue; - } - - let data_block = self.inputs[index].port.pull_data().unwrap()?; - - let before_max_partition_count = self.max_partition_count; - ( - self.inputs[index].bucket, - self.inputs[index].max_partition_count, - ) = self.add_bucket(data_block)?; - - // we need pull all spill data in init, and data less than max partition - if self.inputs[index].bucket <= SINGLE_LEVEL_BUCKET_NUM - || self.inputs[index].max_partition_count < self.max_partition_count - { - self.inputs[index].port.set_need_data(); - self.initialized_all_inputs = false; - } - - // max partition count change - if before_max_partition_count > 0 - && before_max_partition_count != self.max_partition_count - { - // set need data for inputs which is less than the max partition - for i in refresh_index..index { - if !self.inputs[i].port.is_finished() - && !self.inputs[i].port.has_data() - && self.inputs[i].max_partition_count != self.max_partition_count - { - self.inputs[i].port.set_need_data(); - self.initialized_all_inputs = false; - } - } - refresh_index = index; - } - } - } - - if self.initialized_all_inputs { - self.all_inputs_init = true; - } - - Ok(self.initialized_all_inputs) - } - - #[allow(unused_assignments)] - fn add_bucket(&mut self, mut data_block: DataBlock) -> Result<(isize, usize)> { - let (mut bucket, mut partition_count) = (0, 0); - let mut is_empty_block = false; - if let Some(block_meta) = data_block.get_meta() { - if let Some(block_meta) = AggregateMeta::downcast_ref_from(block_meta) { - (bucket, partition_count) = match block_meta { - AggregateMeta::Partitioned { .. } => unreachable!(), - AggregateMeta::AggregateSpilling(_) => unreachable!(), - AggregateMeta::BucketSpilled(_) => { - let meta = data_block.take_meta().unwrap(); - - if let Some(AggregateMeta::BucketSpilled(payload)) = - AggregateMeta::downcast_from(meta) - { - let bucket = payload.bucket; - let partition_count = payload.max_partition_count; - self.max_partition_count = - self.max_partition_count.max(partition_count); - - let data_block = DataBlock::empty_with_meta( - AggregateMeta::create_bucket_spilled(payload), - ); - match self.buckets_blocks.entry(bucket) { - Entry::Vacant(v) => { - v.insert(vec![data_block]); - } - Entry::Occupied(mut v) => { - v.get_mut().push(data_block); - } - }; - - return Ok((SINGLE_LEVEL_BUCKET_NUM, partition_count)); - } - unreachable!() - } - AggregateMeta::Spilled(_) => { - let meta = data_block.take_meta().unwrap(); - - if let Some(AggregateMeta::Spilled(buckets_payload)) = - AggregateMeta::downcast_from(meta) - { - let partition_count = if !buckets_payload.is_empty() { - buckets_payload[0].max_partition_count - } else { - MAX_PARTITION_COUNT - }; - self.max_partition_count = - self.max_partition_count.max(partition_count); - - for bucket_payload in buckets_payload { - let bucket = bucket_payload.bucket; - let data_block = DataBlock::empty_with_meta( - AggregateMeta::create_bucket_spilled(bucket_payload), - ); - match self.buckets_blocks.entry(bucket) { - Entry::Vacant(v) => { - v.insert(vec![data_block]); - } - Entry::Occupied(mut v) => { - v.get_mut().push(data_block); - } - }; - } - - return Ok((SINGLE_LEVEL_BUCKET_NUM, partition_count)); - } - unreachable!() - } - AggregateMeta::Serialized(payload) => { - is_empty_block = payload.data_block.is_empty(); - self.max_partition_count = - self.max_partition_count.max(payload.max_partition_count); - - (payload.bucket, payload.max_partition_count) - } - AggregateMeta::AggregatePayload(payload) => { - is_empty_block = payload.payload.len() == 0; - self.max_partition_count = - self.max_partition_count.max(payload.max_partition_count); - - (payload.bucket, payload.max_partition_count) - } - }; - } else { - return Err(ErrorCode::Internal(format!( - "Internal, TransformPartitionBucket only recv AggregateMeta, but got {:?}", - block_meta - ))); - } - } else { - return Err(ErrorCode::Internal( - "Internal, TransformPartitionBucket only recv DataBlock with meta.", - )); - } - - if !is_empty_block { - if self.all_inputs_init { - if partition_count != self.max_partition_count { - return Err(ErrorCode::Internal( - "Internal, the partition count does not equal the max partition count on TransformPartitionBucket. - ", - )); - } - match self.buckets_blocks.entry(bucket) { - Entry::Vacant(v) => { - v.insert(vec![data_block]); - } - Entry::Occupied(mut v) => { - v.get_mut().push(data_block); - } - }; - } else { - self.unpartitioned_blocks.push(data_block); - } - } - - Ok((bucket, partition_count)) - } - - fn try_push_data_block(&mut self) -> bool { - while self.pushing_bucket < self.working_bucket { - if let Some(bucket_blocks) = self.buckets_blocks.remove(&self.pushing_bucket) { - let data_block = Self::convert_blocks(self.pushing_bucket, bucket_blocks); - self.output.push_data(Ok(data_block)); - self.pushing_bucket += 1; - return true; - } - - self.pushing_bucket += 1; - } - - false - } - - fn partition_block(&mut self, payload: SerializedPayload) -> Result>> { - // already is max partition - if payload.max_partition_count == self.max_partition_count { - let bucket = payload.bucket; - let data_block = - DataBlock::empty_with_meta(Box::new(AggregateMeta::Serialized(payload))); - match self.buckets_blocks.entry(bucket) { - Entry::Vacant(v) => { - v.insert(vec![data_block]); - } - Entry::Occupied(mut v) => { - v.get_mut().push(data_block); - } - }; - return Ok(vec![]); - } - - // need repartition - let mut blocks = Vec::with_capacity(self.max_partition_count); - let p = payload.convert_to_partitioned_payload( - self.params.group_data_types.clone(), - self.params.aggregate_functions.clone(), - self.params.num_states(), - 0, - Arc::new(Bump::new()), - )?; - - let mut partitioned_payload = PartitionedPayload::new( - self.params.group_data_types.clone(), - self.params.aggregate_functions.clone(), - self.max_partition_count as u64, - p.arenas.clone(), - ); - partitioned_payload.combine(p, &mut self.flush_state); - - for (bucket, payload) in partitioned_payload.payloads.into_iter().enumerate() { - blocks.push(Some(DataBlock::empty_with_meta( - AggregateMeta::create_agg_payload( - bucket as isize, - payload, - self.max_partition_count, - ), - ))); - } - - Ok(blocks) - } - - fn partition_payload(&mut self, payload: AggregatePayload) -> Result>> { - // already is max partition - if payload.max_partition_count == self.max_partition_count { - let bucket = payload.bucket; - let data_block = - DataBlock::empty_with_meta(Box::new(AggregateMeta::AggregatePayload(payload))); - match self.buckets_blocks.entry(bucket) { - Entry::Vacant(v) => { - v.insert(vec![data_block]); - } - Entry::Occupied(mut v) => { - v.get_mut().push(data_block); - } - }; - return Ok(vec![]); - } - - // need repartition - let mut blocks = Vec::with_capacity(self.max_partition_count); - let mut partitioned_payload = PartitionedPayload::new( - self.params.group_data_types.clone(), - self.params.aggregate_functions.clone(), - self.max_partition_count as u64, - vec![payload.payload.arena.clone()], - ); - - partitioned_payload.combine_single(payload.payload, &mut self.flush_state, None); - - for (bucket, payload) in partitioned_payload.payloads.into_iter().enumerate() { - blocks.push(Some(DataBlock::empty_with_meta( - AggregateMeta::create_agg_payload( - bucket as isize, - payload, - self.max_partition_count, - ), - ))); - } - - Ok(blocks) - } - - fn convert_blocks(bucket: isize, data_blocks: Vec) -> DataBlock { - let mut data = Vec::with_capacity(data_blocks.len()); - for mut data_block in data_blocks.into_iter() { - if let Some(block_meta) = data_block.take_meta() { - if let Some(block_meta) = AggregateMeta::downcast_from(block_meta) { - data.push(block_meta); - } - } - } - - DataBlock::empty_with_meta(AggregateMeta::create_partitioned(bucket, data)) - } -} - -#[async_trait::async_trait] -impl Processor for NewTransformPartitionBucket { - fn name(&self) -> String { - String::from("TransformPartitionBucket") - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if self.output.is_finished() { - for input_state in &self.inputs { - input_state.port.finish(); - } - - self.buckets_blocks.clear(); - return Ok(Event::Finished); - } - - // We pull the first unsplitted data block - if !self.initialized_all_inputs && !self.initialize_all_inputs()? { - return Ok(Event::NeedData); - } - - if !self.unpartitioned_blocks.is_empty() { - // Split data blocks if it's unsplitted. - return Ok(Event::Sync); - } - - if !self.output.can_push() { - for input_state in &self.inputs { - input_state.port.set_not_need_data(); - } - - return Ok(Event::NeedConsume); - } - - let pushed_data_block = self.try_push_data_block(); - - loop { - // Try to pull the next data or until the port is closed - let mut all_inputs_is_finished = true; - let mut all_port_prepared_data = true; - for index in 0..self.inputs.len() { - if self.inputs[index].port.is_finished() { - continue; - } - - all_inputs_is_finished = false; - if self.inputs[index].bucket > self.working_bucket { - continue; - } - - if !self.inputs[index].port.has_data() { - all_port_prepared_data = false; - self.inputs[index].port.set_need_data(); - continue; - } - - let data_block = self.inputs[index].port.pull_data().unwrap()?; - (self.inputs[index].bucket, _) = self.add_bucket(data_block)?; - - if self.inputs[index].bucket <= self.working_bucket { - all_port_prepared_data = false; - self.inputs[index].port.set_need_data(); - } - } - - if all_inputs_is_finished { - break; - } - - if !all_port_prepared_data { - return Ok(Event::NeedData); - } - - self.working_bucket += 1; - } - - if pushed_data_block || self.try_push_data_block() { - return Ok(Event::NeedConsume); - } - - if let Some((bucket, bucket_blocks)) = self.buckets_blocks.pop_first() { - let data_block = Self::convert_blocks(bucket, bucket_blocks); - self.output.push_data(Ok(data_block)); - return Ok(Event::NeedConsume); - } - - self.output.finish(); - Ok(Event::Finished) - } - - fn process(&mut self) -> Result<()> { - let block_meta = self - .unpartitioned_blocks - .pop() - .and_then(|mut block| block.take_meta()) - .and_then(AggregateMeta::downcast_from); - - if let Some(agg_block_meta) = block_meta { - let data_blocks = match agg_block_meta { - AggregateMeta::Spilled(_) => unreachable!(), - AggregateMeta::Partitioned { .. } => unreachable!(), - AggregateMeta::AggregateSpilling(_) => unreachable!(), - AggregateMeta::BucketSpilled(_) => unreachable!(), - AggregateMeta::Serialized(payload) => self.partition_block(payload)?, - AggregateMeta::AggregatePayload(payload) => self.partition_payload(payload)?, - }; - - for (bucket, block) in data_blocks.into_iter().enumerate() { - if let Some(data_block) = block { - match self.buckets_blocks.entry(bucket as isize) { - Entry::Vacant(v) => { - v.insert(vec![data_block]); - } - Entry::Occupied(mut v) => { - v.get_mut().push(data_block); - } - }; - } - } - } - - Ok(()) - } -} - -pub fn build_partition_bucket( - pipeline: &mut Pipeline, - params: Arc, -) -> Result<()> { - let input_nums = pipeline.output_len(); - let transform = NewTransformPartitionBucket::create(input_nums, params.clone())?; - - let output = transform.get_output(); - let inputs_port = transform.get_inputs(); - - pipeline.add_pipe(Pipe::create(inputs_port.len(), 1, vec![PipeItem::create( - ProcessorPtr::create(Box::new(transform)), - inputs_port, - vec![output], - )])); - - pipeline.try_resize(input_nums)?; - - let semaphore = Arc::new(Semaphore::new(params.max_spill_io_requests)); - let operator = DataOperator::instance().spill_operator(); - pipeline.add_transform(|input, output| { - let operator = operator.clone(); - TransformAggregateSpillReader::create(input, output, operator, semaphore.clone()) - })?; - - pipeline.add_transform(|input, output| { - Ok(ProcessorPtr::create(TransformFinalAggregate::try_create( - input, - output, - params.clone(), - )?)) - })?; - Ok(()) -} diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/mod.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/mod.rs index 76a55b10e85b3..16152f57bef77 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/mod.rs @@ -12,21 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod serde_meta; -mod transform_aggregate_serializer; -mod transform_aggregate_spill_writer; mod transform_deserializer; -mod transform_exchange_aggregate_serializer; -mod transform_exchange_async_barrier; -mod transform_spill_reader; -pub use serde_meta::*; -pub use transform_aggregate_serializer::*; -pub use transform_aggregate_spill_writer::*; pub use transform_deserializer::*; -pub use transform_exchange_aggregate_serializer::*; -pub use transform_exchange_async_barrier::*; -pub use transform_spill_reader::*; + +pub use crate::pipelines::processors::transforms::aggregator::transform_partition_restore::*; pub mod exchange_defines { use arrow_ipc::writer::IpcWriteOptions; diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/serde_meta.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/serde_meta.rs deleted file mode 100644 index b83cf2c97c90e..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/serde_meta.rs +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::ops::Range; - -use databend_common_expression::BlockMetaInfo; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::BlockMetaInfoPtr; - -pub const BUCKET_TYPE: usize = 1; -pub const SPILLED_TYPE: usize = 2; - -// Cannot change to enum, because bincode cannot deserialize custom enum -#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] -pub struct AggregateSerdeMeta { - pub typ: usize, - pub bucket: isize, - pub location: Option, - pub data_range: Option>, - pub columns_layout: Vec, - // use for new agg hashtable - pub max_partition_count: usize, - pub is_empty: bool, -} - -impl AggregateSerdeMeta { - pub fn create_agg_payload( - bucket: isize, - max_partition_count: usize, - is_empty: bool, - ) -> BlockMetaInfoPtr { - Box::new(AggregateSerdeMeta { - typ: BUCKET_TYPE, - bucket, - location: None, - data_range: None, - columns_layout: vec![], - max_partition_count, - is_empty, - }) - } - - pub fn create_spilled( - bucket: isize, - location: String, - data_range: Range, - columns_layout: Vec, - is_empty: bool, - ) -> BlockMetaInfoPtr { - Box::new(AggregateSerdeMeta { - typ: SPILLED_TYPE, - bucket, - columns_layout, - location: Some(location), - data_range: Some(data_range), - max_partition_count: 0, - is_empty, - }) - } - - pub fn create_agg_spilled( - bucket: isize, - location: String, - data_range: Range, - columns_layout: Vec, - max_partition_count: usize, - ) -> BlockMetaInfoPtr { - Box::new(AggregateSerdeMeta { - typ: SPILLED_TYPE, - bucket, - columns_layout, - location: Some(location), - data_range: Some(data_range), - max_partition_count, - is_empty: false, - }) - } -} - -#[typetag::serde(name = "aggregate_serde")] -impl BlockMetaInfo for AggregateSerdeMeta { - fn equals(&self, info: &Box) -> bool { - AggregateSerdeMeta::downcast_ref_from(info).is_some_and(|other| self == other) - } - - fn clone_self(&self) -> Box { - Box::new(self.clone()) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_serializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_serializer.rs deleted file mode 100644 index 096485fa98fcc..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_serializer.rs +++ /dev/null @@ -1,260 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::fmt::Formatter; -use std::pin::Pin; -use std::sync::Arc; - -use databend_common_exception::Result; -use databend_common_expression::local_block_meta_serde; -use databend_common_expression::BlockMetaInfo; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::BlockMetaInfoPtr; -use databend_common_expression::DataBlock; -use databend_common_expression::PayloadFlushState; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_core::processors::ProcessorPtr; -use futures::future::BoxFuture; - -use crate::pipelines::processors::transforms::aggregator::AggregateMeta; -use crate::pipelines::processors::transforms::aggregator::AggregatePayload; -use crate::pipelines::processors::transforms::aggregator::AggregateSerdeMeta; -use crate::pipelines::processors::transforms::aggregator::AggregatorParams; -pub struct TransformAggregateSerializer { - params: Arc, - - input: Arc, - output: Arc, - output_data: Option, - input_data: Option, -} - -impl TransformAggregateSerializer { - pub fn try_create( - input: Arc, - output: Arc, - params: Arc, - ) -> Result { - Ok(ProcessorPtr::create(Box::new( - TransformAggregateSerializer { - input, - output, - params, - input_data: None, - output_data: None, - }, - ))) - } -} - -impl Processor for TransformAggregateSerializer { - fn name(&self) -> String { - String::from("TransformAggregateSerializer") - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if self.output.is_finished() { - self.input.finish(); - return Ok(Event::Finished); - } - - if !self.output.can_push() { - self.input.set_not_need_data(); - return Ok(Event::NeedConsume); - } - - if let Some(output_data) = self.output_data.take() { - self.output.push_data(Ok(output_data)); - return Ok(Event::NeedConsume); - } - - if self.input_data.is_some() { - return Ok(Event::Sync); - } - - if self.input.has_data() { - let data_block = self.input.pull_data().unwrap()?; - return self.transform_input_data(data_block); - } - - if self.input.is_finished() { - self.output.finish(); - return Ok(Event::Finished); - } - - self.input.set_need_data(); - Ok(Event::NeedData) - } - - fn process(&mut self) -> Result<()> { - if let Some(stream) = &mut self.input_data { - self.output_data = Option::transpose(stream.next())?; - - if self.output_data.is_none() { - self.input_data = None; - } - } - - Ok(()) - } -} - -impl TransformAggregateSerializer { - fn transform_input_data(&mut self, mut data_block: DataBlock) -> Result { - debug_assert!(data_block.is_empty()); - if let Some(block_meta) = data_block.take_meta() { - if let Some(block_meta) = AggregateMeta::downcast_from(block_meta) { - match block_meta { - AggregateMeta::Spilled(_) => unreachable!(), - AggregateMeta::Serialized(_) => unreachable!(), - AggregateMeta::BucketSpilled(_) => unreachable!(), - AggregateMeta::Partitioned { .. } => unreachable!(), - AggregateMeta::AggregateSpilling(_) => unreachable!(), - AggregateMeta::AggregatePayload(p) => { - self.input_data = Some(SerializeAggregateStream::create( - &self.params, - SerializePayload::AggregatePayload(p), - )); - return Ok(Event::Sync); - } - } - } - } - - unreachable!() - } -} - -pub enum SerializePayload { - AggregatePayload(AggregatePayload), -} - -pub enum FlightSerialized { - DataBlock(DataBlock), - Future(BoxFuture<'static, Result>), -} - -unsafe impl Sync for FlightSerialized {} - -pub struct FlightSerializedMeta { - pub serialized_blocks: Vec, -} - -impl FlightSerializedMeta { - pub fn create(blocks: Vec) -> BlockMetaInfoPtr { - Box::new(FlightSerializedMeta { - serialized_blocks: blocks, - }) - } -} - -impl std::fmt::Debug for FlightSerializedMeta { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - f.debug_struct("FlightSerializedMeta").finish() - } -} - -local_block_meta_serde!(FlightSerializedMeta); - -#[typetag::serde(name = "exchange_shuffle")] -impl BlockMetaInfo for FlightSerializedMeta {} - -pub struct SerializeAggregateStream { - _params: Arc, - pub payload: Pin>, - flush_state: PayloadFlushState, - end_iter: bool, - nums: usize, -} - -unsafe impl Send for SerializeAggregateStream {} - -unsafe impl Sync for SerializeAggregateStream {} - -impl SerializeAggregateStream { - pub fn create(params: &Arc, payload: SerializePayload) -> Self { - let payload = Box::pin(payload); - - SerializeAggregateStream { - payload, - flush_state: PayloadFlushState::default(), - _params: params.clone(), - end_iter: false, - nums: 0, - } - } -} - -impl Iterator for SerializeAggregateStream { - type Item = Result; - - fn next(&mut self) -> Option { - Result::transpose(self.next_impl()) - } -} - -impl SerializeAggregateStream { - fn next_impl(&mut self) -> Result> { - if self.end_iter { - return Ok(None); - } - - match self.payload.as_ref().get_ref() { - SerializePayload::AggregatePayload(p) => { - let block = p.payload.aggregate_flush(&mut self.flush_state)?; - - if block.is_none() { - self.end_iter = true; - } - - match block { - Some(block) => { - self.nums += 1; - Ok(Some(block.add_meta(Some( - AggregateSerdeMeta::create_agg_payload( - p.bucket, - p.max_partition_count, - false, - ), - ))?)) - } - None => { - // always return at least one block - if self.nums == 0 { - self.nums += 1; - let block = p.payload.empty_block(Some(1)); - Ok(Some(block.add_meta(Some( - AggregateSerdeMeta::create_agg_payload( - p.bucket, - p.max_partition_count, - true, - ), - ))?)) - } else { - Ok(None) - } - } - } - } - } - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_spill_writer.rs deleted file mode 100644 index 744945849d45a..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_spill_writer.rs +++ /dev/null @@ -1,264 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; -use std::time::Instant; - -use databend_common_base::base::ProgressValues; -use databend_common_base::runtime::profile::Profile; -use databend_common_base::runtime::profile::ProfileStatisticsName; -use databend_common_catalog::table_context::TableContext; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::arrow::serialize_column; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::DataBlock; -use databend_common_expression::PartitionedPayload; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use futures_util::future::BoxFuture; -use log::info; -use opendal::Operator; - -use crate::pipelines::processors::transforms::aggregator::AggregateMeta; -use crate::pipelines::processors::transforms::aggregator::AggregatorParams; -use crate::pipelines::processors::transforms::aggregator::BucketSpilledPayload; -use crate::sessions::QueryContext; -use crate::spillers::Spiller; -use crate::spillers::SpillerConfig; -use crate::spillers::SpillerType; - -pub struct TransformAggregateSpillWriter { - ctx: Arc, - input: Arc, - output: Arc, - _params: Arc, - - spiller: Arc, - spilled_block: Option, - spilling_meta: Option, - spilling_future: Option>>, -} - -impl TransformAggregateSpillWriter { - pub fn try_create( - ctx: Arc, - input: Arc, - output: Arc, - operator: Operator, - params: Arc, - location_prefix: String, - ) -> Result> { - let config = SpillerConfig { - spiller_type: SpillerType::Aggregation, - location_prefix, - disk_spill: None, - use_parquet: ctx.get_settings().get_spilling_file_format()?.is_parquet(), - }; - - let spiller = Spiller::create(ctx.clone(), operator, config.clone())?; - Ok(Box::new(TransformAggregateSpillWriter { - ctx, - input, - output, - _params: params, - spiller: Arc::new(spiller), - spilled_block: None, - spilling_meta: None, - spilling_future: None, - })) - } -} - -#[async_trait::async_trait] -impl Processor for TransformAggregateSpillWriter { - fn name(&self) -> String { - String::from("TransformAggregateSpillWriter") - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if self.output.is_finished() { - self.input.finish(); - return Ok(Event::Finished); - } - - if !self.output.can_push() { - self.input.set_not_need_data(); - return Ok(Event::NeedConsume); - } - - if self.spilling_future.is_some() { - self.input.set_not_need_data(); - return Ok(Event::Async); - } - - while let Some(spilled_block) = self.spilled_block.take() { - if !spilled_block.is_empty() || spilled_block.get_meta().is_some() { - self.output.push_data(Ok(spilled_block)); - return Ok(Event::NeedConsume); - } - } - - if self.spilling_meta.is_some() { - self.input.set_not_need_data(); - return Ok(Event::Sync); - } - - if self.input.has_data() { - let mut data_block = self.input.pull_data().unwrap()?; - - if let Some(block_meta) = data_block - .get_meta() - .and_then(AggregateMeta::downcast_ref_from) - { - if matches!(block_meta, AggregateMeta::AggregateSpilling(_)) { - self.input.set_not_need_data(); - let block_meta = data_block.take_meta().unwrap(); - self.spilling_meta = AggregateMeta::downcast_from(block_meta); - return Ok(Event::Sync); - } - } - - self.output.push_data(Ok(data_block)); - return Ok(Event::NeedConsume); - } - - if self.input.is_finished() { - self.output.finish(); - return Ok(Event::Finished); - } - - self.input.set_need_data(); - Ok(Event::NeedData) - } - - fn process(&mut self) -> Result<()> { - if let Some(spilling_meta) = self.spilling_meta.take() { - match spilling_meta { - AggregateMeta::AggregateSpilling(payload) => { - self.spilling_future = Some(agg_spilling_aggregate_payload( - self.ctx.clone(), - self.spiller.clone(), - payload, - )?); - - return Ok(()); - } - _ => { - return Err(ErrorCode::Internal("")); - } - } - } - - Ok(()) - } - - #[async_backtrace::framed] - async fn async_process(&mut self) -> Result<()> { - if let Some(spilling_future) = self.spilling_future.take() { - self.spilled_block = Some(spilling_future.await?); - } - - Ok(()) - } -} - -pub fn agg_spilling_aggregate_payload( - ctx: Arc, - spiller: Arc, - partitioned_payload: PartitionedPayload, -) -> Result>> { - let mut write_size = 0; - let partition_count = partitioned_payload.partition_count(); - let mut write_data = Vec::with_capacity(partition_count); - let mut spilled_buckets_payloads = Vec::with_capacity(partition_count); - // Record how many rows are spilled. - let mut rows = 0; - let location = spiller.create_unique_location(); - for (bucket, payload) in partitioned_payload.payloads.into_iter().enumerate() { - if payload.len() == 0 { - continue; - } - - let data_block = payload.aggregate_flush_all()?; - rows += data_block.num_rows(); - - let begin = write_size; - let columns = data_block.columns().to_vec(); - let mut columns_data = Vec::with_capacity(columns.len()); - let mut columns_layout = Vec::with_capacity(columns.len()); - for column in columns.into_iter() { - let column = column.into_column(data_block.num_rows()); - let column_data = serialize_column(&column); - write_size += column_data.len() as u64; - columns_layout.push(column_data.len() as u64); - columns_data.push(column_data); - } - - write_data.push(columns_data); - spilled_buckets_payloads.push(BucketSpilledPayload { - bucket: bucket as isize, - location: location.clone(), - data_range: begin..write_size, - columns_layout, - max_partition_count: partition_count, - }); - } - - Ok(Box::pin(async move { - let instant = Instant::now(); - if !write_data.is_empty() { - let (location, write_bytes) = spiller - .spill_stream_aggregate_buffer(Some(location), write_data) - .await?; - // perf - { - Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); - Profile::record_usize_profile( - ProfileStatisticsName::RemoteSpillWriteBytes, - write_bytes, - ); - Profile::record_usize_profile( - ProfileStatisticsName::RemoteSpillWriteTime, - instant.elapsed().as_millis() as usize, - ); - } - - { - let progress_val = ProgressValues { - rows, - bytes: write_bytes, - }; - ctx.get_aggregate_spill_progress().incr(&progress_val); - } - - info!( - "Write aggregate spill {} successfully, elapsed: {:?}", - location, - instant.elapsed() - ); - } - - Ok(DataBlock::empty_with_meta(AggregateMeta::create_spilled( - spilled_buckets_payloads, - ))) - })) -} diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_deserializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_deserializer.rs index f07f37e77305b..ddd0bd38d4a0f 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_deserializer.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_deserializer.rs @@ -17,14 +17,9 @@ use std::sync::Arc; use arrow_schema::Schema as ArrowSchema; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::types::ArrayType; -use databend_common_expression::types::NumberType; -use databend_common_expression::types::UInt64Type; -use databend_common_expression::types::ValueType; -use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::BlockMetaInfoPtr; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; -use databend_common_io::prelude::bincode_deserialize_from_slice; use databend_common_io::prelude::BinaryRead; use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; @@ -33,11 +28,6 @@ use databend_common_pipeline_transforms::processors::BlockMetaTransform; use databend_common_pipeline_transforms::processors::BlockMetaTransformer; use databend_common_pipeline_transforms::processors::UnknownMode; -use crate::pipelines::processors::transforms::aggregator::exchange_defines; -use crate::pipelines::processors::transforms::aggregator::AggregateMeta; -use crate::pipelines::processors::transforms::aggregator::AggregateSerdeMeta; -use crate::pipelines::processors::transforms::aggregator::BucketSpilledPayload; -use crate::pipelines::processors::transforms::aggregator::BUCKET_TYPE; use crate::servers::flight::v1::exchange::serde::deserialize_block; use crate::servers::flight::v1::exchange::serde::ExchangeDeserializeMeta; use crate::servers::flight::v1::packets::DataPacket; @@ -69,8 +59,9 @@ impl TransformDeserializer { fn recv_data(&self, dict: Vec, fragment_data: FragmentData) -> Result { const ROW_HEADER_SIZE: usize = std::mem::size_of::(); - let meta = bincode_deserialize_from_slice(&fragment_data.get_meta()[ROW_HEADER_SIZE..]) - .map_err(|_| ErrorCode::BadBytes("block meta deserialize error when exchange"))?; + let meta: Option = + serde_json::from_slice(&fragment_data.get_meta()[ROW_HEADER_SIZE..]) + .map_err(|_| ErrorCode::BadBytes("block meta deserialize error when exchange"))?; let mut row_count_meta = &fragment_data.get_meta()[..ROW_HEADER_SIZE]; let row_count: u32 = row_count_meta.read_scalar()?; @@ -79,91 +70,8 @@ impl TransformDeserializer { return Ok(DataBlock::new_with_meta(vec![], 0, meta)); } - let data_block = match &meta { - None => { - deserialize_block(dict, fragment_data, &self.schema, self.arrow_schema.clone())? - } - Some(meta) => match AggregateSerdeMeta::downcast_ref_from(meta) { - None => { - deserialize_block(dict, fragment_data, &self.schema, self.arrow_schema.clone())? - } - Some(meta) => { - return match meta.typ == BUCKET_TYPE { - true => { - let mut block = deserialize_block( - dict, - fragment_data, - &self.schema, - self.arrow_schema.clone(), - )?; - - if meta.is_empty { - block = block.slice(0..0); - } - - Ok(DataBlock::empty_with_meta( - AggregateMeta::create_serialized( - meta.bucket, - block, - meta.max_partition_count, - ), - )) - } - false => { - let data_schema = Arc::new(exchange_defines::spilled_schema()); - let arrow_schema = Arc::new(exchange_defines::spilled_arrow_schema()); - let data_block = deserialize_block( - dict, - fragment_data, - &data_schema, - arrow_schema.clone(), - )?; - - let columns = data_block - .columns() - .iter() - .map(|c| c.value.clone().into_column()) - .try_collect::>() - .unwrap(); - - let buckets = - NumberType::::try_downcast_column(&columns[0]).unwrap(); - let data_range_start = - NumberType::::try_downcast_column(&columns[1]).unwrap(); - let data_range_end = - NumberType::::try_downcast_column(&columns[2]).unwrap(); - let columns_layout = - ArrayType::::try_downcast_column(&columns[3]).unwrap(); - - let columns_layout_data = columns_layout.values().as_slice(); - let columns_layout_offsets = columns_layout.offsets(); - - let mut buckets_payload = Vec::with_capacity(data_block.num_rows()); - for index in 0..data_block.num_rows() { - unsafe { - buckets_payload.push(BucketSpilledPayload { - bucket: *buckets.get_unchecked(index) as isize, - location: meta.location.clone().unwrap(), - data_range: *data_range_start.get_unchecked(index) - ..*data_range_end.get_unchecked(index), - columns_layout: columns_layout_data[columns_layout_offsets - [index] - as usize - ..columns_layout_offsets[index + 1] as usize] - .to_vec(), - max_partition_count: meta.max_partition_count, - }); - } - } - - Ok(DataBlock::empty_with_meta(AggregateMeta::create_spilled( - buckets_payload, - ))) - } - }; - } - }, - }; + let data_block = + deserialize_block(dict, fragment_data, &self.schema, self.arrow_schema.clone())?; match data_block.num_columns() == 0 { true => Ok(DataBlock::new_with_meta(vec![], row_count as usize, meta)), diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs deleted file mode 100644 index 1274bd7fc94d9..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs +++ /dev/null @@ -1,288 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; -use std::time::Instant; - -use arrow_ipc::writer::IpcWriteOptions; -use arrow_ipc::CompressionType; -use databend_common_base::base::ProgressValues; -use databend_common_base::runtime::profile::Profile; -use databend_common_base::runtime::profile::ProfileStatisticsName; -use databend_common_catalog::table_context::TableContext; -use databend_common_exception::Result; -use databend_common_expression::arrow::serialize_column; -use databend_common_expression::types::ArgType; -use databend_common_expression::types::ArrayType; -use databend_common_expression::types::Int64Type; -use databend_common_expression::types::UInt64Type; -use databend_common_expression::types::ValueType; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::DataBlock; -use databend_common_expression::DataSchemaRef; -use databend_common_expression::FromData; -use databend_common_expression::PartitionedPayload; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_transforms::processors::BlockMetaTransform; -use databend_common_pipeline_transforms::processors::BlockMetaTransformer; -use databend_common_settings::FlightCompression; -use futures_util::future::BoxFuture; -use log::info; -use opendal::Operator; - -use super::SerializePayload; -use crate::pipelines::processors::transforms::aggregator::agg_spilling_aggregate_payload as local_agg_spilling_aggregate_payload; -use crate::pipelines::processors::transforms::aggregator::aggregate_exchange_injector::compute_block_number; -use crate::pipelines::processors::transforms::aggregator::aggregate_meta::AggregateMeta; -use crate::pipelines::processors::transforms::aggregator::exchange_defines; -use crate::pipelines::processors::transforms::aggregator::AggregateSerdeMeta; -use crate::pipelines::processors::transforms::aggregator::AggregatorParams; -use crate::pipelines::processors::transforms::aggregator::FlightSerialized; -use crate::pipelines::processors::transforms::aggregator::FlightSerializedMeta; -use crate::pipelines::processors::transforms::aggregator::SerializeAggregateStream; -use crate::servers::flight::v1::exchange::serde::serialize_block; -use crate::servers::flight::v1::exchange::ExchangeShuffleMeta; -use crate::sessions::QueryContext; -use crate::spillers::Spiller; -use crate::spillers::SpillerConfig; -use crate::spillers::SpillerType; - -pub struct TransformExchangeAggregateSerializer { - ctx: Arc, - local_pos: usize, - options: IpcWriteOptions, - - params: Arc, - spiller: Arc, -} - -impl TransformExchangeAggregateSerializer { - #[allow(clippy::too_many_arguments)] - pub fn try_create( - ctx: Arc, - input: Arc, - output: Arc, - - operator: Operator, - location_prefix: String, - params: Arc, - compression: Option, - _schema: DataSchemaRef, - local_pos: usize, - ) -> Result> { - let compression = match compression { - None => None, - Some(compression) => match compression { - FlightCompression::Lz4 => Some(CompressionType::LZ4_FRAME), - FlightCompression::Zstd => Some(CompressionType::ZSTD), - }, - }; - let config = SpillerConfig { - spiller_type: SpillerType::Aggregation, - location_prefix, - disk_spill: None, - use_parquet: ctx.get_settings().get_spilling_file_format()?.is_parquet(), - }; - - let spiller = Spiller::create(ctx.clone(), operator, config.clone())?; - Ok(BlockMetaTransformer::create( - input, - output, - TransformExchangeAggregateSerializer { - ctx, - params, - local_pos, - spiller: spiller.into(), - options: IpcWriteOptions::default() - .try_with_compression(compression) - .unwrap(), - }, - )) - } -} - -impl BlockMetaTransform for TransformExchangeAggregateSerializer { - const NAME: &'static str = "TransformExchangeAggregateSerializer"; - - fn transform(&mut self, meta: ExchangeShuffleMeta) -> Result> { - let mut serialized_blocks = Vec::with_capacity(meta.blocks.len()); - for (index, mut block) in meta.blocks.into_iter().enumerate() { - if block.is_empty() && block.get_meta().is_none() { - serialized_blocks.push(FlightSerialized::DataBlock(block)); - continue; - } - - match AggregateMeta::downcast_from(block.take_meta().unwrap()) { - None => unreachable!(), - Some(AggregateMeta::Spilled(_)) => unreachable!(), - Some(AggregateMeta::Serialized(_)) => unreachable!(), - Some(AggregateMeta::BucketSpilled(_)) => unreachable!(), - Some(AggregateMeta::Partitioned { .. }) => unreachable!(), - Some(AggregateMeta::AggregateSpilling(payload)) => { - serialized_blocks.push(FlightSerialized::Future( - match index == self.local_pos { - true => local_agg_spilling_aggregate_payload( - self.ctx.clone(), - self.spiller.clone(), - payload, - )?, - false => exchange_agg_spilling_aggregate_payload( - self.ctx.clone(), - self.spiller.clone(), - payload, - )?, - }, - )); - } - - Some(AggregateMeta::AggregatePayload(p)) => { - let (bucket, max_partition_count) = (p.bucket, p.max_partition_count); - - if index == self.local_pos { - serialized_blocks.push(FlightSerialized::DataBlock( - block.add_meta(Some(Box::new(AggregateMeta::AggregatePayload(p))))?, - )); - continue; - } - - let block_number = compute_block_number(bucket, max_partition_count)?; - let stream = SerializeAggregateStream::create( - &self.params, - SerializePayload::AggregatePayload(p), - ); - let mut stream_blocks = stream.into_iter().collect::>>()?; - debug_assert!(!stream_blocks.is_empty()); - let mut c = DataBlock::concat(&stream_blocks)?; - if let Some(meta) = stream_blocks[0].take_meta() { - c.replace_meta(meta); - } - let c = serialize_block(block_number, c, &self.options)?; - serialized_blocks.push(FlightSerialized::DataBlock(c)); - } - }; - } - - Ok(vec![DataBlock::empty_with_meta( - FlightSerializedMeta::create(serialized_blocks), - )]) - } -} - -fn exchange_agg_spilling_aggregate_payload( - ctx: Arc, - spiller: Arc, - partitioned_payload: PartitionedPayload, -) -> Result>> { - let partition_count = partitioned_payload.partition_count(); - let mut write_size = 0; - let mut write_data = Vec::with_capacity(partition_count); - let mut buckets_column_data = Vec::with_capacity(partition_count); - let mut data_range_start_column_data = Vec::with_capacity(partition_count); - let mut data_range_end_column_data = Vec::with_capacity(partition_count); - let mut columns_layout_column_data = Vec::with_capacity(partition_count); - // Record how many rows are spilled. - let mut rows = 0; - - for (bucket, payload) in partitioned_payload.payloads.into_iter().enumerate() { - if payload.len() == 0 { - continue; - } - - let data_block = payload.aggregate_flush_all()?; - rows += data_block.num_rows(); - - let old_write_size = write_size; - let columns = data_block.columns().to_vec(); - let mut columns_data = Vec::with_capacity(columns.len()); - let mut columns_layout = Vec::with_capacity(columns.len()); - - for column in columns.into_iter() { - let column = column.into_column(data_block.num_rows()); - let column_data = serialize_column(&column); - write_size += column_data.len() as u64; - columns_layout.push(column_data.len() as u64); - columns_data.push(column_data); - } - - write_data.push(columns_data); - buckets_column_data.push(bucket as i64); - data_range_end_column_data.push(write_size); - columns_layout_column_data.push(columns_layout); - data_range_start_column_data.push(old_write_size); - } - - Ok(Box::pin(async move { - if !write_data.is_empty() { - let instant = Instant::now(); - let (location, write_bytes) = spiller - .spill_stream_aggregate_buffer(None, write_data) - .await?; - // perf - { - Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); - Profile::record_usize_profile( - ProfileStatisticsName::RemoteSpillWriteBytes, - write_bytes, - ); - Profile::record_usize_profile( - ProfileStatisticsName::RemoteSpillWriteTime, - instant.elapsed().as_millis() as usize, - ); - } - - { - { - let progress_val = ProgressValues { - rows, - bytes: write_bytes, - }; - ctx.get_aggregate_spill_progress().incr(&progress_val); - } - } - - info!( - "Write aggregate spill {} successfully, elapsed: {:?}", - location, - instant.elapsed() - ); - - let data_block = DataBlock::new_from_columns(vec![ - Int64Type::from_data(buckets_column_data), - UInt64Type::from_data(data_range_start_column_data), - UInt64Type::from_data(data_range_end_column_data), - ArrayType::upcast_column(ArrayType::::column_from_iter( - columns_layout_column_data - .into_iter() - .map(|x| UInt64Type::column_from_iter(x.into_iter(), &[])), - &[], - )), - ]); - - let data_block = data_block.add_meta(Some(AggregateSerdeMeta::create_agg_spilled( - -1, - location.clone(), - 0..0, - vec![], - partition_count, - )))?; - - let write_options = exchange_defines::spilled_write_options(); - return serialize_block(-1, data_block, &write_options); - } - - Ok(DataBlock::empty()) - })) -} diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_async_barrier.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_async_barrier.rs deleted file mode 100644 index 1628bc9af5beb..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_async_barrier.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::ProcessorPtr; -use databend_common_pipeline_transforms::processors::AsyncTransform; -use databend_common_pipeline_transforms::processors::AsyncTransformer; - -use crate::pipelines::processors::transforms::aggregator::FlightSerialized; -use crate::pipelines::processors::transforms::aggregator::FlightSerializedMeta; -use crate::servers::flight::v1::exchange::ExchangeShuffleMeta; - -pub struct TransformExchangeAsyncBarrier; - -impl TransformExchangeAsyncBarrier { - pub fn try_create(input: Arc, output: Arc) -> Result { - Ok(ProcessorPtr::create(AsyncTransformer::create( - input, - output, - TransformExchangeAsyncBarrier {}, - ))) - } -} - -#[async_trait::async_trait] -impl AsyncTransform for TransformExchangeAsyncBarrier { - const NAME: &'static str = "TransformExchangeAsyncBarrier"; - - async fn transform(&mut self, mut data: DataBlock) -> Result { - if let Some(meta) = data - .take_meta() - .and_then(FlightSerializedMeta::downcast_from) - { - let mut futures = Vec::with_capacity(meta.serialized_blocks.len()); - - for serialized_block in meta.serialized_blocks { - futures.push(databend_common_base::runtime::spawn(async move { - match serialized_block { - FlightSerialized::DataBlock(v) => Ok(v), - FlightSerialized::Future(f) => f.await, - } - })); - } - - return match futures::future::try_join_all(futures).await { - Err(_) => Err(ErrorCode::TokioError("Cannot join tokio job")), - Ok(spilled_data) => Ok(DataBlock::empty_with_meta(ExchangeShuffleMeta::create( - spilled_data.into_iter().collect::>>()?, - ))), - }; - } - - Err(ErrorCode::Internal("")) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs deleted file mode 100644 index fd03b09e2f3f7..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs +++ /dev/null @@ -1,314 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::collections::VecDeque; -use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; - -use databend_common_base::runtime::profile::Profile; -use databend_common_base::runtime::profile::ProfileStatisticsName; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::arrow::deserialize_column; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::BlockMetaInfoPtr; -use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_core::processors::ProcessorPtr; -use itertools::Itertools; -use log::info; -use opendal::Operator; -use tokio::sync::Semaphore; - -use crate::pipelines::processors::transforms::aggregator::AggregateMeta; -use crate::pipelines::processors::transforms::aggregator::BucketSpilledPayload; -use crate::pipelines::processors::transforms::aggregator::SerializedPayload; - -type DeserializingMeta = (AggregateMeta, VecDeque>); - -pub struct TransformSpillReader { - input: Arc, - output: Arc, - - operator: Operator, - semaphore: Arc, - deserialized_meta: Option, - reading_meta: Option, - deserializing_meta: Option, -} - -#[async_trait::async_trait] -impl Processor for TransformSpillReader { - fn name(&self) -> String { - String::from("TransformSpillReader") - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if self.output.is_finished() { - self.input.finish(); - return Ok(Event::Finished); - } - - if !self.output.can_push() { - self.input.set_not_need_data(); - return Ok(Event::NeedConsume); - } - - if let Some(deserialized_meta) = self.deserialized_meta.take() { - self.output - .push_data(Ok(DataBlock::empty_with_meta(deserialized_meta))); - return Ok(Event::NeedConsume); - } - - if self.deserializing_meta.is_some() { - self.input.set_not_need_data(); - return Ok(Event::Sync); - } - - if self.reading_meta.is_some() { - self.input.set_not_need_data(); - return Ok(Event::Async); - } - - if self.input.has_data() { - let mut data_block = self.input.pull_data().unwrap()?; - - if let Some(block_meta) = data_block - .get_meta() - .and_then(AggregateMeta::downcast_ref_from) - { - if matches!(block_meta, AggregateMeta::BucketSpilled(_)) { - self.input.set_not_need_data(); - let block_meta = data_block.take_meta().unwrap(); - self.reading_meta = AggregateMeta::downcast_from(block_meta); - return Ok(Event::Async); - } - - if let AggregateMeta::Partitioned { data, .. } = block_meta { - if data - .iter() - .any(|meta| matches!(meta, AggregateMeta::BucketSpilled(_))) - { - self.input.set_not_need_data(); - let block_meta = data_block.take_meta().unwrap(); - self.reading_meta = AggregateMeta::downcast_from(block_meta); - return Ok(Event::Async); - } - } - } - - self.output.push_data(Ok(data_block)); - return Ok(Event::NeedConsume); - } - - if self.input.is_finished() { - self.output.finish(); - return Ok(Event::Finished); - } - - self.input.set_need_data(); - Ok(Event::NeedData) - } - - fn process(&mut self) -> Result<()> { - if let Some((meta, mut read_data)) = self.deserializing_meta.take() { - match meta { - AggregateMeta::Spilled(_) => unreachable!(), - AggregateMeta::AggregatePayload(_) => unreachable!(), - AggregateMeta::AggregateSpilling(_) => unreachable!(), - AggregateMeta::Serialized(_) => unreachable!(), - AggregateMeta::BucketSpilled(payload) => { - debug_assert!(read_data.len() == 1); - let data = read_data.pop_front().unwrap(); - - self.deserialized_meta = Some(Box::new(Self::deserialize(payload, data))); - } - AggregateMeta::Partitioned { bucket, data } => { - let mut new_data = Vec::with_capacity(data.len()); - - for meta in data { - if matches!(&meta, AggregateMeta::BucketSpilled(_)) { - if let AggregateMeta::BucketSpilled(payload) = meta { - let data = read_data.pop_front().unwrap(); - new_data.push(Self::deserialize(payload, data)); - } - - continue; - } - - new_data.push(meta); - } - - self.deserialized_meta = - Some(AggregateMeta::create_partitioned(bucket, new_data)); - } - } - } - - Ok(()) - } - - #[async_backtrace::framed] - async fn async_process(&mut self) -> Result<()> { - if let Some(block_meta) = self.reading_meta.take() { - match &block_meta { - AggregateMeta::Spilled(_) => unreachable!(), - AggregateMeta::AggregatePayload(_) => unreachable!(), - AggregateMeta::AggregateSpilling(_) => unreachable!(), - AggregateMeta::Serialized(_) => unreachable!(), - AggregateMeta::BucketSpilled(payload) => { - let _guard = self.semaphore.acquire().await; - let instant = Instant::now(); - let data = self - .operator - .read_with(&payload.location) - .range(payload.data_range.clone()) - .await? - .to_vec(); - - info!( - "Read aggregate spill {} successfully, elapsed: {:?}", - &payload.location, - instant.elapsed() - ); - - self.deserializing_meta = Some((block_meta, VecDeque::from(vec![data]))); - } - AggregateMeta::Partitioned { data, .. } => { - // For log progress. - let mut total_elapsed = Duration::default(); - let log_interval = 100; - let mut processed_count = 0; - - let mut read_data = Vec::with_capacity(data.len()); - for meta in data { - if let AggregateMeta::BucketSpilled(payload) = meta { - let location = payload.location.clone(); - let operator = self.operator.clone(); - let data_range = payload.data_range.clone(); - let semaphore = self.semaphore.clone(); - read_data.push(databend_common_base::runtime::spawn(async move { - let _guard = semaphore.acquire().await; - let instant = Instant::now(); - let data = operator - .read_with(&location) - .range(data_range) - .await? - .to_vec(); - - // perf - { - Profile::record_usize_profile( - ProfileStatisticsName::RemoteSpillReadCount, - 1, - ); - Profile::record_usize_profile( - ProfileStatisticsName::RemoteSpillReadBytes, - data.len(), - ); - Profile::record_usize_profile( - ProfileStatisticsName::RemoteSpillReadTime, - instant.elapsed().as_millis() as usize, - ); - } - - total_elapsed += instant.elapsed(); - processed_count += 1; - - // log the progress - if processed_count % log_interval == 0 { - info!( - "Read aggregate {}/{} spilled buckets, elapsed: {:?}", - processed_count, - data.len(), - total_elapsed - ); - } - - Ok(data) - })); - } - } - - match futures::future::try_join_all(read_data).await { - Err(_) => { - return Err(ErrorCode::TokioError("Cannot join tokio job")); - } - Ok(read_data) => { - let read_data: std::result::Result>, opendal::Error> = - read_data.into_iter().try_collect(); - - self.deserializing_meta = Some((block_meta, read_data?)); - } - }; - - if processed_count != 0 { - info!( - "Read {} aggregate spills successfully, total elapsed: {:?}", - processed_count, total_elapsed - ); - } - } - } - } - - Ok(()) - } -} - -impl TransformSpillReader { - pub fn create( - input: Arc, - output: Arc, - operator: Operator, - semaphore: Arc, - ) -> Result { - Ok(ProcessorPtr::create(Box::new(TransformSpillReader { - input, - output, - operator, - semaphore, - deserialized_meta: None, - reading_meta: None, - deserializing_meta: None, - }))) - } - - fn deserialize(payload: BucketSpilledPayload, data: Vec) -> AggregateMeta { - let mut begin = 0; - let mut columns = Vec::with_capacity(payload.columns_layout.len()); - - for column_layout in payload.columns_layout { - columns.push(deserialize_column(&data[begin..begin + column_layout as usize]).unwrap()); - begin += column_layout as usize; - } - - AggregateMeta::Serialized(SerializedPayload { - bucket: payload.bucket, - data_block: DataBlock::new_from_columns(columns), - max_partition_count: payload.max_partition_count, - }) - } -} - -pub type TransformAggregateSpillReader = TransformSpillReader; diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs index 048d7e6ed5a1c..4c044f74c2853 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs @@ -15,16 +15,21 @@ use std::sync::Arc; use bumpalo::Bump; +use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::AggregateHashTable; +use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; use databend_common_expression::HashTableConfig; +use databend_common_expression::InputColumns; +use databend_common_expression::Payload; use databend_common_expression::PayloadFlushState; +use databend_common_expression::ProbeState; use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_transforms::processors::BlockMetaTransform; -use databend_common_pipeline_transforms::processors::BlockMetaTransformer; +use databend_common_pipeline_transforms::AccumulatingTransform; +use databend_common_pipeline_transforms::AccumulatingTransformer; use crate::pipelines::processors::transforms::aggregator::aggregate_meta::AggregateMeta; use crate::pipelines::processors::transforms::aggregator::AggregatorParams; @@ -32,108 +37,160 @@ use crate::pipelines::processors::transforms::aggregator::AggregatorParams; pub struct TransformFinalAggregate { params: Arc, flush_state: PayloadFlushState, + hash_table: AggregateHashTable, + + working_partition: isize, +} + +impl AccumulatingTransform for TransformFinalAggregate { + const NAME: &'static str = "TransformFinalAggregate"; + + fn transform(&mut self, mut data: DataBlock) -> Result> { + let Some(meta) = data.take_meta() else { + return Err(ErrorCode::Internal( + "Internal, TransformFinalAggregate only recv DataBlock with meta.", + )); + }; + + let Some(aggregate_meta) = AggregateMeta::downcast_from(meta) else { + return Err(ErrorCode::Internal( + "Internal, TransformFinalAggregate only recv DataBlock with meta.", + )); + }; + + let mut blocks = vec![]; + match aggregate_meta { + AggregateMeta::SpilledPayload(_) => unreachable!(), + AggregateMeta::FinalPartition(_) => unreachable!(), + AggregateMeta::InFlightPayload(payload) => { + debug_assert!(payload.partition >= self.working_partition); + debug_assert_eq!(payload.max_partition, payload.global_max_partition); + + if self.working_partition != payload.partition { + self.working_partition = payload.partition; + blocks.push(self.flush_result_blocks()?); + } + + if !data.is_empty() { + let payload = self.deserialize_flight(data)?; + self.hash_table + .combine_payload(&payload, &mut self.flush_state)?; + } + } + AggregateMeta::AggregatePayload(payload) => { + debug_assert!(payload.partition >= self.working_partition); + debug_assert_eq!(payload.max_partition, payload.global_max_partition); + + if self.working_partition != payload.partition { + self.working_partition = payload.partition; + blocks.push(self.flush_result_blocks()?); + } + + if payload.payload.len() != 0 { + self.hash_table + .combine_payload(&payload.payload, &mut self.flush_state)?; + } + } + }; + + Ok(blocks) + } + + fn on_finish(&mut self, output: bool) -> Result> { + if !output { + return Ok(vec![]); + } + + Ok(vec![self.flush_result_blocks()?]) + } } impl TransformFinalAggregate { pub fn try_create( input: Arc, output: Arc, - params: Arc, ) -> Result> { - Ok(BlockMetaTransformer::create( + let config = HashTableConfig::default().with_initial_radix_bits(0); + + let hash_table = AggregateHashTable::new( + params.group_data_types.clone(), + params.aggregate_functions.clone(), + config, + Arc::new(Bump::new()), + ); + + Ok(AccumulatingTransformer::create( input, output, TransformFinalAggregate { params, + hash_table, + working_partition: 0, flush_state: PayloadFlushState::default(), }, )) } - fn transform_agg_hashtable(&mut self, meta: AggregateMeta) -> Result { - let mut agg_hashtable: Option = None; - if let AggregateMeta::Partitioned { bucket, data } = meta { - for bucket_data in data { - match bucket_data { - AggregateMeta::Serialized(payload) => match agg_hashtable.as_mut() { - Some(ht) => { - debug_assert!(bucket == payload.bucket); - - let payload = payload.convert_to_partitioned_payload( - self.params.group_data_types.clone(), - self.params.aggregate_functions.clone(), - self.params.num_states(), - 0, - Arc::new(Bump::new()), - )?; - ht.combine_payloads(&payload, &mut self.flush_state)?; - } - None => { - debug_assert!(bucket == payload.bucket); - agg_hashtable = Some(payload.convert_to_aggregate_table( - self.params.group_data_types.clone(), - self.params.aggregate_functions.clone(), - self.params.num_states(), - 0, - Arc::new(Bump::new()), - true, - )?); - } - }, - AggregateMeta::AggregatePayload(payload) => match agg_hashtable.as_mut() { - Some(ht) => { - debug_assert!(bucket == payload.bucket); - ht.combine_payload(&payload.payload, &mut self.flush_state)?; - } - None => { - debug_assert!(bucket == payload.bucket); - let capacity = - AggregateHashTable::get_capacity_for_count(payload.payload.len()); - let mut hashtable = AggregateHashTable::new_with_capacity( - self.params.group_data_types.clone(), - self.params.aggregate_functions.clone(), - HashTableConfig::default().with_initial_radix_bits(0), - capacity, - Arc::new(Bump::new()), - ); - hashtable.combine_payload(&payload.payload, &mut self.flush_state)?; - agg_hashtable = Some(hashtable); - } - }, - _ => unreachable!(), - } - } - } + fn deserialize_flight(&mut self, data: DataBlock) -> Result { + let rows_num = data.num_rows(); + let group_len = self.params.group_data_types.len(); - if let Some(mut ht) = agg_hashtable { - let mut blocks = vec![]; - self.flush_state.clear(); - - loop { - if ht.merge_result(&mut self.flush_state)? { - let mut cols = self.flush_state.take_aggregate_results(); - cols.extend_from_slice(&self.flush_state.take_group_columns()); - blocks.push(DataBlock::new_from_columns(cols)); - } else { - break; - } - } + let mut state = ProbeState::default(); - if blocks.is_empty() { - return Ok(self.params.empty_result_block()); - } - return DataBlock::concat(&blocks); - } + // create single partition hash table for deserialize + let capacity = AggregateHashTable::get_capacity_for_count(rows_num); + let config = HashTableConfig::default().with_initial_radix_bits(0); + let mut hashtable = AggregateHashTable::new_directly( + self.params.group_data_types.clone(), + self.params.aggregate_functions.clone(), + config, + capacity, + Arc::new(Bump::new()), + false, + ); + + let num_states = self.params.num_states(); + let states_index: Vec = (0..num_states).collect(); + let agg_states = InputColumns::new_block_proxy(&states_index, &data); + + let group_index: Vec = (num_states..(num_states + group_len)).collect(); + let group_columns = InputColumns::new_block_proxy(&group_index, &data); - Ok(self.params.empty_result_block()) + let _ = hashtable.add_groups( + &mut state, + group_columns, + &[(&[]).into()], + agg_states, + rows_num, + )?; + + hashtable.payload.mark_min_cardinality(); + assert_eq!(hashtable.payload.payloads.len(), 1); + Ok(hashtable.payload.payloads.pop().unwrap()) } -} -impl BlockMetaTransform for TransformFinalAggregate { - const NAME: &'static str = "TransformFinalAggregate"; + fn flush_result_blocks(&mut self) -> Result { + let mut blocks = vec![]; + self.flush_state.clear(); + + while self.hash_table.merge_result(&mut self.flush_state)? { + let mut cols = self.flush_state.take_aggregate_results(); + cols.extend_from_slice(&self.flush_state.take_group_columns()); + blocks.push(DataBlock::new_from_columns(cols)); + } - fn transform(&mut self, meta: AggregateMeta) -> Result> { - Ok(vec![self.transform_agg_hashtable(meta)?]) + let config = HashTableConfig::default().with_initial_radix_bits(0); + self.hash_table = AggregateHashTable::new( + self.params.group_data_types.clone(), + self.params.aggregate_functions.clone(), + config, + Arc::new(Bump::new()), + ); + + match blocks.is_empty() { + true => Ok(self.params.empty_result_block()), + false => DataBlock::concat(&blocks), + } } } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs index 404d963f8ba6e..e92159a9c15d3 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs @@ -17,15 +17,22 @@ use std::time::Instant; use std::vec; use bumpalo::Bump; +use byteorder::BigEndian; +use byteorder::WriteBytesExt; use databend_common_base::base::convert_byte_size; use databend_common_base::base::convert_number_size; use databend_common_catalog::plan::AggIndexMeta; +use databend_common_catalog::table_context::TableContext; +use databend_common_config::GlobalConfig; use databend_common_exception::Result; +use databend_common_expression::arrow::write_column; use databend_common_expression::AggregateHashTable; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; use databend_common_expression::HashTableConfig; use databend_common_expression::InputColumns; +use databend_common_expression::PartitionedPayload; +use databend_common_expression::Payload; use databend_common_expression::PayloadFlushState; use databend_common_expression::ProbeState; use databend_common_pipeline_core::processors::InputPort; @@ -34,11 +41,18 @@ use databend_common_pipeline_core::processors::Processor; use databend_common_pipeline_transforms::processors::AccumulatingTransform; use databend_common_pipeline_transforms::processors::AccumulatingTransformer; use databend_common_pipeline_transforms::MemorySettings; +use opendal::Operator; use crate::pipelines::memory_settings::MemorySettingsExt; use crate::pipelines::processors::transforms::aggregator::aggregate_meta::AggregateMeta; use crate::pipelines::processors::transforms::aggregator::AggregatorParams; +use crate::pipelines::processors::transforms::aggregator::SpilledPayload; use crate::sessions::QueryContext; +use crate::spillers::SpillWriter; +use crate::spillers::Spiller; +use crate::spillers::SpillerConfig; +use crate::spillers::SpillerType; + #[allow(clippy::enum_variant_names)] enum HashTable { MovedOut, @@ -61,6 +75,10 @@ pub struct TransformPartialAggregate { processed_bytes: usize, processed_rows: usize, settings: MemorySettings, + configure_peer_nodes: Vec, + spilling_state: Option, + spiller: Spiller, + output_blocks: Vec, } impl TransformPartialAggregate { @@ -68,8 +86,10 @@ impl TransformPartialAggregate { ctx: Arc, input: Arc, output: Arc, + operator: Operator, params: Arc, config: HashTableConfig, + location_prefix: String, ) -> Result> { let hash_table = { let arena = Arc::new(Bump::new()); @@ -92,11 +112,21 @@ impl TransformPartialAggregate { } }; + let config = SpillerConfig { + spiller_type: SpillerType::Aggregation, + location_prefix, + disk_spill: None, + use_parquet: ctx.get_settings().get_spilling_file_format()?.is_parquet(), + }; + + let spiller = Spiller::create(ctx.clone(), operator, config.clone())?; + Ok(AccumulatingTransformer::create( input, output, TransformPartialAggregate { params, + spiller, hash_table, probe_state: ProbeState::default(), settings: MemorySettings::from_aggregate_settings(&ctx)?, @@ -104,6 +134,9 @@ impl TransformPartialAggregate { first_block_start: None, processed_bytes: 0, processed_rows: 0, + configure_peer_nodes: vec![GlobalConfig::instance().query.node_id.clone()], + spilling_state: None, + output_blocks: vec![], }, )) } @@ -182,48 +215,39 @@ impl TransformPartialAggregate { } } } + + fn reset_hashtable(&mut self) { + let hashtable_spilling_state = self.spilling_state.as_mut().unwrap(); + + hashtable_spilling_state + .ht + .config + .update_current_max_radix_bits(); + + let config = hashtable_spilling_state + .ht + .config + .clone() + .with_initial_radix_bits(hashtable_spilling_state.ht.config.max_radix_bits); + + let aggrs = hashtable_spilling_state.ht.payload.aggrs.clone(); + let group_types = hashtable_spilling_state.ht.payload.group_types.clone(); + self.spilling_state = None; + self.hash_table = HashTable::AggregateHashTable(AggregateHashTable::new( + group_types, + aggrs, + config, + Arc::new(Bump::new()), + )); + } } +#[async_trait::async_trait] impl AccumulatingTransform for TransformPartialAggregate { const NAME: &'static str = "TransformPartialAggregate"; fn transform(&mut self, block: DataBlock) -> Result> { self.execute_one_block(block)?; - - if self.settings.check_spill() { - if let HashTable::AggregateHashTable(v) = std::mem::take(&mut self.hash_table) { - let group_types = v.payload.group_types.clone(); - let aggrs = v.payload.aggrs.clone(); - v.config.update_current_max_radix_bits(); - let config = v - .config - .clone() - .with_initial_radix_bits(v.config.max_radix_bits); - - let mut state = PayloadFlushState::default(); - - // repartition to max for normalization - let partitioned_payload = v - .payload - .repartition(1 << config.max_radix_bits, &mut state); - - let blocks = vec![DataBlock::empty_with_meta( - AggregateMeta::create_agg_spilling(partitioned_payload), - )]; - - let arena = Arc::new(Bump::new()); - self.hash_table = HashTable::AggregateHashTable(AggregateHashTable::new( - group_types, - aggrs, - config, - arena, - )); - return Ok(blocks); - } - - unreachable!() - } - Ok(vec![]) } @@ -235,7 +259,6 @@ impl AccumulatingTransform for TransformPartialAggregate { }, HashTable::AggregateHashTable(hashtable) => { let partition_count = hashtable.payload.partition_count(); - let mut blocks = Vec::with_capacity(partition_count); log::info!( "Aggregated {} to {} rows in {} sec(real: {}). ({} rows/sec, {}/sec, {})", @@ -256,20 +279,336 @@ impl AccumulatingTransform for TransformPartialAggregate { convert_byte_size(self.processed_bytes as f64), ); - for (bucket, payload) in hashtable.payload.payloads.into_iter().enumerate() { - if payload.len() != 0 { - blocks.push(DataBlock::empty_with_meta( + if hashtable.len() != 0 { + for (partition, payload) in hashtable.payload.payloads.into_iter().enumerate() { + self.output_blocks.push(DataBlock::empty_with_meta( AggregateMeta::create_agg_payload( - bucket as isize, payload, + partition as isize, + partition_count, partition_count, ), )); } } - blocks + std::mem::take(&mut self.output_blocks) } }) } + + fn configure_peer_nodes(&mut self, nodes: &[String]) { + self.configure_peer_nodes = nodes.to_vec(); + } + + fn need_spill(&self) -> bool { + self.settings.check_spill() + } + + fn prepare_spill_payload(&mut self) -> Result { + if self.spilling_state.is_none() { + let HashTable::AggregateHashTable(ht) = std::mem::take(&mut self.hash_table) else { + return Ok(false); + }; + + if ht.len() == 0 { + self.hash_table = HashTable::AggregateHashTable(ht); + return Ok(false); + } + + let max_bucket = self.configure_peer_nodes.len(); + self.spilling_state = Some(HashtableSpillingState::create(ht, max_bucket)); + } + + if let Some(spilling_state) = self.spilling_state.as_mut() { + spilling_state.last_prepare_payload = spilling_state.serialize_partition_payload()?; + return Ok(true); + } + + Ok(false) + } + + async fn flush_spill_payload(&mut self) -> Result { + let spilling_state = self.spilling_state.as_mut().unwrap(); + + let max_bucket = spilling_state.max_bucket; + let max_partition = 1 << spilling_state.ht.config.max_radix_bits; + + if !spilling_state.data_payload.is_empty() { + if spilling_state.writer.is_none() { + let location = self.spiller.create_unique_location(); + spilling_state.writer = Some(self.spiller.create_aggregate_writer(location).await?); + } + + let writer = spilling_state.writer.as_mut().unwrap(); + + let mut flush_data = Vec::with_capacity(4 * 1024 * 1024); + std::mem::swap(&mut flush_data, &mut spilling_state.data_payload); + writer.write(flush_data).await?; + } + + if spilling_state.last_prepare_payload { + if let Some(writer) = spilling_state.writer.as_mut() { + let last_offset = spilling_state.last_flush_partition_offset; + if writer.write_bytes() > last_offset { + let spilled_payload = SpilledPayload { + partition: spilling_state.working_partition as isize, + location: writer.location(), + data_range: last_offset as u64..writer.write_bytes() as u64, + destination_node: self.configure_peer_nodes[spilling_state.working_bucket] + .clone(), + max_partition, + global_max_partition: max_partition, + }; + + self.output_blocks.push(DataBlock::empty_with_meta( + AggregateMeta::create_spilled_payload(spilled_payload), + )); + + spilling_state.last_flush_partition_offset = writer.write_bytes(); + } + } + + spilling_state.payload_idx = 0; + spilling_state.working_partition += 1; + if spilling_state.working_partition < max_partition { + return Ok(true); + } + + if let Some(writer) = spilling_state.writer.as_mut() { + writer.complete().await?; + spilling_state.writer = None; + spilling_state.last_flush_partition_offset = 0; + } + + spilling_state.payload_idx = 0; + spilling_state.working_bucket += 1; + spilling_state.working_partition = 0; + + if spilling_state.working_bucket < max_bucket { + return Ok(true); + } + + spilling_state.finished = true; + self.reset_hashtable(); + + return Ok(false); + } + + Ok(true) + } +} + +pub struct HashtableSpillingState { + ht: AggregateHashTable, + payload_idx: usize, + working_partition: usize, + partition_flush_state: PayloadFlushState, + + max_bucket: usize, + working_bucket: usize, + bucket_flush_state: PayloadFlushState, + + serialize_flush_state: PayloadFlushState, + + data_payload: Vec, + + finished: bool, + last_prepare_payload: bool, + writer: Option, + + last_flush_partition_offset: usize, +} + +impl HashtableSpillingState { + pub fn create(ht: AggregateHashTable, scatter_max_bucket: usize) -> Self { + HashtableSpillingState { + ht, + payload_idx: 0, + working_partition: 0, + partition_flush_state: PayloadFlushState::default(), + max_bucket: scatter_max_bucket, + working_bucket: 0, + bucket_flush_state: PayloadFlushState::default(), + serialize_flush_state: PayloadFlushState::default(), + data_payload: Vec::with_capacity(6 * 1024 * 1024), + writer: None, + finished: false, + last_prepare_payload: false, + last_flush_partition_offset: 0, + } + } + pub fn serialize_payload(&mut self, payload: Option) -> Result { + let payload = match payload.as_ref() { + Some(payload) => payload, + None => &self.ht.payload.payloads[self.working_partition], + }; + + if payload.len() == 0 { + return Ok(true); + } + + while let Some(data_block) = payload.aggregate_flush(&mut self.serialize_flush_state)? { + if data_block.num_rows() == 0 { + // next batch rows + continue; + } + + let columns = data_block.columns().to_vec(); + for column in columns.into_iter() { + let column = column.into_column(data_block.num_rows()); + + let offset = self.data_payload.len(); + + self.data_payload.write_u64::(0)?; + write_column(&column, &mut self.data_payload)?; + + // rewrite column length + let len = self.data_payload.len(); + let mut buffer = &mut self.data_payload[offset..]; + buffer.write_u64::((len - offset - size_of::()) as u64)?; + } + + if self.data_payload.len() >= 4 * 1024 * 1024 { + // flush data if >= 4MB + return Ok(false); + } + } + + self.serialize_flush_state.clear(); + Ok(true) + } + + pub fn serialize_scatter_payload(&mut self, raw_payload: Option) -> Result { + // If no need scatter + if self.max_bucket <= 1 { + return self.serialize_payload(raw_payload); + } + + // using if-else to avoid mutable borrow occurs here + if let Some(payload) = raw_payload { + while payload.scatter(&mut self.bucket_flush_state, self.max_bucket) { + let working_bucket = self.working_bucket; + let flush_state = &mut self.bucket_flush_state; + + let rows = flush_state.probe_state.partition_count[working_bucket]; + + if rows == 0 { + // next batch rows + continue; + } + + let sel = &flush_state.probe_state.partition_entries[working_bucket]; + + let mut scattered_payload = Payload::new( + payload.arena.clone(), + payload.group_types.clone(), + payload.aggrs.clone(), + payload.states_layout.clone(), + ); + + scattered_payload.state_move_out = true; + scattered_payload.copy_rows(sel, rows, &flush_state.addresses); + + if !self.serialize_payload(Some(scattered_payload))? { + return Ok(false); + } + } + } else { + while self.ht.payload.payloads[self.working_partition] + .scatter(&mut self.bucket_flush_state, self.max_bucket) + { + let working_bucket = self.working_bucket; + let flush_state = &mut self.bucket_flush_state; + let rows = flush_state.probe_state.partition_count[working_bucket]; + + if rows == 0 { + // next batch rows + continue; + } + + let sel = &flush_state.probe_state.partition_entries[working_bucket]; + + let working_payload = &self.ht.payload.payloads[self.working_partition]; + let mut scattered_payload = Payload::new( + working_payload.arena.clone(), + working_payload.group_types.clone(), + working_payload.aggrs.clone(), + working_payload.states_layout.clone(), + ); + + scattered_payload.state_move_out = true; + scattered_payload.copy_rows(sel, rows, &flush_state.addresses); + + if !self.serialize_payload(Some(scattered_payload))? { + return Ok(false); + } + } + } + + self.bucket_flush_state.clear(); + Ok(true) + } + + pub fn serialize_partition_payload(&mut self) -> Result { + let max_partitions = 1 << self.ht.config.max_radix_bits; + + // If no need repartition + if self.ht.payload.partition_count() == max_partitions { + return self.serialize_scatter_payload(None); + } + + let mut partition_payload = PartitionedPayload::new( + self.ht.payload.group_types.clone(), + self.ht.payload.aggrs.clone(), + max_partitions as u64, + self.ht.payload.arenas.clone(), + ); + + for payload in &mut partition_payload.payloads { + payload.state_move_out = true; + } + + // repartition and get current partition payload + for idx in self.payload_idx..self.ht.payload.payloads.len() { + while partition_payload.gather_flush( + &self.ht.payload.payloads[idx], + &mut self.partition_flush_state, + ) { + let working_partition = self.working_partition; + let flush_state = &mut self.partition_flush_state; + + let rows = flush_state.probe_state.partition_count[working_partition]; + + if rows == 0 { + // next batch rows + continue; + } + + let address = &flush_state.addresses; + let selector = &flush_state.probe_state.partition_entries[working_partition]; + + let working_payload = &self.ht.payload.payloads[idx]; + let mut working_partition_payload = Payload::new( + working_payload.arena.clone(), + working_payload.group_types.clone(), + working_payload.aggrs.clone(), + working_payload.states_layout.clone(), + ); + + working_partition_payload.state_move_out = true; + working_partition_payload.copy_rows(selector, rows, address); + + if !self.serialize_scatter_payload(Some(working_partition_payload))? { + return Ok(false); + } + } + + self.payload_idx += 1; + self.partition_flush_state.clear(); + } + + self.partition_flush_state.clear(); + Ok(true) + } } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_align.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_align.rs new file mode 100644 index 0000000000000..eb95601641217 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_align.rs @@ -0,0 +1,405 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::BTreeMap; +use std::collections::VecDeque; +use std::sync::Arc; + +use bumpalo::Bump; +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use databend_common_expression::AggregateHashTable; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_expression::HashTableConfig; +use databend_common_expression::InputColumns; +use databend_common_expression::PartitionedPayload; +use databend_common_expression::Payload; +use databend_common_expression::PayloadFlushState; +use databend_common_expression::ProbeState; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; +use databend_common_pipeline_transforms::MemorySettings; + +use crate::pipelines::memory_settings::MemorySettingsExt; +use crate::pipelines::processors::transforms::aggregator::transform_partition_bucket::SINGLE_LEVEL_BUCKET_NUM; +use crate::pipelines::processors::transforms::aggregator::AggregateMeta; +use crate::pipelines::processors::transforms::aggregator::AggregatePayload; +use crate::pipelines::processors::transforms::aggregator::AggregatorParams; +use crate::sessions::QueryContext; + +pub struct TransformPartitionAlign { + input: Arc, + output: Arc, + + #[allow(dead_code)] + settings: MemorySettings, + params: Arc, + + max_partition: usize, + working_partition: isize, + partitions: Partitions, + + output_data: VecDeque, + input_data: Option<(AggregateMeta, DataBlock)>, +} + +impl TransformPartitionAlign { + pub fn create( + ctx: Arc, + params: Arc, + input: Arc, + output: Arc, + ) -> Result { + let settings = MemorySettings::from_aggregate_settings(&ctx)?; + Ok(TransformPartitionAlign { + input, + output, + params, + settings, + max_partition: 0, + working_partition: 0, + partitions: Partitions::create(), + input_data: None, + output_data: Default::default(), + }) + } + + fn ready_partition(&mut self) -> Option { + let storage_min_partition = self.partitions.min_partition()?; + + if storage_min_partition >= self.working_partition { + return None; + } + + Some(storage_min_partition) + } + + fn fetch_ready_partition(&mut self) -> Result<()> { + if let Some(ready_partition_id) = self.ready_partition() { + let ready_partition = self.partitions.take_partition(ready_partition_id); + + for (meta, data_block) in ready_partition { + self.output_data + .push_back(data_block.add_meta(Some(Box::new(meta)))?); + } + + self.output_data + .push_back(DataBlock::empty_with_meta(AggregateMeta::create_final( + vec![], + ))); + } + + Ok(()) + } + + fn unpark_block(&self, mut data_block: DataBlock) -> Result<(AggregateMeta, DataBlock)> { + let Some(meta) = data_block.take_meta() else { + return Err(ErrorCode::Internal( + "Internal, TransformPartitionBucket only recv DataBlock with meta.", + )); + }; + + let Some(meta) = AggregateMeta::downcast_from(meta) else { + return Err(ErrorCode::Internal( + "Internal, TransformPartitionBucket only recv AggregateMeta".to_string(), + )); + }; + + Ok((meta, data_block)) + } + + fn repartition(&mut self, meta: AggregateMeta, data_block: DataBlock) -> Result<()> { + match meta { + AggregateMeta::FinalPartition(_) => unreachable!(), + AggregateMeta::SpilledPayload(_payload) => unreachable!(), + AggregateMeta::InFlightPayload(payload) => { + if data_block.is_empty() { + return Ok(()); + } + + let payload = AggregatePayload { + partition: payload.partition, + max_partition: payload.max_partition, + payload: self.deserialize_flight(data_block)?, + global_max_partition: payload.global_max_partition, + }; + + let repartition = payload.global_max_partition; + let partitioned = self.partition_payload(payload, repartition); + + for payload in partitioned { + self.partitions + .add_data(AggregateMeta::AggregatePayload(payload), DataBlock::empty()); + } + } + AggregateMeta::AggregatePayload(payload) => { + if payload.payload.len() == 0 { + return Ok(()); + } + + let repartition = payload.global_max_partition; + let partitioned = self.partition_payload(payload, repartition); + for payload in partitioned { + self.partitions + .add_data(AggregateMeta::AggregatePayload(payload), DataBlock::empty()); + } + } + } + + Ok(()) + } + + fn deserialize_flight(&mut self, data: DataBlock) -> Result { + let rows_num = data.num_rows(); + let group_len = self.params.group_data_types.len(); + + let mut state = ProbeState::default(); + + // create single partition hash table for deserialize + let capacity = AggregateHashTable::get_capacity_for_count(rows_num); + let config = HashTableConfig::default().with_initial_radix_bits(0); + let mut hashtable = AggregateHashTable::new_directly( + self.params.group_data_types.clone(), + self.params.aggregate_functions.clone(), + config, + capacity, + Arc::new(Bump::new()), + false, + ); + + let num_states = self.params.num_states(); + let states_index: Vec = (0..num_states).collect(); + let agg_states = InputColumns::new_block_proxy(&states_index, &data); + + let group_index: Vec = (num_states..(num_states + group_len)).collect(); + let group_columns = InputColumns::new_block_proxy(&group_index, &data); + + let _ = hashtable.add_groups( + &mut state, + group_columns, + &[(&[]).into()], + agg_states, + rows_num, + )?; + + hashtable.payload.mark_min_cardinality(); + assert_eq!(hashtable.payload.payloads.len(), 1); + Ok(hashtable.payload.payloads.pop().unwrap()) + } + + fn partition_payload(&mut self, from: AggregatePayload, to: usize) -> Vec { + let mut partitioned = Vec::with_capacity(to); + let mut partitioned_payload = PartitionedPayload::new( + self.params.group_data_types.clone(), + self.params.aggregate_functions.clone(), + to as u64, + from.payload.arena.clone(), + ); + + let mut flush_state = PayloadFlushState::default(); + partitioned_payload.combine_single(from.payload, &mut flush_state, None); + + for (partition, payload) in partitioned_payload.payloads.into_iter().enumerate() { + partitioned.push(AggregatePayload { + payload, + partition: partition as isize, + max_partition: to, + global_max_partition: from.global_max_partition, + }); + } + + partitioned + } +} + +impl Processor for TransformPartitionAlign { + fn name(&self) -> String { + String::from("TransformPartitionAlign") + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.output.can_push() { + self.input.set_not_need_data(); + return Ok(Event::NeedConsume); + } + + if let Some(data_block) = self.output_data.pop_front() { + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + + if self.input.has_data() { + let data_block = self.input.pull_data().unwrap()?; + + let (meta, data_block) = self.unpark_block(data_block)?; + self.max_partition = meta.get_global_max_partition(); + + // need repartition + if meta.get_max_partition() != meta.get_global_max_partition() { + self.input_data = Some((meta, data_block)); + return Ok(Event::Sync); + } + + let partition = meta.get_sorting_partition(); + self.partitions.add_data(meta, data_block); + + if partition > SINGLE_LEVEL_BUCKET_NUM && partition != self.working_partition { + self.working_partition = partition; + } + } + + if self.input.is_finished() && self.working_partition as usize != self.max_partition { + self.working_partition = self.max_partition as isize; + } + + if self.output_data.is_empty() { + self.fetch_ready_partition()?; + } + + if let Some(data_block) = self.output_data.pop_front() { + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + + if self.input.is_finished() { + self.output.finish(); + return Ok(Event::Finished); + } + + self.input.set_need_data(); + Ok(Event::NeedData) + } + + fn process(&mut self) -> Result<()> { + if let Some((meta, data_block)) = self.input_data.take() { + self.repartition(meta, data_block)?; + } + + Ok(()) + } +} + +// #[async_trait::async_trait] +// impl AccumulatingTransform for TransformPartitionAlign { +// const NAME: &'static str = "TransformPartitionAlign"; +// +// fn transform(&mut self, data_block: DataBlock) -> Result> { +// let (meta, data_block) = self.unpark_block(data_block)?; +// self.max_partition = meta.get_global_max_partition(); +// +// // need repartition +// if meta.get_max_partition() != meta.get_global_max_partition() { +// self.repartition(meta, data_block)?; +// return Ok(vec![]); +// } +// +// let partition = meta.get_sorting_partition(); +// self.partitions.add_data(meta, data_block); +// +// if partition > SINGLE_LEVEL_BUCKET_NUM && partition != self.working_partition { +// self.fetch_ready_partition()?; +// self.working_partition = partition; +// // return Ok(ready_partition); +// } +// +// Ok(vec![]) +// } +// +// fn on_finish(&mut self, _output: bool) -> Result> { +// let remain_size = self +// .partitions +// .data +// .values() +// .map(|x| x.len()) +// .sum::(); +// +// let mut remain_partitions = Vec::with_capacity(remain_size + self.partitions.data.len()); +// self.working_partition = self.max_partition as isize; +// +// loop { +// let ready_partition = self.fetch_ready_partition()?; +// +// if !ready_partition.is_empty() { +// remain_partitions.extend(ready_partition); +// continue; +// } +// +// return Ok(remain_partitions); +// } +// } +// +// fn need_spill(&self) -> bool { +// self.settings.check_spill() +// } +// +// fn prepare_spill_payload(&mut self) -> Result { +// // self.partitions.data.f +// Ok(false) +// } +// +// async fn flush_spill_payload(&mut self) -> Result { +// Ok(false) +// } +// } + +#[derive(Debug)] +struct Partitions { + data: BTreeMap>, +} + +impl Partitions { + pub fn create() -> Partitions { + Partitions { + data: BTreeMap::new(), + } + } + + pub fn add_data(&mut self, meta: AggregateMeta, block: DataBlock) { + if matches!(&meta, AggregateMeta::AggregatePayload(v) if v.payload.len() == 0) + || matches!(&meta, AggregateMeta::InFlightPayload(_) if block.is_empty()) + { + return; + } + + match self.data.entry(meta.get_partition()) { + std::collections::btree_map::Entry::Vacant(v) => { + v.insert(vec![(meta, block)]); + } + std::collections::btree_map::Entry::Occupied(mut v) => { + v.get_mut().push((meta, block)); + } + }; + } + + pub fn min_partition(&self) -> Option { + self.data.keys().min().cloned() + } + + pub fn take_partition(&mut self, partition: isize) -> Vec<(AggregateMeta, DataBlock)> { + self.data.remove(&partition).unwrap_or_default() + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs new file mode 100644 index 0000000000000..142c20e452acd --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs @@ -0,0 +1,87 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_catalog::table_context::TableContext; +use databend_common_exception::Result; +use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_core::Pipe; +use databend_common_pipeline_core::PipeItem; +use databend_common_pipeline_core::Pipeline; +use databend_common_storage::DataOperator; + +use super::TransformFinalAggregate; +use super::TransformPartitionRestore; +use crate::pipelines::processors::transforms::aggregator::transform_partition_align::TransformPartitionAlign; +use crate::pipelines::processors::transforms::aggregator::transform_partition_dispatch::TransformPartitionDispatch; +use crate::pipelines::processors::transforms::aggregator::transform_partition_exchange::ExchangePartition; +use crate::pipelines::processors::transforms::aggregator::transform_partition_resorting::ResortingPartition; +use crate::pipelines::processors::transforms::aggregator::AggregatorParams; +use crate::sessions::QueryContext; + +pub static SINGLE_LEVEL_BUCKET_NUM: isize = -1; + +pub fn build_final_aggregate( + ctx: Arc, + pipeline: &mut Pipeline, + params: Arc, +) -> Result<()> { + let settings = ctx.get_settings(); + let pipe_size = settings.get_max_threads()? as usize; + + // 1. resorting partition + pipeline.exchange(1, Arc::new(ResortingPartition::create()))?; + + // 2. align partitions + pipeline.add_transform(|input, output| { + Ok(ProcessorPtr::create(Box::new( + TransformPartitionAlign::create(ctx.clone(), params.clone(), input, output)?, + ))) + })?; + + // 3. dispatch partition + let processor = TransformPartitionDispatch::create(pipe_size); + let inputs_port = processor.get_inputs(); + let outputs_port = processor.get_outputs(); + pipeline.add_pipe(Pipe::create(inputs_port.len(), outputs_port.len(), vec![ + PipeItem::create( + ProcessorPtr::create(Box::new(processor)), + inputs_port, + outputs_port, + ), + ])); + + // 4. restore partition + let operator = DataOperator::instance().spill_operator(); + pipeline.add_transform(|input, output| { + TransformPartitionRestore::create(input, output, operator.clone(), params.clone()) + })?; + + // 5. exchange local + let pipe_size = pipeline.output_len(); + pipeline.exchange( + pipe_size, + ExchangePartition::create(pipe_size, params.clone()), + )?; + + // 6. final aggregate + pipeline.add_transform(|input, output| { + Ok(ProcessorPtr::create(TransformFinalAggregate::try_create( + input.clone(), + output.clone(), + params.clone(), + )?)) + }) +} diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_dispatch.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_dispatch.rs new file mode 100644 index 0000000000000..bfe7e87258e75 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_dispatch.rs @@ -0,0 +1,263 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::VecDeque; +use std::sync::Arc; + +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::EventCause; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; + +use crate::pipelines::processors::transforms::aggregator::AggregateMeta; + +#[derive(PartialEq)] +enum PortStatus { + Idle, + NeedData, + Finished, +} + +struct PortWithStatus { + pub status: PortStatus, + pub port: Arc, +} + +pub struct TransformPartitionDispatch { + initialized: bool, + + finished_outputs: usize, + waiting_outputs: VecDeque, + waiting_outputs_2: VecDeque, + + sync_final_partition: bool, + sent_final_partition: Vec, + synchronized_final_partition: Vec, + + current_data: Option, + + input: Arc, + outputs: Vec>, +} + +impl TransformPartitionDispatch { + pub fn create(outputs: usize) -> TransformPartitionDispatch { + let mut outputs_port = Vec::with_capacity(outputs); + + for _index in 0..outputs { + outputs_port.push(PortWithStatus { + status: PortStatus::Idle, + port: OutputPort::create(), + }); + } + + TransformPartitionDispatch { + initialized: false, + finished_outputs: 0, + outputs: outputs_port, + input: InputPort::create(), + waiting_outputs: VecDeque::with_capacity(outputs), + waiting_outputs_2: VecDeque::with_capacity(outputs), + current_data: None, + sync_final_partition: false, + sent_final_partition: vec![false; outputs], + synchronized_final_partition: vec![false; outputs], + } + } + + pub fn get_inputs(&self) -> Vec> { + vec![self.input.clone()] + } + + pub fn get_outputs(&self) -> Vec> { + self.outputs.iter().map(|x| x.port.clone()).collect() + } + + fn unpark_block(mut data_block: DataBlock) -> Result<(AggregateMeta, DataBlock)> { + let Some(meta) = data_block.take_meta() else { + return Err(ErrorCode::Internal( + "Internal, TransformPartitionBucket only recv DataBlock with meta.", + )); + }; + + let Some(meta) = AggregateMeta::downcast_from(meta) else { + return Err(ErrorCode::Internal( + "Internal, TransformPartitionBucket only recv AggregateMeta".to_string(), + )); + }; + + Ok((meta, data_block)) + } +} + +impl Processor for TransformPartitionDispatch { + fn name(&self) -> String { + String::from("TransformPartitionDispatch") + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + #[allow(clippy::collapsible_if)] + fn event_with_cause(&mut self, cause: EventCause) -> Result { + if let EventCause::Output(output_index) = &cause { + let output = &mut self.outputs[*output_index]; + + if output.port.is_finished() { + if output.status != PortStatus::Finished { + self.finished_outputs += 1; + output.status = PortStatus::Finished; + } + } else if output.port.can_push() { + if self.sync_final_partition { + if self.sent_final_partition[*output_index] { + output.status = PortStatus::Idle; + self.waiting_outputs_2.push_back(*output_index); + self.synchronized_final_partition[*output_index] = true; + } else { + self.sent_final_partition[*output_index] = true; + output.port.push_data(Ok(DataBlock::empty_with_meta( + AggregateMeta::create_final(vec![]), + ))); + } + } else if output.status != PortStatus::NeedData { + output.status = PortStatus::NeedData; + self.waiting_outputs.push_back(*output_index); + } + } + } + + if !self.initialized && !self.waiting_outputs.is_empty() { + self.initialized = true; + self.input.set_need_data(); + } + + if self.finished_outputs == self.outputs.len() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.sync_final_partition && self.input.has_data() && self.current_data.is_none() { + let data_block = self.input.pull_data().unwrap()?; + let (meta, data_block) = Self::unpark_block(data_block)?; + + match meta { + AggregateMeta::FinalPartition(_) => { + self.sync_final_partition = true; + self.input.set_not_need_data(); + } + meta => { + self.input.set_need_data(); + self.current_data = Some(data_block.add_meta(Some(Box::new(meta)))?); + } + }; + } + + while self.sync_final_partition { + while let Some(output_index) = self.waiting_outputs.pop_front() { + if self.outputs[output_index].port.is_finished() { + self.synchronized_final_partition[output_index] = true; + + if self.outputs[output_index].status != PortStatus::Finished { + self.finished_outputs += 1; + self.outputs[output_index].status = PortStatus::Finished; + } + } + + self.outputs[output_index] + .port + .push_data(Ok(DataBlock::empty_with_meta(AggregateMeta::create_final( + vec![], + )))); + self.sent_final_partition[output_index] = true; + self.outputs[output_index].status = PortStatus::Idle; + } + + for (idx, synchronized) in self.synchronized_final_partition.iter().enumerate() { + if !synchronized && !self.outputs[idx].port.is_finished() { + return Ok(Event::NeedConsume); + } + } + + self.sync_final_partition = false; + self.sent_final_partition = vec![false; self.sent_final_partition.len()]; + self.synchronized_final_partition = vec![false; self.sent_final_partition.len()]; + std::mem::swap(&mut self.waiting_outputs, &mut self.waiting_outputs_2); + + if self.input.has_data() { + let data_block = self.input.pull_data().unwrap()?; + let (meta, data_block) = Self::unpark_block(data_block)?; + + match meta { + AggregateMeta::FinalPartition(_) => { + self.sync_final_partition = true; + self.input.set_not_need_data(); + continue; + } + meta => { + self.current_data = Some(data_block.add_meta(Some(Box::new(meta)))?); + } + }; + } + + self.input.set_need_data(); + break; + } + + while !self.waiting_outputs.is_empty() && self.current_data.is_some() { + let output_index = self.waiting_outputs.pop_front().unwrap(); + + // Port is finished when waiting. + if self.outputs[output_index].port.is_finished() { + if self.outputs[output_index].status != PortStatus::Finished { + self.finished_outputs += 1; + self.outputs[output_index].status = PortStatus::Finished; + } + + continue; + } + + if let Some(data_block) = self.current_data.take() { + self.outputs[output_index].port.push_data(Ok(data_block)); + self.outputs[output_index].status = PortStatus::Idle; + self.input.set_need_data(); + } + } + + if self.finished_outputs == self.outputs.len() { + self.input.finish(); + return Ok(Event::Finished); + } + + if self.input.is_finished() && self.current_data.is_none() { + for output in &self.outputs { + output.port.finish(); + } + + return Ok(Event::Finished); + } + + match self.waiting_outputs.is_empty() { + true => Ok(Event::NeedConsume), + false => Ok(Event::NeedData), + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_exchange.rs new file mode 100644 index 0000000000000..67a500714be7e --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_exchange.rs @@ -0,0 +1,241 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use std::sync::Arc; + +use bumpalo::Bump; +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use databend_common_expression::AggregateHashTable; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_expression::HashTableConfig; +use databend_common_expression::InputColumns; +use databend_common_expression::Payload; +use databend_common_expression::PayloadFlushState; +use databend_common_expression::ProbeState; +use databend_common_pipeline_core::processors::Exchange; + +use crate::pipelines::processors::transforms::aggregator::AggregateMeta; +use crate::pipelines::processors::transforms::aggregator::AggregatePayload; +use crate::pipelines::processors::transforms::aggregator::AggregatorParams; +use crate::pipelines::processors::transforms::aggregator::InFlightPayload; + +const HASH_SEED: u64 = 9263883436177860930; + +pub struct ExchangePartition { + merge_window_size: usize, + params: Arc, +} + +impl ExchangePartition { + pub fn create(merge_window_size: usize, params: Arc) -> Arc { + Arc::new(ExchangePartition { + merge_window_size, + params, + }) + } +} + +impl ExchangePartition { + fn partition_aggregate(mut payload: AggregatePayload, n: usize) -> Result> { + if payload.payload.len() == 0 { + return Ok(vec![]); + } + + let mut repartition_payloads = Vec::with_capacity(n); + let group_types = payload.payload.group_types.clone(); + let aggrs = payload.payload.aggrs.clone(); + let mut state = PayloadFlushState::default(); + + for _ in 0..repartition_payloads.capacity() { + repartition_payloads.push(Payload::new( + payload.payload.arena.clone(), + group_types.clone(), + aggrs.clone(), + payload.payload.states_layout.clone(), + )); + } + + // scatter each page of the payload. + while payload + .payload + .scatter_with_seed::(&mut state, repartition_payloads.len()) + { + // copy to the corresponding bucket. + for (idx, bucket) in repartition_payloads.iter_mut().enumerate() { + let count = state.probe_state.partition_count[idx]; + + if count > 0 { + let sel = &state.probe_state.partition_entries[idx]; + bucket.copy_rows(sel, count, &state.addresses); + } + } + } + + payload.payload.state_move_out = true; + + let mut partitions = Vec::with_capacity(repartition_payloads.len()); + + for repartition_payload in repartition_payloads { + partitions.push(DataBlock::empty_with_meta( + AggregateMeta::create_agg_payload( + repartition_payload, + payload.partition, + payload.max_partition, + payload.global_max_partition, + ), + )); + } + + Ok(partitions) + } + + fn partition_flight_payload( + &self, + payload: InFlightPayload, + block: DataBlock, + n: usize, + ) -> Result> { + let rows_num = block.num_rows(); + + if rows_num == 0 { + return Ok(vec![]); + } + + let group_len = self.params.group_data_types.len(); + + let mut state = ProbeState::default(); + + // create single partition hash table for deserialize + let capacity = AggregateHashTable::get_capacity_for_count(rows_num); + let config = HashTableConfig::default().with_initial_radix_bits(0); + let mut hashtable = AggregateHashTable::new_directly( + self.params.group_data_types.clone(), + self.params.aggregate_functions.clone(), + config, + capacity, + Arc::new(Bump::new()), + false, + ); + + let num_states = self.params.num_states(); + let states_index: Vec = (0..num_states).collect(); + let agg_states = InputColumns::new_block_proxy(&states_index, &block); + + let group_index: Vec = (num_states..(num_states + group_len)).collect(); + let group_columns = InputColumns::new_block_proxy(&group_index, &block); + + let _ = hashtable.add_groups( + &mut state, + group_columns, + &[(&[]).into()], + agg_states, + rows_num, + )?; + + hashtable.payload.mark_min_cardinality(); + assert_eq!(hashtable.payload.payloads.len(), 1); + + Self::partition_aggregate( + AggregatePayload { + partition: payload.partition, + payload: hashtable.payload.payloads.pop().unwrap(), + max_partition: payload.max_partition, + global_max_partition: payload.global_max_partition, + }, + n, + ) + } +} + +impl Exchange for ExchangePartition { + const NAME: &'static str = "AggregatePartitionExchange"; + const MULTIWAY_SORT: bool = false; + + fn partition(&self, mut data_block: DataBlock, n: usize) -> Result> { + let Some(meta) = data_block.take_meta() else { + return Err(ErrorCode::Internal( + "AggregatePartitionExchange only recv AggregateMeta", + )); + }; + + let Some(meta) = AggregateMeta::downcast_from(meta) else { + return Err(ErrorCode::Internal( + "AggregatePartitionExchange only recv AggregateMeta", + )); + }; + + match meta { + // already restore in upstream + AggregateMeta::SpilledPayload(_) => unreachable!(), + AggregateMeta::FinalPartition(_) => Ok(vec![]), + AggregateMeta::AggregatePayload(payload) => Self::partition_aggregate(payload, n), + AggregateMeta::InFlightPayload(payload) => { + self.partition_flight_payload(payload, data_block, n) + } + } + } + + fn output_window_size(&self) -> usize { + self.merge_window_size + } + + fn merge_output(&self, data_blocks: Vec) -> Result> { + let mut blocks = BTreeMap::::new(); + for mut data_block in data_blocks { + let Some(meta) = data_block.take_meta() else { + return Err(ErrorCode::Internal( + "Internal, ExchangePartition only recv DataBlock with meta.", + )); + }; + + let Some(aggregate_meta) = AggregateMeta::downcast_from(meta) else { + return Err(ErrorCode::Internal( + "Internal, ExchangePartition only recv DataBlock with meta.", + )); + }; + + let mut payload = match aggregate_meta { + AggregateMeta::SpilledPayload(_) => unreachable!(), + AggregateMeta::FinalPartition(_) => unreachable!(), + AggregateMeta::InFlightPayload(_) => unreachable!(), + AggregateMeta::AggregatePayload(payload) => payload, + }; + + match blocks.entry(payload.partition) { + Entry::Vacant(v) => { + v.insert(payload); + } + Entry::Occupied(mut v) => { + payload.payload.state_move_out = true; + v.get_mut() + .payload + .arena + .extend(payload.payload.arena.clone()); + v.get_mut().payload.combine(payload.payload); + } + } + } + + Ok(blocks + .into_values() + .map(|payload| { + DataBlock::empty_with_meta(Box::new(AggregateMeta::AggregatePayload(payload))) + }) + .collect()) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_resorting.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_resorting.rs new file mode 100644 index 0000000000000..00697bc99efe4 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_resorting.rs @@ -0,0 +1,107 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Ordering; +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering as AtomicOrdering; + +use databend_common_exception::Result; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_pipeline_core::processors::Exchange; + +use crate::pipelines::processors::transforms::aggregator::AggregateMeta; + +pub struct ResortingPartition { + global_max_partition: AtomicUsize, +} + +impl ResortingPartition { + pub fn create() -> Self { + ResortingPartition { + global_max_partition: AtomicUsize::new(0), + } + } + + fn block_number(meta: &AggregateMeta) -> (isize, usize) { + (meta.get_sorting_partition(), meta.get_max_partition()) + } +} + +impl Exchange for ResortingPartition { + const NAME: &'static str = "PartitionResorting"; + const MULTIWAY_SORT: bool = true; + + fn partition(&self, mut data_block: DataBlock, n: usize) -> Result> { + debug_assert_eq!(n, 1); + + let Some(meta) = data_block.take_meta() else { + return Ok(vec![data_block]); + }; + + let Some(_) = AggregateMeta::downcast_ref_from(&meta) else { + return Ok(vec![data_block]); + }; + + let global_max_partition = self.global_max_partition.load(AtomicOrdering::SeqCst); + let mut meta = AggregateMeta::downcast_from(meta).unwrap(); + meta.set_global_max_partition(global_max_partition); + + Ok(vec![data_block.add_meta(Some(Box::new(meta)))?]) + } + + fn init_way( + &self, + _index: usize, + first_data: &DataBlock, + ) -> databend_common_exception::Result<()> { + let max_partition = match first_data.get_meta() { + None => 0, + Some(meta) => match AggregateMeta::downcast_ref_from(meta) { + None => 0, + Some(v) => v.get_global_max_partition(), + }, + }; + + self.global_max_partition + .fetch_max(max_partition, std::sync::atomic::Ordering::SeqCst); + Ok(()) + } + + fn sorting_function(left_block: &DataBlock, right_block: &DataBlock) -> Ordering { + let Some(left_meta) = left_block.get_meta() else { + return Ordering::Equal; + }; + let Some(left_meta) = AggregateMeta::downcast_ref_from(left_meta) else { + return Ordering::Equal; + }; + + let Some(right_meta) = right_block.get_meta() else { + return Ordering::Equal; + }; + let Some(right_meta) = AggregateMeta::downcast_ref_from(right_meta) else { + return Ordering::Equal; + }; + + let (l_partition, l_max_partition) = ResortingPartition::block_number(left_meta); + let (r_partition, r_max_partition) = ResortingPartition::block_number(right_meta); + + // ORDER BY max_partition asc, partition asc, idx asc + match l_max_partition.cmp(&r_max_partition) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => l_partition.cmp(&r_partition), + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_restore.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_restore.rs new file mode 100644 index 0000000000000..6cc9cab78642a --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_restore.rs @@ -0,0 +1,196 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::VecDeque; +use std::sync::Arc; + +use byteorder::BigEndian; +use byteorder::ReadBytesExt; +use databend_common_exception::Result; +use databend_common_expression::arrow::deserialize_column; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; +use databend_common_pipeline_core::processors::ProcessorPtr; +use opendal::Operator; + +use crate::pipelines::processors::transforms::aggregator::AggregateMeta; +use crate::pipelines::processors::transforms::aggregator::AggregatorParams; +use crate::pipelines::processors::transforms::aggregator::SpilledPayload; + +type DeserializingMeta = (AggregateMeta, VecDeque>); + +pub struct TransformPartitionRestore { + input: Arc, + output: Arc, + + operator: Operator, + params: Arc, + output_data: Option, + reading_meta: Option, + deserializing_meta: Option, +} + +#[async_trait::async_trait] +impl Processor for TransformPartitionRestore { + fn name(&self) -> String { + String::from("TransformPartitionRestore") + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.output.can_push() { + self.input.set_not_need_data(); + return Ok(Event::NeedConsume); + } + + if let Some(output_data) = self.output_data.take() { + self.output.push_data(Ok(output_data)); + return Ok(Event::NeedConsume); + } + + if self.deserializing_meta.is_some() { + self.input.set_not_need_data(); + return Ok(Event::Sync); + } + + if self.reading_meta.is_some() { + self.input.set_not_need_data(); + return Ok(Event::Async); + } + + if self.input.has_data() { + let mut data_block = self.input.pull_data().unwrap()?; + + if let Some(block_meta) = data_block + .get_meta() + .and_then(AggregateMeta::downcast_ref_from) + { + if matches!(block_meta, AggregateMeta::SpilledPayload(_)) { + self.input.set_not_need_data(); + let block_meta = data_block.take_meta().unwrap(); + self.reading_meta = AggregateMeta::downcast_from(block_meta); + return Ok(Event::Async); + } + } + + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + + if self.input.is_finished() { + self.output.finish(); + return Ok(Event::Finished); + } + + self.input.set_need_data(); + Ok(Event::NeedData) + } + + fn process(&mut self) -> Result<()> { + if let Some((meta, mut read_data)) = self.deserializing_meta.take() { + match meta { + AggregateMeta::SpilledPayload(payload) => { + debug_assert!(read_data.len() == 1); + let data = read_data.pop_front().unwrap(); + self.output_data = Some(self.deserialize(payload, data)?); + } + _ => unreachable!(), + } + } + + Ok(()) + } + + #[async_backtrace::framed] + async fn async_process(&mut self) -> Result<()> { + if let Some(block_meta) = self.reading_meta.take() { + match &block_meta { + AggregateMeta::SpilledPayload(payload) => { + let data = self + .operator + .read_with(&payload.location) + .range(payload.data_range.clone()) + .await? + .to_vec(); + + self.deserializing_meta = Some((block_meta, VecDeque::from(vec![data]))); + } + _ => unreachable!(), + } + } + + Ok(()) + } +} + +impl TransformPartitionRestore { + pub fn create( + input: Arc, + output: Arc, + operator: Operator, + params: Arc, + ) -> Result { + Ok(ProcessorPtr::create(Box::new(TransformPartitionRestore { + input, + output, + operator, + params, + output_data: None, + reading_meta: None, + deserializing_meta: None, + }))) + } + + fn deserialize(&self, payload: SpilledPayload, data: Vec) -> Result { + let columns = self.params.group_data_types.len() + self.params.aggregate_functions.len(); + + let mut blocks = vec![]; + let mut cursor = data.as_slice(); + + while !cursor.is_empty() { + let mut block_columns = Vec::with_capacity(columns); + + for _idx in 0..columns { + let column_size = cursor.read_u64::().unwrap(); + let (left, right) = cursor.split_at(column_size as usize); + block_columns.push(deserialize_column(left).unwrap()); + cursor = right; + } + + let block1 = DataBlock::new_from_columns(block_columns); + blocks.push(block1); + } + + let block = DataBlock::concat(&blocks).unwrap(); + + block.add_meta(Some(AggregateMeta::create_in_flight_payload( + payload.partition, + payload.max_partition, + payload.global_max_partition, + ))) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs index 3cf2d0621770f..3feed32aadce9 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs @@ -541,6 +541,7 @@ impl HashJoinProbeState { } else { None }; + result_blocks.push(self.merge_eq_block( probe_block, build_block, diff --git a/src/query/service/src/pipelines/processors/transforms/range_join/merge_join_state.rs b/src/query/service/src/pipelines/processors/transforms/range_join/merge_join_state.rs index 10e0f7cd547c0..1e512b1a2c630 100644 --- a/src/query/service/src/pipelines/processors/transforms/range_join/merge_join_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/range_join/merge_join_state.rs @@ -137,6 +137,7 @@ impl RangeJoinState { j += 1; } } + Ok(result_blocks) } diff --git a/src/query/service/src/pipelines/processors/transforms/range_join/range_join_state.rs b/src/query/service/src/pipelines/processors/transforms/range_join/range_join_state.rs index bdc79cd0df387..83a2aa153414f 100644 --- a/src/query/service/src/pipelines/processors/transforms/range_join/range_join_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/range_join/range_join_state.rs @@ -92,16 +92,22 @@ impl RangeJoinState { } pub(crate) fn sink_right(&self, block: DataBlock) -> Result<()> { - // Sink block to right table - let mut right_table = self.right_table.write(); - right_table.push(block); + if !block.is_empty() || block.get_meta().is_some() { + // Sink block to right table + let mut right_table = self.right_table.write(); + right_table.push(block); + } + Ok(()) } pub(crate) fn sink_left(&self, block: DataBlock) -> Result<()> { - // Sink block to left table - let mut left_table = self.left_table.write(); - left_table.push(block); + if !block.is_empty() || block.get_meta().is_some() { + // Sink block to left table + let mut left_table = self.left_table.write(); + left_table.push(block); + } + Ok(()) } @@ -133,9 +139,11 @@ impl RangeJoinState { pub fn task_id(&self) -> Option { let task_id = self.finished_tasks.fetch_add(1, atomic::Ordering::SeqCst); + if task_id >= self.tasks.read().len() as u64 { return None; } + Some(task_id as usize) } @@ -176,6 +184,7 @@ impl RangeJoinState { let left_table = self.left_table.read(); // Right table is bigger than left table let mut right_table = self.right_table.write(); + if !left_table.is_empty() && !right_table.is_empty() && left_table.len() * right_table.len() < max_threads @@ -272,6 +281,7 @@ impl RangeJoinState { right_offset = 0; left_offset += left_block.num_rows(); } + Ok(()) } } diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs deleted file mode 100644 index 4aa65ba175a83..0000000000000 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use databend_common_catalog::table_context::TableContext; -use databend_common_exception::Result; -use databend_common_pipeline_core::Pipeline; -use databend_common_settings::FlightCompression; - -use super::exchange_params::MergeExchangeParams; -use crate::servers::flight::v1::exchange::serde::TransformExchangeDeserializer; -use crate::servers::flight::v1::exchange::serde::TransformExchangeSerializer; -use crate::servers::flight::v1::exchange::serde::TransformScatterExchangeSerializer; -use crate::servers::flight::v1::exchange::DataExchange; -use crate::servers::flight::v1::exchange::ExchangeSorting; -use crate::servers::flight::v1::exchange::ShuffleExchangeParams; -use crate::servers::flight::v1::scatter::BroadcastFlightScatter; -use crate::servers::flight::v1::scatter::FlightScatter; -use crate::servers::flight::v1::scatter::HashFlightScatter; -use crate::sessions::QueryContext; - -pub trait ExchangeInjector: Send + Sync + 'static { - fn flight_scatter( - &self, - ctx: &Arc, - exchange: &DataExchange, - ) -> Result>>; - - fn exchange_sorting(&self) -> Option>; - - fn apply_merge_serializer( - &self, - params: &MergeExchangeParams, - compression: Option, - pipeline: &mut Pipeline, - ) -> Result<()>; - - fn apply_shuffle_serializer( - &self, - params: &ShuffleExchangeParams, - compression: Option, - pipeline: &mut Pipeline, - ) -> Result<()>; - - fn apply_merge_deserializer( - &self, - params: &MergeExchangeParams, - pipeline: &mut Pipeline, - ) -> Result<()>; - - fn apply_shuffle_deserializer( - &self, - params: &ShuffleExchangeParams, - pipeline: &mut Pipeline, - ) -> Result<()>; -} - -pub struct DefaultExchangeInjector; - -impl DefaultExchangeInjector { - pub fn create() -> Arc { - Arc::new(DefaultExchangeInjector {}) - } -} - -impl ExchangeInjector for DefaultExchangeInjector { - fn flight_scatter( - &self, - ctx: &Arc, - exchange: &DataExchange, - ) -> Result>> { - Ok(Arc::new(match exchange { - DataExchange::Merge(_) => unreachable!(), - DataExchange::Broadcast(exchange) => Box::new(BroadcastFlightScatter::try_create( - exchange.destination_ids.len(), - )?), - DataExchange::ShuffleDataExchange(exchange) => { - let local_id = &ctx.get_cluster().local_id; - let local_pos = exchange - .destination_ids - .iter() - .position(|x| x == local_id) - .unwrap(); - HashFlightScatter::try_create( - ctx.get_function_context()?, - exchange.shuffle_keys.clone(), - exchange.destination_ids.len(), - local_pos, - )? - } - })) - } - - fn exchange_sorting(&self) -> Option> { - None - } - - fn apply_merge_serializer( - &self, - params: &MergeExchangeParams, - compression: Option, - pipeline: &mut Pipeline, - ) -> Result<()> { - pipeline.add_transform(|input, output| { - TransformExchangeSerializer::create(input, output, params, compression) - }) - } - - fn apply_shuffle_serializer( - &self, - params: &ShuffleExchangeParams, - compression: Option, - pipeline: &mut Pipeline, - ) -> Result<()> { - pipeline.add_transform(|input, output| { - TransformScatterExchangeSerializer::create(input, output, compression, params) - }) - } - - fn apply_merge_deserializer( - &self, - params: &MergeExchangeParams, - pipeline: &mut Pipeline, - ) -> Result<()> { - pipeline.add_transform(|input, output| { - Ok(TransformExchangeDeserializer::create( - input, - output, - ¶ms.schema, - )) - }) - } - - fn apply_shuffle_deserializer( - &self, - params: &ShuffleExchangeParams, - pipeline: &mut Pipeline, - ) -> Result<()> { - pipeline.add_transform(|input, output| { - Ok(TransformExchangeDeserializer::create( - input, - output, - ¶ms.schema, - )) - }) - } -} diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs index d158ccf3c9b89..c3d399b7c1895 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs @@ -63,12 +63,12 @@ use crate::servers::flight::v1::actions::init_query_fragments; use crate::servers::flight::v1::actions::INIT_QUERY_FRAGMENTS; use crate::servers::flight::v1::actions::START_PREPARED_QUERY; use crate::servers::flight::v1::exchange::DataExchange; -use crate::servers::flight::v1::exchange::DefaultExchangeInjector; -use crate::servers::flight::v1::exchange::ExchangeInjector; use crate::servers::flight::v1::packets::Edge; use crate::servers::flight::v1::packets::QueryEnv; use crate::servers::flight::v1::packets::QueryFragment; use crate::servers::flight::v1::packets::QueryFragments; +use crate::servers::flight::v1::scatter::BroadcastFlightScatter; +use crate::servers::flight::v1::scatter::HashFlightScatter; use crate::servers::flight::FlightClient; use crate::servers::flight::FlightExchange; use crate::servers::flight::FlightReceiver; @@ -470,9 +470,7 @@ impl DataExchangeManager { None => Err(ErrorCode::Internal("Query not exists.")), Some(query_coordinator) => { assert!(query_coordinator.fragment_exchanges.is_empty()); - let injector = DefaultExchangeInjector::create(); - let mut build_res = - query_coordinator.subscribe_fragment(&ctx, fragment_id, injector)?; + let mut build_res = query_coordinator.subscribe_fragment(&ctx, fragment_id)?; let exchanges = std::mem::take(&mut query_coordinator.statistics_exchanges); let statistics_receiver = StatisticsReceiver::spawn_receiver(&ctx, exchanges)?; @@ -533,7 +531,6 @@ impl DataExchangeManager { &self, query_id: &str, fragment_id: usize, - injector: Arc, ) -> Result { let queries_coordinator_guard = self.queries_coordinator.lock(); let queries_coordinator = unsafe { &mut *queries_coordinator_guard.deref().get() }; @@ -548,7 +545,7 @@ impl DataExchangeManager { .query_ctx .clone(); - query_coordinator.subscribe_fragment(&query_ctx, fragment_id, injector) + query_coordinator.subscribe_fragment(&query_ctx, fragment_id) } } } @@ -735,7 +732,6 @@ impl QueryCoordinator { &mut self, ctx: &Arc, fragment_id: usize, - injector: Arc, ) -> Result { // Merge pipelines if exist locally pipeline if let Some(mut fragment_coordinator) = self.fragments_coordinator.remove(&fragment_id) { @@ -759,21 +755,14 @@ impl QueryCoordinator { fragment_coordinator .pipeline_build_res .as_ref() - .map(|x| x.exchange_injector.clone()) - .ok_or_else(|| { - ErrorCode::Internal("Pipeline build result is none, It's a bug") - })?, + .map(|x| x.enable_multiway_sort) + .unwrap_or(false), )?; let mut build_res = fragment_coordinator.pipeline_build_res.unwrap(); // Add exchange data transform. - ExchangeTransform::via( - ctx, - &exchange_params, - &mut build_res.main_pipeline, - injector, - )?; + ExchangeTransform::via(ctx, &exchange_params, &mut build_res.main_pipeline)?; return Ok(build_res); } @@ -821,10 +810,8 @@ impl QueryCoordinator { coordinator .pipeline_build_res .as_ref() - .map(|x| x.exchange_injector.clone()) - .ok_or_else(|| { - ErrorCode::Internal("Pipeline build result is none, It's a bug") - })?, + .map(|x| x.enable_multiway_sort) + .unwrap_or(false), )?, ); } @@ -916,13 +903,13 @@ impl FragmentCoordinator { pub fn create_exchange_params( &self, info: &QueryInfo, - exchange_injector: Arc, + enable_multiway_sort: bool, ) -> Result { if let Some(data_exchange) = &self.data_exchange { return match data_exchange { DataExchange::Merge(exchange) => { Ok(ExchangeParams::MergeExchange(MergeExchangeParams { - exchange_injector: exchange_injector.clone(), + enable_multiway_sort, schema: self.physical_plan.output_schema()?, fragment_id: self.fragment_id, query_id: info.query_id.to_string(), @@ -933,26 +920,30 @@ impl FragmentCoordinator { } DataExchange::Broadcast(exchange) => { Ok(ExchangeParams::ShuffleExchange(ShuffleExchangeParams { - exchange_injector: exchange_injector.clone(), + enable_multiway_sort, schema: self.physical_plan.output_schema()?, fragment_id: self.fragment_id, query_id: info.query_id.to_string(), executor_id: info.current_executor.to_string(), destination_ids: exchange.destination_ids.to_owned(), - shuffle_scatter: exchange_injector - .flight_scatter(&info.query_ctx, data_exchange)?, + shuffle_scatter: Arc::new(Box::new(BroadcastFlightScatter::try_create( + exchange.destination_ids.len(), + )?)), })) } DataExchange::ShuffleDataExchange(exchange) => { Ok(ExchangeParams::ShuffleExchange(ShuffleExchangeParams { - exchange_injector: exchange_injector.clone(), + enable_multiway_sort, schema: self.physical_plan.output_schema()?, fragment_id: self.fragment_id, query_id: info.query_id.to_string(), executor_id: info.current_executor.to_string(), destination_ids: exchange.destination_ids.to_owned(), - shuffle_scatter: exchange_injector - .flight_scatter(&info.query_ctx, data_exchange)?, + shuffle_scatter: Arc::new(HashFlightScatter::try_create( + &info.query_ctx, + exchange.shuffle_keys.clone(), + &exchange.destination_ids, + )?), })) } }; diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_params.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_params.rs index 799efe506affe..15607c0454f8d 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_params.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_params.rs @@ -16,7 +16,6 @@ use std::sync::Arc; use databend_common_expression::DataSchemaRef; -use crate::servers::flight::v1::exchange::ExchangeInjector; use crate::servers::flight::v1::scatter::FlightScatter; #[derive(Clone)] @@ -27,7 +26,7 @@ pub struct ShuffleExchangeParams { pub schema: DataSchemaRef, pub destination_ids: Vec, pub shuffle_scatter: Arc>, - pub exchange_injector: Arc, + pub enable_multiway_sort: bool, } #[derive(Clone)] @@ -37,8 +36,8 @@ pub struct MergeExchangeParams { pub destination_id: String, pub schema: DataSchemaRef, pub ignore_exchange: bool, + pub enable_multiway_sort: bool, pub allow_adjust_parallelism: bool, - pub exchange_injector: Arc, } pub enum ExchangeParams { diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_sink.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_sink.rs index 73ed08eb1c021..ef606b21e3ac6 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_sink.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_sink.rs @@ -16,20 +16,15 @@ use std::sync::Arc; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_core::Pipe; -use databend_common_pipeline_core::PipeItem; use databend_common_pipeline_core::Pipeline; use super::exchange_params::ExchangeParams; use super::exchange_sink_writer::create_writer_item; -use super::exchange_sorting::ExchangeSorting; -use super::exchange_sorting::TransformExchangeSorting; use super::exchange_transform_shuffle::exchange_shuffle; -use super::serde::ExchangeSerializeMeta; use crate::clusters::ClusterHelper; +use crate::pipelines::processors::transforms::aggregator::FlightExchange; +use crate::servers::flight::v1::scatter::MergeFlightScatter; use crate::sessions::QueryContext; use crate::sessions::TableContext; @@ -55,29 +50,29 @@ impl ExchangeSink { ))); } - let exchange_injector = ¶ms.exchange_injector; + let settings = ctx.get_settings(); + let compression = settings.get_query_flight_compression()?; + + let nodes = vec![]; + match params.enable_multiway_sort { + true => pipeline.exchange( + 1, + FlightExchange::::create( + nodes, + compression, + Arc::new(Box::new(MergeFlightScatter)), + ), + )?, + false => pipeline.exchange( + 1, + FlightExchange::::create( + nodes, + compression, + Arc::new(Box::new(MergeFlightScatter)), + ), + )?, + }; - if !params.ignore_exchange { - let settings = ctx.get_settings(); - let compression = settings.get_query_flight_compression()?; - exchange_injector.apply_merge_serializer(params, compression, pipeline)?; - } - - if !params.ignore_exchange && exchange_injector.exchange_sorting().is_some() { - let output_len = pipeline.output_len(); - let sorting = SinkExchangeSorting::create(); - let transform = TransformExchangeSorting::create(output_len, sorting); - - let output = transform.get_output(); - let inputs = transform.get_inputs(); - pipeline.add_pipe(Pipe::create(output_len, 1, vec![PipeItem::create( - ProcessorPtr::create(Box::new(transform)), - inputs, - vec![output], - )])); - } - - pipeline.try_resize(1)?; assert_eq!(senders.len(), 1); pipeline.add_pipe(Pipe::create(1, 0, vec![create_writer_item( senders.remove(0), @@ -111,27 +106,3 @@ impl ExchangeSink { } } } - -struct SinkExchangeSorting; - -impl SinkExchangeSorting { - pub fn create() -> Arc { - Arc::new(SinkExchangeSorting {}) - } -} - -impl ExchangeSorting for SinkExchangeSorting { - fn block_number(&self, data_block: &DataBlock) -> Result { - let block_meta = data_block.get_meta(); - let shuffle_meta = block_meta - .and_then(ExchangeSerializeMeta::downcast_ref_from) - .ok_or_else(|| { - ErrorCode::Internal(format!( - "Failed to downcast ExchangeSerializeMeta from BlockMeta: {:?}", - block_meta - )) - })?; - - Ok(shuffle_meta.block_number) - } -} diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_sink_writer.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_sink_writer.rs index abebc2ba6a254..704359391bc0a 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_sink_writer.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_sink_writer.rs @@ -69,9 +69,10 @@ impl AsyncSink for ExchangeWriterSink { #[async_backtrace::framed] async fn consume(&mut self, mut data_block: DataBlock) -> Result { let serialize_meta = match data_block.take_meta() { - None => Err(ErrorCode::Internal( - "ExchangeWriterSink only recv ExchangeSerializeMeta.", - )), + None => Err(ErrorCode::Internal(format!( + "ExchangeWriterSink only recv ExchangeSerializeMeta. {:?}", + data_block + ))), Some(block_meta) => ExchangeSerializeMeta::downcast_from(block_meta).ok_or_else(|| { ErrorCode::Internal("ExchangeWriterSink only recv ExchangeSerializeMeta.") }), diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_sorting.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_sorting.rs deleted file mode 100644 index 8cc931d64641a..0000000000000 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_sorting.rs +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; - -use databend_common_exception::Result; -use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; - -pub trait ExchangeSorting: Send + Sync + 'static { - fn block_number(&self, data_block: &DataBlock) -> Result; -} - -// N input one output -pub struct TransformExchangeSorting { - inputs: Vec>, - output: Arc, - sorting: Arc, - - buffer_len: usize, - buffer: Vec>, -} - -impl TransformExchangeSorting { - pub fn create(inputs: usize, sorting: Arc) -> TransformExchangeSorting { - let output = OutputPort::create(); - let mut buffer = Vec::with_capacity(inputs); - let mut inputs_port = Vec::with_capacity(inputs); - - for _ in 0..inputs { - buffer.push(None); - inputs_port.push(InputPort::create()); - } - - TransformExchangeSorting { - output, - sorting, - buffer, - buffer_len: 0, - inputs: inputs_port, - } - } - - pub fn get_output(&self) -> Arc { - self.output.clone() - } - pub fn get_inputs(&self) -> Vec> { - self.inputs.clone() - } -} - -#[async_trait::async_trait] -impl Processor for TransformExchangeSorting { - fn name(&self) -> String { - String::from("TransformExchangeSorting") - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if self.output.is_finished() { - for input in &self.inputs { - input.finish(); - } - - return Ok(Event::Finished); - } - - let mut unready_inputs = false; - let mut all_inputs_finished = true; - for (index, input) in self.inputs.iter().enumerate() { - if input.is_finished() { - continue; - } - - all_inputs_finished = false; - if self.buffer[index].is_none() { - if input.has_data() { - let data_block = input.pull_data().unwrap()?; - let block_number = self.sorting.block_number(&data_block)?; - self.buffer[index] = Some((block_number, data_block)); - self.buffer_len += 1; - input.set_need_data(); - continue; - } - - unready_inputs = true; - } - - input.set_need_data(); - } - - if !self.output.can_push() { - return Ok(Event::NeedConsume); - } - - if all_inputs_finished && self.buffer_len == 0 { - self.output.finish(); - return Ok(Event::Finished); - } - - if !unready_inputs { - let mut min_index = 0; - let mut min_value = isize::MAX; - for (index, buffer) in self.buffer.iter().enumerate() { - if let Some((block_number, _)) = buffer { - if *block_number < min_value { - min_index = index; - min_value = *block_number; - } - } - } - - if let Some((_, block)) = self.buffer[min_index].take() { - self.buffer_len -= 1; - self.output.push_data(Ok(block)); - return Ok(Event::NeedConsume); - } - } - - Ok(Event::NeedData) - } -} diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_source.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_source.rs index acdfb66de123e..2d4bcdef3d32a 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_source.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_source.rs @@ -28,14 +28,13 @@ use super::exchange_params::ExchangeParams; use super::exchange_params::MergeExchangeParams; use super::exchange_source_reader::ExchangeSourceReader; use crate::clusters::ClusterHelper; -use crate::servers::flight::v1::exchange::ExchangeInjector; +use crate::pipelines::processors::transforms::aggregator::TransformAggregateDeserializer; use crate::sessions::QueryContext; /// Add Exchange Source to the pipeline. pub fn via_exchange_source( ctx: Arc, params: &MergeExchangeParams, - injector: Arc, pipeline: &mut Pipeline, ) -> Result<()> { // UpstreamTransform ---> DummyTransform ---> DummyTransform ---> DownstreamTransform @@ -93,5 +92,7 @@ pub fn via_exchange_source( pipeline.try_resize(last_output_len)?; } - injector.apply_merge_deserializer(params, pipeline) + pipeline.add_transform(|input, output| { + TransformAggregateDeserializer::try_create(input, output, ¶ms.schema) + }) } diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_transform.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_transform.rs index 47be1d1f473f8..0afb02e227455 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_transform.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_transform.rs @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::sync::Arc; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; -use databend_common_pipeline_core::processors::create_resize_item; use databend_common_pipeline_core::Pipe; use databend_common_pipeline_core::Pipeline; use databend_common_pipeline_transforms::processors::create_dummy_item; @@ -27,7 +27,7 @@ use super::exchange_source::via_exchange_source; use super::exchange_source_reader::create_reader_item; use super::exchange_transform_shuffle::exchange_shuffle; use crate::clusters::ClusterHelper; -use crate::servers::flight::v1::exchange::ExchangeInjector; +use crate::pipelines::processors::transforms::aggregator::TransformAggregateDeserializer; use crate::sessions::QueryContext; pub struct ExchangeTransform; @@ -37,11 +37,10 @@ impl ExchangeTransform { ctx: &Arc, params: &ExchangeParams, pipeline: &mut Pipeline, - injector: Arc, ) -> Result<()> { match params { ExchangeParams::MergeExchange(params) => { - via_exchange_source(ctx.clone(), params, injector, pipeline) + via_exchange_source(ctx.clone(), params, pipeline) } ExchangeParams::ShuffleExchange(params) => { exchange_shuffle(ctx, params, pipeline)?; @@ -58,8 +57,7 @@ impl ExchangeTransform { let senders = flight_senders.into_iter(); for (destination_id, sender) in params.destination_ids.iter().zip(senders) { items.push(match destination_id == ¶ms.executor_id { - true if max_threads == 1 => create_dummy_item(), - true => create_resize_item(1, max_threads), + true => create_dummy_item(), false => create_writer_item( sender, false, @@ -70,28 +68,52 @@ impl ExchangeTransform { }); } - let mut nodes_source = 0; let receivers = exchange_manager.get_flight_receiver(&exchange_params)?; + let nodes_source = receivers.len(); + + let mut lookup = params + .destination_ids + .iter() + .cloned() + .enumerate() + .map(|(x, y)| (y, x)) + .collect::>(); + + let mut nodes = Vec::with_capacity(nodes_source); + let mut reorder = Vec::with_capacity(nodes_source); + nodes.push(params.executor_id.clone()); + reorder.push(lookup.remove(¶ms.executor_id).unwrap()); + for (destination_id, receiver) in receivers { - if destination_id != params.executor_id { - nodes_source += 1; - items.push(create_reader_item( - receiver, - &destination_id, - ¶ms.executor_id, - params.fragment_id, - )); + if destination_id == params.executor_id { + continue; } - } - let new_outputs = max_threads + nodes_source; - pipeline.add_pipe(Pipe::create(len, new_outputs, items)); + nodes.push(destination_id.clone()); + reorder.push(lookup.remove(&destination_id).unwrap()); - if params.exchange_injector.exchange_sorting().is_none() { - pipeline.try_resize(max_threads)?; + items.push(create_reader_item( + receiver, + &destination_id, + ¶ms.executor_id, + params.fragment_id, + )); } - injector.apply_shuffle_deserializer(params, pipeline) + pipeline.add_pipe(Pipe::create(len, nodes_source, items)); + + match params.enable_multiway_sort { + true => pipeline.reorder_inputs(reorder), + false => pipeline.try_resize(max_threads)?, + }; + + pipeline.add_transform(|input, output| { + TransformAggregateDeserializer::try_create( + input.clone(), + output.clone(), + ¶ms.schema, + ) + }) } } } diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_transform_scatter.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_transform_scatter.rs deleted file mode 100644 index 0b69270eab8e3..0000000000000 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_transform_scatter.rs +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::ProcessorPtr; -use databend_common_pipeline_transforms::processors::Transform; -use databend_common_pipeline_transforms::processors::Transformer; - -use super::exchange_transform_shuffle::ExchangeShuffleMeta; -use crate::servers::flight::v1::scatter::FlightScatter; - -pub struct ScatterTransform { - scatter: Arc>, -} - -impl ScatterTransform { - pub fn create( - input: Arc, - output: Arc, - scatter: Arc>, - ) -> ProcessorPtr { - ProcessorPtr::create(Transformer::create(input, output, ScatterTransform { - scatter, - })) - } -} - -impl Transform for ScatterTransform { - const NAME: &'static str = "ScatterTransform"; - - fn transform(&mut self, data: DataBlock) -> databend_common_exception::Result { - let blocks = self.scatter.execute(data)?; - - Ok(DataBlock::empty_with_meta(ExchangeShuffleMeta::create( - blocks, - ))) - } -} diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_transform_shuffle.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_transform_shuffle.rs index 9c3242147c8d3..9cefcde59d441 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_transform_shuffle.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_transform_shuffle.rs @@ -12,35 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; -use std::collections::VecDeque; use std::fmt::Debug; use std::fmt::Formatter; use std::sync::Arc; use databend_common_catalog::table_context::TableContext; -use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::local_block_meta_serde; use databend_common_expression::BlockMetaInfo; -use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::BlockMetaInfoPtr; use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::EventCause; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_core::processors::ProcessorPtr; -use databend_common_pipeline_core::Pipe; -use databend_common_pipeline_core::PipeItem; use databend_common_pipeline_core::Pipeline; use super::exchange_params::ShuffleExchangeParams; -use super::exchange_sorting::ExchangeSorting; -use super::exchange_sorting::TransformExchangeSorting; -use super::exchange_transform_scatter::ScatterTransform; -use super::serde::ExchangeSerializeMeta; +use crate::pipelines::processors::transforms::aggregator::FlightExchange; use crate::sessions::QueryContext; pub struct ExchangeShuffleMeta { @@ -64,397 +49,39 @@ local_block_meta_serde!(ExchangeShuffleMeta); #[typetag::serde(name = "exchange_shuffle")] impl BlockMetaInfo for ExchangeShuffleMeta {} -struct OutputsBuffer { - inner: Vec>, -} - -impl OutputsBuffer { - pub fn create(capacity: usize, outputs: usize) -> OutputsBuffer { - OutputsBuffer { - inner: vec![capacity; outputs] - .into_iter() - .map(VecDeque::with_capacity) - .collect::>(), - } - } - - pub fn is_all_empty(&self) -> bool { - self.inner.iter().all(|x| x.is_empty()) - } - - pub fn is_empty(&self, index: usize) -> bool { - self.inner[index].is_empty() - } - - pub fn is_full(&self) -> bool { - self.inner.iter().any(|x| x.len() == x.capacity()) - } - - pub fn clear(&mut self, index: usize) { - self.inner[index].clear(); - } - - pub fn pop(&mut self, index: usize) -> Option { - self.inner[index].pop_front() - } - - pub fn push_back(&mut self, index: usize, block: DataBlock) -> usize { - self.inner[index].push_back(block); - self.inner[index].len() - } -} - -#[derive(PartialEq)] -enum PortStatus { - Idle, - HasData, - NeedData, - Finished, -} - -struct PortWithStatus { - pub status: PortStatus, - pub port: Arc, -} - -struct ExchangeShuffleTransform { - initialized: bool, - - finished_inputs: usize, - finished_outputs: usize, - - waiting_outputs: Vec, - waiting_inputs: VecDeque, - - buffer: OutputsBuffer, - inputs: Vec>, - outputs: Vec>, -} - -impl Processor for ExchangeShuffleTransform { - fn name(&self) -> String { - String::from("ExchangeShuffleTransform") - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event_with_cause(&mut self, cause: EventCause) -> Result { - if let EventCause::Output(output_index) = &cause { - let output = &mut self.outputs[*output_index]; - - if output.port.is_finished() { - if output.status != PortStatus::Finished { - self.finished_outputs += 1; - output.status = PortStatus::Finished; - } - - self.buffer.clear(*output_index); - - self.wakeup_inputs(); - self.wakeup_outputs(); - } else if output.port.can_push() { - if !self.buffer.is_empty(*output_index) { - let data_block = self.buffer.pop(*output_index).unwrap(); - output.status = PortStatus::Idle; - output.port.push_data(Ok(data_block)); - - self.wakeup_inputs(); - self.wakeup_outputs(); - } else if output.status != PortStatus::NeedData { - output.status = PortStatus::NeedData; - self.waiting_outputs.push(*output_index); - } - } - } - - if !self.initialized && !self.waiting_outputs.is_empty() { - self.initialized = true; - for input in &self.inputs { - input.port.set_need_data(); - } - } - - if self.finished_outputs == self.outputs.len() { - for input in &self.inputs { - input.port.finish(); - } - - return Ok(Event::Finished); - } - - if let EventCause::Input(input_index) = &cause { - let input = &mut self.inputs[*input_index]; - - if input.port.is_finished() { - if input.status != PortStatus::Finished { - self.finished_inputs += 1; - input.status = PortStatus::Finished; - } - - self.wakeup_outputs(); - self.wakeup_inputs(); - } else if input.port.has_data() { - if !self.buffer.is_full() { - self.take_input_data_into_buffer(*input_index); - - self.wakeup_outputs(); - self.wakeup_inputs(); - } else if input.status != PortStatus::HasData { - input.status = PortStatus::HasData; - self.waiting_inputs.push_back(*input_index); - } - } - } - - if self.finished_outputs == self.outputs.len() { - for input in &self.inputs { - input.port.finish(); - } - - return Ok(Event::Finished); - } - - if self.finished_inputs == self.inputs.len() { - for (index, output) in self.outputs.iter_mut().enumerate() { - if self.buffer.is_empty(index) && output.status != PortStatus::Finished { - self.finished_outputs += 1; - output.status = PortStatus::Finished; - output.port.finish(); - } - } - - if self.buffer.is_all_empty() { - return Ok(Event::Finished); - } - } - - match self.waiting_outputs.is_empty() { - true => Ok(Event::NeedConsume), - false => Ok(Event::NeedData), - } - } - - fn details_status(&self) -> Option { - #[derive(Debug)] - #[allow(dead_code)] - struct Display { - queue_status: Vec<(usize, usize)>, - inputs: usize, - finished_inputs: usize, - outputs: usize, - finished_outputs: usize, - - waiting_outputs: Vec, - waiting_inputs: VecDeque, - } - - let mut queue_status = vec![]; - for (idx, queue) in self.buffer.inner.iter().enumerate() { - queue_status.push((idx, queue.len())); - } - - Some(format!("{:?}", Display { - queue_status, - inputs: self.inputs.len(), - outputs: self.outputs.len(), - finished_inputs: self.finished_inputs, - finished_outputs: self.finished_outputs, - waiting_inputs: self.waiting_inputs.clone(), - waiting_outputs: self.waiting_outputs.clone(), - })) - } -} - -impl ExchangeShuffleTransform { - fn wakeup_inputs(&mut self) { - while !self.waiting_inputs.is_empty() && !self.buffer.is_full() { - let input_index = self.waiting_inputs.pop_front().unwrap(); - - self.take_input_data_into_buffer(input_index); - } - } - - fn wakeup_outputs(&mut self) { - let mut new_waiting_output = Vec::with_capacity(self.waiting_outputs.len()); - - for waiting_output in &self.waiting_outputs { - let output = &mut self.outputs[*waiting_output]; - - if output.port.is_finished() { - if output.status != PortStatus::Finished { - self.finished_outputs += 1; - output.status = PortStatus::Finished; - } - - self.buffer.clear(*waiting_output); - continue; - } - - if self.buffer.is_empty(*waiting_output) { - new_waiting_output.push(*waiting_output); - continue; - } - - let data_block = self.buffer.pop(*waiting_output).unwrap(); - output.status = PortStatus::Idle; - output.port.push_data(Ok(data_block)); - } - - self.waiting_outputs = new_waiting_output; - } - - fn take_input_data_into_buffer(&mut self, input_index: usize) { - let input = &mut self.inputs[input_index]; - - input.status = PortStatus::Idle; - let mut data_block = input.port.pull_data().unwrap().unwrap(); - - if let Some(block_meta) = data_block.take_meta() { - if let Some(shuffle_meta) = ExchangeShuffleMeta::downcast_from(block_meta) { - for (index, block) in shuffle_meta.blocks.into_iter().enumerate() { - if (!block.is_empty() || block.get_meta().is_some()) - && self.outputs[index].status != PortStatus::Finished - { - self.buffer.push_back(index, block); - } - } - } - } - - if input.port.is_finished() { - if input.status != PortStatus::Finished { - self.finished_inputs += 1; - input.status = PortStatus::Finished; - } - - return; - } - - input.port.set_need_data(); - } -} - -impl ExchangeShuffleTransform { - pub fn create(inputs: usize, outputs: usize, buffer: usize) -> ExchangeShuffleTransform { - let mut inputs_port = Vec::with_capacity(inputs); - let mut outputs_port = Vec::with_capacity(outputs); - - for _index in 0..inputs { - inputs_port.push(PortWithStatus { - status: PortStatus::Idle, - port: InputPort::create(), - }); - } - - for _index in 0..outputs { - outputs_port.push(PortWithStatus { - status: PortStatus::Idle, - port: OutputPort::create(), - }); - } - - ExchangeShuffleTransform { - initialized: false, - finished_inputs: 0, - finished_outputs: 0, - inputs: inputs_port, - outputs: outputs_port, - buffer: OutputsBuffer::create(buffer, outputs), - waiting_inputs: VecDeque::with_capacity(inputs), - waiting_outputs: Vec::with_capacity(outputs), - } - } - - pub fn get_inputs(&self) -> Vec> { - self.inputs.iter().map(|x| x.port.clone()).collect() - } - - pub fn get_outputs(&self) -> Vec> { - self.outputs.iter().map(|x| x.port.clone()).collect() - } -} - // Scatter the data block and push it to the corresponding output port pub fn exchange_shuffle( ctx: &Arc, params: &ShuffleExchangeParams, pipeline: &mut Pipeline, ) -> Result<()> { - // append scatter transform - pipeline.add_transform(|input, output| { - Ok(ScatterTransform::create( - input, - output, - params.shuffle_scatter.clone(), - )) - })?; - - let exchange_injector = ¶ms.exchange_injector; + if let Some(last_pipe) = pipeline.pipes.last() { + for item in &last_pipe.items { + item.processor.configure_peer_nodes(¶ms.destination_ids); + } + } let settings = ctx.get_settings(); let compression = settings.get_query_flight_compression()?; - exchange_injector.apply_shuffle_serializer(params, compression, pipeline)?; - - let output_len = pipeline.output_len(); - if let Some(exchange_sorting) = &exchange_injector.exchange_sorting() { - let sorting = ShuffleExchangeSorting::create(exchange_sorting.clone()); - let transform = TransformExchangeSorting::create(output_len, sorting); - let output = transform.get_output(); - let inputs = transform.get_inputs(); - pipeline.add_pipe(Pipe::create(output_len, 1, vec![PipeItem::create( - ProcessorPtr::create(Box::new(transform)), - inputs, - vec![output], - )])); - } - - let inputs_size = pipeline.output_len(); - let outputs_size = params.destination_ids.len(); - let transform = ExchangeShuffleTransform::create(inputs_size, outputs_size, output_len); - - let inputs = transform.get_inputs(); - let outputs = transform.get_outputs(); - pipeline.add_pipe(Pipe::create(inputs_size, outputs_size, vec![ - PipeItem::create(ProcessorPtr::create(Box::new(transform)), inputs, outputs), - ])); + match params.enable_multiway_sort { + true => pipeline.exchange( + params.destination_ids.len(), + FlightExchange::::create( + params.destination_ids.clone(), + compression, + params.shuffle_scatter.clone(), + ), + )?, + false => pipeline.exchange( + params.destination_ids.len(), + FlightExchange::::create( + params.destination_ids.clone(), + compression, + params.shuffle_scatter.clone(), + ), + )?, + }; Ok(()) } - -struct ShuffleExchangeSorting { - inner: Arc, -} - -impl ShuffleExchangeSorting { - pub fn create(inner: Arc) -> Arc { - Arc::new(ShuffleExchangeSorting { inner }) - } -} - -impl ExchangeSorting for ShuffleExchangeSorting { - fn block_number(&self, data_block: &DataBlock) -> Result { - let block_meta = data_block.get_meta(); - let shuffle_meta = block_meta - .and_then(ExchangeShuffleMeta::downcast_ref_from) - .unwrap(); - - for block in &shuffle_meta.blocks { - if let Some(block_meta) = block.get_meta() { - if let Some(block_meta) = ExchangeSerializeMeta::downcast_ref_from(block_meta) { - return Ok(block_meta.block_number); - } - } - - if !block.is_empty() || block.get_meta().is_some() { - return self.inner.block_number(block); - } - } - - Err(ErrorCode::Internal( - "Internal, ShuffleExchangeSorting only recv ExchangeSerializeMeta.", - )) - } -} diff --git a/src/query/service/src/servers/flight/v1/exchange/mod.rs b/src/query/service/src/servers/flight/v1/exchange/mod.rs index 194f2cbe1e3e5..ac51beb3bb7de 100644 --- a/src/query/service/src/servers/flight/v1/exchange/mod.rs +++ b/src/query/service/src/servers/flight/v1/exchange/mod.rs @@ -13,16 +13,13 @@ // limitations under the License. mod data_exchange; -mod exchange_injector; mod exchange_manager; mod exchange_params; mod exchange_sink; mod exchange_sink_writer; -mod exchange_sorting; mod exchange_source; mod exchange_source_reader; mod exchange_transform; -mod exchange_transform_scatter; mod exchange_transform_shuffle; mod statistics_receiver; mod statistics_sender; @@ -33,10 +30,7 @@ pub use data_exchange::BroadcastExchange; pub use data_exchange::DataExchange; pub use data_exchange::MergeExchange; pub use data_exchange::ShuffleDataExchange; -pub use exchange_injector::DefaultExchangeInjector; -pub use exchange_injector::ExchangeInjector; pub use exchange_manager::DataExchangeManager; pub use exchange_params::MergeExchangeParams; pub use exchange_params::ShuffleExchangeParams; -pub use exchange_sorting::ExchangeSorting; pub use exchange_transform_shuffle::ExchangeShuffleMeta; diff --git a/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs b/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs index 5a757f37ba299..2d58d9d9c707b 100644 --- a/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs +++ b/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs @@ -23,46 +23,38 @@ use arrow_flight::SchemaAsIpc; use arrow_ipc::writer::DictionaryTracker; use arrow_ipc::writer::IpcDataGenerator; use arrow_ipc::writer::IpcWriteOptions; -use arrow_ipc::CompressionType; use arrow_schema::ArrowError; use arrow_schema::Schema as ArrowSchema; use bytes::Bytes; -use databend_common_base::runtime::profile::Profile; -use databend_common_base::runtime::profile::ProfileStatisticsName; -use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::local_block_meta_serde; use databend_common_expression::BlockMetaInfo; use databend_common_expression::BlockMetaInfoPtr; use databend_common_expression::DataBlock; -use databend_common_io::prelude::bincode_serialize_into_buf; use databend_common_io::prelude::BinaryWrite; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::ProcessorPtr; -use databend_common_pipeline_transforms::processors::BlockMetaTransform; -use databend_common_pipeline_transforms::processors::BlockMetaTransformer; -use databend_common_pipeline_transforms::processors::Transform; -use databend_common_pipeline_transforms::processors::Transformer; -use databend_common_pipeline_transforms::processors::UnknownMode; -use databend_common_settings::FlightCompression; - -use crate::servers::flight::v1::exchange::ExchangeShuffleMeta; -use crate::servers::flight::v1::exchange::MergeExchangeParams; -use crate::servers::flight::v1::exchange::ShuffleExchangeParams; + use crate::servers::flight::v1::packets::DataPacket; use crate::servers::flight::v1::packets::FragmentData; pub struct ExchangeSerializeMeta { - pub block_number: isize, + pub partition: isize, + pub max_partition: usize, + pub global_max_partition: usize, pub packet: Vec, } impl ExchangeSerializeMeta { - pub fn create(block_number: isize, packet: Vec) -> BlockMetaInfoPtr { + pub fn create( + partition: isize, + max_partition: usize, + global_max_partition: usize, + packet: Vec, + ) -> BlockMetaInfoPtr { Box::new(ExchangeSerializeMeta { packet, - block_number, + partition, + max_partition, + global_max_partition, }) } } @@ -78,120 +70,25 @@ local_block_meta_serde!(ExchangeSerializeMeta); #[typetag::serde(name = "exchange_serialize")] impl BlockMetaInfo for ExchangeSerializeMeta {} -pub struct TransformExchangeSerializer { - options: IpcWriteOptions, -} - -impl TransformExchangeSerializer { - pub fn create( - input: Arc, - output: Arc, - _params: &MergeExchangeParams, - compression: Option, - ) -> Result { - let compression = match compression { - None => None, - Some(compression) => match compression { - FlightCompression::Lz4 => Some(CompressionType::LZ4_FRAME), - FlightCompression::Zstd => Some(CompressionType::ZSTD), - }, - }; - - Ok(ProcessorPtr::create(Transformer::create( - input, - output, - TransformExchangeSerializer { - options: IpcWriteOptions::default().try_with_compression(compression)?, - }, - ))) - } -} - -impl Transform for TransformExchangeSerializer { - const NAME: &'static str = "ExchangeSerializerTransform"; - - fn transform(&mut self, data_block: DataBlock) -> Result { - Profile::record_usize_profile(ProfileStatisticsName::ExchangeRows, data_block.num_rows()); - serialize_block(0, data_block, &self.options) - } -} - -pub struct TransformScatterExchangeSerializer { - local_pos: usize, - options: IpcWriteOptions, -} - -impl TransformScatterExchangeSerializer { - pub fn create( - input: Arc, - output: Arc, - compression: Option, - params: &ShuffleExchangeParams, - ) -> Result { - let local_id = ¶ms.executor_id; - let compression = match compression { - None => None, - Some(compression) => match compression { - FlightCompression::Lz4 => Some(CompressionType::LZ4_FRAME), - FlightCompression::Zstd => Some(CompressionType::ZSTD), - }, - }; - - Ok(ProcessorPtr::create(BlockMetaTransformer::create( - input, - output, - TransformScatterExchangeSerializer { - options: IpcWriteOptions::default().try_with_compression(compression)?, - local_pos: params - .destination_ids - .iter() - .position(|x| x == local_id) - .unwrap(), - }, - ))) - } -} - -impl BlockMetaTransform for TransformScatterExchangeSerializer { - const UNKNOWN_MODE: UnknownMode = UnknownMode::Error; - const NAME: &'static str = "TransformScatterExchangeSerializer"; - - fn transform(&mut self, meta: ExchangeShuffleMeta) -> Result> { - let mut new_blocks = Vec::with_capacity(meta.blocks.len()); - for (index, block) in meta.blocks.into_iter().enumerate() { - if block.is_empty() { - new_blocks.push(block); - continue; - } - - new_blocks.push(match self.local_pos == index { - true => block, - false => serialize_block(0, block, &self.options)?, - }); - } - - Ok(vec![DataBlock::empty_with_meta( - ExchangeShuffleMeta::create(new_blocks), - )]) - } -} - pub fn serialize_block( - block_num: isize, + partition: isize, + max_partition: usize, + global_max_partition: usize, data_block: DataBlock, options: &IpcWriteOptions, ) -> Result { if data_block.is_empty() && data_block.get_meta().is_none() { return Ok(DataBlock::empty_with_meta(ExchangeSerializeMeta::create( - block_num, + partition, + max_partition, + global_max_partition, vec![], ))); } let mut meta = vec![]; meta.write_scalar_own(data_block.num_rows() as u32)?; - bincode_serialize_into_buf(&mut meta, &data_block.get_meta()) - .map_err(|_| ErrorCode::BadBytes("block meta serialize error when exchange"))?; + serde_json::to_writer(&mut meta, &data_block.get_meta())?; let (_, dict, values) = match data_block.is_empty() { true => batches_to_flight_data_with_options( @@ -226,7 +123,10 @@ pub fn serialize_block( } Ok(DataBlock::empty_with_meta(ExchangeSerializeMeta::create( - block_num, packet, + partition, + max_partition, + global_max_partition, + packet, ))) } diff --git a/src/query/service/src/servers/flight/v1/exchange/serde/mod.rs b/src/query/service/src/servers/flight/v1/exchange/serde/mod.rs index 7349b2f46b0c1..ccf7abfb694b6 100644 --- a/src/query/service/src/servers/flight/v1/exchange/serde/mod.rs +++ b/src/query/service/src/servers/flight/v1/exchange/serde/mod.rs @@ -20,5 +20,3 @@ pub use exchange_deserializer::ExchangeDeserializeMeta; pub use exchange_deserializer::TransformExchangeDeserializer; pub use exchange_serializer::serialize_block; pub use exchange_serializer::ExchangeSerializeMeta; -pub use exchange_serializer::TransformExchangeSerializer; -pub use exchange_serializer::TransformScatterExchangeSerializer; diff --git a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs index 118cf8b8519c7..68a4e8d163829 100644 --- a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs +++ b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs @@ -15,6 +15,7 @@ use std::collections::hash_map::DefaultHasher; use std::hash::Hasher; +use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::type_check::check_function; @@ -37,6 +38,7 @@ use databend_common_expression::Value; use databend_common_functions::BUILTIN_FUNCTIONS; use crate::servers::flight::v1::scatter::flight_scatter::FlightScatter; +use crate::sessions::QueryContext; #[derive(Clone)] pub struct HashFlightScatter { @@ -47,11 +49,15 @@ pub struct HashFlightScatter { impl HashFlightScatter { pub fn try_create( - func_ctx: FunctionContext, + ctx: &QueryContext, hash_keys: Vec, - scatter_size: usize, - local_pos: usize, + destination_ids: &[String], ) -> Result> { + let local_id = &ctx.get_cluster().local_id; + let func_ctx = ctx.get_function_context()?; + let scatter_size = destination_ids.len(); + let local_pos = destination_ids.iter().position(|x| x == local_id).unwrap(); + if hash_keys.len() == 1 { return OneHashKeyFlightScatter::try_create( func_ctx, diff --git a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_merge.rs b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_merge.rs new file mode 100644 index 0000000000000..02cebbfcbe938 --- /dev/null +++ b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_merge.rs @@ -0,0 +1,37 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::Result; +use databend_common_expression::DataBlock; + +use crate::servers::flight::v1::scatter::flight_scatter::FlightScatter; + +pub struct MergeFlightScatter; + +// impl MergeFlightScatter { +// pub fn try_create(scattered_size: usize) -> Result { +// Ok(MergeFlightScatter { scattered_size }) +// } +// } + +impl FlightScatter for MergeFlightScatter { + fn execute(&self, data_block: DataBlock) -> Result> { + // let mut data_blocks = vec![]; + // for _ in 0..self.scattered_size { + // data_blocks.push(data_block.clone()); + // } + + Ok(vec![data_block]) + } +} diff --git a/src/query/service/src/servers/flight/v1/scatter/mod.rs b/src/query/service/src/servers/flight/v1/scatter/mod.rs index b5f5f900dab71..caaee700968e4 100644 --- a/src/query/service/src/servers/flight/v1/scatter/mod.rs +++ b/src/query/service/src/servers/flight/v1/scatter/mod.rs @@ -15,7 +15,9 @@ mod flight_scatter; mod flight_scatter_broadcast; mod flight_scatter_hash; +mod flight_scatter_merge; pub use flight_scatter::FlightScatter; pub use flight_scatter_broadcast::BroadcastFlightScatter; pub use flight_scatter_hash::HashFlightScatter; +pub use flight_scatter_merge::MergeFlightScatter; diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 114c8dff71254..ae250a3e2acef 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -35,6 +35,7 @@ use databend_storages_common_cache::TempDir; use databend_storages_common_cache::TempPath; use opendal::Buffer; use opendal::Operator; +use opendal::Writer; use parking_lot::RwLock; use super::serialize::*; @@ -187,6 +188,17 @@ impl Spiller { format!("{}/{}", self.location_prefix, GlobalUniqName::unique()) } + pub async fn create_aggregate_writer(&self, location: String) -> Result { + let writer = self.operator.writer(&location).await?; + Ok(SpillWriter { + bytes: 0, + writer, + location, + ctx: self.ctx.clone(), + private_spilled_files: self.private_spilled_files.clone(), + }) + } + pub async fn spill_stream_aggregate_buffer( &self, location: Option, @@ -556,3 +568,42 @@ fn record_read_profile(location: &Location, start: &Instant, read_bytes: usize) } } } + +pub struct SpillWriter { + bytes: usize, + writer: Writer, + location: String, + ctx: Arc, + private_spilled_files: Arc>>, +} + +impl SpillWriter { + pub async fn write(&mut self, bytes: Vec) -> Result<()> { + self.bytes += bytes.len(); + Ok(self.writer.write(bytes).await?) + } + + pub fn location(&self) -> String { + self.location.clone() + } + + pub fn write_bytes(&self) -> usize { + self.bytes + } + + pub async fn complete(&mut self) -> Result<()> { + self.writer.close().await?; + + self.ctx.add_spill_file( + Location::Remote(self.location.clone()), + Layout::Aggregate, + self.bytes, + ); + + self.private_spilled_files + .write() + .insert(Location::Remote(self.location.clone()), Layout::Aggregate); + + Ok(()) + } +} diff --git a/src/query/storages/system/src/query_log_table.rs b/src/query/storages/system/src/query_log_table.rs index c6c8e16946fdf..fc83a87d05cc4 100644 --- a/src/query/storages/system/src/query_log_table.rs +++ b/src/query/storages/system/src/query_log_table.rs @@ -179,7 +179,7 @@ pub struct QueryLogElement { // Transaction pub txn_state: String, pub txn_id: String, - pub peek_memory_usage: HashMap, + pub peak_memory_usage: HashMap, } impl SystemLogElement for QueryLogElement { @@ -575,7 +575,7 @@ impl SystemLogElement for QueryLogElement { columns.next().unwrap().push( Scalar::Variant( jsonb::Value::from(jsonb::Object::from_iter( - self.peek_memory_usage + self.peak_memory_usage .iter() .map(|(k, v)| (k.clone(), jsonb::Value::from(*v))), )) diff --git a/tests/sqllogictests/suites/query/cte/basic_r_cte.test b/tests/sqllogictests/suites/query/cte/basic_r_cte.test index 1d4ce93efcd9d..60b9d6bae97a0 100644 --- a/tests/sqllogictests/suites/query/cte/basic_r_cte.test +++ b/tests/sqllogictests/suites/query/cte/basic_r_cte.test @@ -254,7 +254,7 @@ select concat('城市',rn::varchar) city from t1 where rn<=5; statement ok insert into train -select concat('G',row_number()over()::varchar),c1.city,c2.city, n from city c1, city c2, (select 600 n union select 800 union select 1200 union select 1600) a ; +select concat('G',row_number()over()::varchar),c1_city,c2_city, n from (SELECT c1.city as c1_city,c2.city as c2_city, n FROM city c1, city c2, (select 600 n union select 800 union select 1200 union select 1600) a order by c1.city,c2.city, n); statement ok insert into passenger @@ -281,10 +281,8 @@ select from t0,(select 1 n union all select 2); ---- -261700 523200 210000 +224100 448000 210000 statement ok use default; -statement ok -drop database db; diff --git a/tests/sqllogictests/suites/query/window_function/window_bound.test b/tests/sqllogictests/suites/query/window_function/window_bound.test index b9f7b17571ed4..763e2bd51507a 100644 --- a/tests/sqllogictests/suites/query/window_function/window_bound.test +++ b/tests/sqllogictests/suites/query/window_function/window_bound.test @@ -267,23 +267,23 @@ SELECT a, DENSE_RANK() OVER (ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND UNB 6 4 7 5 -query I -SELECT DISTINCT lead((861, FALSE, FALSE), 9, (849, TRUE, TRUE)) OVER ( +query II +SELECT * FROM (SELECT DISTINCT lead((861, FALSE, FALSE), 9, (849, TRUE, TRUE)) OVER ( PARTITION BY 15560425903542832284, 965871850213131579 - ORDER BY 13746504519650342222, 5897530378272856518 ASC NULLS FIRST) -FROM range(100, 12000000, 467); + ORDER BY 13746504519650342222, 5897530378272856518 ASC NULLS FIRST) AS C +FROM range(100, 12000000, 467)) ORDER BY C.1; ---- -(861,0,0) (849,1,1) +(861,0,0) query II -SELECT DISTINCT lead((861, FALSE, FALSE), 9, (849, TRUE, TRUE)) OVER ( +SELECT * FROM (SELECT DISTINCT lead((861, FALSE, FALSE), 9, (849, TRUE, TRUE)) OVER ( PARTITION BY 15560425903542832284, 965871850213131579 - ORDER BY 13746504519650342222, 5897530378272856518 ASC NULLS FIRST) -FROM range(100, 120000000, 467); + ORDER BY 13746504519650342222, 5897530378272856518 ASC NULLS FIRST) AS C +FROM range(100, 120000000, 467)) ORDER BY C.1; ---- -(861,0,0) (849,1,1) +(861,0,0) statement ok DROP DATABASE test_window_bound;