diff --git a/Cargo.lock b/Cargo.lock
index c2959a314380f..70cb8d79605f7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -635,7 +635,7 @@ source = "git+https://github.com/datafuse-extras/async-backtrace.git?rev=dea4553
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -736,7 +736,7 @@ source = "git+https://github.com/datafuse-extras/async-recursion.git?rev=a353334
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -758,7 +758,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -775,7 +775,7 @@ checksum = "d556ec1359574147ec0c4fc5eb525f3f23263a592b1a9c07e0a75b427de55c97"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -1346,7 +1346,7 @@ dependencies = [
  "regex",
  "rustc-hash 1.1.0",
  "shlex",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -1534,7 +1534,7 @@ dependencies = [
  "proc-macro-crate 3.1.0",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
  "syn_derive",
 ]
 
@@ -1676,7 +1676,7 @@ checksum = "1ee891b04274a59bd38b412188e24b849617b2e45a0fd8d057deb63e7403761b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -2138,7 +2138,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -2812,7 +2812,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f"
 dependencies = [
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -2855,7 +2855,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim 0.11.1",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -2866,7 +2866,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -3292,7 +3292,7 @@ dependencies = [
  "hex",
  "hyper-util",
  "itertools 0.13.0",
- "jiff 0.2.1",
+ "jiff 0.2.4",
  "jsonb",
  "lexical-core",
  "log",
@@ -3339,7 +3339,7 @@ dependencies = [
  "geozero",
  "goldenfile",
  "hex",
- "jiff 0.2.1",
+ "jiff 0.2.4",
  "jsonb",
  "lexical-core",
  "match-template",
@@ -3394,7 +3394,7 @@ dependencies = [
  "jaq-interpret",
  "jaq-parse",
  "jaq-std",
- "jiff 0.2.1",
+ "jiff 0.2.4",
  "jsonb",
  "lexical-core",
  "libm",
@@ -3487,7 +3487,7 @@ dependencies = [
  "geo",
  "geozero",
  "hex",
- "jiff 0.2.1",
+ "jiff 0.2.4",
  "lexical-core",
  "micromarshal",
  "rmp-serde",
@@ -4590,7 +4590,7 @@ dependencies = [
  "databend-storages-common-cache",
  "futures",
  "itertools 0.13.0",
- "jiff 0.2.1",
+ "jiff 0.2.4",
  "jsonb",
  "log",
  "once_cell",
@@ -4920,7 +4920,7 @@ dependencies = [
  "databend-common-exception",
  "databend-common-expression",
  "dtparse",
- "jiff 0.2.1",
+ "jiff 0.2.4",
  "num-traits",
 ]
 
@@ -5186,7 +5186,7 @@ dependencies = [
  "hyper-util",
  "indicatif",
  "itertools 0.13.0",
- "jiff 0.2.1",
+ "jiff 0.2.4",
  "jsonb",
  "jwt-simple",
  "log",
@@ -5290,7 +5290,7 @@ dependencies = [
  "derive-visitor",
  "ethnum",
  "itertools 0.13.0",
- "jiff 0.2.1",
+ "jiff 0.2.4",
  "jsonb",
  "rand 0.8.5",
  "reqwest",
@@ -5467,7 +5467,7 @@ dependencies = [
  "enum-ordinalize",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -5569,7 +5569,7 @@ checksum = "0c8e41236d5a9f04da3072d7186a76aba734e7bfd2cd05f7877fde172b65fb11"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -5707,7 +5707,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -5717,7 +5717,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
 dependencies = [
  "derive_builder_core",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -5738,7 +5738,7 @@ dependencies = [
  "convert_case 0.6.0",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
  "unicode-xid",
 ]
 
@@ -5815,7 +5815,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -5928,7 +5928,7 @@ dependencies = [
  "enum-ordinalize",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -6012,7 +6012,7 @@ dependencies = [
  "heck 0.4.1",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -6032,7 +6032,7 @@ checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -6044,7 +6044,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -6065,7 +6065,7 @@ checksum = "de0d48a183585823424a4ce1aa132d174a6a81bd540895822eb4c8373a8e49e8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -6131,7 +6131,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -6283,7 +6283,7 @@ dependencies = [
  "proc-macro-error2",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -6529,7 +6529,7 @@ checksum = "b0fa992f1656e1707946bbba340ad244f0814009ef8c0118eb7b658395f19a2e"
 dependencies = [
  "frunk_proc_macro_helpers",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -6541,7 +6541,7 @@ dependencies = [
  "frunk_core",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -6553,7 +6553,7 @@ dependencies = [
  "frunk_core",
  "frunk_proc_macro_helpers",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -6651,7 +6651,7 @@ checksum = "5ac45ed0bddbd110eb68862768a194f88700f5b91c39931d2f432fab67a16d08"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -6716,7 +6716,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -7388,7 +7388,7 @@ checksum = "999ce923619f88194171a67fb3e6d613653b8d4d6078b529b15a765da0edcc17"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -8680,7 +8680,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -9139,10 +9139,11 @@ dependencies = [
 
 [[package]]
 name = "jiff"
-version = "0.2.1"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3590fea8e9e22d449600c9bbd481a8163bef223e4ff938e5f55899f8cf1adb93"
+checksum = "d699bc6dfc879fb1bf9bdff0d4c56f0884fc6f0d0eb0fba397a6d00cd9a6b85e"
 dependencies = [
+ "jiff-static",
  "jiff-tzdb",
  "jiff-tzdb-platform",
  "log",
@@ -9152,6 +9153,17 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "jiff-static"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d16e75759ee0aa64c57a56acbf43916987b20c77373cb7e808979e02b93c9f9"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "jiff-tzdb"
 version = "0.1.2"
@@ -10092,7 +10104,7 @@ dependencies = [
  "proc-macro-error 1.0.4",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
  "termcolor",
  "thiserror 1.0.65",
 ]
@@ -10270,7 +10282,7 @@ dependencies = [
  "proc-macro-error 1.0.4",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -10465,7 +10477,7 @@ dependencies = [
  "proc-macro-crate 3.1.0",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -10649,7 +10661,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "semver",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -10691,7 +10703,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -11043,7 +11055,7 @@ dependencies = [
  "regex",
  "regex-syntax 0.8.4",
  "structmeta",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -11195,7 +11207,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -11342,7 +11354,7 @@ dependencies = [
  "proc-macro-crate 3.1.0",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -11476,7 +11488,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
 dependencies = [
  "proc-macro2",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -11576,7 +11588,7 @@ dependencies = [
  "proc-macro-error-attr2",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -11587,9 +11599,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.92"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
+checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
 dependencies = [
  "unicode-ident",
 ]
@@ -11665,7 +11677,7 @@ checksum = "440f724eba9f6996b75d63681b0a92b06947f1457076d503a4d2e2c8f56442b8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -11722,7 +11734,7 @@ dependencies = [
  "prost",
  "prost-types",
  "regex",
- "syn 2.0.95",
+ "syn 2.0.100",
  "tempfile",
 ]
 
@@ -11736,7 +11748,7 @@ dependencies = [
  "itertools 0.14.0",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -11941,7 +11953,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -11954,7 +11966,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-build-config",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -12069,9 +12081,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.36"
+version = "1.0.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
 dependencies = [
  "proc-macro2",
 ]
@@ -12215,7 +12227,7 @@ checksum = "8b86292cf41ccfc96c5de7165c1c53d5b4ac540c5bab9d1857acbe9eba5f1a0b"
 dependencies = [
  "proc-macro-hack",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -12268,7 +12280,7 @@ version = "0.1.1"
 source = "git+https://github.com/datafuse-extras/recursive.git?rev=6af35a1#6af35a1e59e7050f86ee19fbd0a79535d016c87d"
 dependencies = [
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -12713,7 +12725,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rquickjs-core",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -13207,7 +13219,7 @@ checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -13249,7 +13261,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -13319,7 +13331,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -13577,7 +13589,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -13789,7 +13801,7 @@ dependencies = [
  "quote",
  "sqlx-core",
  "sqlx-macros-core",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -13812,7 +13824,7 @@ dependencies = [
  "sqlx-mysql",
  "sqlx-postgres",
  "sqlx-sqlite",
- "syn 2.0.95",
+ "syn 2.0.100",
  "tempfile",
  "tokio",
  "url",
@@ -14016,7 +14028,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "structmeta-derive",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -14027,7 +14039,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -14068,7 +14080,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -14224,9 +14236,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.95"
+version = "2.0.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46f71c0377baf4ef1cc3e3402ded576dccc315800fbc62dfc7fe04b009773b4a"
+checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -14253,7 +14265,7 @@ dependencies = [
  "proc-macro-error 1.0.4",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -14279,7 +14291,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -14570,7 +14582,7 @@ checksum = "e71277381bd8b17eea2126a849dced540862c498398d4dd52405233a5d3cc643"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -14637,7 +14649,7 @@ checksum = "ae71770322cbd277e69d762a16c444af02aa0575ac0d174f0b9562d3b37f8602"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -14648,7 +14660,7 @@ checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -14808,7 +14820,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -15018,7 +15030,7 @@ dependencies = [
  "prost-build",
  "prost-types",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -15100,7 +15112,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -15234,7 +15246,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -15245,7 +15257,7 @@ checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -15281,7 +15293,7 @@ checksum = "70b20a22c42c8f1cd23ce5e34f165d4d37038f5b663ad20fb6adbdf029172483"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -15600,7 +15612,7 @@ checksum = "d674d135b4a8c1d7e813e2f8d1c9a58308aee4a680323066025e53132218bd91"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -15763,7 +15775,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
  "wasm-bindgen-shared",
 ]
 
@@ -15797,7 +15809,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -15972,7 +15984,7 @@ dependencies = [
  "anyhow",
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
  "wasmtime-component-util",
  "wasmtime-wit-bindgen",
  "wit-parser",
@@ -16088,7 +16100,7 @@ checksum = "df09be00c38f49172ca9936998938476e3f2df782673a39ae2ef9fb0838341b6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -16234,7 +16246,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "shellexpand",
- "syn 2.0.95",
+ "syn 2.0.100",
  "witx",
 ]
 
@@ -16246,7 +16258,7 @@ checksum = "9b8eb1a5783540696c59cefbfc9e52570c2d5e62bd47bdf0bdcef29231879db2"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
  "wiggle-generate",
 ]
 
@@ -16363,7 +16375,7 @@ checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -16374,7 +16386,7 @@ checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -16833,7 +16845,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
  "synstructure",
 ]
 
@@ -16870,7 +16882,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -16881,7 +16893,7 @@ checksum = "6352c01d0edd5db859a63e2605f4ea3183ddbd15e2c4a9e7d32184df75e4f154"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -16901,7 +16913,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
  "synstructure",
 ]
 
@@ -16930,7 +16942,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.95",
+ "syn 2.0.100",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 1c9184126890d..d217d582b50a9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -587,9 +587,8 @@ result_large_err = "allow"
 [profile.release]
 debug = 1
 lto = "thin"
-overflow-checks = false
-opt-level = "s"         # defaults to be 3
-incremental = true
+opt-level = "s" # defaults to be 3
+#incremental = true
 
 [profile.ci]
 inherits = "release"
diff --git a/src/common/base/src/runtime/memory/mem_stat.rs b/src/common/base/src/runtime/memory/mem_stat.rs
index 295ff6e4927ff..d646d925750ce 100644
--- a/src/common/base/src/runtime/memory/mem_stat.rs
+++ b/src/common/base/src/runtime/memory/mem_stat.rs
@@ -40,14 +40,14 @@ pub struct MemStat {
     name: Option<String>,
 
     pub(crate) used: AtomicI64,
-    pub(crate) peek_used: AtomicI64,
+    pub(crate) peak_used: AtomicI64,
 
     /// The limit of max used memory for this tracker.
     ///
     /// Set to 0 to disable the limit.
     limit: AtomicI64,
 
-    parent_memory_stat: Vec<Arc<MemStat>>,
+    parent_memory_stat: Option<Arc<MemStat>>,
 }
 
 impl MemStat {
@@ -56,17 +56,17 @@ impl MemStat {
             id: 0,
             name: None,
             used: AtomicI64::new(0),
-            peek_used: AtomicI64::new(0),
+            peak_used: AtomicI64::new(0),
             limit: AtomicI64::new(0),
-            parent_memory_stat: vec![],
+            parent_memory_stat: None,
         }
     }
 
     pub fn create(name: String) -> Arc<MemStat> {
-        MemStat::create_child(name, vec![])
+        MemStat::create_child(name, None)
     }
 
-    pub fn create_child(name: String, parent_memory_stat: Vec<Arc<MemStat>>) -> Arc<MemStat> {
+    pub fn create_child(name: String, parent_memory_stat: Option<Arc<MemStat>>) -> Arc<MemStat> {
         let id = match GlobalSequence::next() {
             0 => GlobalSequence::next(),
             id => id,
@@ -76,16 +76,12 @@ impl MemStat {
             id,
             name: Some(name),
             used: AtomicI64::new(0),
-            peek_used: AtomicI64::new(0),
+            peak_used: AtomicI64::new(0),
             limit: AtomicI64::new(0),
             parent_memory_stat,
         })
     }
 
-    pub fn get_parent_memory_stat(&self) -> Vec<Arc<MemStat>> {
-        self.parent_memory_stat.clone()
-    }
-
     pub fn set_limit(&self, mut size: i64) {
         // It may cause the process unable to run if memory limit is too low.
         if size > 0 && size < MINIMUM_MEMORY_LIMIT {
@@ -107,19 +103,15 @@ impl MemStat {
         let mut used = self.used.fetch_add(batch_memory_used, Ordering::Relaxed);
 
         used += batch_memory_used;
-        self.peek_used.fetch_max(used, Ordering::Relaxed);
+        self.peak_used.fetch_max(used, Ordering::Relaxed);
 
-        for (idx, parent_memory_stat) in self.parent_memory_stat.iter().enumerate() {
+        if let Some(parent_memory_stat) = self.parent_memory_stat.as_ref() {
             if let Err(cause) = parent_memory_stat
                 .record_memory::<NEED_ROLLBACK>(batch_memory_used, current_memory_alloc)
             {
                 if NEED_ROLLBACK {
                     // We only roll back the memory that alloc failed
                     self.used.fetch_sub(current_memory_alloc, Ordering::Relaxed);
-
-                    for index in 0..idx {
-                        self.parent_memory_stat[index].rollback(current_memory_alloc);
-                    }
                 }
 
                 return Err(cause);
@@ -142,8 +134,8 @@ impl MemStat {
     pub fn rollback(&self, memory_usage: i64) {
         self.used.fetch_sub(memory_usage, Ordering::Relaxed);
 
-        for parent_memory_stat in &self.parent_memory_stat {
-            parent_memory_stat.rollback(memory_usage)
+        if let Some(parent_memory_stat) = &self.parent_memory_stat {
+            parent_memory_stat.rollback(memory_usage);
         }
     }
 
@@ -171,7 +163,7 @@ impl MemStat {
 
     #[inline]
     pub fn get_peek_memory_usage(&self) -> i64 {
-        self.peek_used.load(Ordering::Relaxed)
+        self.peak_used.load(Ordering::Relaxed)
     }
 }
 
@@ -268,7 +260,7 @@ mod tests {
     fn test_multiple_level_mem_stat() -> Result<()> {
         let mem_stat = MemStat::create("TEST".to_string());
         let child_mem_stat =
-            MemStat::create_child("TEST_CHILD".to_string(), vec![mem_stat.clone()]);
+            MemStat::create_child("TEST_CHILD".to_string(), Some(mem_stat.clone()));
 
         mem_stat.record_memory::<false>(1, 1).unwrap();
         mem_stat.record_memory::<false>(2, 2).unwrap();
@@ -292,7 +284,7 @@ mod tests {
         let mem_stat = MemStat::create("TEST".to_string());
         mem_stat.set_limit(MINIMUM_MEMORY_LIMIT * 2);
         let child_mem_stat =
-            MemStat::create_child("TEST_CHILD".to_string(), vec![mem_stat.clone()]);
+            MemStat::create_child("TEST_CHILD".to_string(), Some(mem_stat.clone()));
         child_mem_stat.set_limit(MINIMUM_MEMORY_LIMIT);
 
         mem_stat.record_memory::<false>(1, 1).unwrap();
@@ -322,7 +314,7 @@ mod tests {
         let mem_stat = MemStat::create("TEST".to_string());
         mem_stat.set_limit(MINIMUM_MEMORY_LIMIT);
         let child_mem_stat =
-            MemStat::create_child("TEST_CHILD".to_string(), vec![mem_stat.clone()]);
+            MemStat::create_child("TEST_CHILD".to_string(), Some(mem_stat.clone()));
         child_mem_stat.set_limit(MINIMUM_MEMORY_LIMIT * 2);
 
         assert!(child_mem_stat
@@ -335,7 +327,7 @@ mod tests {
         let mem_stat = MemStat::create("TEST".to_string());
         mem_stat.set_limit(MINIMUM_MEMORY_LIMIT * 2);
         let child_mem_stat =
-            MemStat::create_child("TEST_CHILD".to_string(), vec![mem_stat.clone()]);
+            MemStat::create_child("TEST_CHILD".to_string(), Some(mem_stat.clone()));
         child_mem_stat.set_limit(MINIMUM_MEMORY_LIMIT);
 
         assert!(child_mem_stat
diff --git a/src/common/base/src/runtime/memory/stat_buffer_global.rs b/src/common/base/src/runtime/memory/stat_buffer_global.rs
index 4eb20f411f296..cce85443b3054 100644
--- a/src/common/base/src/runtime/memory/stat_buffer_global.rs
+++ b/src/common/base/src/runtime/memory/stat_buffer_global.rs
@@ -90,7 +90,7 @@ impl GlobalStatBuffer {
                 .used
                 .fetch_add(memory_usage, Ordering::Relaxed);
             self.global_mem_stat
-                .peek_used
+                .peak_used
                 .fetch_max(used + memory_usage, Ordering::Relaxed);
             return Ok(());
         }
@@ -126,7 +126,7 @@ impl GlobalStatBuffer {
                 .used
                 .fetch_add(memory_usage, Ordering::Relaxed);
             self.global_mem_stat
-                .peek_used
+                .peak_used
                 .fetch_max(used + memory_usage, Ordering::Relaxed);
             return;
         }
diff --git a/src/common/base/src/runtime/memory/stat_buffer_mem_stat.rs b/src/common/base/src/runtime/memory/stat_buffer_mem_stat.rs
index 8c890598dd61b..71f035fb9d726 100644
--- a/src/common/base/src/runtime/memory/stat_buffer_mem_stat.rs
+++ b/src/common/base/src/runtime/memory/stat_buffer_mem_stat.rs
@@ -93,7 +93,7 @@ impl MemStatBuffer {
         if self.destroyed_thread_local_macro {
             let used = mem_stat.used.fetch_add(usage, Ordering::Relaxed);
             mem_stat
-                .peek_used
+                .peak_used
                 .fetch_max(used + usage, Ordering::Relaxed);
             return Ok(());
         }
@@ -134,7 +134,7 @@ impl MemStatBuffer {
         if self.destroyed_thread_local_macro {
             let used = mem_stat.used.fetch_add(memory_usage, Ordering::Relaxed);
             mem_stat
-                .peek_used
+                .peak_used
                 .fetch_max(used + memory_usage, Ordering::Relaxed);
             return;
         }
diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs
index 3fae88f9f636b..0cf85106e336f 100644
--- a/src/query/expression/src/aggregate/aggregate_hashtable.rs
+++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs
@@ -177,7 +177,7 @@ impl AggregateHashTable {
         row_count: usize,
     ) -> Result<usize> {
         state.row_count = row_count;
-        group_hash_columns(group_columns, &mut state.group_hashes);
+        group_hash_columns(group_columns, state.group_hashes.as_mut_slice());
 
         let new_group_count = if self.direct_append {
             for idx in 0..row_count {
@@ -337,7 +337,7 @@ impl AggregateHashTable {
                 unsafe {
                     row_match_columns(
                         group_columns,
-                        &state.addresses,
+                        state.addresses.as_slice(),
                         &mut state.group_compare_vector,
                         &mut state.temp_vector,
                         need_compare_count,
diff --git a/src/query/expression/src/aggregate/mod.rs b/src/query/expression/src/aggregate/mod.rs
index 6911a0efc3cf3..28a8a0a9eae0f 100644
--- a/src/query/expression/src/aggregate/mod.rs
+++ b/src/query/expression/src/aggregate/mod.rs
@@ -38,10 +38,10 @@ pub use payload::*;
 pub use payload_flush::*;
 pub use probe_state::*;
 
-pub type SelectVector = [usize; BATCH_SIZE];
+pub type SelectVector = Vec<usize>;
 
 pub fn new_sel() -> SelectVector {
-    [0; BATCH_SIZE]
+    vec![0; BATCH_SIZE]
 }
 
 // A batch size to probe, flush, repartition, etc.
diff --git a/src/query/expression/src/aggregate/partitioned_payload.rs b/src/query/expression/src/aggregate/partitioned_payload.rs
index 5b27d6939f330..c5163811ffd90 100644
--- a/src/query/expression/src/aggregate/partitioned_payload.rs
+++ b/src/query/expression/src/aggregate/partitioned_payload.rs
@@ -16,6 +16,8 @@ use std::sync::Arc;
 
 use bumpalo::Bump;
 use itertools::Itertools;
+use serde::Deserializer;
+use serde::Serializer;
 
 use super::payload::Payload;
 use super::probe_state::ProbeState;
@@ -50,6 +52,18 @@ pub struct PartitionedPayload {
 unsafe impl Send for PartitionedPayload {}
 unsafe impl Sync for PartitionedPayload {}
 
+impl serde::Serialize for PartitionedPayload {
+    fn serialize<S: Serializer>(&self, _: S) -> Result<S::Ok, S::Error> {
+        unreachable!("PartitionedPayload must not be exchanged between multiple nodes.")
+    }
+}
+
+impl<'de> serde::Deserialize<'de> for PartitionedPayload {
+    fn deserialize<D: Deserializer<'de>>(_: D) -> Result<Self, D::Error> {
+        unreachable!("PartitionedPayload must not be exchanged between multiple nodes.")
+    }
+}
+
 impl PartitionedPayload {
     pub fn new(
         group_types: Vec<DataType>,
@@ -69,7 +83,7 @@ impl PartitionedPayload {
         let payloads = (0..partition_count)
             .map(|_| {
                 Payload::new(
-                    arenas[0].clone(),
+                    arenas.clone(),
                     group_types.clone(),
                     aggrs.clone(),
                     states_layout.clone(),
@@ -116,9 +130,9 @@ impl PartitionedPayload {
         if self.payloads.len() == 1 {
             self.payloads[0].reserve_append_rows(
                 &state.empty_vector,
-                &state.group_hashes,
-                &mut state.addresses,
-                &mut state.page_index,
+                state.group_hashes.as_slice(),
+                state.addresses.as_mut_slice(),
+                state.page_index.as_mut_slice(),
                 new_group_rows,
                 group_columns,
             );
@@ -143,9 +157,9 @@ impl PartitionedPayload {
 
                     self.payloads[partition_index].reserve_append_rows(
                         sel,
-                        &state.group_hashes,
-                        &mut state.addresses,
-                        &mut state.page_index,
+                        state.group_hashes.as_slice(),
+                        state.addresses.as_mut_slice(),
+                        state.page_index.as_mut_slice(),
                         count,
                         group_columns,
                     );
diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs
index 788f187ed9699..28c6303aa3045 100644
--- a/src/query/expression/src/aggregate/payload.rs
+++ b/src/query/expression/src/aggregate/payload.rs
@@ -46,7 +46,7 @@ use crate::MAX_PAGE_SIZE;
 // [HASH] is the hash data of the groups
 // [STATE_ADDRS] is the state_addrs of the aggregate functions, 8 bytes each
 pub struct Payload {
-    pub arena: Arc<Bump>,
+    pub arena: Vec<Arc<Bump>>,
     // if true, the states are moved out of the payload into other payload, and will not be dropped
     pub state_move_out: bool,
     pub group_types: Vec<DataType>,
@@ -94,7 +94,7 @@ pub type Pages = Vec<Page>;
 
 impl Payload {
     pub fn new(
-        arena: Arc<Bump>,
+        arena: Vec<Arc<Bump>>,
         group_types: Vec<DataType>,
         aggrs: Vec<AggregateFunctionRef>,
         states_layout: Option<StatesLayout>,
@@ -267,7 +267,7 @@ impl Payload {
 
             unsafe {
                 serialize_column_to_rowformat(
-                    &self.arena,
+                    &self.arena[0],
                     col,
                     select_vector,
                     new_group_rows,
@@ -297,7 +297,7 @@ impl Payload {
             // write states
             let (array_layout, padded_size) = layout.repeat(new_group_rows).unwrap();
             // Bump only allocates but does not drop, so there is no use after free for any item.
-            let place = self.arena.alloc_layout(array_layout);
+            let place = self.arena[0].alloc_layout(array_layout);
             for (idx, place) in select_vector
                 .iter()
                 .take(new_group_rows)
@@ -385,7 +385,11 @@ impl Payload {
         );
     }
 
-    pub fn scatter(&self, state: &mut PayloadFlushState, partition_count: usize) -> bool {
+    pub fn scatter_with_seed<const SEED: u64>(
+        &self,
+        state: &mut PayloadFlushState,
+        partitions: usize,
+    ) -> bool {
         if state.flush_page >= self.pages.len() {
             return false;
         }
@@ -397,23 +401,27 @@ impl Payload {
             state.flush_page += 1;
             state.flush_page_row = 0;
             state.row_count = 0;
-            return self.scatter(state, partition_count);
+            return self.scatter_with_seed::<SEED>(state, partitions);
         }
 
         let end = (state.flush_page_row + BATCH_SIZE).min(page.rows);
         let rows = end - state.flush_page_row;
         state.row_count = rows;
 
-        state.probe_state.reset_partitions(partition_count);
+        state.probe_state.reset_partitions(partitions);
+
+        let mods: StrengthReducedU64 = StrengthReducedU64::new(partitions as u64);
 
-        let mods: StrengthReducedU64 = StrengthReducedU64::new(partition_count as u64);
         for idx in 0..rows {
             state.addresses[idx] = self.data_ptr(page, idx + state.flush_page_row);
 
-            let hash = unsafe { read::<u64>(state.addresses[idx].add(self.hash_offset) as _) };
+            let mut hash = unsafe { read::<u64>(state.addresses[idx].add(self.hash_offset) as _) };
 
-            let partition_idx = (hash % mods) as usize;
+            if SEED != 0 {
+                hash = Self::combine_hash(hash, SEED);
+            }
 
+            let partition_idx = (hash % mods) as usize;
             let sel = &mut state.probe_state.partition_entries[partition_idx];
             sel[state.probe_state.partition_count[partition_idx]] = idx;
             state.probe_state.partition_count[partition_idx] += 1;
@@ -422,6 +430,10 @@ impl Payload {
         true
     }
 
+    pub fn scatter(&self, state: &mut PayloadFlushState, partitions: usize) -> bool {
+        self.scatter_with_seed::<0>(state, partitions)
+    }
+
     pub fn empty_block(&self, fake_rows: Option<usize>) -> DataBlock {
         let fake_rows = fake_rows.unwrap_or(0);
         let columns = (0..self.aggrs.len())
@@ -434,6 +446,18 @@ impl Payload {
             .collect_vec();
         DataBlock::new_from_columns(columns)
     }
+
+    #[allow(unused_parens)]
+    fn combine_hash(hash: u64, seed: u64) -> u64 {
+        static KMUL: u64 = 0x9ddfea08eb382d69;
+
+        let mut a = (seed ^ hash).wrapping_mul(KMUL);
+        a ^= (a >> 47);
+
+        let mut b = (hash ^ a).wrapping_mul(KMUL);
+        b ^= (b >> 47);
+        b.wrapping_mul(KMUL)
+    }
 }
 
 impl Drop for Payload {
diff --git a/src/query/expression/src/aggregate/payload_flush.rs b/src/query/expression/src/aggregate/payload_flush.rs
index 4fe9f35830227..2b3161c252950 100644
--- a/src/query/expression/src/aggregate/payload_flush.rs
+++ b/src/query/expression/src/aggregate/payload_flush.rs
@@ -164,22 +164,6 @@ impl Payload {
         Ok(Some(DataBlock::new_from_columns(cols)))
     }
 
-    pub fn group_by_flush_all(&self) -> Result<DataBlock> {
-        let mut state = PayloadFlushState::default();
-        let mut blocks = vec![];
-
-        while self.flush(&mut state) {
-            let cols = state.take_group_columns();
-            blocks.push(DataBlock::new_from_columns(cols));
-        }
-
-        if blocks.is_empty() {
-            return Ok(self.empty_block(None));
-        }
-
-        DataBlock::concat(&blocks)
-    }
-
     pub fn flush(&self, state: &mut PayloadFlushState) -> bool {
         if state.flush_page >= self.pages.len() {
             return false;
diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs
index ce8b908e0b5ae..ee73c9142e30c 100644
--- a/src/query/expression/src/aggregate/payload_row.rs
+++ b/src/query/expression/src/aggregate/payload_row.rs
@@ -421,7 +421,7 @@ unsafe fn row_match_binary_column(
         }
     }
 
-    select_vector.clone_from_slice(temp_vector);
+    select_vector.clone_from_slice(temp_vector.as_slice());
 
     *count = match_count;
 }
@@ -502,7 +502,7 @@ unsafe fn row_match_string_column(
         }
     }
 
-    select_vector.clone_from_slice(temp_vector);
+    select_vector.clone_from_slice(temp_vector.as_slice());
 
     *count = match_count;
 }
@@ -567,7 +567,7 @@ unsafe fn row_match_column_type<T: ArgType>(
         }
     }
 
-    select_vector.clone_from_slice(temp_vector);
+    select_vector.clone_from_slice(temp_vector.as_slice());
     *count = match_count;
 }
 
@@ -604,6 +604,6 @@ unsafe fn row_match_generic_column(
             *no_match_count += 1;
         }
     }
-    select_vector.clone_from_slice(temp_vector);
+    select_vector.clone_from_slice(temp_vector.as_slice());
     *count = match_count;
 }
diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs
index 896c1ff46cca9..5b1cb702abb18 100644
--- a/src/query/expression/src/aggregate/probe_state.rs
+++ b/src/query/expression/src/aggregate/probe_state.rs
@@ -20,10 +20,10 @@ use crate::BATCH_SIZE;
 /// ProbeState is the state to probe HT
 /// It could be reuse during multiple probe process
 pub struct ProbeState {
-    pub group_hashes: [u64; BATCH_SIZE],
-    pub addresses: [*const u8; BATCH_SIZE],
-    pub page_index: [usize; BATCH_SIZE],
-    pub state_places: [StateAddr; BATCH_SIZE],
+    pub group_hashes: Vec<u64>,
+    pub addresses: Vec<*const u8>,
+    pub page_index: Vec<usize>,
+    pub state_places: Vec<StateAddr>,
     pub group_compare_vector: SelectVector,
     pub no_match_vector: SelectVector,
     pub empty_vector: SelectVector,
@@ -37,10 +37,10 @@ pub struct ProbeState {
 impl Default for ProbeState {
     fn default() -> Self {
         Self {
-            group_hashes: [0_u64; BATCH_SIZE],
-            addresses: [std::ptr::null::<u8>(); BATCH_SIZE],
-            page_index: [0; BATCH_SIZE],
-            state_places: [StateAddr::new(0); BATCH_SIZE],
+            group_hashes: vec![0_u64; BATCH_SIZE],
+            addresses: vec![std::ptr::null::<u8>(); BATCH_SIZE],
+            page_index: vec![0; BATCH_SIZE],
+            state_places: vec![StateAddr::new(0); BATCH_SIZE],
             group_compare_vector: new_sel(),
             no_match_vector: new_sel(),
             empty_vector: new_sel(),
@@ -64,8 +64,8 @@ impl ProbeState {
 
     pub fn reset_partitions(&mut self, partition_count: usize) {
         if self.partition_entries.len() < partition_count {
-            self.partition_entries.resize(partition_count, new_sel());
             self.partition_count.resize(partition_count, 0);
+            self.partition_entries.resize_with(partition_count, new_sel);
         }
 
         for i in 0..partition_count {
diff --git a/src/query/expression/src/lib.rs b/src/query/expression/src/lib.rs
index e402fe927d1a2..0de870bd811ff 100755
--- a/src/query/expression/src/lib.rs
+++ b/src/query/expression/src/lib.rs
@@ -43,6 +43,7 @@
 #![feature(alloc_layout_extra)]
 #![feature(debug_closure_helpers)]
 #![feature(never_type)]
+extern crate core;
 
 #[allow(dead_code)]
 mod block;
diff --git a/src/query/pipeline/core/Cargo.toml b/src/query/pipeline/core/Cargo.toml
index 64866ff49dc29..3464bd77c300d 100644
--- a/src/query/pipeline/core/Cargo.toml
+++ b/src/query/pipeline/core/Cargo.toml
@@ -17,6 +17,7 @@ futures = { workspace = true }
 log = { workspace = true }
 petgraph = { workspace = true }
 serde = { workspace = true }
+typetag = { workspace = true }
 
 [dev-dependencies]
 serde = { workspace = true }
diff --git a/src/query/pipeline/core/src/lib.rs b/src/query/pipeline/core/src/lib.rs
index d064965129771..a8a59cadeb076 100644
--- a/src/query/pipeline/core/src/lib.rs
+++ b/src/query/pipeline/core/src/lib.rs
@@ -15,6 +15,8 @@
 #![feature(once_cell_try)]
 #![feature(variant_count)]
 #![feature(associated_type_defaults)]
+#![feature(adt_const_params)]
+#![feature(let_chains)]
 #![allow(clippy::arc_with_non_send_sync)]
 #![allow(clippy::useless_asref)]
 
diff --git a/src/query/pipeline/core/src/pipeline.rs b/src/query/pipeline/core/src/pipeline.rs
index 8072b7b997b88..808849efa2bf7 100644
--- a/src/query/pipeline/core/src/pipeline.rs
+++ b/src/query/pipeline/core/src/pipeline.rs
@@ -20,6 +20,7 @@ use std::sync::atomic::Ordering;
 use std::sync::Arc;
 use std::time::Instant;
 
+use databend_common_base::base::tokio::sync::Barrier;
 use databend_common_base::runtime::defer;
 use databend_common_base::runtime::drop_guard;
 use databend_common_exception::ErrorCode;
@@ -32,10 +33,14 @@ use crate::finished_chain::ExecutionInfo;
 use crate::finished_chain::FinishedCallbackChain;
 use crate::pipe::Pipe;
 use crate::pipe::PipeItem;
+use crate::processors::BatchExchangeProcessor;
+use crate::processors::BatchMergePartitionProcessor;
+use crate::processors::BatchPartitionProcessor;
 use crate::processors::DuplicateProcessor;
 use crate::processors::Exchange;
 use crate::processors::InputPort;
 use crate::processors::MergePartitionProcessor;
+use crate::processors::OnePartitionProcessor;
 use crate::processors::OutputPort;
 use crate::processors::PartitionProcessor;
 use crate::processors::PlanScope;
@@ -447,23 +452,43 @@ impl Pipeline {
         }
     }
 
-    pub fn exchange<T: Exchange>(&mut self, n: usize, exchange: Arc<T>) {
+    pub fn exchange<T: Exchange>(&mut self, n: usize, exchange: Arc<T>) -> Result<()> {
+        debug_assert_ne!(n, 0);
+
+        if !T::MULTIWAY_SORT {
+            return self.batch_exchange(n, exchange);
+        }
+
         if let Some(pipe) = self.pipes.last() {
             if pipe.output_length < 1 {
-                return;
+                return Ok(());
             }
 
             let input_len = pipe.output_length;
+            let barrier = Arc::new(Barrier::new(input_len));
             let mut items = Vec::with_capacity(input_len);
 
-            for _index in 0..input_len {
+            for index in 0..input_len {
                 let input = InputPort::create();
-                let outputs: Vec<_> = (0..n).map(|_| OutputPort::create()).collect();
-                items.push(PipeItem::create(
-                    PartitionProcessor::create(input.clone(), outputs.clone(), exchange.clone()),
-                    vec![input],
-                    outputs,
-                ));
+                let outputs = (0..n).map(|_| OutputPort::create()).collect::<Vec<_>>();
+                let partition_processor = match n {
+                    1 => OnePartitionProcessor::create(
+                        input.clone(),
+                        outputs[0].clone(),
+                        exchange.clone(),
+                        index,
+                        barrier.clone(),
+                    ),
+                    _ => PartitionProcessor::create(
+                        input.clone(),
+                        outputs.clone(),
+                        exchange.clone(),
+                        index,
+                        barrier.clone(),
+                    ),
+                };
+
+                items.push(PipeItem::create(partition_processor, vec![input], outputs));
             }
 
             // partition data block
@@ -481,7 +506,7 @@ impl Pipeline {
                 let output = OutputPort::create();
                 let inputs: Vec<_> = (0..input_len).map(|_| InputPort::create()).collect();
                 items.push(PipeItem::create(
-                    MergePartitionProcessor::create(
+                    MergePartitionProcessor::<T>::create(
                         inputs.clone(),
                         output.clone(),
                         exchange.clone(),
@@ -492,8 +517,43 @@ impl Pipeline {
             }
 
             // merge partition
-            self.add_pipe(Pipe::create(input_len * n, n, items))
+            self.add_pipe(Pipe::create(input_len * n, n, items));
         }
+
+        Ok(())
+    }
+
+    fn batch_exchange<T: Exchange>(&mut self, n: usize, exchange: Arc<T>) -> Result<()> {
+        self.add_transform(|input, output| {
+            Ok(BatchPartitionProcessor::create(
+                input,
+                output,
+                n,
+                exchange.clone(),
+            ))
+        })?;
+
+        let input_len = self.output_len();
+        let inputs = (0..input_len)
+            .map(|_| InputPort::create())
+            .collect::<Vec<_>>();
+        let outputs = (0..n).map(|_| OutputPort::create()).collect::<Vec<_>>();
+
+        self.add_pipe(Pipe::create(input_len, n, vec![PipeItem::create(
+            BatchExchangeProcessor::create(inputs.clone(), outputs.clone(), exchange.clone()),
+            inputs,
+            outputs,
+        )]));
+
+        self.add_transform(|input, output| {
+            Ok(BatchMergePartitionProcessor::create(
+                input,
+                output,
+                exchange.clone(),
+            ))
+        })?;
+
+        Ok(())
     }
 
     #[track_caller]
diff --git a/src/query/pipeline/core/src/processors/mod.rs b/src/query/pipeline/core/src/processors/mod.rs
index c3b0e1772a341..00023c709fd0d 100644
--- a/src/query/pipeline/core/src/processors/mod.rs
+++ b/src/query/pipeline/core/src/processors/mod.rs
@@ -37,7 +37,12 @@ pub use profile::PlanScope;
 pub use profile::PlanScopeGuard;
 pub use resize_processor::create_resize_item;
 pub use resize_processor::ResizeProcessor;
+pub use shuffle_processor::BatchExchangeProcessor;
+pub use shuffle_processor::BatchMergePartitionProcessor;
+pub use shuffle_processor::BatchPartitionProcessor;
 pub use shuffle_processor::Exchange;
 pub use shuffle_processor::MergePartitionProcessor;
+pub use shuffle_processor::MultiwayStrategy;
+pub use shuffle_processor::OnePartitionProcessor;
 pub use shuffle_processor::PartitionProcessor;
 pub use shuffle_processor::ShuffleProcessor;
diff --git a/src/query/pipeline/core/src/processors/processor.rs b/src/query/pipeline/core/src/processors/processor.rs
index ce70053b80ded..d9e885a1b69ad 100644
--- a/src/query/pipeline/core/src/processors/processor.rs
+++ b/src/query/pipeline/core/src/processors/processor.rs
@@ -80,6 +80,22 @@ pub trait Processor: Send {
         Err(ErrorCode::Unimplemented("Unimplemented async_process."))
     }
 
+    fn prepare_spill_payload(&mut self) -> Result<bool> {
+        Err(ErrorCode::Unimplemented(
+            "Unimplemented prepare_spill_payload",
+        ))
+    }
+
+    async fn flush_spill_payload(&mut self) -> Result<bool> {
+        Err(ErrorCode::Unimplemented(
+            "Unimplemented flush_spill_payload",
+        ))
+    }
+
+    fn configure_peer_nodes(&mut self, _nodes: &[String]) {
+        // do nothing by default
+    }
+
     fn details_status(&self) -> Option<String> {
         None
     }
@@ -198,6 +214,10 @@ impl ProcessorPtr {
         .boxed()
     }
 
+    pub fn configure_peer_nodes(&self, nodes: &[String]) {
+        unsafe { (*self.inner.get()).configure_peer_nodes(nodes) }
+    }
+
     /// # Safety
     pub unsafe fn details_status(&self) -> Option<String> {
         (*self.inner.get()).details_status()
diff --git a/src/query/pipeline/core/src/processors/shuffle_processor.rs b/src/query/pipeline/core/src/processors/shuffle_processor.rs
index 2b57c3b3cc333..3ba0673135f34 100644
--- a/src/query/pipeline/core/src/processors/shuffle_processor.rs
+++ b/src/query/pipeline/core/src/processors/shuffle_processor.rs
@@ -13,9 +13,16 @@
 // limitations under the License.
 
 use std::any::Any;
+use std::cmp::Ordering;
+use std::collections::VecDeque;
 use std::sync::Arc;
 
+use databend_common_base::base::tokio::sync::Barrier;
 use databend_common_exception::Result;
+use databend_common_expression::local_block_meta_serde;
+use databend_common_expression::BlockMetaInfo;
+use databend_common_expression::BlockMetaInfoDowncast;
+use databend_common_expression::BlockMetaInfoPtr;
 use databend_common_expression::DataBlock;
 
 use crate::processors::Event;
@@ -25,6 +32,7 @@ use crate::processors::OutputPort;
 use crate::processors::Processor;
 use crate::processors::ProcessorPtr;
 
+#[derive(Eq, PartialEq)]
 pub enum MultiwayStrategy {
     Random,
     Custom,
@@ -32,14 +40,43 @@ pub enum MultiwayStrategy {
 
 pub trait Exchange: Send + Sync + 'static {
     const NAME: &'static str;
+    const MULTIWAY_SORT: bool = false;
     const SKIP_EMPTY_DATA_BLOCK: bool = false;
-    const STRATEGY: MultiwayStrategy = MultiwayStrategy::Random;
 
     fn partition(&self, data_block: DataBlock, n: usize) -> Result<Vec<DataBlock>>;
 
-    fn multiway_pick(&self, _partitions: &[Option<DataBlock>]) -> Result<usize> {
+    fn init_way(&self, _index: usize, _first_data: &DataBlock) -> Result<()> {
+        Ok(())
+    }
+
+    fn sorting_function(_: &DataBlock, _: &DataBlock) -> Ordering {
         unimplemented!()
     }
+
+    fn multiway_pick(&self, data_blocks: &mut [Option<DataBlock>]) -> Option<usize> {
+        let position =
+            data_blocks
+                .iter()
+                .enumerate()
+                .filter_map(|(idx, x)| x.as_ref().map(|d| (idx, d)))
+                .min_by(|(left_idx, left_block), (right_idx, right_block)| {
+                    match Self::sorting_function(left_block, right_block) {
+                        Ordering::Less => Ordering::Less,
+                        Ordering::Greater => Ordering::Greater,
+                        Ordering::Equal => left_idx.cmp(right_idx),
+                    }
+                });
+
+        position.map(|(idx, _)| idx)
+    }
+
+    fn output_window_size(&self) -> usize {
+        3
+    }
+
+    fn merge_output(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
+        Ok(data_blocks)
+    }
 }
 
 pub struct ShuffleProcessor {
@@ -166,6 +203,11 @@ pub struct PartitionProcessor<T: Exchange> {
     exchange: Arc<T>,
     input_data: Option<DataBlock>,
     partitioned_data: Vec<Option<DataBlock>>,
+
+    index: usize,
+    initialized: bool,
+    barrier: Arc<Barrier>,
+    hit: usize,
 }
 
 impl<T: Exchange> PartitionProcessor<T> {
@@ -173,18 +215,26 @@ impl<T: Exchange> PartitionProcessor<T> {
         input: Arc<InputPort>,
         outputs: Vec<Arc<OutputPort>>,
         exchange: Arc<T>,
+        index: usize,
+        barrier: Arc<Barrier>,
     ) -> ProcessorPtr {
         let partitioned_data = vec![None; outputs.len()];
+        let hit = index % outputs.len();
         ProcessorPtr::create(Box::new(PartitionProcessor {
             input,
             outputs,
             exchange,
             partitioned_data,
             input_data: None,
+            initialized: !T::MULTIWAY_SORT,
+            index,
+            barrier,
+            hit,
         }))
     }
 }
 
+#[async_trait::async_trait]
 impl<T: Exchange> Processor for PartitionProcessor<T> {
     fn name(&self) -> String {
         format!("ShufflePartition({})", T::NAME)
@@ -198,7 +248,15 @@ impl<T: Exchange> Processor for PartitionProcessor<T> {
         let mut all_output_finished = true;
         let mut all_data_pushed_output = true;
 
-        for (index, output) in self.outputs.iter().enumerate() {
+        for _index in 0..self.outputs.len() {
+            let index = self.hit;
+            let output = &self.outputs[self.hit];
+            self.hit += 1;
+
+            if self.hit == self.outputs.len() {
+                self.hit = 0;
+            }
+
             if output.is_finished() {
                 self.partitioned_data[index].take();
                 continue;
@@ -208,20 +266,25 @@ impl<T: Exchange> Processor for PartitionProcessor<T> {
 
             if output.can_push() {
                 if let Some(block) = self.partitioned_data[index].take() {
-                    output.push_data(Ok(block));
-
-                    continue;
+                    if !block.is_empty() || block.get_meta().is_some() {
+                        output.push_data(Ok(block));
+                        return Ok(Event::NeedConsume);
+                    }
                 }
             }
 
-            if self.partitioned_data[index].is_some() {
+            if !output.can_push() || self.partitioned_data[index].is_some() {
                 all_data_pushed_output = false;
             }
         }
 
         if all_output_finished {
             self.input.finish();
-            return Ok(Event::Finished);
+
+            return match self.initialized {
+                true => Ok(Event::Finished),
+                false => Ok(Event::Async),
+            };
         }
 
         if !all_data_pushed_output {
@@ -229,9 +292,20 @@ impl<T: Exchange> Processor for PartitionProcessor<T> {
             return Ok(Event::NeedConsume);
         }
 
+        if self.input_data.is_some() {
+            return match self.initialized {
+                true => Ok(Event::Sync),
+                false => Ok(Event::Async),
+            };
+        }
+
         if self.input.has_data() {
             self.input_data = Some(self.input.pull_data().unwrap()?);
-            return Ok(Event::Sync);
+
+            return match self.initialized {
+                true => Ok(Event::Sync),
+                false => Ok(Event::Async),
+            };
         }
 
         if self.input.is_finished() {
@@ -239,7 +313,10 @@ impl<T: Exchange> Processor for PartitionProcessor<T> {
                 output.finish();
             }
 
-            return Ok(Event::Finished);
+            return match self.initialized {
+                true => Ok(Event::Finished),
+                false => Ok(Event::Async),
+            };
         }
 
         self.input.set_need_data();
@@ -254,25 +331,160 @@ impl<T: Exchange> Processor for PartitionProcessor<T> {
 
             let partitioned = self.exchange.partition(block, self.outputs.len())?;
 
-            for (index, block) in partitioned.into_iter().enumerate() {
-                if block.is_empty() && block.get_meta().is_none() {
-                    continue;
-                }
+            if partitioned.is_empty() {
+                return Ok(());
+            }
 
+            assert_eq!(partitioned.len(), self.outputs.len());
+            for (index, block) in partitioned.into_iter().enumerate() {
                 self.partitioned_data[index] = Some(block);
             }
         }
 
         Ok(())
     }
+
+    async fn async_process(&mut self) -> Result<()> {
+        self.initialized = true;
+        if let Some(data_block) = self.input_data.as_ref() {
+            self.exchange.init_way(self.index, data_block)?;
+        }
+
+        self.barrier.wait().await;
+        Ok(())
+    }
 }
 
-pub struct MergePartitionProcessor<T: Exchange> {
+pub struct OnePartitionProcessor<T: Exchange> {
+    input: Arc<InputPort>,
+    output: Arc<OutputPort>,
+
     exchange: Arc<T>,
+    input_data: Option<DataBlock>,
+
+    index: usize,
+    initialized: bool,
+    barrier: Arc<Barrier>,
+}
+
+impl<T: Exchange> OnePartitionProcessor<T> {
+    pub fn create(
+        input: Arc<InputPort>,
+        outputs: Arc<OutputPort>,
+        exchange: Arc<T>,
+        index: usize,
+        barrier: Arc<Barrier>,
+    ) -> ProcessorPtr {
+        ProcessorPtr::create(Box::new(OnePartitionProcessor {
+            input,
+            output: outputs,
+            exchange,
+            input_data: None,
+            initialized: !T::MULTIWAY_SORT,
+            index,
+            barrier,
+        }))
+    }
+}
+
+#[async_trait::async_trait]
+impl<T: Exchange> Processor for OnePartitionProcessor<T> {
+    fn name(&self) -> String {
+        format!("ShufflePartition({})", T::NAME)
+    }
+
+    fn as_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn event(&mut self) -> Result<Event> {
+        if self.output.is_finished() {
+            self.input.finish();
+
+            return match self.initialized {
+                true => Ok(Event::Finished),
+                false => Ok(Event::Async),
+            };
+        }
+
+        if !self.output.can_push() {
+            self.input.set_not_need_data();
+            return Ok(Event::NeedConsume);
+        }
+
+        if self.input_data.is_some() {
+            if !self.initialized {
+                return Ok(Event::Async);
+            }
+
+            let block = self.input_data.take().unwrap();
+            let mut partitioned_data = self.exchange.partition(block, 1)?;
+
+            if let Some(block) = partitioned_data.pop() {
+                debug_assert!(partitioned_data.is_empty());
+                self.output.push_data(Ok(block));
+                return Ok(Event::NeedConsume);
+            }
+        }
+
+        if self.input.has_data() {
+            if !self.initialized {
+                self.input_data = Some(self.input.pull_data().unwrap()?);
+                return Ok(Event::Async);
+            }
+
+            let data_block = self.input.pull_data().unwrap()?;
+            let mut partitioned_data = self.exchange.partition(data_block, 1)?;
+
+            if let Some(block) = partitioned_data.pop() {
+                debug_assert!(partitioned_data.is_empty());
+                self.output.push_data(Ok(block));
+                return Ok(Event::NeedConsume);
+            }
+        }
 
+        if self.input.is_finished() {
+            self.output.finish();
+
+            return match self.initialized {
+                true => Ok(Event::Finished),
+                false => Ok(Event::Async),
+            };
+        }
+
+        self.input.set_need_data();
+        Ok(Event::NeedData)
+    }
+
+    async fn async_process(&mut self) -> Result<()> {
+        self.initialized = true;
+        if let Some(data_block) = self.input_data.as_ref() {
+            self.exchange.init_way(self.index, data_block)?;
+        }
+
+        self.barrier.wait().await;
+        Ok(())
+    }
+}
+
+#[derive(Clone, PartialEq)]
+enum PortStatus {
+    Idle,
+    HasData,
+    Finished,
+}
+
+pub struct MergePartitionProcessor<T: Exchange> {
     output: Arc<OutputPort>,
     inputs: Vec<Arc<InputPort>>,
     inputs_data: Vec<Option<DataBlock>>,
+    exchange: Arc<T>,
+
+    initialize: bool,
+    finished_inputs: usize,
+    waiting_inputs: VecDeque<usize>,
+    wakeup_inputs: VecDeque<usize>,
+    inputs_status: Vec<PortStatus>,
 }
 
 impl<T: Exchange> MergePartitionProcessor<T> {
@@ -282,18 +494,30 @@ impl<T: Exchange> MergePartitionProcessor<T> {
         exchange: Arc<T>,
     ) -> ProcessorPtr {
         let inputs_data = vec![None; inputs.len()];
-        ProcessorPtr::create(Box::new(MergePartitionProcessor {
+        let inputs_status = vec![PortStatus::Idle; inputs.len()];
+        let waiting_inputs = VecDeque::with_capacity(inputs.len());
+        let wakeup_inputs = VecDeque::with_capacity(inputs.len());
+
+        ProcessorPtr::create(Box::new(MergePartitionProcessor::<T> {
             output,
             inputs,
-            exchange,
             inputs_data,
+            exchange,
+            inputs_status,
+            waiting_inputs,
+            initialize: false,
+            finished_inputs: 0,
+            wakeup_inputs,
         }))
     }
 }
 
 impl<T: Exchange> Processor for MergePartitionProcessor<T> {
     fn name(&self) -> String {
-        format!("ShuffleMergePartition({})", T::NAME)
+        match T::MULTIWAY_SORT {
+            true => format!("ShuffleSortMergePartition({})", T::NAME),
+            false => format!("ShuffleMergePartition({})", T::NAME),
+        }
     }
 
     fn as_any(&mut self) -> &mut dyn Any {
@@ -314,8 +538,7 @@ impl<T: Exchange> Processor for MergePartitionProcessor<T> {
         }
 
         let mut all_inputs_finished = true;
-        let mut need_pick_block_to_push = matches!(T::STRATEGY, MultiwayStrategy::Custom);
-
+        let mut need_pick_block_to_push = true;
         for (index, input) in self.inputs.iter().enumerate() {
             if input.is_finished() {
                 continue;
@@ -323,19 +546,8 @@ impl<T: Exchange> Processor for MergePartitionProcessor<T> {
 
             all_inputs_finished = false;
 
-            if input.has_data() {
-                match T::STRATEGY {
-                    MultiwayStrategy::Random => {
-                        if self.output.can_push() {
-                            self.output.push_data(Ok(input.pull_data().unwrap()?));
-                        }
-                    }
-                    MultiwayStrategy::Custom => {
-                        if self.inputs_data[index].is_none() {
-                            self.inputs_data[index] = Some(input.pull_data().unwrap()?);
-                        }
-                    }
-                }
+            if input.has_data() && self.inputs_data[index].is_none() {
+                self.inputs_data[index] = Some(input.pull_data().unwrap()?);
             }
 
             if self.inputs_data[index].is_none() {
@@ -345,20 +557,456 @@ impl<T: Exchange> Processor for MergePartitionProcessor<T> {
             input.set_need_data();
         }
 
+        if need_pick_block_to_push {
+            if let Some(pick_index) = self.exchange.multiway_pick(&mut self.inputs_data) {
+                if let Some(block) = self.inputs_data[pick_index].take() {
+                    self.output.push_data(Ok(block));
+                    return Ok(Event::NeedConsume);
+                }
+            }
+        }
+
         if all_inputs_finished {
             self.output.finish();
             return Ok(Event::Finished);
         }
 
-        if need_pick_block_to_push {
-            let pick_index = self.exchange.multiway_pick(&self.inputs_data)?;
+        Ok(Event::NeedData)
+    }
 
-            if let Some(block) = self.inputs_data[pick_index].take() {
-                self.output.push_data(Ok(block));
+    fn event_with_cause(&mut self, cause: EventCause) -> Result<Event> {
+        if T::MULTIWAY_SORT {
+            return self.event();
+        }
+
+        if let EventCause::Output(_) = cause {
+            if self.output.is_finished() {
+                for input in &self.inputs {
+                    input.finish();
+                }
+
+                return Ok(Event::Finished);
+            }
+
+            if !self.output.can_push() {
+                return Ok(Event::NeedConsume);
+            }
+
+            while let Some(idx) = self.wakeup_inputs.pop_front() {
+                self.inputs[idx].set_need_data();
+            }
+        }
+
+        if !self.initialize && self.waiting_inputs.is_empty() {
+            self.initialize = true;
+
+            for input in &self.inputs {
+                input.set_need_data();
+            }
+
+            return Ok(Event::NeedData);
+        }
+
+        if let EventCause::Input(idx) = cause {
+            if self.inputs[idx].is_finished() && self.inputs_status[idx] != PortStatus::Finished {
+                self.finished_inputs += 1;
+                self.inputs_status[idx] = PortStatus::Finished;
+            }
+
+            if self.inputs[idx].has_data() && self.inputs_status[idx] != PortStatus::HasData {
+                self.waiting_inputs.push_back(idx);
+                self.inputs_status[idx] = PortStatus::HasData;
+            }
+        }
+
+        if self.finished_inputs == self.inputs.len() {
+            self.output.finish();
+            return Ok(Event::Finished);
+        }
+
+        while !self.waiting_inputs.is_empty() && self.output.can_push() {
+            let idx = self.waiting_inputs.pop_front().unwrap();
+            self.output.push_data(self.inputs[idx].pull_data().unwrap());
+            self.inputs_status[idx] = PortStatus::Idle;
+
+            if self.inputs[idx].is_finished() {
+                if self.inputs_status[idx] != PortStatus::Finished {
+                    self.finished_inputs += 1;
+                    self.inputs_status[idx] = PortStatus::Finished;
+                }
+
+                continue;
+            }
+
+            self.wakeup_inputs.push_back(idx);
+        }
+
+        match self.waiting_inputs.is_empty() {
+            true => Ok(Event::NeedData),
+            false => Ok(Event::NeedConsume),
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct ExchangeMeta {
+    data_blocks: Vec<DataBlock>,
+}
+
+local_block_meta_serde!(ExchangeMeta);
+
+#[typetag::serde(name = "LocalExchangeMeta")]
+impl BlockMetaInfo for ExchangeMeta {}
+
+impl ExchangeMeta {
+    pub fn create(blocks: Vec<DataBlock>) -> BlockMetaInfoPtr {
+        Box::new(ExchangeMeta {
+            data_blocks: blocks,
+        })
+    }
+}
+
+pub struct BatchPartitionProcessor<T: Exchange> {
+    input: Arc<InputPort>,
+    output: Arc<OutputPort>,
+
+    input_data: Option<DataBlock>,
+    output_data: Option<DataBlock>,
+
+    exchange: Arc<T>,
+    to_partition: usize,
+}
+
+impl<T: Exchange> BatchPartitionProcessor<T> {
+    pub fn create(
+        input: Arc<InputPort>,
+        output: Arc<OutputPort>,
+        to_partition: usize,
+        exchange: Arc<T>,
+    ) -> ProcessorPtr {
+        ProcessorPtr::create(Box::new(BatchPartitionProcessor {
+            input,
+            output,
+            exchange,
+            to_partition,
+            input_data: None,
+            output_data: None,
+        }))
+    }
+}
+
+impl<T: Exchange> Processor for BatchPartitionProcessor<T> {
+    fn name(&self) -> String {
+        String::from("PartitionProcessor")
+    }
+
+    fn as_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn event(&mut self) -> Result<Event> {
+        if self.output.is_finished() {
+            self.input.finish();
+            return Ok(Event::Finished);
+        }
+
+        if !self.output.can_push() {
+            self.input.set_not_need_data();
+            return Ok(Event::NeedConsume);
+        }
+
+        if let Some(data_block) = self.output_data.take() {
+            self.output.push_data(Ok(data_block));
+            return Ok(Event::NeedConsume);
+        }
+
+        if self.input.has_data() {
+            self.input_data = Some(self.input.pull_data().unwrap()?);
+            return Ok(Event::Sync);
+        }
+
+        if self.input.is_finished() {
+            self.output.finish();
+            return Ok(Event::Finished);
+        }
+
+        self.input.set_need_data();
+        Ok(Event::NeedData)
+    }
+
+    fn process(&mut self) -> Result<()> {
+        if let Some(block) = self.input_data.take() {
+            if T::SKIP_EMPTY_DATA_BLOCK && block.is_empty() {
+                return Ok(());
+            }
+
+            let partitioned_data = self.exchange.partition(block, self.to_partition)?;
+            self.output_data = Some(DataBlock::empty_with_meta(ExchangeMeta::create(
+                partitioned_data,
+            )));
+        }
+
+        Ok(())
+    }
+}
+
+pub struct BatchExchangeProcessor<T: Exchange> {
+    input: Vec<Arc<InputPort>>,
+    output: Vec<Arc<OutputPort>>,
+
+    initialize: bool,
+
+    finished_input_size: usize,
+    input_finish_status: Vec<bool>,
+    waiting_inputs: VecDeque<usize>,
+
+    finished_output_size: usize,
+    pending_outputs: Vec<bool>,
+    output_finish_status: Vec<bool>,
+
+    exchange: Arc<T>,
+    matrix: Vec<VecDeque<DataBlock>>,
+}
+
+impl<T: Exchange> BatchExchangeProcessor<T> {
+    pub fn create(
+        input: Vec<Arc<InputPort>>,
+        output: Vec<Arc<OutputPort>>,
+        exchange: Arc<T>,
+    ) -> ProcessorPtr {
+        let pending_outputs = vec![false; output.len()];
+        let input_finish_status = vec![false; input.len()];
+        let output_finish_status = vec![false; output.len()];
+
+        let mut matrix = Vec::with_capacity(output.len());
+
+        for _ in 0..output.capacity() {
+            matrix.push(VecDeque::new());
+        }
+
+        ProcessorPtr::create(Box::new(BatchExchangeProcessor {
+            input,
+            output,
+            matrix,
+            exchange,
+            pending_outputs,
+            input_finish_status,
+            output_finish_status,
+
+            initialize: false,
+            finished_input_size: 0,
+            finished_output_size: 0,
+            waiting_inputs: VecDeque::new(),
+        }))
+    }
+}
+
+impl<T: Exchange> Processor for BatchExchangeProcessor<T> {
+    fn name(&self) -> String {
+        String::from("BatchExchangeProcessor")
+    }
+
+    fn as_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn event_with_cause(&mut self, cause: EventCause) -> Result<Event> {
+        if let EventCause::Input(index) = cause {
+            if self.input[index].has_data() {
+                let mut data_block = self.input[index].pull_data().unwrap()?;
+
+                let meta = data_block.take_meta().unwrap();
+                let meta = ExchangeMeta::downcast_from(meta).unwrap();
+
+                for (idx, block) in meta.data_blocks.into_iter().enumerate() {
+                    self.matrix[idx].push_back(block);
+                }
+            }
+
+            if self.input[index].is_finished() {
+                if !self.input_finish_status[index] {
+                    self.finished_input_size += 1;
+                    self.input_finish_status[index] = true;
+                }
+            } else {
+                self.waiting_inputs.push_back(index);
+            }
+        }
+
+        if let EventCause::Output(index) = cause {
+            if self.output[index].is_finished() && !self.output_finish_status[index] {
+                self.finished_output_size += 1;
+                self.output_finish_status[index] = true;
+            }
+
+            if self.output[index].can_push() {
+                self.pending_outputs[index] = true;
+            }
+        }
+
+        if !self.initialize {
+            self.initialize = true;
+
+            for input in &self.input {
+                input.set_need_data();
+            }
+
+            return Ok(Event::NeedData);
+        }
+
+        if self.finished_output_size == self.output.len() {
+            for input in &self.input {
+                input.finish();
+            }
+
+            return Ok(Event::Finished);
+        }
+
+        let all_input_finished = self.finished_input_size == self.input.len();
+
+        let mut sent_all_data = true;
+        for (idx, data) in self.matrix.iter_mut().enumerate() {
+            if data.is_empty() || self.output_finish_status[idx] {
+                continue;
+            }
+
+            sent_all_data = false;
+            if self.pending_outputs[idx]
+                && (all_input_finished || (data.len() >= self.exchange.output_window_size()))
+            {
+                self.pending_outputs[idx] = false;
+                let mut output_data = Vec::with_capacity(self.exchange.output_window_size());
+
+                for _index in 0..self.exchange.output_window_size() {
+                    if let Some(data) = data.pop_front() {
+                        output_data.push(data);
+                    }
+                }
+
+                self.output[idx].push_data(Ok(DataBlock::empty_with_meta(ExchangeMeta::create(
+                    output_data,
+                ))));
                 return Ok(Event::NeedConsume);
             }
         }
 
+        while let Some(index) = self.waiting_inputs.pop_front() {
+            if !self.input[index].is_finished() {
+                self.input[index].set_need_data();
+                return Ok(Event::NeedData);
+            } else if !self.input_finish_status[index] {
+                self.input_finish_status[index] = true;
+                self.finished_input_size += 1;
+            }
+        }
+
+        let all_input_finished = self.finished_input_size == self.input.len();
+        if sent_all_data && all_input_finished {
+            for output in &self.output {
+                output.finish();
+            }
+
+            return Ok(Event::Finished);
+        }
+
+        Ok(Event::NeedConsume)
+    }
+}
+
+pub struct BatchMergePartitionProcessor<T: Exchange> {
+    input: Arc<InputPort>,
+    output: Arc<OutputPort>,
+
+    input_data: Option<DataBlock>,
+    output_data: VecDeque<DataBlock>,
+
+    exchange: Arc<T>,
+}
+
+impl<T: Exchange> BatchMergePartitionProcessor<T> {
+    pub fn create(
+        input: Arc<InputPort>,
+        output: Arc<OutputPort>,
+        exchange: Arc<T>,
+    ) -> ProcessorPtr {
+        ProcessorPtr::create(Box::new(BatchMergePartitionProcessor {
+            input,
+            output,
+            input_data: None,
+            output_data: VecDeque::new(),
+            exchange,
+        }))
+    }
+}
+
+impl<T: Exchange> Processor for BatchMergePartitionProcessor<T> {
+    fn name(&self) -> String {
+        String::from("MergePartitionProcessor")
+    }
+
+    fn as_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn event(&mut self) -> Result<Event> {
+        if self.output.is_finished() {
+            self.input.finish();
+            return Ok(Event::Finished);
+        }
+
+        if !self.output.can_push() {
+            self.input.set_not_need_data();
+            return Ok(Event::NeedConsume);
+        }
+
+        if let Some(data_block) = self.output_data.pop_front() {
+            self.output.push_data(Ok(data_block));
+            return Ok(Event::NeedConsume);
+        }
+
+        if self.input.has_data() {
+            self.input_data = Some(self.input.pull_data().unwrap()?);
+            return Ok(Event::Sync);
+        }
+
+        if self.input.is_finished() {
+            self.output.finish();
+            return Ok(Event::Finished);
+        }
+
+        self.input.set_need_data();
         Ok(Event::NeedData)
     }
+
+    fn process(&mut self) -> Result<()> {
+        if let Some(mut block) = self.input_data.take() {
+            let meta = block.take_meta().unwrap();
+            let meta = ExchangeMeta::downcast_from(meta).unwrap();
+            self.output_data
+                .extend(self.exchange.merge_output(meta.data_blocks)?);
+        }
+
+        Ok(())
+    }
 }
+
+// pub struct BatchSortingExchangeProcessor<T: Exchange> {
+//     exchange: Arc<T>,
+//
+//     inputs: Vec<Arc<InputPort>>,
+//     outputs: Vec<Arc<OutputPort>>,
+// }
+//
+// impl<T: Exchange> Processor for BatchSortingExchangeProcessor<T> {
+//     fn name(&self) -> String {
+//         String::from("BatchSortingShuffleProcessor")
+//     }
+//
+//     fn as_any(&mut self) -> &mut dyn Any {
+//         self
+//     }
+//
+//     fn event_with_cause(&mut self, _cause: EventCause) -> Result<Event> {
+//         todo!()
+//     }
+// }
diff --git a/src/query/pipeline/transforms/src/processors/transforms/transform_accumulating.rs b/src/query/pipeline/transforms/src/processors/transforms/transform_accumulating.rs
index 1ffc9dcbcf2ea..dc46a6c6d759d 100644
--- a/src/query/pipeline/transforms/src/processors/transforms/transform_accumulating.rs
+++ b/src/query/pipeline/transforms/src/processors/transforms/transform_accumulating.rs
@@ -18,6 +18,7 @@ use std::marker::PhantomData;
 use std::sync::Arc;
 
 use databend_common_base::runtime::drop_guard;
+use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::BlockMetaInfo;
 use databend_common_expression::BlockMetaInfoDowncast;
@@ -27,9 +28,12 @@ use databend_common_pipeline_core::processors::InputPort;
 use databend_common_pipeline_core::processors::OutputPort;
 use databend_common_pipeline_core::processors::Processor;
 
+#[async_trait::async_trait]
 pub trait AccumulatingTransform: Send {
     const NAME: &'static str;
 
+    const SUPPORT_SPILL: bool = false;
+
     fn transform(&mut self, data: DataBlock) -> Result<Vec<DataBlock>>;
 
     fn on_finish(&mut self, _output: bool) -> Result<Vec<DataBlock>> {
@@ -37,6 +41,24 @@ pub trait AccumulatingTransform: Send {
     }
 
     fn interrupt(&self) {}
+
+    fn configure_peer_nodes(&mut self, _nodes: &[String]) {}
+
+    fn need_spill(&self) -> bool {
+        false
+    }
+
+    fn prepare_spill_payload(&mut self) -> Result<bool> {
+        Err(ErrorCode::Unimplemented(
+            "Unimplemented prepare_spill_payload",
+        ))
+    }
+
+    async fn flush_spill_payload(&mut self) -> Result<bool> {
+        Err(ErrorCode::Unimplemented(
+            "Unimplemented flush_spill_payload",
+        ))
+    }
 }
 
 pub struct AccumulatingTransformer<T: AccumulatingTransform + 'static> {
@@ -47,6 +69,10 @@ pub struct AccumulatingTransformer<T: AccumulatingTransform + 'static> {
     called_on_finish: bool,
     input_data: Option<DataBlock>,
     output_data: VecDeque<DataBlock>,
+
+    has_spill: bool,
+    flush_spill_payload: bool,
+    prepare_spill_payload: bool,
 }
 
 impl<T: AccumulatingTransform + 'static> AccumulatingTransformer<T> {
@@ -58,6 +84,9 @@ impl<T: AccumulatingTransform + 'static> AccumulatingTransformer<T> {
             input_data: None,
             output_data: VecDeque::with_capacity(1),
             called_on_finish: false,
+            has_spill: false,
+            flush_spill_payload: false,
+            prepare_spill_payload: false,
         })
     }
 }
@@ -93,6 +122,14 @@ impl<T: AccumulatingTransform + 'static> Processor for AccumulatingTransformer<T
             return Ok(Event::Finished);
         }
 
+        if self.prepare_spill_payload {
+            return Ok(Event::Sync);
+        }
+
+        if self.flush_spill_payload {
+            return Ok(Event::Async);
+        }
+
         if !self.output.can_push() {
             self.input.set_not_need_data();
             return Ok(Event::NeedConsume);
@@ -114,7 +151,15 @@ impl<T: AccumulatingTransform + 'static> Processor for AccumulatingTransformer<T
 
         if self.input.is_finished() {
             return match !self.called_on_finish {
-                true => Ok(Event::Sync),
+                true => {
+                    // To avoid downstream out-of-memory, once a spill occurs, all data must be spilled entirely.
+                    if self.has_spill {
+                        self.has_spill = false;
+                        self.prepare_spill_payload = true;
+                    }
+
+                    Ok(Event::Sync)
+                }
                 false => {
                     self.output.finish();
                     Ok(Event::Finished)
@@ -126,9 +171,21 @@ impl<T: AccumulatingTransform + 'static> Processor for AccumulatingTransformer<T
         Ok(Event::NeedData)
     }
 
+    fn interrupt(&self) {
+        self.inner.interrupt();
+    }
+
     fn process(&mut self) -> Result<()> {
+        if self.prepare_spill_payload {
+            self.prepare_spill_payload = false;
+            self.flush_spill_payload = self.prepare_spill_payload()?;
+            return Ok(());
+        }
+
         if let Some(data_block) = self.input_data.take() {
             self.output_data.extend(self.inner.transform(data_block)?);
+            self.prepare_spill_payload = self.inner.need_spill();
+            self.has_spill |= self.prepare_spill_payload;
             return Ok(());
         }
 
@@ -140,8 +197,25 @@ impl<T: AccumulatingTransform + 'static> Processor for AccumulatingTransformer<T
         Ok(())
     }
 
-    fn interrupt(&self) {
-        self.inner.interrupt();
+    async fn async_process(&mut self) -> Result<()> {
+        if self.flush_spill_payload {
+            self.flush_spill_payload = false;
+            self.prepare_spill_payload = self.flush_spill_payload().await?;
+        }
+
+        Ok(())
+    }
+
+    fn prepare_spill_payload(&mut self) -> Result<bool> {
+        self.inner.prepare_spill_payload()
+    }
+
+    async fn flush_spill_payload(&mut self) -> Result<bool> {
+        self.inner.flush_spill_payload().await
+    }
+
+    fn configure_peer_nodes(&mut self, nodes: &[String]) {
+        self.inner.configure_peer_nodes(nodes)
     }
 }
 
diff --git a/src/query/service/src/interpreters/common/query_log.rs b/src/query/service/src/interpreters/common/query_log.rs
index 2896858421360..2e9dd024b642b 100644
--- a/src/query/service/src/interpreters/common/query_log.rs
+++ b/src/query/service/src/interpreters/common/query_log.rs
@@ -225,7 +225,7 @@ impl InterpreterQueryLog {
             has_profiles: false,
             txn_state,
             txn_id,
-            peek_memory_usage: HashMap::new(),
+            peak_memory_usage: HashMap::new(),
         })
     }
 
@@ -337,7 +337,7 @@ impl InterpreterQueryLog {
         let txn_id = guard.txn_id().to_string();
         drop(guard);
 
-        let peek_memory_usage = ctx.get_node_peek_memory_usage();
+        let peak_memory_usage = ctx.get_node_peek_memory_usage();
 
         Self::write_log(QueryLogElement {
             log_type,
@@ -402,7 +402,7 @@ impl InterpreterQueryLog {
             has_profiles,
             txn_state,
             txn_id,
-            peek_memory_usage,
+            peak_memory_usage,
         })
     }
 }
diff --git a/src/query/service/src/pipelines/builders/builder_aggregate.rs b/src/query/service/src/pipelines/builders/builder_aggregate.rs
index fde91de7ca754..e2a81b426cfed 100644
--- a/src/query/service/src/pipelines/builders/builder_aggregate.rs
+++ b/src/query/service/src/pipelines/builders/builder_aggregate.rs
@@ -37,13 +37,11 @@ use databend_common_sql::IndexType;
 use databend_common_storage::DataOperator;
 use itertools::Itertools;
 
-use crate::pipelines::processors::transforms::aggregator::build_partition_bucket;
+use crate::pipelines::processors::transforms::aggregator::build_final_aggregate;
 use crate::pipelines::processors::transforms::aggregator::create_udaf_script_function;
-use crate::pipelines::processors::transforms::aggregator::AggregateInjector;
 use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
 use crate::pipelines::processors::transforms::aggregator::FinalSingleStateAggregator;
 use crate::pipelines::processors::transforms::aggregator::PartialSingleStateAggregator;
-use crate::pipelines::processors::transforms::aggregator::TransformAggregateSpillWriter;
 use crate::pipelines::processors::transforms::aggregator::TransformExpandGroupingSets;
 use crate::pipelines::processors::transforms::aggregator::TransformPartialAggregate;
 use crate::pipelines::PipelineBuilder;
@@ -153,36 +151,21 @@ impl PipelineBuilder {
             });
         }
 
+        let location_prefix = self.ctx.query_id_spill_prefix();
+        let operator = DataOperator::instance().spill_operator();
         self.main_pipeline.add_transform(|input, output| {
             Ok(ProcessorPtr::create(TransformPartialAggregate::try_create(
                 self.ctx.clone(),
                 input,
                 output,
+                operator.clone(),
                 params.clone(),
                 partial_agg_config.clone(),
+                location_prefix.clone(),
             )?))
         })?;
 
-        // If cluster mode, spill write will be completed in exchange serialize, because we need scatter the block data first
-        if !self.is_exchange_neighbor {
-            let operator = DataOperator::instance().spill_operator();
-            let location_prefix = self.ctx.query_id_spill_prefix();
-
-            self.main_pipeline.add_transform(|input, output| {
-                Ok(ProcessorPtr::create(
-                    TransformAggregateSpillWriter::try_create(
-                        self.ctx.clone(),
-                        input,
-                        output,
-                        operator.clone(),
-                        params.clone(),
-                        location_prefix.clone(),
-                    )?,
-                ))
-            })?;
-        }
-
-        self.exchange_injector = AggregateInjector::create(self.ctx.clone(), params.clone());
+        self.enable_multiway_sort = true;
         Ok(())
     }
 
@@ -215,15 +198,13 @@ impl PipelineBuilder {
             return Ok(());
         }
 
-        let old_inject = self.exchange_injector.clone();
-
         let input: &PhysicalPlan = &aggregate.input;
-        if matches!(input, PhysicalPlan::ExchangeSource(_)) {
-            self.exchange_injector = AggregateInjector::create(self.ctx.clone(), params.clone());
-        }
+        let old_value = self.enable_multiway_sort;
+        self.enable_multiway_sort |= matches!(input, PhysicalPlan::ExchangeSource(_));
+
         self.build_pipeline(&aggregate.input)?;
-        self.exchange_injector = old_inject;
-        build_partition_bucket(&mut self.main_pipeline, params.clone())
+        self.enable_multiway_sort = old_value;
+        build_final_aggregate(self.ctx.clone(), &mut self.main_pipeline, params.clone())
     }
 
     fn build_aggregator_params(
diff --git a/src/query/service/src/pipelines/builders/builder_exchange.rs b/src/query/service/src/pipelines/builders/builder_exchange.rs
index 6c27b81ae366e..26af09a82e82f 100644
--- a/src/query/service/src/pipelines/builders/builder_exchange.rs
+++ b/src/query/service/src/pipelines/builders/builder_exchange.rs
@@ -24,7 +24,6 @@ impl PipelineBuilder {
         let mut build_res = exchange_manager.get_fragment_source(
             &exchange_source.query_id,
             exchange_source.source_fragment_id,
-            self.exchange_injector.clone(),
         )?;
 
         // add profile
diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
index dd9ab7edd4a38..cc3a2b2ab90ad 100644
--- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
+++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
@@ -49,7 +49,7 @@ impl PipelineBuilder {
         self.main_pipeline.exchange(
             num_processors,
             HilbertPartitionExchange::create(partition.num_partitions),
-        );
+        )?;
 
         let settings = self.ctx.get_settings();
         let disk_bytes_limit = settings.get_window_partition_spilling_to_disk_bytes_limit()?;
diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs
index ae0dffc1dc73b..6e531b5200c4b 100644
--- a/src/query/service/src/pipelines/builders/builder_window.rs
+++ b/src/query/service/src/pipelines/builders/builder_window.rs
@@ -184,12 +184,12 @@ impl PipelineBuilder {
                     top_n.func,
                     num_partitions as u64,
                 ),
-            )
+            )?
         } else {
             self.main_pipeline.exchange(
                 num_processors,
                 WindowPartitionExchange::create(partition_by.clone(), num_partitions),
-            );
+            )?;
         }
 
         let disk_bytes_limit = settings.get_window_partition_spilling_to_disk_bytes_limit()?;
diff --git a/src/query/service/src/pipelines/pipeline_build_res.rs b/src/query/service/src/pipelines/pipeline_build_res.rs
index fd40f817e3cda..615caf8216153 100644
--- a/src/query/service/src/pipelines/pipeline_build_res.rs
+++ b/src/query/service/src/pipelines/pipeline_build_res.rs
@@ -24,8 +24,6 @@ use databend_common_pipeline_sources::OneBlockSource;
 
 use crate::interpreters::CreateTableInterpreter;
 use crate::pipelines::processors::transforms::HashJoinBuildState;
-use crate::servers::flight::v1::exchange::DefaultExchangeInjector;
-use crate::servers::flight::v1::exchange::ExchangeInjector;
 
 #[derive(Clone)]
 pub struct PipelineBuilderData {
@@ -38,7 +36,7 @@ pub struct PipelineBuildResult {
     // Containing some sub queries pipelines, must be complete pipeline
     pub sources_pipelines: Vec<Pipeline>,
 
-    pub exchange_injector: Arc<dyn ExchangeInjector>,
+    pub enable_multiway_sort: bool,
     /// for local fragment data sharing
     pub builder_data: PipelineBuilderData,
     pub r_cte_scan_interpreters: Vec<CreateTableInterpreter>,
@@ -49,7 +47,7 @@ impl PipelineBuildResult {
         PipelineBuildResult {
             main_pipeline: Pipeline::create(),
             sources_pipelines: vec![],
-            exchange_injector: DefaultExchangeInjector::create(),
+            enable_multiway_sort: false,
             builder_data: PipelineBuilderData {
                 input_join_state: None,
                 input_probe_schema: None,
@@ -72,7 +70,7 @@ impl PipelineBuildResult {
         Ok(PipelineBuildResult {
             main_pipeline,
             sources_pipelines: vec![],
-            exchange_injector: DefaultExchangeInjector::create(),
+            enable_multiway_sort: false,
             builder_data: PipelineBuilderData {
                 input_join_state: None,
                 input_probe_schema: None,
diff --git a/src/query/service/src/pipelines/pipeline_builder.rs b/src/query/service/src/pipelines/pipeline_builder.rs
index 1763a9cc6dfc6..2563bf29cbf2a 100644
--- a/src/query/service/src/pipelines/pipeline_builder.rs
+++ b/src/query/service/src/pipelines/pipeline_builder.rs
@@ -33,8 +33,6 @@ use crate::interpreters::CreateTableInterpreter;
 use crate::pipelines::processors::transforms::HashJoinBuildState;
 use crate::pipelines::processors::HashJoinState;
 use crate::pipelines::PipelineBuildResult;
-use crate::servers::flight::v1::exchange::DefaultExchangeInjector;
-use crate::servers::flight::v1::exchange::ExchangeInjector;
 use crate::sessions::QueryContext;
 
 pub struct PipelineBuilder {
@@ -49,7 +47,7 @@ pub struct PipelineBuilder {
     pub merge_into_probe_data_fields: Option<Vec<DataField>>,
     pub join_state: Option<Arc<HashJoinBuildState>>,
 
-    pub(crate) exchange_injector: Arc<dyn ExchangeInjector>,
+    pub(crate) enable_multiway_sort: bool,
 
     pub hash_join_states: HashMap<usize, Arc<HashJoinState>>,
 
@@ -72,13 +70,13 @@ impl PipelineBuilder {
             settings,
             pipelines: vec![],
             main_pipeline: Pipeline::with_scopes(scopes),
-            exchange_injector: DefaultExchangeInjector::create(),
             merge_into_probe_data_fields: None,
             join_state: None,
             hash_join_states: HashMap::new(),
             r_cte_scan_interpreters: vec![],
             is_exchange_neighbor: false,
             contain_sink_processor: false,
+            enable_multiway_sort: false,
         }
     }
 
@@ -105,12 +103,12 @@ impl PipelineBuilder {
         Ok(PipelineBuildResult {
             main_pipeline: self.main_pipeline,
             sources_pipelines: self.pipelines,
-            exchange_injector: self.exchange_injector,
             builder_data: PipelineBuilderData {
                 input_join_state: self.join_state,
                 input_probe_schema: self.merge_into_probe_data_fields,
             },
             r_cte_scan_interpreters: self.r_cte_scan_interpreters,
+            enable_multiway_sort: self.enable_multiway_sort,
         })
     }
 
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs
index 55688a4347259..5c91c2621275a 100644
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs
@@ -12,71 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::cmp::Ordering;
+use std::collections::HashMap;
+use std::sync::atomic::AtomicUsize;
+use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::Arc;
 
-use bumpalo::Bump;
-use databend_common_exception::ErrorCode;
+use arrow_ipc::writer::IpcWriteOptions;
+use arrow_ipc::CompressionType;
+use databend_common_config::GlobalConfig;
 use databend_common_exception::Result;
 use databend_common_expression::BlockMetaInfoDowncast;
 use databend_common_expression::DataBlock;
-use databend_common_expression::PartitionedPayload;
 use databend_common_expression::Payload;
 use databend_common_expression::PayloadFlushState;
-use databend_common_pipeline_core::processors::ProcessorPtr;
-use databend_common_pipeline_core::Pipeline;
+use databend_common_pipeline_core::processors::Exchange;
 use databend_common_settings::FlightCompression;
-use databend_common_storage::DataOperator;
 
 use crate::pipelines::processors::transforms::aggregator::aggregate_meta::AggregateMeta;
-use crate::pipelines::processors::transforms::aggregator::serde::TransformExchangeAggregateSerializer;
-use crate::pipelines::processors::transforms::aggregator::serde::TransformExchangeAsyncBarrier;
-use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
-use crate::pipelines::processors::transforms::aggregator::TransformAggregateDeserializer;
-use crate::pipelines::processors::transforms::aggregator::TransformAggregateSerializer;
-use crate::pipelines::processors::transforms::aggregator::TransformAggregateSpillWriter;
-use crate::servers::flight::v1::exchange::DataExchange;
-use crate::servers::flight::v1::exchange::ExchangeInjector;
-use crate::servers::flight::v1::exchange::ExchangeSorting;
-use crate::servers::flight::v1::exchange::MergeExchangeParams;
-use crate::servers::flight::v1::exchange::ShuffleExchangeParams;
+use crate::servers::flight::v1::exchange::serde::serialize_block;
+use crate::servers::flight::v1::exchange::serde::ExchangeSerializeMeta;
 use crate::servers::flight::v1::scatter::FlightScatter;
-use crate::sessions::QueryContext;
-
-struct AggregateExchangeSorting {}
-
-pub fn compute_block_number(bucket: isize, max_partition_count: usize) -> Result<isize> {
-    Ok(max_partition_count as isize * 1000 + bucket)
-}
-
-impl ExchangeSorting for AggregateExchangeSorting {
-    fn block_number(&self, data_block: &DataBlock) -> Result<isize> {
-        match data_block.get_meta() {
-            None => Ok(-1),
-            Some(block_meta_info) => match AggregateMeta::downcast_ref_from(block_meta_info) {
-                None => Err(ErrorCode::Internal(format!(
-                    "Internal error, AggregateExchangeSorting only recv AggregateMeta {:?}",
-                    serde_json::to_string(block_meta_info)
-                ))),
-                Some(meta_info) => match meta_info {
-                    AggregateMeta::Partitioned { .. } => unreachable!(),
-                    AggregateMeta::Serialized(v) => {
-                        compute_block_number(v.bucket, v.max_partition_count)
-                    }
-                    AggregateMeta::AggregatePayload(v) => {
-                        compute_block_number(v.bucket, v.max_partition_count)
-                    }
-                    AggregateMeta::AggregateSpilling(_)
-                    | AggregateMeta::Spilled(_)
-                    | AggregateMeta::BucketSpilled(_) => Ok(-1),
-                },
-            },
-        }
-    }
-}
-
-struct HashTableHashScatter {
-    buckets: usize,
-}
 
 fn scatter_payload(mut payload: Payload, buckets: usize) -> Result<Vec<Payload>> {
     let mut buckets = Vec::with_capacity(buckets);
@@ -112,222 +68,228 @@ fn scatter_payload(mut payload: Payload, buckets: usize) -> Result<Vec<Payload>>
     Ok(buckets)
 }
 
-fn scatter_partitioned_payload(
-    partitioned_payload: PartitionedPayload,
-    buckets: usize,
-) -> Result<Vec<PartitionedPayload>> {
-    let mut buckets = Vec::with_capacity(buckets);
+pub struct FlightExchange<const MULTIWAY_SORT: bool> {
+    local_id: String,
+    node_list: Vec<String>,
+    node_list_lookup: HashMap<String, usize>,
 
-    let group_types = partitioned_payload.group_types.clone();
-    let aggrs = partitioned_payload.aggrs.clone();
-    let partition_count = partitioned_payload.partition_count() as u64;
-    let mut state = PayloadFlushState::default();
-
-    for _ in 0..buckets.capacity() {
-        buckets.push(PartitionedPayload::new(
-            group_types.clone(),
-            aggrs.clone(),
-            partition_count,
-            partitioned_payload.arenas.clone(),
-        ));
-    }
+    options: IpcWriteOptions,
+    global_max_partition: Arc<AtomicUsize>,
+    shuffle_scatter: Arc<Box<dyn FlightScatter>>,
+}
 
-    let mut payloads = Vec::with_capacity(buckets.len());
+impl<const MULTIWAY_SORT: bool> FlightExchange<MULTIWAY_SORT> {
+    pub fn create(
+        node_list: Vec<String>,
+        compression: Option<FlightCompression>,
+        shuffle_scatter: Arc<Box<dyn FlightScatter>>,
+    ) -> Arc<Self> {
+        let compression = match compression {
+            None => None,
+            Some(compression) => match compression {
+                FlightCompression::Lz4 => Some(CompressionType::LZ4_FRAME),
+                FlightCompression::Zstd => Some(CompressionType::ZSTD),
+            },
+        };
 
-    for _ in 0..payloads.capacity() {
-        payloads.push(Payload::new(
-            Arc::new(Bump::new()),
-            group_types.clone(),
-            aggrs.clone(),
-            partitioned_payload.states_layout.clone(),
-        ));
+        let node_list_lookup = node_list
+            .iter()
+            .cloned()
+            .enumerate()
+            .map(|(x, y)| (y, x))
+            .collect::<HashMap<String, usize>>();
+
+        Arc::new(FlightExchange {
+            local_id: GlobalConfig::instance().query.node_id.clone(),
+            node_list,
+            node_list_lookup,
+            options: IpcWriteOptions::default()
+                .try_with_compression(compression)
+                .unwrap(),
+            shuffle_scatter,
+            global_max_partition: Arc::new(AtomicUsize::new(0)),
+        })
     }
+}
+
+impl<const MULTIWAY_SORT: bool> FlightExchange<MULTIWAY_SORT> {
+    fn default_partition(&self, data_block: DataBlock) -> Result<Vec<DataBlock>> {
+        if self.node_list.is_empty() {
+            let data_block = serialize_block(0, 0, 0, data_block, &self.options)?;
+            return Ok(vec![data_block]);
+        }
 
-    for mut payload in partitioned_payload.payloads.into_iter() {
-        // scatter each page of the payload.
-        while payload.scatter(&mut state, buckets.len()) {
-            // copy to the corresponding bucket.
-            for (idx, bucket) in payloads.iter_mut().enumerate() {
-                let count = state.probe_state.partition_count[idx];
+        let data_blocks = self.shuffle_scatter.execute(data_block)?;
 
-                if count > 0 {
-                    let sel = &state.probe_state.partition_entries[idx];
-                    bucket.copy_rows(sel, count, &state.addresses);
-                }
+        let mut blocks = Vec::with_capacity(data_blocks.len());
+        for (idx, data_block) in data_blocks.into_iter().enumerate() {
+            if self.node_list[idx] == self.local_id {
+                blocks.push(data_block);
+                continue;
             }
+
+            blocks.push(serialize_block(0, 0, 0, data_block, &self.options)?);
         }
-        state.clear();
-        payload.state_move_out = true;
-    }
 
-    for (idx, payload) in payloads.into_iter().enumerate() {
-        buckets[idx].combine_single(payload, &mut state, None);
+        Ok(blocks)
     }
-
-    Ok(buckets)
 }
 
-impl FlightScatter for HashTableHashScatter {
-    fn execute(&self, mut data_block: DataBlock) -> Result<Vec<DataBlock>> {
-        if let Some(block_meta) = data_block.take_meta() {
-            if let Some(block_meta) = AggregateMeta::downcast_from(block_meta) {
-                let mut blocks = Vec::with_capacity(self.buckets);
-                match block_meta {
-                    AggregateMeta::Spilled(_) => unreachable!(),
-                    AggregateMeta::BucketSpilled(_) => unreachable!(),
-                    AggregateMeta::Serialized(_) => unreachable!(),
-                    AggregateMeta::Partitioned { .. } => unreachable!(),
-                    AggregateMeta::AggregateSpilling(payload) => {
-                        for p in scatter_partitioned_payload(payload, self.buckets)? {
-                            blocks.push(DataBlock::empty_with_meta(
-                                AggregateMeta::create_agg_spilling(p),
-                            ));
-                        }
-                    }
-                    AggregateMeta::AggregatePayload(p) => {
-                        for payload in scatter_payload(p.payload, self.buckets)? {
-                            blocks.push(DataBlock::empty_with_meta(
-                                AggregateMeta::create_agg_payload(
-                                    p.bucket,
-                                    payload,
-                                    p.max_partition_count,
-                                ),
-                            ));
+impl<const MULTIWAY_SORT: bool> Exchange for FlightExchange<MULTIWAY_SORT> {
+    const NAME: &'static str = "AggregateExchange";
+    const MULTIWAY_SORT: bool = MULTIWAY_SORT;
+
+    fn partition(&self, mut data_block: DataBlock, n: usize) -> Result<Vec<DataBlock>> {
+        let Some(meta) = data_block.take_meta() else {
+            // only exchange data
+            if data_block.is_empty() {
+                return Ok(vec![]);
+            }
+
+            return self.default_partition(data_block);
+        };
+
+        let Some(_) = AggregateMeta::downcast_ref_from(&meta) else {
+            return self.default_partition(data_block.add_meta(Some(meta))?);
+        };
+
+        assert!(MULTIWAY_SORT);
+        assert_eq!(self.node_list_lookup.len(), n);
+        match AggregateMeta::downcast_from(meta).unwrap() {
+            AggregateMeta::FinalPartition(_) => unreachable!(),
+            AggregateMeta::InFlightPayload(_) => unreachable!(),
+            AggregateMeta::SpilledPayload(v) => {
+                let mut blocks = Vec::with_capacity(n);
+                let global_max_partition = self.global_max_partition.load(AtomicOrdering::SeqCst);
+                for node_id in &self.node_list {
+                    let mut node_data_block = match *node_id == v.destination_node {
+                        true => DataBlock::empty_with_meta(AggregateMeta::create_spilled_payload(
+                            v.clone(),
+                        )),
+                        false => {
+                            DataBlock::empty_with_meta(AggregateMeta::create_in_flight_payload(
+                                v.get_sorting_partition(),
+                                v.max_partition,
+                                global_max_partition,
+                            ))
                         }
+                    };
+
+                    if *node_id != self.local_id {
+                        node_data_block = serialize_block(
+                            v.get_sorting_partition(),
+                            v.max_partition,
+                            global_max_partition,
+                            node_data_block,
+                            &self.options,
+                        )?
                     }
-                };
 
-                return Ok(blocks);
-            }
-        }
+                    blocks.push(node_data_block);
+                }
 
-        Err(ErrorCode::Internal(
-            "Internal, HashTableHashScatter only recv AggregateMeta",
-        ))
-    }
-}
+                Ok(blocks)
+            }
+            AggregateMeta::AggregatePayload(p) => {
+                if p.payload.len() == 0 {
+                    return Ok(vec![]);
+                }
 
-pub struct AggregateInjector {
-    ctx: Arc<QueryContext>,
-    aggregator_params: Arc<AggregatorParams>,
-}
+                let mut blocks = Vec::with_capacity(n);
+                let global_max_partition = self.global_max_partition.load(AtomicOrdering::SeqCst);
+                for (idx, payload) in scatter_payload(p.payload, n)?.into_iter().enumerate() {
+                    if self.node_list[idx] == self.local_id {
+                        blocks.push(DataBlock::empty_with_meta(
+                            AggregateMeta::create_agg_payload(
+                                payload,
+                                p.partition,
+                                p.max_partition,
+                                global_max_partition,
+                            ),
+                        ));
+
+                        continue;
+                    }
 
-impl AggregateInjector {
-    pub fn create(
-        ctx: Arc<QueryContext>,
-        params: Arc<AggregatorParams>,
-    ) -> Arc<dyn ExchangeInjector> {
-        Arc::new(AggregateInjector {
-            ctx,
-            aggregator_params: params,
-        })
-    }
-}
+                    let data_block = match payload.len() == 0 {
+                        true => DataBlock::empty(),
+                        false => payload.aggregate_flush_all()?,
+                    };
+
+                    let data_block =
+                        data_block.add_meta(Some(AggregateMeta::create_in_flight_payload(
+                            p.partition,
+                            p.max_partition,
+                            global_max_partition,
+                        )))?;
+
+                    let data_block = serialize_block(
+                        p.partition,
+                        p.max_partition,
+                        global_max_partition,
+                        data_block,
+                        &self.options,
+                    )?;
+                    blocks.push(data_block);
+                }
 
-impl ExchangeInjector for AggregateInjector {
-    fn flight_scatter(
-        &self,
-        _: &Arc<QueryContext>,
-        exchange: &DataExchange,
-    ) -> Result<Arc<Box<dyn FlightScatter>>> {
-        match exchange {
-            DataExchange::Merge(_) => unreachable!(),
-            DataExchange::Broadcast(_) => unreachable!(),
-            DataExchange::ShuffleDataExchange(exchange) => {
-                Ok(Arc::new(Box::new(HashTableHashScatter {
-                    buckets: exchange.destination_ids.len(),
-                })))
+                Ok(blocks)
             }
         }
     }
 
-    fn exchange_sorting(&self) -> Option<Arc<dyn ExchangeSorting>> {
-        Some(Arc::new(AggregateExchangeSorting {}))
-    }
+    fn init_way(&self, _index: usize, block: &DataBlock) -> Result<()> {
+        let max_partition = match block.get_meta() {
+            None => 0,
+            Some(meta) => match AggregateMeta::downcast_ref_from(meta) {
+                None => 0,
+                Some(v) => v.get_max_partition(),
+            },
+        };
 
-    fn apply_merge_serializer(
-        &self,
-        _: &MergeExchangeParams,
-        _compression: Option<FlightCompression>,
-        pipeline: &mut Pipeline,
-    ) -> Result<()> {
-        let params = self.aggregator_params.clone();
-
-        let operator = DataOperator::instance().spill_operator();
-        let location_prefix = self.ctx.query_id_spill_prefix();
-
-        pipeline.add_transform(|input, output| {
-            Ok(ProcessorPtr::create(
-                TransformAggregateSpillWriter::try_create(
-                    self.ctx.clone(),
-                    input,
-                    output,
-                    operator.clone(),
-                    params.clone(),
-                    location_prefix.clone(),
-                )?,
-            ))
-        })?;
-
-        pipeline.add_transform(|input, output| {
-            TransformAggregateSerializer::try_create(input, output, params.clone())
-        })
+        self.global_max_partition
+            .fetch_max(max_partition, std::sync::atomic::Ordering::SeqCst);
+        Ok(())
     }
 
-    fn apply_shuffle_serializer(
-        &self,
-        shuffle_params: &ShuffleExchangeParams,
-        compression: Option<FlightCompression>,
-        pipeline: &mut Pipeline,
-    ) -> Result<()> {
-        let params = self.aggregator_params.clone();
-        let operator = DataOperator::instance().spill_operator();
-        let location_prefix = self.ctx.query_id_spill_prefix();
-
-        let schema = shuffle_params.schema.clone();
-        let local_id = &shuffle_params.executor_id;
-        let local_pos = shuffle_params
-            .destination_ids
-            .iter()
-            .position(|x| x == local_id)
-            .unwrap();
-
-        pipeline.add_transform(|input, output| {
-            Ok(ProcessorPtr::create(
-                TransformExchangeAggregateSerializer::try_create(
-                    self.ctx.clone(),
-                    input,
-                    output,
-                    operator.clone(),
-                    location_prefix.clone(),
-                    params.clone(),
-                    compression,
-                    schema.clone(),
-                    local_pos,
-                )?,
-            ))
-        })?;
-
-        pipeline.add_transform(TransformExchangeAsyncBarrier::try_create)
-    }
+    fn sorting_function(left_block: &DataBlock, right_block: &DataBlock) -> Ordering {
+        let Some(left_meta) = left_block.get_meta() else {
+            return Ordering::Equal;
+        };
 
-    fn apply_merge_deserializer(
-        &self,
-        params: &MergeExchangeParams,
-        pipeline: &mut Pipeline,
-    ) -> Result<()> {
-        pipeline.add_transform(|input, output| {
-            TransformAggregateDeserializer::try_create(input, output, &params.schema)
-        })
-    }
+        let (l_partition, l_max_partition) =
+            match ExchangeSerializeMeta::downcast_ref_from(left_meta) {
+                Some(meta) => (meta.partition, meta.max_partition),
+                None => {
+                    let Some(meta) = AggregateMeta::downcast_ref_from(left_meta) else {
+                        return Ordering::Equal;
+                    };
 
-    fn apply_shuffle_deserializer(
-        &self,
-        params: &ShuffleExchangeParams,
-        pipeline: &mut Pipeline,
-    ) -> Result<()> {
-        pipeline.add_transform(|input, output| {
-            TransformAggregateDeserializer::try_create(input, output, &params.schema)
-        })
+                    (meta.get_sorting_partition(), meta.get_max_partition())
+                }
+            };
+
+        let Some(right_meta) = right_block.get_meta() else {
+            return Ordering::Equal;
+        };
+
+        let (r_partition, r_max_partition) =
+            match ExchangeSerializeMeta::downcast_ref_from(right_meta) {
+                Some(meta) => (meta.partition, meta.max_partition),
+                None => {
+                    let Some(meta) = AggregateMeta::downcast_ref_from(right_meta) else {
+                        return Ordering::Equal;
+                    };
+
+                    (meta.get_sorting_partition(), meta.get_max_partition())
+                }
+            };
+
+        // ORDER BY max_partition asc, partition asc
+        match l_max_partition.cmp(&r_max_partition) {
+            Ordering::Less => Ordering::Less,
+            Ordering::Greater => Ordering::Greater,
+            Ordering::Equal => l_partition.cmp(&r_partition),
+        }
     }
 }
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs
index 2ae3cc620b928..3baee05794962 100644
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs
@@ -19,6 +19,7 @@ use std::sync::Arc;
 
 use bumpalo::Bump;
 use databend_common_exception::Result;
+use databend_common_expression::local_block_meta_serde;
 use databend_common_expression::types::DataType;
 use databend_common_expression::AggregateFunction;
 use databend_common_expression::AggregateHashTable;
@@ -36,7 +37,8 @@ pub struct SerializedPayload {
     pub bucket: isize,
     pub data_block: DataBlock,
     // use for new agg_hashtable
-    pub max_partition_count: usize,
+    pub max_partition: usize,
+    pub global_max_partition: usize,
 }
 
 impl SerializedPayload {
@@ -106,114 +108,158 @@ impl SerializedPayload {
     }
 }
 
-pub struct BucketSpilledPayload {
-    pub bucket: isize,
+#[derive(serde::Serialize, serde::Deserialize, Clone, Debug)]
+pub struct SpilledPayload {
+    pub partition: isize,
     pub location: String,
     pub data_range: Range<u64>,
-    pub columns_layout: Vec<u64>,
-    pub max_partition_count: usize,
+    pub destination_node: String,
+    pub max_partition: usize,
+    pub global_max_partition: usize,
+}
+
+impl SpilledPayload {
+    pub fn get_sorting_partition(&self) -> isize {
+        -(self.max_partition as isize - self.partition)
+    }
 }
 
 pub struct AggregatePayload {
-    pub bucket: isize,
+    pub partition: isize,
     pub payload: Payload,
     // use for new agg_hashtable
-    pub max_partition_count: usize,
+    pub max_partition: usize,
+    pub global_max_partition: usize,
+}
+
+#[derive(serde::Serialize, serde::Deserialize)]
+pub struct InFlightPayload {
+    pub partition: isize,
+    pub max_partition: usize,
+    pub global_max_partition: usize,
+}
+
+pub struct FinalPayload {
+    pub data: Vec<DataBlock>,
 }
 
+#[derive(serde::Serialize, serde::Deserialize)]
 pub enum AggregateMeta {
-    Serialized(SerializedPayload),
+    SpilledPayload(SpilledPayload),
     AggregatePayload(AggregatePayload),
-    AggregateSpilling(PartitionedPayload),
-    BucketSpilled(BucketSpilledPayload),
-    Spilled(Vec<BucketSpilledPayload>),
-
-    Partitioned { bucket: isize, data: Vec<Self> },
+    InFlightPayload(InFlightPayload),
+    FinalPartition(FinalPayload),
 }
 
 impl AggregateMeta {
     pub fn create_agg_payload(
-        bucket: isize,
         payload: Payload,
-        max_partition_count: usize,
+        partition: isize,
+        max_partition: usize,
+        global_max_partition: usize,
     ) -> BlockMetaInfoPtr {
         Box::new(AggregateMeta::AggregatePayload(AggregatePayload {
-            bucket,
             payload,
-            max_partition_count,
+            partition,
+            max_partition,
+            global_max_partition,
         }))
     }
 
-    pub fn create_agg_spilling(payload: PartitionedPayload) -> BlockMetaInfoPtr {
-        Box::new(AggregateMeta::AggregateSpilling(payload))
-    }
-
-    pub fn create_serialized(
-        bucket: isize,
-        block: DataBlock,
-        max_partition_count: usize,
+    pub fn create_in_flight_payload(
+        partition: isize,
+        max_partition: usize,
+        global_max_partition: usize,
     ) -> BlockMetaInfoPtr {
-        Box::new(AggregateMeta::Serialized(SerializedPayload {
-            bucket,
-            data_block: block,
-            max_partition_count,
+        Box::new(AggregateMeta::InFlightPayload(InFlightPayload {
+            partition,
+            max_partition,
+            global_max_partition,
         }))
     }
 
-    pub fn create_spilled(buckets_payload: Vec<BucketSpilledPayload>) -> BlockMetaInfoPtr {
-        Box::new(AggregateMeta::Spilled(buckets_payload))
+    pub fn create_spilled_payload(payload: SpilledPayload) -> BlockMetaInfoPtr {
+        Box::new(AggregateMeta::SpilledPayload(payload))
     }
 
-    pub fn create_bucket_spilled(payload: BucketSpilledPayload) -> BlockMetaInfoPtr {
-        Box::new(AggregateMeta::BucketSpilled(payload))
+    pub fn create_final(blocks: Vec<DataBlock>) -> BlockMetaInfoPtr {
+        Box::new(AggregateMeta::FinalPartition(FinalPayload { data: blocks }))
     }
 
-    pub fn create_partitioned(bucket: isize, data: Vec<Self>) -> BlockMetaInfoPtr {
-        Box::new(AggregateMeta::Partitioned { data, bucket })
+    pub fn get_global_max_partition(&self) -> usize {
+        match self {
+            AggregateMeta::SpilledPayload(v) => v.global_max_partition,
+            AggregateMeta::AggregatePayload(v) => v.global_max_partition,
+            AggregateMeta::InFlightPayload(v) => v.global_max_partition,
+            AggregateMeta::FinalPartition(_) => unreachable!(),
+        }
     }
-}
 
-impl serde::Serialize for AggregateMeta {
-    fn serialize<S>(&self, _: S) -> std::result::Result<S::Ok, S::Error>
-    where S: serde::Serializer {
-        unreachable!("AggregateMeta does not support exchanging between multiple nodes")
+    pub fn get_partition(&self) -> isize {
+        match self {
+            AggregateMeta::SpilledPayload(v) => v.partition,
+            AggregateMeta::AggregatePayload(v) => v.partition,
+            AggregateMeta::InFlightPayload(v) => v.partition,
+            AggregateMeta::FinalPartition(_) => unreachable!(),
+        }
     }
-}
 
-impl<'de> serde::Deserialize<'de> for AggregateMeta {
-    fn deserialize<D>(_: D) -> std::result::Result<Self, D::Error>
-    where D: serde::Deserializer<'de> {
-        unreachable!("AggregateMeta does not support exchanging between multiple nodes")
+    pub fn get_sorting_partition(&self) -> isize {
+        match self {
+            AggregateMeta::AggregatePayload(v) => v.partition,
+            AggregateMeta::InFlightPayload(v) => v.partition,
+            AggregateMeta::SpilledPayload(v) => v.get_sorting_partition(),
+            AggregateMeta::FinalPartition(_) => unreachable!(),
+        }
+    }
+
+    pub fn get_max_partition(&self) -> usize {
+        match self {
+            AggregateMeta::SpilledPayload(v) => v.max_partition,
+            AggregateMeta::AggregatePayload(v) => v.max_partition,
+            AggregateMeta::InFlightPayload(v) => v.max_partition,
+            AggregateMeta::FinalPartition(_) => unreachable!(),
+        }
+    }
+
+    pub fn set_global_max_partition(&mut self, global_max_partition: usize) {
+        match self {
+            AggregateMeta::SpilledPayload(v) => {
+                v.global_max_partition = global_max_partition;
+            }
+            AggregateMeta::AggregatePayload(v) => {
+                v.global_max_partition = global_max_partition;
+            }
+            AggregateMeta::InFlightPayload(v) => {
+                v.global_max_partition = global_max_partition;
+            }
+            AggregateMeta::FinalPartition(_) => unreachable!(),
+        }
     }
 }
 
 impl Debug for AggregateMeta {
     fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
         match self {
-            AggregateMeta::Partitioned { .. } => {
-                f.debug_struct("AggregateMeta::Partitioned").finish()
+            AggregateMeta::FinalPartition(_) => {
+                f.debug_struct("AggregateMeta::FinalPartition").finish()
+            }
+            AggregateMeta::SpilledPayload(_) => {
+                f.debug_struct("Aggregate::SpilledPayload").finish()
             }
-            AggregateMeta::Serialized { .. } => {
-                f.debug_struct("AggregateMeta::Serialized").finish()
+            AggregateMeta::InFlightPayload(_) => {
+                f.debug_struct("Aggregate:InFlightPayload").finish()
             }
-            AggregateMeta::Spilled(_) => f.debug_struct("Aggregate::Spilled").finish(),
-            AggregateMeta::BucketSpilled(_) => f.debug_struct("Aggregate::BucketSpilled").finish(),
             AggregateMeta::AggregatePayload(_) => {
                 f.debug_struct("AggregateMeta:AggregatePayload").finish()
             }
-            AggregateMeta::AggregateSpilling(_) => {
-                f.debug_struct("AggregateMeta:AggregateSpilling").finish()
-            }
         }
     }
 }
 
-impl BlockMetaInfo for AggregateMeta {
-    fn typetag_deserialize(&self) {
-        unimplemented!("AggregateMeta does not support exchanging between multiple nodes")
-    }
+#[typetag::serde(name = "AggregateMeta")]
+impl BlockMetaInfo for AggregateMeta {}
 
-    fn typetag_name(&self) -> &'static str {
-        unimplemented!("AggregateMeta does not support exchanging between multiple nodes")
-    }
-}
+local_block_meta_serde!(FinalPayload);
+local_block_meta_serde!(AggregatePayload);
+local_block_meta_serde!(SerializedPayload);
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/mod.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/mod.rs
index bdd17a88364fc..94f2d0ec7bba8 100644
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/mod.rs
@@ -15,21 +15,26 @@
 mod aggregate_exchange_injector;
 mod aggregate_meta;
 mod aggregator_params;
-mod new_transform_partition_bucket;
 mod serde;
 mod transform_aggregate_expand;
 mod transform_aggregate_final;
 mod transform_aggregate_partial;
+mod transform_partition_align;
+mod transform_partition_bucket;
+mod transform_partition_dispatch;
+mod transform_partition_exchange;
+mod transform_partition_resorting;
+mod transform_partition_restore;
 mod transform_single_key;
 mod udaf_script;
 
-pub use aggregate_exchange_injector::AggregateInjector;
+pub use aggregate_exchange_injector::FlightExchange;
 pub use aggregate_meta::*;
 pub use aggregator_params::AggregatorParams;
-pub use new_transform_partition_bucket::build_partition_bucket;
 pub use transform_aggregate_expand::TransformExpandGroupingSets;
 pub use transform_aggregate_final::TransformFinalAggregate;
 pub use transform_aggregate_partial::TransformPartialAggregate;
+pub use transform_partition_bucket::build_final_aggregate;
 pub use transform_single_key::FinalSingleStateAggregator;
 pub use transform_single_key::PartialSingleStateAggregator;
 pub use udaf_script::*;
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/new_transform_partition_bucket.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/new_transform_partition_bucket.rs
deleted file mode 100644
index 5c5cddc4258fd..0000000000000
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/new_transform_partition_bucket.rs
+++ /dev/null
@@ -1,612 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::any::Any;
-use std::collections::btree_map::Entry;
-use std::collections::BTreeMap;
-use std::sync::Arc;
-
-use bumpalo::Bump;
-use databend_common_exception::ErrorCode;
-use databend_common_exception::Result;
-use databend_common_expression::BlockMetaInfoDowncast;
-use databend_common_expression::DataBlock;
-use databend_common_expression::PartitionedPayload;
-use databend_common_expression::PayloadFlushState;
-use databend_common_pipeline_core::processors::Event;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::Processor;
-use databend_common_pipeline_core::processors::ProcessorPtr;
-use databend_common_pipeline_core::Pipe;
-use databend_common_pipeline_core::PipeItem;
-use databend_common_pipeline_core::Pipeline;
-use databend_common_storage::DataOperator;
-use tokio::sync::Semaphore;
-
-use super::AggregatePayload;
-use super::TransformAggregateSpillReader;
-use super::TransformFinalAggregate;
-use crate::pipelines::processors::transforms::aggregator::aggregate_meta::AggregateMeta;
-use crate::pipelines::processors::transforms::aggregator::aggregate_meta::SerializedPayload;
-use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
-
-static SINGLE_LEVEL_BUCKET_NUM: isize = -1;
-static MAX_PARTITION_COUNT: usize = 128;
-
-struct InputPortState {
-    port: Arc<InputPort>,
-    bucket: isize,
-    max_partition_count: usize,
-}
-pub struct NewTransformPartitionBucket {
-    output: Arc<OutputPort>,
-    inputs: Vec<InputPortState>,
-    params: Arc<AggregatorParams>,
-    working_bucket: isize,
-    pushing_bucket: isize,
-    initialized_all_inputs: bool,
-    all_inputs_init: bool,
-    buckets_blocks: BTreeMap<isize, Vec<DataBlock>>,
-    flush_state: PayloadFlushState,
-    unpartitioned_blocks: Vec<DataBlock>,
-    max_partition_count: usize,
-}
-
-impl NewTransformPartitionBucket {
-    pub fn create(input_nums: usize, params: Arc<AggregatorParams>) -> Result<Self> {
-        let mut inputs = Vec::with_capacity(input_nums);
-
-        for _index in 0..input_nums {
-            inputs.push(InputPortState {
-                bucket: -1,
-                port: InputPort::create(),
-                max_partition_count: 0,
-            });
-        }
-
-        Ok(NewTransformPartitionBucket {
-            params,
-            inputs,
-            working_bucket: 0,
-            pushing_bucket: 0,
-            output: OutputPort::create(),
-            buckets_blocks: BTreeMap::new(),
-            unpartitioned_blocks: vec![],
-            flush_state: PayloadFlushState::default(),
-            initialized_all_inputs: false,
-            all_inputs_init: false,
-            max_partition_count: 0,
-        })
-    }
-
-    pub fn get_inputs(&self) -> Vec<Arc<InputPort>> {
-        let mut inputs = Vec::with_capacity(self.inputs.len());
-
-        for input_state in &self.inputs {
-            inputs.push(input_state.port.clone());
-        }
-
-        inputs
-    }
-
-    pub fn get_output(&self) -> Arc<OutputPort> {
-        self.output.clone()
-    }
-
-    fn initialize_all_inputs(&mut self) -> Result<bool> {
-        self.initialized_all_inputs = true;
-        // in a cluster where partitions are only 8 and 128,
-        // we need to pull all data where the partition equals 8 until the partition changes to 128 or there is no data available.
-        if self.params.cluster_aggregator {
-            for index in 0..self.inputs.len() {
-                if self.inputs[index].port.is_finished() {
-                    continue;
-                }
-
-                // We pull all the data that are not the max_partition_count and all spill data
-                if self.inputs[index].max_partition_count == MAX_PARTITION_COUNT
-                    && self.inputs[index].bucket > SINGLE_LEVEL_BUCKET_NUM
-                {
-                    continue;
-                }
-
-                if !self.inputs[index].port.has_data() {
-                    self.inputs[index].port.set_need_data();
-                    self.initialized_all_inputs = false;
-                    continue;
-                }
-
-                let data_block = self.inputs[index].port.pull_data().unwrap()?;
-
-                (
-                    self.inputs[index].bucket,
-                    self.inputs[index].max_partition_count,
-                ) = self.add_bucket(data_block)?;
-
-                // we need pull all spill data in init, and data less than max partition
-                if self.inputs[index].bucket <= SINGLE_LEVEL_BUCKET_NUM
-                    || self.inputs[index].max_partition_count < MAX_PARTITION_COUNT
-                {
-                    self.inputs[index].port.set_need_data();
-                    self.initialized_all_inputs = false;
-                }
-            }
-        } else {
-            // in singleton, the partition is 8, 32, 128.
-            // We pull the first data to ensure the max partition,
-            // and then pull all data that is less than the max partition
-            let mut refresh_index = 0;
-            for index in 0..self.inputs.len() {
-                if self.inputs[index].port.is_finished() {
-                    continue;
-                }
-
-                // We pull all the data that are not the max_partition_count
-                if self.inputs[index].max_partition_count > 0
-                    && self.inputs[index].bucket > SINGLE_LEVEL_BUCKET_NUM
-                    && self.inputs[index].max_partition_count == self.max_partition_count
-                {
-                    continue;
-                }
-
-                if !self.inputs[index].port.has_data() {
-                    self.inputs[index].port.set_need_data();
-                    self.initialized_all_inputs = false;
-                    continue;
-                }
-
-                let data_block = self.inputs[index].port.pull_data().unwrap()?;
-
-                let before_max_partition_count = self.max_partition_count;
-                (
-                    self.inputs[index].bucket,
-                    self.inputs[index].max_partition_count,
-                ) = self.add_bucket(data_block)?;
-
-                // we need pull all spill data in init, and data less than max partition
-                if self.inputs[index].bucket <= SINGLE_LEVEL_BUCKET_NUM
-                    || self.inputs[index].max_partition_count < self.max_partition_count
-                {
-                    self.inputs[index].port.set_need_data();
-                    self.initialized_all_inputs = false;
-                }
-
-                // max partition count change
-                if before_max_partition_count > 0
-                    && before_max_partition_count != self.max_partition_count
-                {
-                    // set need data for inputs which is less than the max partition
-                    for i in refresh_index..index {
-                        if !self.inputs[i].port.is_finished()
-                            && !self.inputs[i].port.has_data()
-                            && self.inputs[i].max_partition_count != self.max_partition_count
-                        {
-                            self.inputs[i].port.set_need_data();
-                            self.initialized_all_inputs = false;
-                        }
-                    }
-                    refresh_index = index;
-                }
-            }
-        }
-
-        if self.initialized_all_inputs {
-            self.all_inputs_init = true;
-        }
-
-        Ok(self.initialized_all_inputs)
-    }
-
-    #[allow(unused_assignments)]
-    fn add_bucket(&mut self, mut data_block: DataBlock) -> Result<(isize, usize)> {
-        let (mut bucket, mut partition_count) = (0, 0);
-        let mut is_empty_block = false;
-        if let Some(block_meta) = data_block.get_meta() {
-            if let Some(block_meta) = AggregateMeta::downcast_ref_from(block_meta) {
-                (bucket, partition_count) = match block_meta {
-                    AggregateMeta::Partitioned { .. } => unreachable!(),
-                    AggregateMeta::AggregateSpilling(_) => unreachable!(),
-                    AggregateMeta::BucketSpilled(_) => {
-                        let meta = data_block.take_meta().unwrap();
-
-                        if let Some(AggregateMeta::BucketSpilled(payload)) =
-                            AggregateMeta::downcast_from(meta)
-                        {
-                            let bucket = payload.bucket;
-                            let partition_count = payload.max_partition_count;
-                            self.max_partition_count =
-                                self.max_partition_count.max(partition_count);
-
-                            let data_block = DataBlock::empty_with_meta(
-                                AggregateMeta::create_bucket_spilled(payload),
-                            );
-                            match self.buckets_blocks.entry(bucket) {
-                                Entry::Vacant(v) => {
-                                    v.insert(vec![data_block]);
-                                }
-                                Entry::Occupied(mut v) => {
-                                    v.get_mut().push(data_block);
-                                }
-                            };
-
-                            return Ok((SINGLE_LEVEL_BUCKET_NUM, partition_count));
-                        }
-                        unreachable!()
-                    }
-                    AggregateMeta::Spilled(_) => {
-                        let meta = data_block.take_meta().unwrap();
-
-                        if let Some(AggregateMeta::Spilled(buckets_payload)) =
-                            AggregateMeta::downcast_from(meta)
-                        {
-                            let partition_count = if !buckets_payload.is_empty() {
-                                buckets_payload[0].max_partition_count
-                            } else {
-                                MAX_PARTITION_COUNT
-                            };
-                            self.max_partition_count =
-                                self.max_partition_count.max(partition_count);
-
-                            for bucket_payload in buckets_payload {
-                                let bucket = bucket_payload.bucket;
-                                let data_block = DataBlock::empty_with_meta(
-                                    AggregateMeta::create_bucket_spilled(bucket_payload),
-                                );
-                                match self.buckets_blocks.entry(bucket) {
-                                    Entry::Vacant(v) => {
-                                        v.insert(vec![data_block]);
-                                    }
-                                    Entry::Occupied(mut v) => {
-                                        v.get_mut().push(data_block);
-                                    }
-                                };
-                            }
-
-                            return Ok((SINGLE_LEVEL_BUCKET_NUM, partition_count));
-                        }
-                        unreachable!()
-                    }
-                    AggregateMeta::Serialized(payload) => {
-                        is_empty_block = payload.data_block.is_empty();
-                        self.max_partition_count =
-                            self.max_partition_count.max(payload.max_partition_count);
-
-                        (payload.bucket, payload.max_partition_count)
-                    }
-                    AggregateMeta::AggregatePayload(payload) => {
-                        is_empty_block = payload.payload.len() == 0;
-                        self.max_partition_count =
-                            self.max_partition_count.max(payload.max_partition_count);
-
-                        (payload.bucket, payload.max_partition_count)
-                    }
-                };
-            } else {
-                return Err(ErrorCode::Internal(format!(
-                    "Internal, TransformPartitionBucket only recv AggregateMeta, but got {:?}",
-                    block_meta
-                )));
-            }
-        } else {
-            return Err(ErrorCode::Internal(
-                "Internal, TransformPartitionBucket only recv DataBlock with meta.",
-            ));
-        }
-
-        if !is_empty_block {
-            if self.all_inputs_init {
-                if partition_count != self.max_partition_count {
-                    return Err(ErrorCode::Internal(
-                    "Internal, the partition count does not equal the max partition count on TransformPartitionBucket.
-                    ",
-                ));
-                }
-                match self.buckets_blocks.entry(bucket) {
-                    Entry::Vacant(v) => {
-                        v.insert(vec![data_block]);
-                    }
-                    Entry::Occupied(mut v) => {
-                        v.get_mut().push(data_block);
-                    }
-                };
-            } else {
-                self.unpartitioned_blocks.push(data_block);
-            }
-        }
-
-        Ok((bucket, partition_count))
-    }
-
-    fn try_push_data_block(&mut self) -> bool {
-        while self.pushing_bucket < self.working_bucket {
-            if let Some(bucket_blocks) = self.buckets_blocks.remove(&self.pushing_bucket) {
-                let data_block = Self::convert_blocks(self.pushing_bucket, bucket_blocks);
-                self.output.push_data(Ok(data_block));
-                self.pushing_bucket += 1;
-                return true;
-            }
-
-            self.pushing_bucket += 1;
-        }
-
-        false
-    }
-
-    fn partition_block(&mut self, payload: SerializedPayload) -> Result<Vec<Option<DataBlock>>> {
-        // already is max partition
-        if payload.max_partition_count == self.max_partition_count {
-            let bucket = payload.bucket;
-            let data_block =
-                DataBlock::empty_with_meta(Box::new(AggregateMeta::Serialized(payload)));
-            match self.buckets_blocks.entry(bucket) {
-                Entry::Vacant(v) => {
-                    v.insert(vec![data_block]);
-                }
-                Entry::Occupied(mut v) => {
-                    v.get_mut().push(data_block);
-                }
-            };
-            return Ok(vec![]);
-        }
-
-        // need repartition
-        let mut blocks = Vec::with_capacity(self.max_partition_count);
-        let p = payload.convert_to_partitioned_payload(
-            self.params.group_data_types.clone(),
-            self.params.aggregate_functions.clone(),
-            self.params.num_states(),
-            0,
-            Arc::new(Bump::new()),
-        )?;
-
-        let mut partitioned_payload = PartitionedPayload::new(
-            self.params.group_data_types.clone(),
-            self.params.aggregate_functions.clone(),
-            self.max_partition_count as u64,
-            p.arenas.clone(),
-        );
-        partitioned_payload.combine(p, &mut self.flush_state);
-
-        for (bucket, payload) in partitioned_payload.payloads.into_iter().enumerate() {
-            blocks.push(Some(DataBlock::empty_with_meta(
-                AggregateMeta::create_agg_payload(
-                    bucket as isize,
-                    payload,
-                    self.max_partition_count,
-                ),
-            )));
-        }
-
-        Ok(blocks)
-    }
-
-    fn partition_payload(&mut self, payload: AggregatePayload) -> Result<Vec<Option<DataBlock>>> {
-        // already is max partition
-        if payload.max_partition_count == self.max_partition_count {
-            let bucket = payload.bucket;
-            let data_block =
-                DataBlock::empty_with_meta(Box::new(AggregateMeta::AggregatePayload(payload)));
-            match self.buckets_blocks.entry(bucket) {
-                Entry::Vacant(v) => {
-                    v.insert(vec![data_block]);
-                }
-                Entry::Occupied(mut v) => {
-                    v.get_mut().push(data_block);
-                }
-            };
-            return Ok(vec![]);
-        }
-
-        // need repartition
-        let mut blocks = Vec::with_capacity(self.max_partition_count);
-        let mut partitioned_payload = PartitionedPayload::new(
-            self.params.group_data_types.clone(),
-            self.params.aggregate_functions.clone(),
-            self.max_partition_count as u64,
-            vec![payload.payload.arena.clone()],
-        );
-
-        partitioned_payload.combine_single(payload.payload, &mut self.flush_state, None);
-
-        for (bucket, payload) in partitioned_payload.payloads.into_iter().enumerate() {
-            blocks.push(Some(DataBlock::empty_with_meta(
-                AggregateMeta::create_agg_payload(
-                    bucket as isize,
-                    payload,
-                    self.max_partition_count,
-                ),
-            )));
-        }
-
-        Ok(blocks)
-    }
-
-    fn convert_blocks(bucket: isize, data_blocks: Vec<DataBlock>) -> DataBlock {
-        let mut data = Vec::with_capacity(data_blocks.len());
-        for mut data_block in data_blocks.into_iter() {
-            if let Some(block_meta) = data_block.take_meta() {
-                if let Some(block_meta) = AggregateMeta::downcast_from(block_meta) {
-                    data.push(block_meta);
-                }
-            }
-        }
-
-        DataBlock::empty_with_meta(AggregateMeta::create_partitioned(bucket, data))
-    }
-}
-
-#[async_trait::async_trait]
-impl Processor for NewTransformPartitionBucket {
-    fn name(&self) -> String {
-        String::from("TransformPartitionBucket")
-    }
-
-    fn as_any(&mut self) -> &mut dyn Any {
-        self
-    }
-
-    fn event(&mut self) -> Result<Event> {
-        if self.output.is_finished() {
-            for input_state in &self.inputs {
-                input_state.port.finish();
-            }
-
-            self.buckets_blocks.clear();
-            return Ok(Event::Finished);
-        }
-
-        // We pull the first unsplitted data block
-        if !self.initialized_all_inputs && !self.initialize_all_inputs()? {
-            return Ok(Event::NeedData);
-        }
-
-        if !self.unpartitioned_blocks.is_empty() {
-            // Split data blocks if it's unsplitted.
-            return Ok(Event::Sync);
-        }
-
-        if !self.output.can_push() {
-            for input_state in &self.inputs {
-                input_state.port.set_not_need_data();
-            }
-
-            return Ok(Event::NeedConsume);
-        }
-
-        let pushed_data_block = self.try_push_data_block();
-
-        loop {
-            // Try to pull the next data or until the port is closed
-            let mut all_inputs_is_finished = true;
-            let mut all_port_prepared_data = true;
-            for index in 0..self.inputs.len() {
-                if self.inputs[index].port.is_finished() {
-                    continue;
-                }
-
-                all_inputs_is_finished = false;
-                if self.inputs[index].bucket > self.working_bucket {
-                    continue;
-                }
-
-                if !self.inputs[index].port.has_data() {
-                    all_port_prepared_data = false;
-                    self.inputs[index].port.set_need_data();
-                    continue;
-                }
-
-                let data_block = self.inputs[index].port.pull_data().unwrap()?;
-                (self.inputs[index].bucket, _) = self.add_bucket(data_block)?;
-
-                if self.inputs[index].bucket <= self.working_bucket {
-                    all_port_prepared_data = false;
-                    self.inputs[index].port.set_need_data();
-                }
-            }
-
-            if all_inputs_is_finished {
-                break;
-            }
-
-            if !all_port_prepared_data {
-                return Ok(Event::NeedData);
-            }
-
-            self.working_bucket += 1;
-        }
-
-        if pushed_data_block || self.try_push_data_block() {
-            return Ok(Event::NeedConsume);
-        }
-
-        if let Some((bucket, bucket_blocks)) = self.buckets_blocks.pop_first() {
-            let data_block = Self::convert_blocks(bucket, bucket_blocks);
-            self.output.push_data(Ok(data_block));
-            return Ok(Event::NeedConsume);
-        }
-
-        self.output.finish();
-        Ok(Event::Finished)
-    }
-
-    fn process(&mut self) -> Result<()> {
-        let block_meta = self
-            .unpartitioned_blocks
-            .pop()
-            .and_then(|mut block| block.take_meta())
-            .and_then(AggregateMeta::downcast_from);
-
-        if let Some(agg_block_meta) = block_meta {
-            let data_blocks = match agg_block_meta {
-                AggregateMeta::Spilled(_) => unreachable!(),
-                AggregateMeta::Partitioned { .. } => unreachable!(),
-                AggregateMeta::AggregateSpilling(_) => unreachable!(),
-                AggregateMeta::BucketSpilled(_) => unreachable!(),
-                AggregateMeta::Serialized(payload) => self.partition_block(payload)?,
-                AggregateMeta::AggregatePayload(payload) => self.partition_payload(payload)?,
-            };
-
-            for (bucket, block) in data_blocks.into_iter().enumerate() {
-                if let Some(data_block) = block {
-                    match self.buckets_blocks.entry(bucket as isize) {
-                        Entry::Vacant(v) => {
-                            v.insert(vec![data_block]);
-                        }
-                        Entry::Occupied(mut v) => {
-                            v.get_mut().push(data_block);
-                        }
-                    };
-                }
-            }
-        }
-
-        Ok(())
-    }
-}
-
-pub fn build_partition_bucket(
-    pipeline: &mut Pipeline,
-    params: Arc<AggregatorParams>,
-) -> Result<()> {
-    let input_nums = pipeline.output_len();
-    let transform = NewTransformPartitionBucket::create(input_nums, params.clone())?;
-
-    let output = transform.get_output();
-    let inputs_port = transform.get_inputs();
-
-    pipeline.add_pipe(Pipe::create(inputs_port.len(), 1, vec![PipeItem::create(
-        ProcessorPtr::create(Box::new(transform)),
-        inputs_port,
-        vec![output],
-    )]));
-
-    pipeline.try_resize(input_nums)?;
-
-    let semaphore = Arc::new(Semaphore::new(params.max_spill_io_requests));
-    let operator = DataOperator::instance().spill_operator();
-    pipeline.add_transform(|input, output| {
-        let operator = operator.clone();
-        TransformAggregateSpillReader::create(input, output, operator, semaphore.clone())
-    })?;
-
-    pipeline.add_transform(|input, output| {
-        Ok(ProcessorPtr::create(TransformFinalAggregate::try_create(
-            input,
-            output,
-            params.clone(),
-        )?))
-    })?;
-    Ok(())
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/mod.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/mod.rs
index 76a55b10e85b3..16152f57bef77 100644
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/mod.rs
@@ -12,21 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-mod serde_meta;
-mod transform_aggregate_serializer;
-mod transform_aggregate_spill_writer;
 mod transform_deserializer;
-mod transform_exchange_aggregate_serializer;
-mod transform_exchange_async_barrier;
-mod transform_spill_reader;
 
-pub use serde_meta::*;
-pub use transform_aggregate_serializer::*;
-pub use transform_aggregate_spill_writer::*;
 pub use transform_deserializer::*;
-pub use transform_exchange_aggregate_serializer::*;
-pub use transform_exchange_async_barrier::*;
-pub use transform_spill_reader::*;
+
+pub use crate::pipelines::processors::transforms::aggregator::transform_partition_restore::*;
 
 pub mod exchange_defines {
     use arrow_ipc::writer::IpcWriteOptions;
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/serde_meta.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/serde_meta.rs
deleted file mode 100644
index b83cf2c97c90e..0000000000000
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/serde_meta.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::ops::Range;
-
-use databend_common_expression::BlockMetaInfo;
-use databend_common_expression::BlockMetaInfoDowncast;
-use databend_common_expression::BlockMetaInfoPtr;
-
-pub const BUCKET_TYPE: usize = 1;
-pub const SPILLED_TYPE: usize = 2;
-
-// Cannot change to enum, because bincode cannot deserialize custom enum
-#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)]
-pub struct AggregateSerdeMeta {
-    pub typ: usize,
-    pub bucket: isize,
-    pub location: Option<String>,
-    pub data_range: Option<Range<u64>>,
-    pub columns_layout: Vec<usize>,
-    // use for new agg hashtable
-    pub max_partition_count: usize,
-    pub is_empty: bool,
-}
-
-impl AggregateSerdeMeta {
-    pub fn create_agg_payload(
-        bucket: isize,
-        max_partition_count: usize,
-        is_empty: bool,
-    ) -> BlockMetaInfoPtr {
-        Box::new(AggregateSerdeMeta {
-            typ: BUCKET_TYPE,
-            bucket,
-            location: None,
-            data_range: None,
-            columns_layout: vec![],
-            max_partition_count,
-            is_empty,
-        })
-    }
-
-    pub fn create_spilled(
-        bucket: isize,
-        location: String,
-        data_range: Range<u64>,
-        columns_layout: Vec<usize>,
-        is_empty: bool,
-    ) -> BlockMetaInfoPtr {
-        Box::new(AggregateSerdeMeta {
-            typ: SPILLED_TYPE,
-            bucket,
-            columns_layout,
-            location: Some(location),
-            data_range: Some(data_range),
-            max_partition_count: 0,
-            is_empty,
-        })
-    }
-
-    pub fn create_agg_spilled(
-        bucket: isize,
-        location: String,
-        data_range: Range<u64>,
-        columns_layout: Vec<usize>,
-        max_partition_count: usize,
-    ) -> BlockMetaInfoPtr {
-        Box::new(AggregateSerdeMeta {
-            typ: SPILLED_TYPE,
-            bucket,
-            columns_layout,
-            location: Some(location),
-            data_range: Some(data_range),
-            max_partition_count,
-            is_empty: false,
-        })
-    }
-}
-
-#[typetag::serde(name = "aggregate_serde")]
-impl BlockMetaInfo for AggregateSerdeMeta {
-    fn equals(&self, info: &Box<dyn BlockMetaInfo>) -> bool {
-        AggregateSerdeMeta::downcast_ref_from(info).is_some_and(|other| self == other)
-    }
-
-    fn clone_self(&self) -> Box<dyn BlockMetaInfo> {
-        Box::new(self.clone())
-    }
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_serializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_serializer.rs
deleted file mode 100644
index 096485fa98fcc..0000000000000
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_serializer.rs
+++ /dev/null
@@ -1,260 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::any::Any;
-use std::fmt::Formatter;
-use std::pin::Pin;
-use std::sync::Arc;
-
-use databend_common_exception::Result;
-use databend_common_expression::local_block_meta_serde;
-use databend_common_expression::BlockMetaInfo;
-use databend_common_expression::BlockMetaInfoDowncast;
-use databend_common_expression::BlockMetaInfoPtr;
-use databend_common_expression::DataBlock;
-use databend_common_expression::PayloadFlushState;
-use databend_common_pipeline_core::processors::Event;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::Processor;
-use databend_common_pipeline_core::processors::ProcessorPtr;
-use futures::future::BoxFuture;
-
-use crate::pipelines::processors::transforms::aggregator::AggregateMeta;
-use crate::pipelines::processors::transforms::aggregator::AggregatePayload;
-use crate::pipelines::processors::transforms::aggregator::AggregateSerdeMeta;
-use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
-pub struct TransformAggregateSerializer {
-    params: Arc<AggregatorParams>,
-
-    input: Arc<InputPort>,
-    output: Arc<OutputPort>,
-    output_data: Option<DataBlock>,
-    input_data: Option<SerializeAggregateStream>,
-}
-
-impl TransformAggregateSerializer {
-    pub fn try_create(
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-        params: Arc<AggregatorParams>,
-    ) -> Result<ProcessorPtr> {
-        Ok(ProcessorPtr::create(Box::new(
-            TransformAggregateSerializer {
-                input,
-                output,
-                params,
-                input_data: None,
-                output_data: None,
-            },
-        )))
-    }
-}
-
-impl Processor for TransformAggregateSerializer {
-    fn name(&self) -> String {
-        String::from("TransformAggregateSerializer")
-    }
-
-    fn as_any(&mut self) -> &mut dyn Any {
-        self
-    }
-
-    fn event(&mut self) -> Result<Event> {
-        if self.output.is_finished() {
-            self.input.finish();
-            return Ok(Event::Finished);
-        }
-
-        if !self.output.can_push() {
-            self.input.set_not_need_data();
-            return Ok(Event::NeedConsume);
-        }
-
-        if let Some(output_data) = self.output_data.take() {
-            self.output.push_data(Ok(output_data));
-            return Ok(Event::NeedConsume);
-        }
-
-        if self.input_data.is_some() {
-            return Ok(Event::Sync);
-        }
-
-        if self.input.has_data() {
-            let data_block = self.input.pull_data().unwrap()?;
-            return self.transform_input_data(data_block);
-        }
-
-        if self.input.is_finished() {
-            self.output.finish();
-            return Ok(Event::Finished);
-        }
-
-        self.input.set_need_data();
-        Ok(Event::NeedData)
-    }
-
-    fn process(&mut self) -> Result<()> {
-        if let Some(stream) = &mut self.input_data {
-            self.output_data = Option::transpose(stream.next())?;
-
-            if self.output_data.is_none() {
-                self.input_data = None;
-            }
-        }
-
-        Ok(())
-    }
-}
-
-impl TransformAggregateSerializer {
-    fn transform_input_data(&mut self, mut data_block: DataBlock) -> Result<Event> {
-        debug_assert!(data_block.is_empty());
-        if let Some(block_meta) = data_block.take_meta() {
-            if let Some(block_meta) = AggregateMeta::downcast_from(block_meta) {
-                match block_meta {
-                    AggregateMeta::Spilled(_) => unreachable!(),
-                    AggregateMeta::Serialized(_) => unreachable!(),
-                    AggregateMeta::BucketSpilled(_) => unreachable!(),
-                    AggregateMeta::Partitioned { .. } => unreachable!(),
-                    AggregateMeta::AggregateSpilling(_) => unreachable!(),
-                    AggregateMeta::AggregatePayload(p) => {
-                        self.input_data = Some(SerializeAggregateStream::create(
-                            &self.params,
-                            SerializePayload::AggregatePayload(p),
-                        ));
-                        return Ok(Event::Sync);
-                    }
-                }
-            }
-        }
-
-        unreachable!()
-    }
-}
-
-pub enum SerializePayload {
-    AggregatePayload(AggregatePayload),
-}
-
-pub enum FlightSerialized {
-    DataBlock(DataBlock),
-    Future(BoxFuture<'static, Result<DataBlock>>),
-}
-
-unsafe impl Sync for FlightSerialized {}
-
-pub struct FlightSerializedMeta {
-    pub serialized_blocks: Vec<FlightSerialized>,
-}
-
-impl FlightSerializedMeta {
-    pub fn create(blocks: Vec<FlightSerialized>) -> BlockMetaInfoPtr {
-        Box::new(FlightSerializedMeta {
-            serialized_blocks: blocks,
-        })
-    }
-}
-
-impl std::fmt::Debug for FlightSerializedMeta {
-    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
-        f.debug_struct("FlightSerializedMeta").finish()
-    }
-}
-
-local_block_meta_serde!(FlightSerializedMeta);
-
-#[typetag::serde(name = "exchange_shuffle")]
-impl BlockMetaInfo for FlightSerializedMeta {}
-
-pub struct SerializeAggregateStream {
-    _params: Arc<AggregatorParams>,
-    pub payload: Pin<Box<SerializePayload>>,
-    flush_state: PayloadFlushState,
-    end_iter: bool,
-    nums: usize,
-}
-
-unsafe impl Send for SerializeAggregateStream {}
-
-unsafe impl Sync for SerializeAggregateStream {}
-
-impl SerializeAggregateStream {
-    pub fn create(params: &Arc<AggregatorParams>, payload: SerializePayload) -> Self {
-        let payload = Box::pin(payload);
-
-        SerializeAggregateStream {
-            payload,
-            flush_state: PayloadFlushState::default(),
-            _params: params.clone(),
-            end_iter: false,
-            nums: 0,
-        }
-    }
-}
-
-impl Iterator for SerializeAggregateStream {
-    type Item = Result<DataBlock>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        Result::transpose(self.next_impl())
-    }
-}
-
-impl SerializeAggregateStream {
-    fn next_impl(&mut self) -> Result<Option<DataBlock>> {
-        if self.end_iter {
-            return Ok(None);
-        }
-
-        match self.payload.as_ref().get_ref() {
-            SerializePayload::AggregatePayload(p) => {
-                let block = p.payload.aggregate_flush(&mut self.flush_state)?;
-
-                if block.is_none() {
-                    self.end_iter = true;
-                }
-
-                match block {
-                    Some(block) => {
-                        self.nums += 1;
-                        Ok(Some(block.add_meta(Some(
-                            AggregateSerdeMeta::create_agg_payload(
-                                p.bucket,
-                                p.max_partition_count,
-                                false,
-                            ),
-                        ))?))
-                    }
-                    None => {
-                        // always return at least one block
-                        if self.nums == 0 {
-                            self.nums += 1;
-                            let block = p.payload.empty_block(Some(1));
-                            Ok(Some(block.add_meta(Some(
-                                AggregateSerdeMeta::create_agg_payload(
-                                    p.bucket,
-                                    p.max_partition_count,
-                                    true,
-                                ),
-                            ))?))
-                        } else {
-                            Ok(None)
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_spill_writer.rs
deleted file mode 100644
index 744945849d45a..0000000000000
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_spill_writer.rs
+++ /dev/null
@@ -1,264 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::any::Any;
-use std::sync::Arc;
-use std::time::Instant;
-
-use databend_common_base::base::ProgressValues;
-use databend_common_base::runtime::profile::Profile;
-use databend_common_base::runtime::profile::ProfileStatisticsName;
-use databend_common_catalog::table_context::TableContext;
-use databend_common_exception::ErrorCode;
-use databend_common_exception::Result;
-use databend_common_expression::arrow::serialize_column;
-use databend_common_expression::BlockMetaInfoDowncast;
-use databend_common_expression::DataBlock;
-use databend_common_expression::PartitionedPayload;
-use databend_common_pipeline_core::processors::Event;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::Processor;
-use futures_util::future::BoxFuture;
-use log::info;
-use opendal::Operator;
-
-use crate::pipelines::processors::transforms::aggregator::AggregateMeta;
-use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
-use crate::pipelines::processors::transforms::aggregator::BucketSpilledPayload;
-use crate::sessions::QueryContext;
-use crate::spillers::Spiller;
-use crate::spillers::SpillerConfig;
-use crate::spillers::SpillerType;
-
-pub struct TransformAggregateSpillWriter {
-    ctx: Arc<QueryContext>,
-    input: Arc<InputPort>,
-    output: Arc<OutputPort>,
-    _params: Arc<AggregatorParams>,
-
-    spiller: Arc<Spiller>,
-    spilled_block: Option<DataBlock>,
-    spilling_meta: Option<AggregateMeta>,
-    spilling_future: Option<BoxFuture<'static, Result<DataBlock>>>,
-}
-
-impl TransformAggregateSpillWriter {
-    pub fn try_create(
-        ctx: Arc<QueryContext>,
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-        operator: Operator,
-        params: Arc<AggregatorParams>,
-        location_prefix: String,
-    ) -> Result<Box<dyn Processor>> {
-        let config = SpillerConfig {
-            spiller_type: SpillerType::Aggregation,
-            location_prefix,
-            disk_spill: None,
-            use_parquet: ctx.get_settings().get_spilling_file_format()?.is_parquet(),
-        };
-
-        let spiller = Spiller::create(ctx.clone(), operator, config.clone())?;
-        Ok(Box::new(TransformAggregateSpillWriter {
-            ctx,
-            input,
-            output,
-            _params: params,
-            spiller: Arc::new(spiller),
-            spilled_block: None,
-            spilling_meta: None,
-            spilling_future: None,
-        }))
-    }
-}
-
-#[async_trait::async_trait]
-impl Processor for TransformAggregateSpillWriter {
-    fn name(&self) -> String {
-        String::from("TransformAggregateSpillWriter")
-    }
-
-    fn as_any(&mut self) -> &mut dyn Any {
-        self
-    }
-
-    fn event(&mut self) -> Result<Event> {
-        if self.output.is_finished() {
-            self.input.finish();
-            return Ok(Event::Finished);
-        }
-
-        if !self.output.can_push() {
-            self.input.set_not_need_data();
-            return Ok(Event::NeedConsume);
-        }
-
-        if self.spilling_future.is_some() {
-            self.input.set_not_need_data();
-            return Ok(Event::Async);
-        }
-
-        while let Some(spilled_block) = self.spilled_block.take() {
-            if !spilled_block.is_empty() || spilled_block.get_meta().is_some() {
-                self.output.push_data(Ok(spilled_block));
-                return Ok(Event::NeedConsume);
-            }
-        }
-
-        if self.spilling_meta.is_some() {
-            self.input.set_not_need_data();
-            return Ok(Event::Sync);
-        }
-
-        if self.input.has_data() {
-            let mut data_block = self.input.pull_data().unwrap()?;
-
-            if let Some(block_meta) = data_block
-                .get_meta()
-                .and_then(AggregateMeta::downcast_ref_from)
-            {
-                if matches!(block_meta, AggregateMeta::AggregateSpilling(_)) {
-                    self.input.set_not_need_data();
-                    let block_meta = data_block.take_meta().unwrap();
-                    self.spilling_meta = AggregateMeta::downcast_from(block_meta);
-                    return Ok(Event::Sync);
-                }
-            }
-
-            self.output.push_data(Ok(data_block));
-            return Ok(Event::NeedConsume);
-        }
-
-        if self.input.is_finished() {
-            self.output.finish();
-            return Ok(Event::Finished);
-        }
-
-        self.input.set_need_data();
-        Ok(Event::NeedData)
-    }
-
-    fn process(&mut self) -> Result<()> {
-        if let Some(spilling_meta) = self.spilling_meta.take() {
-            match spilling_meta {
-                AggregateMeta::AggregateSpilling(payload) => {
-                    self.spilling_future = Some(agg_spilling_aggregate_payload(
-                        self.ctx.clone(),
-                        self.spiller.clone(),
-                        payload,
-                    )?);
-
-                    return Ok(());
-                }
-                _ => {
-                    return Err(ErrorCode::Internal(""));
-                }
-            }
-        }
-
-        Ok(())
-    }
-
-    #[async_backtrace::framed]
-    async fn async_process(&mut self) -> Result<()> {
-        if let Some(spilling_future) = self.spilling_future.take() {
-            self.spilled_block = Some(spilling_future.await?);
-        }
-
-        Ok(())
-    }
-}
-
-pub fn agg_spilling_aggregate_payload(
-    ctx: Arc<QueryContext>,
-    spiller: Arc<Spiller>,
-    partitioned_payload: PartitionedPayload,
-) -> Result<BoxFuture<'static, Result<DataBlock>>> {
-    let mut write_size = 0;
-    let partition_count = partitioned_payload.partition_count();
-    let mut write_data = Vec::with_capacity(partition_count);
-    let mut spilled_buckets_payloads = Vec::with_capacity(partition_count);
-    // Record how many rows are spilled.
-    let mut rows = 0;
-    let location = spiller.create_unique_location();
-    for (bucket, payload) in partitioned_payload.payloads.into_iter().enumerate() {
-        if payload.len() == 0 {
-            continue;
-        }
-
-        let data_block = payload.aggregate_flush_all()?;
-        rows += data_block.num_rows();
-
-        let begin = write_size;
-        let columns = data_block.columns().to_vec();
-        let mut columns_data = Vec::with_capacity(columns.len());
-        let mut columns_layout = Vec::with_capacity(columns.len());
-        for column in columns.into_iter() {
-            let column = column.into_column(data_block.num_rows());
-            let column_data = serialize_column(&column);
-            write_size += column_data.len() as u64;
-            columns_layout.push(column_data.len() as u64);
-            columns_data.push(column_data);
-        }
-
-        write_data.push(columns_data);
-        spilled_buckets_payloads.push(BucketSpilledPayload {
-            bucket: bucket as isize,
-            location: location.clone(),
-            data_range: begin..write_size,
-            columns_layout,
-            max_partition_count: partition_count,
-        });
-    }
-
-    Ok(Box::pin(async move {
-        let instant = Instant::now();
-        if !write_data.is_empty() {
-            let (location, write_bytes) = spiller
-                .spill_stream_aggregate_buffer(Some(location), write_data)
-                .await?;
-            // perf
-            {
-                Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1);
-                Profile::record_usize_profile(
-                    ProfileStatisticsName::RemoteSpillWriteBytes,
-                    write_bytes,
-                );
-                Profile::record_usize_profile(
-                    ProfileStatisticsName::RemoteSpillWriteTime,
-                    instant.elapsed().as_millis() as usize,
-                );
-            }
-
-            {
-                let progress_val = ProgressValues {
-                    rows,
-                    bytes: write_bytes,
-                };
-                ctx.get_aggregate_spill_progress().incr(&progress_val);
-            }
-
-            info!(
-                "Write aggregate spill {} successfully, elapsed: {:?}",
-                location,
-                instant.elapsed()
-            );
-        }
-
-        Ok(DataBlock::empty_with_meta(AggregateMeta::create_spilled(
-            spilled_buckets_payloads,
-        )))
-    }))
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_deserializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_deserializer.rs
index f07f37e77305b..ddd0bd38d4a0f 100644
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_deserializer.rs
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_deserializer.rs
@@ -17,14 +17,9 @@ use std::sync::Arc;
 use arrow_schema::Schema as ArrowSchema;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
-use databend_common_expression::types::ArrayType;
-use databend_common_expression::types::NumberType;
-use databend_common_expression::types::UInt64Type;
-use databend_common_expression::types::ValueType;
-use databend_common_expression::BlockMetaInfoDowncast;
+use databend_common_expression::BlockMetaInfoPtr;
 use databend_common_expression::DataBlock;
 use databend_common_expression::DataSchemaRef;
-use databend_common_io::prelude::bincode_deserialize_from_slice;
 use databend_common_io::prelude::BinaryRead;
 use databend_common_pipeline_core::processors::InputPort;
 use databend_common_pipeline_core::processors::OutputPort;
@@ -33,11 +28,6 @@ use databend_common_pipeline_transforms::processors::BlockMetaTransform;
 use databend_common_pipeline_transforms::processors::BlockMetaTransformer;
 use databend_common_pipeline_transforms::processors::UnknownMode;
 
-use crate::pipelines::processors::transforms::aggregator::exchange_defines;
-use crate::pipelines::processors::transforms::aggregator::AggregateMeta;
-use crate::pipelines::processors::transforms::aggregator::AggregateSerdeMeta;
-use crate::pipelines::processors::transforms::aggregator::BucketSpilledPayload;
-use crate::pipelines::processors::transforms::aggregator::BUCKET_TYPE;
 use crate::servers::flight::v1::exchange::serde::deserialize_block;
 use crate::servers::flight::v1::exchange::serde::ExchangeDeserializeMeta;
 use crate::servers::flight::v1::packets::DataPacket;
@@ -69,8 +59,9 @@ impl TransformDeserializer {
     fn recv_data(&self, dict: Vec<DataPacket>, fragment_data: FragmentData) -> Result<DataBlock> {
         const ROW_HEADER_SIZE: usize = std::mem::size_of::<u32>();
 
-        let meta = bincode_deserialize_from_slice(&fragment_data.get_meta()[ROW_HEADER_SIZE..])
-            .map_err(|_| ErrorCode::BadBytes("block meta deserialize error when exchange"))?;
+        let meta: Option<BlockMetaInfoPtr> =
+            serde_json::from_slice(&fragment_data.get_meta()[ROW_HEADER_SIZE..])
+                .map_err(|_| ErrorCode::BadBytes("block meta deserialize error when exchange"))?;
 
         let mut row_count_meta = &fragment_data.get_meta()[..ROW_HEADER_SIZE];
         let row_count: u32 = row_count_meta.read_scalar()?;
@@ -79,91 +70,8 @@ impl TransformDeserializer {
             return Ok(DataBlock::new_with_meta(vec![], 0, meta));
         }
 
-        let data_block = match &meta {
-            None => {
-                deserialize_block(dict, fragment_data, &self.schema, self.arrow_schema.clone())?
-            }
-            Some(meta) => match AggregateSerdeMeta::downcast_ref_from(meta) {
-                None => {
-                    deserialize_block(dict, fragment_data, &self.schema, self.arrow_schema.clone())?
-                }
-                Some(meta) => {
-                    return match meta.typ == BUCKET_TYPE {
-                        true => {
-                            let mut block = deserialize_block(
-                                dict,
-                                fragment_data,
-                                &self.schema,
-                                self.arrow_schema.clone(),
-                            )?;
-
-                            if meta.is_empty {
-                                block = block.slice(0..0);
-                            }
-
-                            Ok(DataBlock::empty_with_meta(
-                                AggregateMeta::create_serialized(
-                                    meta.bucket,
-                                    block,
-                                    meta.max_partition_count,
-                                ),
-                            ))
-                        }
-                        false => {
-                            let data_schema = Arc::new(exchange_defines::spilled_schema());
-                            let arrow_schema = Arc::new(exchange_defines::spilled_arrow_schema());
-                            let data_block = deserialize_block(
-                                dict,
-                                fragment_data,
-                                &data_schema,
-                                arrow_schema.clone(),
-                            )?;
-
-                            let columns = data_block
-                                .columns()
-                                .iter()
-                                .map(|c| c.value.clone().into_column())
-                                .try_collect::<Vec<_>>()
-                                .unwrap();
-
-                            let buckets =
-                                NumberType::<i64>::try_downcast_column(&columns[0]).unwrap();
-                            let data_range_start =
-                                NumberType::<u64>::try_downcast_column(&columns[1]).unwrap();
-                            let data_range_end =
-                                NumberType::<u64>::try_downcast_column(&columns[2]).unwrap();
-                            let columns_layout =
-                                ArrayType::<UInt64Type>::try_downcast_column(&columns[3]).unwrap();
-
-                            let columns_layout_data = columns_layout.values().as_slice();
-                            let columns_layout_offsets = columns_layout.offsets();
-
-                            let mut buckets_payload = Vec::with_capacity(data_block.num_rows());
-                            for index in 0..data_block.num_rows() {
-                                unsafe {
-                                    buckets_payload.push(BucketSpilledPayload {
-                                        bucket: *buckets.get_unchecked(index) as isize,
-                                        location: meta.location.clone().unwrap(),
-                                        data_range: *data_range_start.get_unchecked(index)
-                                            ..*data_range_end.get_unchecked(index),
-                                        columns_layout: columns_layout_data[columns_layout_offsets
-                                            [index]
-                                            as usize
-                                            ..columns_layout_offsets[index + 1] as usize]
-                                            .to_vec(),
-                                        max_partition_count: meta.max_partition_count,
-                                    });
-                                }
-                            }
-
-                            Ok(DataBlock::empty_with_meta(AggregateMeta::create_spilled(
-                                buckets_payload,
-                            )))
-                        }
-                    };
-                }
-            },
-        };
+        let data_block =
+            deserialize_block(dict, fragment_data, &self.schema, self.arrow_schema.clone())?;
 
         match data_block.num_columns() == 0 {
             true => Ok(DataBlock::new_with_meta(vec![], row_count as usize, meta)),
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs
deleted file mode 100644
index 1274bd7fc94d9..0000000000000
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs
+++ /dev/null
@@ -1,288 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-use std::time::Instant;
-
-use arrow_ipc::writer::IpcWriteOptions;
-use arrow_ipc::CompressionType;
-use databend_common_base::base::ProgressValues;
-use databend_common_base::runtime::profile::Profile;
-use databend_common_base::runtime::profile::ProfileStatisticsName;
-use databend_common_catalog::table_context::TableContext;
-use databend_common_exception::Result;
-use databend_common_expression::arrow::serialize_column;
-use databend_common_expression::types::ArgType;
-use databend_common_expression::types::ArrayType;
-use databend_common_expression::types::Int64Type;
-use databend_common_expression::types::UInt64Type;
-use databend_common_expression::types::ValueType;
-use databend_common_expression::BlockMetaInfoDowncast;
-use databend_common_expression::DataBlock;
-use databend_common_expression::DataSchemaRef;
-use databend_common_expression::FromData;
-use databend_common_expression::PartitionedPayload;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::Processor;
-use databend_common_pipeline_transforms::processors::BlockMetaTransform;
-use databend_common_pipeline_transforms::processors::BlockMetaTransformer;
-use databend_common_settings::FlightCompression;
-use futures_util::future::BoxFuture;
-use log::info;
-use opendal::Operator;
-
-use super::SerializePayload;
-use crate::pipelines::processors::transforms::aggregator::agg_spilling_aggregate_payload as local_agg_spilling_aggregate_payload;
-use crate::pipelines::processors::transforms::aggregator::aggregate_exchange_injector::compute_block_number;
-use crate::pipelines::processors::transforms::aggregator::aggregate_meta::AggregateMeta;
-use crate::pipelines::processors::transforms::aggregator::exchange_defines;
-use crate::pipelines::processors::transforms::aggregator::AggregateSerdeMeta;
-use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
-use crate::pipelines::processors::transforms::aggregator::FlightSerialized;
-use crate::pipelines::processors::transforms::aggregator::FlightSerializedMeta;
-use crate::pipelines::processors::transforms::aggregator::SerializeAggregateStream;
-use crate::servers::flight::v1::exchange::serde::serialize_block;
-use crate::servers::flight::v1::exchange::ExchangeShuffleMeta;
-use crate::sessions::QueryContext;
-use crate::spillers::Spiller;
-use crate::spillers::SpillerConfig;
-use crate::spillers::SpillerType;
-
-pub struct TransformExchangeAggregateSerializer {
-    ctx: Arc<QueryContext>,
-    local_pos: usize,
-    options: IpcWriteOptions,
-
-    params: Arc<AggregatorParams>,
-    spiller: Arc<Spiller>,
-}
-
-impl TransformExchangeAggregateSerializer {
-    #[allow(clippy::too_many_arguments)]
-    pub fn try_create(
-        ctx: Arc<QueryContext>,
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-
-        operator: Operator,
-        location_prefix: String,
-        params: Arc<AggregatorParams>,
-        compression: Option<FlightCompression>,
-        _schema: DataSchemaRef,
-        local_pos: usize,
-    ) -> Result<Box<dyn Processor>> {
-        let compression = match compression {
-            None => None,
-            Some(compression) => match compression {
-                FlightCompression::Lz4 => Some(CompressionType::LZ4_FRAME),
-                FlightCompression::Zstd => Some(CompressionType::ZSTD),
-            },
-        };
-        let config = SpillerConfig {
-            spiller_type: SpillerType::Aggregation,
-            location_prefix,
-            disk_spill: None,
-            use_parquet: ctx.get_settings().get_spilling_file_format()?.is_parquet(),
-        };
-
-        let spiller = Spiller::create(ctx.clone(), operator, config.clone())?;
-        Ok(BlockMetaTransformer::create(
-            input,
-            output,
-            TransformExchangeAggregateSerializer {
-                ctx,
-                params,
-                local_pos,
-                spiller: spiller.into(),
-                options: IpcWriteOptions::default()
-                    .try_with_compression(compression)
-                    .unwrap(),
-            },
-        ))
-    }
-}
-
-impl BlockMetaTransform<ExchangeShuffleMeta> for TransformExchangeAggregateSerializer {
-    const NAME: &'static str = "TransformExchangeAggregateSerializer";
-
-    fn transform(&mut self, meta: ExchangeShuffleMeta) -> Result<Vec<DataBlock>> {
-        let mut serialized_blocks = Vec::with_capacity(meta.blocks.len());
-        for (index, mut block) in meta.blocks.into_iter().enumerate() {
-            if block.is_empty() && block.get_meta().is_none() {
-                serialized_blocks.push(FlightSerialized::DataBlock(block));
-                continue;
-            }
-
-            match AggregateMeta::downcast_from(block.take_meta().unwrap()) {
-                None => unreachable!(),
-                Some(AggregateMeta::Spilled(_)) => unreachable!(),
-                Some(AggregateMeta::Serialized(_)) => unreachable!(),
-                Some(AggregateMeta::BucketSpilled(_)) => unreachable!(),
-                Some(AggregateMeta::Partitioned { .. }) => unreachable!(),
-                Some(AggregateMeta::AggregateSpilling(payload)) => {
-                    serialized_blocks.push(FlightSerialized::Future(
-                        match index == self.local_pos {
-                            true => local_agg_spilling_aggregate_payload(
-                                self.ctx.clone(),
-                                self.spiller.clone(),
-                                payload,
-                            )?,
-                            false => exchange_agg_spilling_aggregate_payload(
-                                self.ctx.clone(),
-                                self.spiller.clone(),
-                                payload,
-                            )?,
-                        },
-                    ));
-                }
-
-                Some(AggregateMeta::AggregatePayload(p)) => {
-                    let (bucket, max_partition_count) = (p.bucket, p.max_partition_count);
-
-                    if index == self.local_pos {
-                        serialized_blocks.push(FlightSerialized::DataBlock(
-                            block.add_meta(Some(Box::new(AggregateMeta::AggregatePayload(p))))?,
-                        ));
-                        continue;
-                    }
-
-                    let block_number = compute_block_number(bucket, max_partition_count)?;
-                    let stream = SerializeAggregateStream::create(
-                        &self.params,
-                        SerializePayload::AggregatePayload(p),
-                    );
-                    let mut stream_blocks = stream.into_iter().collect::<Result<Vec<_>>>()?;
-                    debug_assert!(!stream_blocks.is_empty());
-                    let mut c = DataBlock::concat(&stream_blocks)?;
-                    if let Some(meta) = stream_blocks[0].take_meta() {
-                        c.replace_meta(meta);
-                    }
-                    let c = serialize_block(block_number, c, &self.options)?;
-                    serialized_blocks.push(FlightSerialized::DataBlock(c));
-                }
-            };
-        }
-
-        Ok(vec![DataBlock::empty_with_meta(
-            FlightSerializedMeta::create(serialized_blocks),
-        )])
-    }
-}
-
-fn exchange_agg_spilling_aggregate_payload(
-    ctx: Arc<QueryContext>,
-    spiller: Arc<Spiller>,
-    partitioned_payload: PartitionedPayload,
-) -> Result<BoxFuture<'static, Result<DataBlock>>> {
-    let partition_count = partitioned_payload.partition_count();
-    let mut write_size = 0;
-    let mut write_data = Vec::with_capacity(partition_count);
-    let mut buckets_column_data = Vec::with_capacity(partition_count);
-    let mut data_range_start_column_data = Vec::with_capacity(partition_count);
-    let mut data_range_end_column_data = Vec::with_capacity(partition_count);
-    let mut columns_layout_column_data = Vec::with_capacity(partition_count);
-    // Record how many rows are spilled.
-    let mut rows = 0;
-
-    for (bucket, payload) in partitioned_payload.payloads.into_iter().enumerate() {
-        if payload.len() == 0 {
-            continue;
-        }
-
-        let data_block = payload.aggregate_flush_all()?;
-        rows += data_block.num_rows();
-
-        let old_write_size = write_size;
-        let columns = data_block.columns().to_vec();
-        let mut columns_data = Vec::with_capacity(columns.len());
-        let mut columns_layout = Vec::with_capacity(columns.len());
-
-        for column in columns.into_iter() {
-            let column = column.into_column(data_block.num_rows());
-            let column_data = serialize_column(&column);
-            write_size += column_data.len() as u64;
-            columns_layout.push(column_data.len() as u64);
-            columns_data.push(column_data);
-        }
-
-        write_data.push(columns_data);
-        buckets_column_data.push(bucket as i64);
-        data_range_end_column_data.push(write_size);
-        columns_layout_column_data.push(columns_layout);
-        data_range_start_column_data.push(old_write_size);
-    }
-
-    Ok(Box::pin(async move {
-        if !write_data.is_empty() {
-            let instant = Instant::now();
-            let (location, write_bytes) = spiller
-                .spill_stream_aggregate_buffer(None, write_data)
-                .await?;
-            // perf
-            {
-                Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1);
-                Profile::record_usize_profile(
-                    ProfileStatisticsName::RemoteSpillWriteBytes,
-                    write_bytes,
-                );
-                Profile::record_usize_profile(
-                    ProfileStatisticsName::RemoteSpillWriteTime,
-                    instant.elapsed().as_millis() as usize,
-                );
-            }
-
-            {
-                {
-                    let progress_val = ProgressValues {
-                        rows,
-                        bytes: write_bytes,
-                    };
-                    ctx.get_aggregate_spill_progress().incr(&progress_val);
-                }
-            }
-
-            info!(
-                "Write aggregate spill {} successfully, elapsed: {:?}",
-                location,
-                instant.elapsed()
-            );
-
-            let data_block = DataBlock::new_from_columns(vec![
-                Int64Type::from_data(buckets_column_data),
-                UInt64Type::from_data(data_range_start_column_data),
-                UInt64Type::from_data(data_range_end_column_data),
-                ArrayType::upcast_column(ArrayType::<UInt64Type>::column_from_iter(
-                    columns_layout_column_data
-                        .into_iter()
-                        .map(|x| UInt64Type::column_from_iter(x.into_iter(), &[])),
-                    &[],
-                )),
-            ]);
-
-            let data_block = data_block.add_meta(Some(AggregateSerdeMeta::create_agg_spilled(
-                -1,
-                location.clone(),
-                0..0,
-                vec![],
-                partition_count,
-            )))?;
-
-            let write_options = exchange_defines::spilled_write_options();
-            return serialize_block(-1, data_block, &write_options);
-        }
-
-        Ok(DataBlock::empty())
-    }))
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_async_barrier.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_async_barrier.rs
deleted file mode 100644
index 1628bc9af5beb..0000000000000
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_async_barrier.rs
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use databend_common_exception::ErrorCode;
-use databend_common_exception::Result;
-use databend_common_expression::BlockMetaInfoDowncast;
-use databend_common_expression::DataBlock;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::ProcessorPtr;
-use databend_common_pipeline_transforms::processors::AsyncTransform;
-use databend_common_pipeline_transforms::processors::AsyncTransformer;
-
-use crate::pipelines::processors::transforms::aggregator::FlightSerialized;
-use crate::pipelines::processors::transforms::aggregator::FlightSerializedMeta;
-use crate::servers::flight::v1::exchange::ExchangeShuffleMeta;
-
-pub struct TransformExchangeAsyncBarrier;
-
-impl TransformExchangeAsyncBarrier {
-    pub fn try_create(input: Arc<InputPort>, output: Arc<OutputPort>) -> Result<ProcessorPtr> {
-        Ok(ProcessorPtr::create(AsyncTransformer::create(
-            input,
-            output,
-            TransformExchangeAsyncBarrier {},
-        )))
-    }
-}
-
-#[async_trait::async_trait]
-impl AsyncTransform for TransformExchangeAsyncBarrier {
-    const NAME: &'static str = "TransformExchangeAsyncBarrier";
-
-    async fn transform(&mut self, mut data: DataBlock) -> Result<DataBlock> {
-        if let Some(meta) = data
-            .take_meta()
-            .and_then(FlightSerializedMeta::downcast_from)
-        {
-            let mut futures = Vec::with_capacity(meta.serialized_blocks.len());
-
-            for serialized_block in meta.serialized_blocks {
-                futures.push(databend_common_base::runtime::spawn(async move {
-                    match serialized_block {
-                        FlightSerialized::DataBlock(v) => Ok(v),
-                        FlightSerialized::Future(f) => f.await,
-                    }
-                }));
-            }
-
-            return match futures::future::try_join_all(futures).await {
-                Err(_) => Err(ErrorCode::TokioError("Cannot join tokio job")),
-                Ok(spilled_data) => Ok(DataBlock::empty_with_meta(ExchangeShuffleMeta::create(
-                    spilled_data.into_iter().collect::<Result<Vec<_>>>()?,
-                ))),
-            };
-        }
-
-        Err(ErrorCode::Internal(""))
-    }
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs
deleted file mode 100644
index fd03b09e2f3f7..0000000000000
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs
+++ /dev/null
@@ -1,314 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::any::Any;
-use std::collections::VecDeque;
-use std::sync::Arc;
-use std::time::Duration;
-use std::time::Instant;
-
-use databend_common_base::runtime::profile::Profile;
-use databend_common_base::runtime::profile::ProfileStatisticsName;
-use databend_common_exception::ErrorCode;
-use databend_common_exception::Result;
-use databend_common_expression::arrow::deserialize_column;
-use databend_common_expression::BlockMetaInfoDowncast;
-use databend_common_expression::BlockMetaInfoPtr;
-use databend_common_expression::DataBlock;
-use databend_common_pipeline_core::processors::Event;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::Processor;
-use databend_common_pipeline_core::processors::ProcessorPtr;
-use itertools::Itertools;
-use log::info;
-use opendal::Operator;
-use tokio::sync::Semaphore;
-
-use crate::pipelines::processors::transforms::aggregator::AggregateMeta;
-use crate::pipelines::processors::transforms::aggregator::BucketSpilledPayload;
-use crate::pipelines::processors::transforms::aggregator::SerializedPayload;
-
-type DeserializingMeta = (AggregateMeta, VecDeque<Vec<u8>>);
-
-pub struct TransformSpillReader {
-    input: Arc<InputPort>,
-    output: Arc<OutputPort>,
-
-    operator: Operator,
-    semaphore: Arc<Semaphore>,
-    deserialized_meta: Option<BlockMetaInfoPtr>,
-    reading_meta: Option<AggregateMeta>,
-    deserializing_meta: Option<DeserializingMeta>,
-}
-
-#[async_trait::async_trait]
-impl Processor for TransformSpillReader {
-    fn name(&self) -> String {
-        String::from("TransformSpillReader")
-    }
-
-    fn as_any(&mut self) -> &mut dyn Any {
-        self
-    }
-
-    fn event(&mut self) -> Result<Event> {
-        if self.output.is_finished() {
-            self.input.finish();
-            return Ok(Event::Finished);
-        }
-
-        if !self.output.can_push() {
-            self.input.set_not_need_data();
-            return Ok(Event::NeedConsume);
-        }
-
-        if let Some(deserialized_meta) = self.deserialized_meta.take() {
-            self.output
-                .push_data(Ok(DataBlock::empty_with_meta(deserialized_meta)));
-            return Ok(Event::NeedConsume);
-        }
-
-        if self.deserializing_meta.is_some() {
-            self.input.set_not_need_data();
-            return Ok(Event::Sync);
-        }
-
-        if self.reading_meta.is_some() {
-            self.input.set_not_need_data();
-            return Ok(Event::Async);
-        }
-
-        if self.input.has_data() {
-            let mut data_block = self.input.pull_data().unwrap()?;
-
-            if let Some(block_meta) = data_block
-                .get_meta()
-                .and_then(AggregateMeta::downcast_ref_from)
-            {
-                if matches!(block_meta, AggregateMeta::BucketSpilled(_)) {
-                    self.input.set_not_need_data();
-                    let block_meta = data_block.take_meta().unwrap();
-                    self.reading_meta = AggregateMeta::downcast_from(block_meta);
-                    return Ok(Event::Async);
-                }
-
-                if let AggregateMeta::Partitioned { data, .. } = block_meta {
-                    if data
-                        .iter()
-                        .any(|meta| matches!(meta, AggregateMeta::BucketSpilled(_)))
-                    {
-                        self.input.set_not_need_data();
-                        let block_meta = data_block.take_meta().unwrap();
-                        self.reading_meta = AggregateMeta::downcast_from(block_meta);
-                        return Ok(Event::Async);
-                    }
-                }
-            }
-
-            self.output.push_data(Ok(data_block));
-            return Ok(Event::NeedConsume);
-        }
-
-        if self.input.is_finished() {
-            self.output.finish();
-            return Ok(Event::Finished);
-        }
-
-        self.input.set_need_data();
-        Ok(Event::NeedData)
-    }
-
-    fn process(&mut self) -> Result<()> {
-        if let Some((meta, mut read_data)) = self.deserializing_meta.take() {
-            match meta {
-                AggregateMeta::Spilled(_) => unreachable!(),
-                AggregateMeta::AggregatePayload(_) => unreachable!(),
-                AggregateMeta::AggregateSpilling(_) => unreachable!(),
-                AggregateMeta::Serialized(_) => unreachable!(),
-                AggregateMeta::BucketSpilled(payload) => {
-                    debug_assert!(read_data.len() == 1);
-                    let data = read_data.pop_front().unwrap();
-
-                    self.deserialized_meta = Some(Box::new(Self::deserialize(payload, data)));
-                }
-                AggregateMeta::Partitioned { bucket, data } => {
-                    let mut new_data = Vec::with_capacity(data.len());
-
-                    for meta in data {
-                        if matches!(&meta, AggregateMeta::BucketSpilled(_)) {
-                            if let AggregateMeta::BucketSpilled(payload) = meta {
-                                let data = read_data.pop_front().unwrap();
-                                new_data.push(Self::deserialize(payload, data));
-                            }
-
-                            continue;
-                        }
-
-                        new_data.push(meta);
-                    }
-
-                    self.deserialized_meta =
-                        Some(AggregateMeta::create_partitioned(bucket, new_data));
-                }
-            }
-        }
-
-        Ok(())
-    }
-
-    #[async_backtrace::framed]
-    async fn async_process(&mut self) -> Result<()> {
-        if let Some(block_meta) = self.reading_meta.take() {
-            match &block_meta {
-                AggregateMeta::Spilled(_) => unreachable!(),
-                AggregateMeta::AggregatePayload(_) => unreachable!(),
-                AggregateMeta::AggregateSpilling(_) => unreachable!(),
-                AggregateMeta::Serialized(_) => unreachable!(),
-                AggregateMeta::BucketSpilled(payload) => {
-                    let _guard = self.semaphore.acquire().await;
-                    let instant = Instant::now();
-                    let data = self
-                        .operator
-                        .read_with(&payload.location)
-                        .range(payload.data_range.clone())
-                        .await?
-                        .to_vec();
-
-                    info!(
-                        "Read aggregate spill {} successfully, elapsed: {:?}",
-                        &payload.location,
-                        instant.elapsed()
-                    );
-
-                    self.deserializing_meta = Some((block_meta, VecDeque::from(vec![data])));
-                }
-                AggregateMeta::Partitioned { data, .. } => {
-                    // For log progress.
-                    let mut total_elapsed = Duration::default();
-                    let log_interval = 100;
-                    let mut processed_count = 0;
-
-                    let mut read_data = Vec::with_capacity(data.len());
-                    for meta in data {
-                        if let AggregateMeta::BucketSpilled(payload) = meta {
-                            let location = payload.location.clone();
-                            let operator = self.operator.clone();
-                            let data_range = payload.data_range.clone();
-                            let semaphore = self.semaphore.clone();
-                            read_data.push(databend_common_base::runtime::spawn(async move {
-                                let _guard = semaphore.acquire().await;
-                                let instant = Instant::now();
-                                let data = operator
-                                    .read_with(&location)
-                                    .range(data_range)
-                                    .await?
-                                    .to_vec();
-
-                                // perf
-                                {
-                                    Profile::record_usize_profile(
-                                        ProfileStatisticsName::RemoteSpillReadCount,
-                                        1,
-                                    );
-                                    Profile::record_usize_profile(
-                                        ProfileStatisticsName::RemoteSpillReadBytes,
-                                        data.len(),
-                                    );
-                                    Profile::record_usize_profile(
-                                        ProfileStatisticsName::RemoteSpillReadTime,
-                                        instant.elapsed().as_millis() as usize,
-                                    );
-                                }
-
-                                total_elapsed += instant.elapsed();
-                                processed_count += 1;
-
-                                // log the progress
-                                if processed_count % log_interval == 0 {
-                                    info!(
-                                        "Read aggregate {}/{} spilled buckets, elapsed: {:?}",
-                                        processed_count,
-                                        data.len(),
-                                        total_elapsed
-                                    );
-                                }
-
-                                Ok(data)
-                            }));
-                        }
-                    }
-
-                    match futures::future::try_join_all(read_data).await {
-                        Err(_) => {
-                            return Err(ErrorCode::TokioError("Cannot join tokio job"));
-                        }
-                        Ok(read_data) => {
-                            let read_data: std::result::Result<VecDeque<Vec<u8>>, opendal::Error> =
-                                read_data.into_iter().try_collect();
-
-                            self.deserializing_meta = Some((block_meta, read_data?));
-                        }
-                    };
-
-                    if processed_count != 0 {
-                        info!(
-                            "Read {} aggregate spills successfully, total elapsed: {:?}",
-                            processed_count, total_elapsed
-                        );
-                    }
-                }
-            }
-        }
-
-        Ok(())
-    }
-}
-
-impl TransformSpillReader {
-    pub fn create(
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-        operator: Operator,
-        semaphore: Arc<Semaphore>,
-    ) -> Result<ProcessorPtr> {
-        Ok(ProcessorPtr::create(Box::new(TransformSpillReader {
-            input,
-            output,
-            operator,
-            semaphore,
-            deserialized_meta: None,
-            reading_meta: None,
-            deserializing_meta: None,
-        })))
-    }
-
-    fn deserialize(payload: BucketSpilledPayload, data: Vec<u8>) -> AggregateMeta {
-        let mut begin = 0;
-        let mut columns = Vec::with_capacity(payload.columns_layout.len());
-
-        for column_layout in payload.columns_layout {
-            columns.push(deserialize_column(&data[begin..begin + column_layout as usize]).unwrap());
-            begin += column_layout as usize;
-        }
-
-        AggregateMeta::Serialized(SerializedPayload {
-            bucket: payload.bucket,
-            data_block: DataBlock::new_from_columns(columns),
-            max_partition_count: payload.max_partition_count,
-        })
-    }
-}
-
-pub type TransformAggregateSpillReader = TransformSpillReader;
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs
index 048d7e6ed5a1c..4c044f74c2853 100644
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs
@@ -15,16 +15,21 @@
 use std::sync::Arc;
 
 use bumpalo::Bump;
+use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::AggregateHashTable;
+use databend_common_expression::BlockMetaInfoDowncast;
 use databend_common_expression::DataBlock;
 use databend_common_expression::HashTableConfig;
+use databend_common_expression::InputColumns;
+use databend_common_expression::Payload;
 use databend_common_expression::PayloadFlushState;
+use databend_common_expression::ProbeState;
 use databend_common_pipeline_core::processors::InputPort;
 use databend_common_pipeline_core::processors::OutputPort;
 use databend_common_pipeline_core::processors::Processor;
-use databend_common_pipeline_transforms::processors::BlockMetaTransform;
-use databend_common_pipeline_transforms::processors::BlockMetaTransformer;
+use databend_common_pipeline_transforms::AccumulatingTransform;
+use databend_common_pipeline_transforms::AccumulatingTransformer;
 
 use crate::pipelines::processors::transforms::aggregator::aggregate_meta::AggregateMeta;
 use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
@@ -32,108 +37,160 @@ use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
 pub struct TransformFinalAggregate {
     params: Arc<AggregatorParams>,
     flush_state: PayloadFlushState,
+    hash_table: AggregateHashTable,
+
+    working_partition: isize,
+}
+
+impl AccumulatingTransform for TransformFinalAggregate {
+    const NAME: &'static str = "TransformFinalAggregate";
+
+    fn transform(&mut self, mut data: DataBlock) -> Result<Vec<DataBlock>> {
+        let Some(meta) = data.take_meta() else {
+            return Err(ErrorCode::Internal(
+                "Internal, TransformFinalAggregate only recv DataBlock with meta.",
+            ));
+        };
+
+        let Some(aggregate_meta) = AggregateMeta::downcast_from(meta) else {
+            return Err(ErrorCode::Internal(
+                "Internal, TransformFinalAggregate only recv DataBlock with meta.",
+            ));
+        };
+
+        let mut blocks = vec![];
+        match aggregate_meta {
+            AggregateMeta::SpilledPayload(_) => unreachable!(),
+            AggregateMeta::FinalPartition(_) => unreachable!(),
+            AggregateMeta::InFlightPayload(payload) => {
+                debug_assert!(payload.partition >= self.working_partition);
+                debug_assert_eq!(payload.max_partition, payload.global_max_partition);
+
+                if self.working_partition != payload.partition {
+                    self.working_partition = payload.partition;
+                    blocks.push(self.flush_result_blocks()?);
+                }
+
+                if !data.is_empty() {
+                    let payload = self.deserialize_flight(data)?;
+                    self.hash_table
+                        .combine_payload(&payload, &mut self.flush_state)?;
+                }
+            }
+            AggregateMeta::AggregatePayload(payload) => {
+                debug_assert!(payload.partition >= self.working_partition);
+                debug_assert_eq!(payload.max_partition, payload.global_max_partition);
+
+                if self.working_partition != payload.partition {
+                    self.working_partition = payload.partition;
+                    blocks.push(self.flush_result_blocks()?);
+                }
+
+                if payload.payload.len() != 0 {
+                    self.hash_table
+                        .combine_payload(&payload.payload, &mut self.flush_state)?;
+                }
+            }
+        };
+
+        Ok(blocks)
+    }
+
+    fn on_finish(&mut self, output: bool) -> Result<Vec<DataBlock>> {
+        if !output {
+            return Ok(vec![]);
+        }
+
+        Ok(vec![self.flush_result_blocks()?])
+    }
 }
 
 impl TransformFinalAggregate {
     pub fn try_create(
         input: Arc<InputPort>,
         output: Arc<OutputPort>,
-
         params: Arc<AggregatorParams>,
     ) -> Result<Box<dyn Processor>> {
-        Ok(BlockMetaTransformer::create(
+        let config = HashTableConfig::default().with_initial_radix_bits(0);
+
+        let hash_table = AggregateHashTable::new(
+            params.group_data_types.clone(),
+            params.aggregate_functions.clone(),
+            config,
+            Arc::new(Bump::new()),
+        );
+
+        Ok(AccumulatingTransformer::create(
             input,
             output,
             TransformFinalAggregate {
                 params,
+                hash_table,
+                working_partition: 0,
                 flush_state: PayloadFlushState::default(),
             },
         ))
     }
 
-    fn transform_agg_hashtable(&mut self, meta: AggregateMeta) -> Result<DataBlock> {
-        let mut agg_hashtable: Option<AggregateHashTable> = None;
-        if let AggregateMeta::Partitioned { bucket, data } = meta {
-            for bucket_data in data {
-                match bucket_data {
-                    AggregateMeta::Serialized(payload) => match agg_hashtable.as_mut() {
-                        Some(ht) => {
-                            debug_assert!(bucket == payload.bucket);
-
-                            let payload = payload.convert_to_partitioned_payload(
-                                self.params.group_data_types.clone(),
-                                self.params.aggregate_functions.clone(),
-                                self.params.num_states(),
-                                0,
-                                Arc::new(Bump::new()),
-                            )?;
-                            ht.combine_payloads(&payload, &mut self.flush_state)?;
-                        }
-                        None => {
-                            debug_assert!(bucket == payload.bucket);
-                            agg_hashtable = Some(payload.convert_to_aggregate_table(
-                                self.params.group_data_types.clone(),
-                                self.params.aggregate_functions.clone(),
-                                self.params.num_states(),
-                                0,
-                                Arc::new(Bump::new()),
-                                true,
-                            )?);
-                        }
-                    },
-                    AggregateMeta::AggregatePayload(payload) => match agg_hashtable.as_mut() {
-                        Some(ht) => {
-                            debug_assert!(bucket == payload.bucket);
-                            ht.combine_payload(&payload.payload, &mut self.flush_state)?;
-                        }
-                        None => {
-                            debug_assert!(bucket == payload.bucket);
-                            let capacity =
-                                AggregateHashTable::get_capacity_for_count(payload.payload.len());
-                            let mut hashtable = AggregateHashTable::new_with_capacity(
-                                self.params.group_data_types.clone(),
-                                self.params.aggregate_functions.clone(),
-                                HashTableConfig::default().with_initial_radix_bits(0),
-                                capacity,
-                                Arc::new(Bump::new()),
-                            );
-                            hashtable.combine_payload(&payload.payload, &mut self.flush_state)?;
-                            agg_hashtable = Some(hashtable);
-                        }
-                    },
-                    _ => unreachable!(),
-                }
-            }
-        }
+    fn deserialize_flight(&mut self, data: DataBlock) -> Result<Payload> {
+        let rows_num = data.num_rows();
+        let group_len = self.params.group_data_types.len();
 
-        if let Some(mut ht) = agg_hashtable {
-            let mut blocks = vec![];
-            self.flush_state.clear();
-
-            loop {
-                if ht.merge_result(&mut self.flush_state)? {
-                    let mut cols = self.flush_state.take_aggregate_results();
-                    cols.extend_from_slice(&self.flush_state.take_group_columns());
-                    blocks.push(DataBlock::new_from_columns(cols));
-                } else {
-                    break;
-                }
-            }
+        let mut state = ProbeState::default();
 
-            if blocks.is_empty() {
-                return Ok(self.params.empty_result_block());
-            }
-            return DataBlock::concat(&blocks);
-        }
+        // create single partition hash table for deserialize
+        let capacity = AggregateHashTable::get_capacity_for_count(rows_num);
+        let config = HashTableConfig::default().with_initial_radix_bits(0);
+        let mut hashtable = AggregateHashTable::new_directly(
+            self.params.group_data_types.clone(),
+            self.params.aggregate_functions.clone(),
+            config,
+            capacity,
+            Arc::new(Bump::new()),
+            false,
+        );
+
+        let num_states = self.params.num_states();
+        let states_index: Vec<usize> = (0..num_states).collect();
+        let agg_states = InputColumns::new_block_proxy(&states_index, &data);
+
+        let group_index: Vec<usize> = (num_states..(num_states + group_len)).collect();
+        let group_columns = InputColumns::new_block_proxy(&group_index, &data);
 
-        Ok(self.params.empty_result_block())
+        let _ = hashtable.add_groups(
+            &mut state,
+            group_columns,
+            &[(&[]).into()],
+            agg_states,
+            rows_num,
+        )?;
+
+        hashtable.payload.mark_min_cardinality();
+        assert_eq!(hashtable.payload.payloads.len(), 1);
+        Ok(hashtable.payload.payloads.pop().unwrap())
     }
-}
 
-impl BlockMetaTransform<AggregateMeta> for TransformFinalAggregate {
-    const NAME: &'static str = "TransformFinalAggregate";
+    fn flush_result_blocks(&mut self) -> Result<DataBlock> {
+        let mut blocks = vec![];
+        self.flush_state.clear();
+
+        while self.hash_table.merge_result(&mut self.flush_state)? {
+            let mut cols = self.flush_state.take_aggregate_results();
+            cols.extend_from_slice(&self.flush_state.take_group_columns());
+            blocks.push(DataBlock::new_from_columns(cols));
+        }
 
-    fn transform(&mut self, meta: AggregateMeta) -> Result<Vec<DataBlock>> {
-        Ok(vec![self.transform_agg_hashtable(meta)?])
+        let config = HashTableConfig::default().with_initial_radix_bits(0);
+        self.hash_table = AggregateHashTable::new(
+            self.params.group_data_types.clone(),
+            self.params.aggregate_functions.clone(),
+            config,
+            Arc::new(Bump::new()),
+        );
+
+        match blocks.is_empty() {
+            true => Ok(self.params.empty_result_block()),
+            false => DataBlock::concat(&blocks),
+        }
     }
 }
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs
index 404d963f8ba6e..e92159a9c15d3 100644
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs
@@ -17,15 +17,22 @@ use std::time::Instant;
 use std::vec;
 
 use bumpalo::Bump;
+use byteorder::BigEndian;
+use byteorder::WriteBytesExt;
 use databend_common_base::base::convert_byte_size;
 use databend_common_base::base::convert_number_size;
 use databend_common_catalog::plan::AggIndexMeta;
+use databend_common_catalog::table_context::TableContext;
+use databend_common_config::GlobalConfig;
 use databend_common_exception::Result;
+use databend_common_expression::arrow::write_column;
 use databend_common_expression::AggregateHashTable;
 use databend_common_expression::BlockMetaInfoDowncast;
 use databend_common_expression::DataBlock;
 use databend_common_expression::HashTableConfig;
 use databend_common_expression::InputColumns;
+use databend_common_expression::PartitionedPayload;
+use databend_common_expression::Payload;
 use databend_common_expression::PayloadFlushState;
 use databend_common_expression::ProbeState;
 use databend_common_pipeline_core::processors::InputPort;
@@ -34,11 +41,18 @@ use databend_common_pipeline_core::processors::Processor;
 use databend_common_pipeline_transforms::processors::AccumulatingTransform;
 use databend_common_pipeline_transforms::processors::AccumulatingTransformer;
 use databend_common_pipeline_transforms::MemorySettings;
+use opendal::Operator;
 
 use crate::pipelines::memory_settings::MemorySettingsExt;
 use crate::pipelines::processors::transforms::aggregator::aggregate_meta::AggregateMeta;
 use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
+use crate::pipelines::processors::transforms::aggregator::SpilledPayload;
 use crate::sessions::QueryContext;
+use crate::spillers::SpillWriter;
+use crate::spillers::Spiller;
+use crate::spillers::SpillerConfig;
+use crate::spillers::SpillerType;
+
 #[allow(clippy::enum_variant_names)]
 enum HashTable {
     MovedOut,
@@ -61,6 +75,10 @@ pub struct TransformPartialAggregate {
     processed_bytes: usize,
     processed_rows: usize,
     settings: MemorySettings,
+    configure_peer_nodes: Vec<String>,
+    spilling_state: Option<HashtableSpillingState>,
+    spiller: Spiller,
+    output_blocks: Vec<DataBlock>,
 }
 
 impl TransformPartialAggregate {
@@ -68,8 +86,10 @@ impl TransformPartialAggregate {
         ctx: Arc<QueryContext>,
         input: Arc<InputPort>,
         output: Arc<OutputPort>,
+        operator: Operator,
         params: Arc<AggregatorParams>,
         config: HashTableConfig,
+        location_prefix: String,
     ) -> Result<Box<dyn Processor>> {
         let hash_table = {
             let arena = Arc::new(Bump::new());
@@ -92,11 +112,21 @@ impl TransformPartialAggregate {
             }
         };
 
+        let config = SpillerConfig {
+            spiller_type: SpillerType::Aggregation,
+            location_prefix,
+            disk_spill: None,
+            use_parquet: ctx.get_settings().get_spilling_file_format()?.is_parquet(),
+        };
+
+        let spiller = Spiller::create(ctx.clone(), operator, config.clone())?;
+
         Ok(AccumulatingTransformer::create(
             input,
             output,
             TransformPartialAggregate {
                 params,
+                spiller,
                 hash_table,
                 probe_state: ProbeState::default(),
                 settings: MemorySettings::from_aggregate_settings(&ctx)?,
@@ -104,6 +134,9 @@ impl TransformPartialAggregate {
                 first_block_start: None,
                 processed_bytes: 0,
                 processed_rows: 0,
+                configure_peer_nodes: vec![GlobalConfig::instance().query.node_id.clone()],
+                spilling_state: None,
+                output_blocks: vec![],
             },
         ))
     }
@@ -182,48 +215,39 @@ impl TransformPartialAggregate {
             }
         }
     }
+
+    fn reset_hashtable(&mut self) {
+        let hashtable_spilling_state = self.spilling_state.as_mut().unwrap();
+
+        hashtable_spilling_state
+            .ht
+            .config
+            .update_current_max_radix_bits();
+
+        let config = hashtable_spilling_state
+            .ht
+            .config
+            .clone()
+            .with_initial_radix_bits(hashtable_spilling_state.ht.config.max_radix_bits);
+
+        let aggrs = hashtable_spilling_state.ht.payload.aggrs.clone();
+        let group_types = hashtable_spilling_state.ht.payload.group_types.clone();
+        self.spilling_state = None;
+        self.hash_table = HashTable::AggregateHashTable(AggregateHashTable::new(
+            group_types,
+            aggrs,
+            config,
+            Arc::new(Bump::new()),
+        ));
+    }
 }
 
+#[async_trait::async_trait]
 impl AccumulatingTransform for TransformPartialAggregate {
     const NAME: &'static str = "TransformPartialAggregate";
 
     fn transform(&mut self, block: DataBlock) -> Result<Vec<DataBlock>> {
         self.execute_one_block(block)?;
-
-        if self.settings.check_spill() {
-            if let HashTable::AggregateHashTable(v) = std::mem::take(&mut self.hash_table) {
-                let group_types = v.payload.group_types.clone();
-                let aggrs = v.payload.aggrs.clone();
-                v.config.update_current_max_radix_bits();
-                let config = v
-                    .config
-                    .clone()
-                    .with_initial_radix_bits(v.config.max_radix_bits);
-
-                let mut state = PayloadFlushState::default();
-
-                // repartition to max for normalization
-                let partitioned_payload = v
-                    .payload
-                    .repartition(1 << config.max_radix_bits, &mut state);
-
-                let blocks = vec![DataBlock::empty_with_meta(
-                    AggregateMeta::create_agg_spilling(partitioned_payload),
-                )];
-
-                let arena = Arc::new(Bump::new());
-                self.hash_table = HashTable::AggregateHashTable(AggregateHashTable::new(
-                    group_types,
-                    aggrs,
-                    config,
-                    arena,
-                ));
-                return Ok(blocks);
-            }
-
-            unreachable!()
-        }
-
         Ok(vec![])
     }
 
@@ -235,7 +259,6 @@ impl AccumulatingTransform for TransformPartialAggregate {
             },
             HashTable::AggregateHashTable(hashtable) => {
                 let partition_count = hashtable.payload.partition_count();
-                let mut blocks = Vec::with_capacity(partition_count);
 
                 log::info!(
                     "Aggregated {} to {} rows in {} sec(real: {}). ({} rows/sec, {}/sec, {})",
@@ -256,20 +279,336 @@ impl AccumulatingTransform for TransformPartialAggregate {
                     convert_byte_size(self.processed_bytes as f64),
                 );
 
-                for (bucket, payload) in hashtable.payload.payloads.into_iter().enumerate() {
-                    if payload.len() != 0 {
-                        blocks.push(DataBlock::empty_with_meta(
+                if hashtable.len() != 0 {
+                    for (partition, payload) in hashtable.payload.payloads.into_iter().enumerate() {
+                        self.output_blocks.push(DataBlock::empty_with_meta(
                             AggregateMeta::create_agg_payload(
-                                bucket as isize,
                                 payload,
+                                partition as isize,
+                                partition_count,
                                 partition_count,
                             ),
                         ));
                     }
                 }
 
-                blocks
+                std::mem::take(&mut self.output_blocks)
             }
         })
     }
+
+    fn configure_peer_nodes(&mut self, nodes: &[String]) {
+        self.configure_peer_nodes = nodes.to_vec();
+    }
+
+    fn need_spill(&self) -> bool {
+        self.settings.check_spill()
+    }
+
+    fn prepare_spill_payload(&mut self) -> Result<bool> {
+        if self.spilling_state.is_none() {
+            let HashTable::AggregateHashTable(ht) = std::mem::take(&mut self.hash_table) else {
+                return Ok(false);
+            };
+
+            if ht.len() == 0 {
+                self.hash_table = HashTable::AggregateHashTable(ht);
+                return Ok(false);
+            }
+
+            let max_bucket = self.configure_peer_nodes.len();
+            self.spilling_state = Some(HashtableSpillingState::create(ht, max_bucket));
+        }
+
+        if let Some(spilling_state) = self.spilling_state.as_mut() {
+            spilling_state.last_prepare_payload = spilling_state.serialize_partition_payload()?;
+            return Ok(true);
+        }
+
+        Ok(false)
+    }
+
+    async fn flush_spill_payload(&mut self) -> Result<bool> {
+        let spilling_state = self.spilling_state.as_mut().unwrap();
+
+        let max_bucket = spilling_state.max_bucket;
+        let max_partition = 1 << spilling_state.ht.config.max_radix_bits;
+
+        if !spilling_state.data_payload.is_empty() {
+            if spilling_state.writer.is_none() {
+                let location = self.spiller.create_unique_location();
+                spilling_state.writer = Some(self.spiller.create_aggregate_writer(location).await?);
+            }
+
+            let writer = spilling_state.writer.as_mut().unwrap();
+
+            let mut flush_data = Vec::with_capacity(4 * 1024 * 1024);
+            std::mem::swap(&mut flush_data, &mut spilling_state.data_payload);
+            writer.write(flush_data).await?;
+        }
+
+        if spilling_state.last_prepare_payload {
+            if let Some(writer) = spilling_state.writer.as_mut() {
+                let last_offset = spilling_state.last_flush_partition_offset;
+                if writer.write_bytes() > last_offset {
+                    let spilled_payload = SpilledPayload {
+                        partition: spilling_state.working_partition as isize,
+                        location: writer.location(),
+                        data_range: last_offset as u64..writer.write_bytes() as u64,
+                        destination_node: self.configure_peer_nodes[spilling_state.working_bucket]
+                            .clone(),
+                        max_partition,
+                        global_max_partition: max_partition,
+                    };
+
+                    self.output_blocks.push(DataBlock::empty_with_meta(
+                        AggregateMeta::create_spilled_payload(spilled_payload),
+                    ));
+
+                    spilling_state.last_flush_partition_offset = writer.write_bytes();
+                }
+            }
+
+            spilling_state.payload_idx = 0;
+            spilling_state.working_partition += 1;
+            if spilling_state.working_partition < max_partition {
+                return Ok(true);
+            }
+
+            if let Some(writer) = spilling_state.writer.as_mut() {
+                writer.complete().await?;
+                spilling_state.writer = None;
+                spilling_state.last_flush_partition_offset = 0;
+            }
+
+            spilling_state.payload_idx = 0;
+            spilling_state.working_bucket += 1;
+            spilling_state.working_partition = 0;
+
+            if spilling_state.working_bucket < max_bucket {
+                return Ok(true);
+            }
+
+            spilling_state.finished = true;
+            self.reset_hashtable();
+
+            return Ok(false);
+        }
+
+        Ok(true)
+    }
+}
+
+pub struct HashtableSpillingState {
+    ht: AggregateHashTable,
+    payload_idx: usize,
+    working_partition: usize,
+    partition_flush_state: PayloadFlushState,
+
+    max_bucket: usize,
+    working_bucket: usize,
+    bucket_flush_state: PayloadFlushState,
+
+    serialize_flush_state: PayloadFlushState,
+
+    data_payload: Vec<u8>,
+
+    finished: bool,
+    last_prepare_payload: bool,
+    writer: Option<SpillWriter>,
+
+    last_flush_partition_offset: usize,
+}
+
+impl HashtableSpillingState {
+    pub fn create(ht: AggregateHashTable, scatter_max_bucket: usize) -> Self {
+        HashtableSpillingState {
+            ht,
+            payload_idx: 0,
+            working_partition: 0,
+            partition_flush_state: PayloadFlushState::default(),
+            max_bucket: scatter_max_bucket,
+            working_bucket: 0,
+            bucket_flush_state: PayloadFlushState::default(),
+            serialize_flush_state: PayloadFlushState::default(),
+            data_payload: Vec::with_capacity(6 * 1024 * 1024),
+            writer: None,
+            finished: false,
+            last_prepare_payload: false,
+            last_flush_partition_offset: 0,
+        }
+    }
+    pub fn serialize_payload(&mut self, payload: Option<Payload>) -> Result<bool> {
+        let payload = match payload.as_ref() {
+            Some(payload) => payload,
+            None => &self.ht.payload.payloads[self.working_partition],
+        };
+
+        if payload.len() == 0 {
+            return Ok(true);
+        }
+
+        while let Some(data_block) = payload.aggregate_flush(&mut self.serialize_flush_state)? {
+            if data_block.num_rows() == 0 {
+                // next batch rows
+                continue;
+            }
+
+            let columns = data_block.columns().to_vec();
+            for column in columns.into_iter() {
+                let column = column.into_column(data_block.num_rows());
+
+                let offset = self.data_payload.len();
+
+                self.data_payload.write_u64::<BigEndian>(0)?;
+                write_column(&column, &mut self.data_payload)?;
+
+                // rewrite column length
+                let len = self.data_payload.len();
+                let mut buffer = &mut self.data_payload[offset..];
+                buffer.write_u64::<BigEndian>((len - offset - size_of::<u64>()) as u64)?;
+            }
+
+            if self.data_payload.len() >= 4 * 1024 * 1024 {
+                // flush data if >= 4MB
+                return Ok(false);
+            }
+        }
+
+        self.serialize_flush_state.clear();
+        Ok(true)
+    }
+
+    pub fn serialize_scatter_payload(&mut self, raw_payload: Option<Payload>) -> Result<bool> {
+        // If no need scatter
+        if self.max_bucket <= 1 {
+            return self.serialize_payload(raw_payload);
+        }
+
+        // using if-else to avoid mutable borrow occurs here
+        if let Some(payload) = raw_payload {
+            while payload.scatter(&mut self.bucket_flush_state, self.max_bucket) {
+                let working_bucket = self.working_bucket;
+                let flush_state = &mut self.bucket_flush_state;
+
+                let rows = flush_state.probe_state.partition_count[working_bucket];
+
+                if rows == 0 {
+                    // next batch rows
+                    continue;
+                }
+
+                let sel = &flush_state.probe_state.partition_entries[working_bucket];
+
+                let mut scattered_payload = Payload::new(
+                    payload.arena.clone(),
+                    payload.group_types.clone(),
+                    payload.aggrs.clone(),
+                    payload.states_layout.clone(),
+                );
+
+                scattered_payload.state_move_out = true;
+                scattered_payload.copy_rows(sel, rows, &flush_state.addresses);
+
+                if !self.serialize_payload(Some(scattered_payload))? {
+                    return Ok(false);
+                }
+            }
+        } else {
+            while self.ht.payload.payloads[self.working_partition]
+                .scatter(&mut self.bucket_flush_state, self.max_bucket)
+            {
+                let working_bucket = self.working_bucket;
+                let flush_state = &mut self.bucket_flush_state;
+                let rows = flush_state.probe_state.partition_count[working_bucket];
+
+                if rows == 0 {
+                    // next batch rows
+                    continue;
+                }
+
+                let sel = &flush_state.probe_state.partition_entries[working_bucket];
+
+                let working_payload = &self.ht.payload.payloads[self.working_partition];
+                let mut scattered_payload = Payload::new(
+                    working_payload.arena.clone(),
+                    working_payload.group_types.clone(),
+                    working_payload.aggrs.clone(),
+                    working_payload.states_layout.clone(),
+                );
+
+                scattered_payload.state_move_out = true;
+                scattered_payload.copy_rows(sel, rows, &flush_state.addresses);
+
+                if !self.serialize_payload(Some(scattered_payload))? {
+                    return Ok(false);
+                }
+            }
+        }
+
+        self.bucket_flush_state.clear();
+        Ok(true)
+    }
+
+    pub fn serialize_partition_payload(&mut self) -> Result<bool> {
+        let max_partitions = 1 << self.ht.config.max_radix_bits;
+
+        // If no need repartition
+        if self.ht.payload.partition_count() == max_partitions {
+            return self.serialize_scatter_payload(None);
+        }
+
+        let mut partition_payload = PartitionedPayload::new(
+            self.ht.payload.group_types.clone(),
+            self.ht.payload.aggrs.clone(),
+            max_partitions as u64,
+            self.ht.payload.arenas.clone(),
+        );
+
+        for payload in &mut partition_payload.payloads {
+            payload.state_move_out = true;
+        }
+
+        // repartition and get current partition payload
+        for idx in self.payload_idx..self.ht.payload.payloads.len() {
+            while partition_payload.gather_flush(
+                &self.ht.payload.payloads[idx],
+                &mut self.partition_flush_state,
+            ) {
+                let working_partition = self.working_partition;
+                let flush_state = &mut self.partition_flush_state;
+
+                let rows = flush_state.probe_state.partition_count[working_partition];
+
+                if rows == 0 {
+                    // next batch rows
+                    continue;
+                }
+
+                let address = &flush_state.addresses;
+                let selector = &flush_state.probe_state.partition_entries[working_partition];
+
+                let working_payload = &self.ht.payload.payloads[idx];
+                let mut working_partition_payload = Payload::new(
+                    working_payload.arena.clone(),
+                    working_payload.group_types.clone(),
+                    working_payload.aggrs.clone(),
+                    working_payload.states_layout.clone(),
+                );
+
+                working_partition_payload.state_move_out = true;
+                working_partition_payload.copy_rows(selector, rows, address);
+
+                if !self.serialize_scatter_payload(Some(working_partition_payload))? {
+                    return Ok(false);
+                }
+            }
+
+            self.payload_idx += 1;
+            self.partition_flush_state.clear();
+        }
+
+        self.partition_flush_state.clear();
+        Ok(true)
+    }
 }
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_align.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_align.rs
new file mode 100644
index 0000000000000..eb95601641217
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_align.rs
@@ -0,0 +1,405 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::BTreeMap;
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+use bumpalo::Bump;
+use databend_common_exception::ErrorCode;
+use databend_common_exception::Result;
+use databend_common_expression::AggregateHashTable;
+use databend_common_expression::BlockMetaInfoDowncast;
+use databend_common_expression::DataBlock;
+use databend_common_expression::HashTableConfig;
+use databend_common_expression::InputColumns;
+use databend_common_expression::PartitionedPayload;
+use databend_common_expression::Payload;
+use databend_common_expression::PayloadFlushState;
+use databend_common_expression::ProbeState;
+use databend_common_pipeline_core::processors::Event;
+use databend_common_pipeline_core::processors::InputPort;
+use databend_common_pipeline_core::processors::OutputPort;
+use databend_common_pipeline_core::processors::Processor;
+use databend_common_pipeline_transforms::MemorySettings;
+
+use crate::pipelines::memory_settings::MemorySettingsExt;
+use crate::pipelines::processors::transforms::aggregator::transform_partition_bucket::SINGLE_LEVEL_BUCKET_NUM;
+use crate::pipelines::processors::transforms::aggregator::AggregateMeta;
+use crate::pipelines::processors::transforms::aggregator::AggregatePayload;
+use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
+use crate::sessions::QueryContext;
+
+pub struct TransformPartitionAlign {
+    input: Arc<InputPort>,
+    output: Arc<OutputPort>,
+
+    #[allow(dead_code)]
+    settings: MemorySettings,
+    params: Arc<AggregatorParams>,
+
+    max_partition: usize,
+    working_partition: isize,
+    partitions: Partitions,
+
+    output_data: VecDeque<DataBlock>,
+    input_data: Option<(AggregateMeta, DataBlock)>,
+}
+
+impl TransformPartitionAlign {
+    pub fn create(
+        ctx: Arc<QueryContext>,
+        params: Arc<AggregatorParams>,
+        input: Arc<InputPort>,
+        output: Arc<OutputPort>,
+    ) -> Result<Self> {
+        let settings = MemorySettings::from_aggregate_settings(&ctx)?;
+        Ok(TransformPartitionAlign {
+            input,
+            output,
+            params,
+            settings,
+            max_partition: 0,
+            working_partition: 0,
+            partitions: Partitions::create(),
+            input_data: None,
+            output_data: Default::default(),
+        })
+    }
+
+    fn ready_partition(&mut self) -> Option<isize> {
+        let storage_min_partition = self.partitions.min_partition()?;
+
+        if storage_min_partition >= self.working_partition {
+            return None;
+        }
+
+        Some(storage_min_partition)
+    }
+
+    fn fetch_ready_partition(&mut self) -> Result<()> {
+        if let Some(ready_partition_id) = self.ready_partition() {
+            let ready_partition = self.partitions.take_partition(ready_partition_id);
+
+            for (meta, data_block) in ready_partition {
+                self.output_data
+                    .push_back(data_block.add_meta(Some(Box::new(meta)))?);
+            }
+
+            self.output_data
+                .push_back(DataBlock::empty_with_meta(AggregateMeta::create_final(
+                    vec![],
+                )));
+        }
+
+        Ok(())
+    }
+
+    fn unpark_block(&self, mut data_block: DataBlock) -> Result<(AggregateMeta, DataBlock)> {
+        let Some(meta) = data_block.take_meta() else {
+            return Err(ErrorCode::Internal(
+                "Internal, TransformPartitionBucket only recv DataBlock with meta.",
+            ));
+        };
+
+        let Some(meta) = AggregateMeta::downcast_from(meta) else {
+            return Err(ErrorCode::Internal(
+                "Internal, TransformPartitionBucket only recv AggregateMeta".to_string(),
+            ));
+        };
+
+        Ok((meta, data_block))
+    }
+
+    fn repartition(&mut self, meta: AggregateMeta, data_block: DataBlock) -> Result<()> {
+        match meta {
+            AggregateMeta::FinalPartition(_) => unreachable!(),
+            AggregateMeta::SpilledPayload(_payload) => unreachable!(),
+            AggregateMeta::InFlightPayload(payload) => {
+                if data_block.is_empty() {
+                    return Ok(());
+                }
+
+                let payload = AggregatePayload {
+                    partition: payload.partition,
+                    max_partition: payload.max_partition,
+                    payload: self.deserialize_flight(data_block)?,
+                    global_max_partition: payload.global_max_partition,
+                };
+
+                let repartition = payload.global_max_partition;
+                let partitioned = self.partition_payload(payload, repartition);
+
+                for payload in partitioned {
+                    self.partitions
+                        .add_data(AggregateMeta::AggregatePayload(payload), DataBlock::empty());
+                }
+            }
+            AggregateMeta::AggregatePayload(payload) => {
+                if payload.payload.len() == 0 {
+                    return Ok(());
+                }
+
+                let repartition = payload.global_max_partition;
+                let partitioned = self.partition_payload(payload, repartition);
+                for payload in partitioned {
+                    self.partitions
+                        .add_data(AggregateMeta::AggregatePayload(payload), DataBlock::empty());
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn deserialize_flight(&mut self, data: DataBlock) -> Result<Payload> {
+        let rows_num = data.num_rows();
+        let group_len = self.params.group_data_types.len();
+
+        let mut state = ProbeState::default();
+
+        // create single partition hash table for deserialize
+        let capacity = AggregateHashTable::get_capacity_for_count(rows_num);
+        let config = HashTableConfig::default().with_initial_radix_bits(0);
+        let mut hashtable = AggregateHashTable::new_directly(
+            self.params.group_data_types.clone(),
+            self.params.aggregate_functions.clone(),
+            config,
+            capacity,
+            Arc::new(Bump::new()),
+            false,
+        );
+
+        let num_states = self.params.num_states();
+        let states_index: Vec<usize> = (0..num_states).collect();
+        let agg_states = InputColumns::new_block_proxy(&states_index, &data);
+
+        let group_index: Vec<usize> = (num_states..(num_states + group_len)).collect();
+        let group_columns = InputColumns::new_block_proxy(&group_index, &data);
+
+        let _ = hashtable.add_groups(
+            &mut state,
+            group_columns,
+            &[(&[]).into()],
+            agg_states,
+            rows_num,
+        )?;
+
+        hashtable.payload.mark_min_cardinality();
+        assert_eq!(hashtable.payload.payloads.len(), 1);
+        Ok(hashtable.payload.payloads.pop().unwrap())
+    }
+
+    fn partition_payload(&mut self, from: AggregatePayload, to: usize) -> Vec<AggregatePayload> {
+        let mut partitioned = Vec::with_capacity(to);
+        let mut partitioned_payload = PartitionedPayload::new(
+            self.params.group_data_types.clone(),
+            self.params.aggregate_functions.clone(),
+            to as u64,
+            from.payload.arena.clone(),
+        );
+
+        let mut flush_state = PayloadFlushState::default();
+        partitioned_payload.combine_single(from.payload, &mut flush_state, None);
+
+        for (partition, payload) in partitioned_payload.payloads.into_iter().enumerate() {
+            partitioned.push(AggregatePayload {
+                payload,
+                partition: partition as isize,
+                max_partition: to,
+                global_max_partition: from.global_max_partition,
+            });
+        }
+
+        partitioned
+    }
+}
+
+impl Processor for TransformPartitionAlign {
+    fn name(&self) -> String {
+        String::from("TransformPartitionAlign")
+    }
+
+    fn as_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn event(&mut self) -> Result<Event> {
+        if self.output.is_finished() {
+            self.input.finish();
+            return Ok(Event::Finished);
+        }
+
+        if !self.output.can_push() {
+            self.input.set_not_need_data();
+            return Ok(Event::NeedConsume);
+        }
+
+        if let Some(data_block) = self.output_data.pop_front() {
+            self.output.push_data(Ok(data_block));
+            return Ok(Event::NeedConsume);
+        }
+
+        if self.input.has_data() {
+            let data_block = self.input.pull_data().unwrap()?;
+
+            let (meta, data_block) = self.unpark_block(data_block)?;
+            self.max_partition = meta.get_global_max_partition();
+
+            // need repartition
+            if meta.get_max_partition() != meta.get_global_max_partition() {
+                self.input_data = Some((meta, data_block));
+                return Ok(Event::Sync);
+            }
+
+            let partition = meta.get_sorting_partition();
+            self.partitions.add_data(meta, data_block);
+
+            if partition > SINGLE_LEVEL_BUCKET_NUM && partition != self.working_partition {
+                self.working_partition = partition;
+            }
+        }
+
+        if self.input.is_finished() && self.working_partition as usize != self.max_partition {
+            self.working_partition = self.max_partition as isize;
+        }
+
+        if self.output_data.is_empty() {
+            self.fetch_ready_partition()?;
+        }
+
+        if let Some(data_block) = self.output_data.pop_front() {
+            self.output.push_data(Ok(data_block));
+            return Ok(Event::NeedConsume);
+        }
+
+        if self.input.is_finished() {
+            self.output.finish();
+            return Ok(Event::Finished);
+        }
+
+        self.input.set_need_data();
+        Ok(Event::NeedData)
+    }
+
+    fn process(&mut self) -> Result<()> {
+        if let Some((meta, data_block)) = self.input_data.take() {
+            self.repartition(meta, data_block)?;
+        }
+
+        Ok(())
+    }
+}
+
+// #[async_trait::async_trait]
+// impl AccumulatingTransform for TransformPartitionAlign {
+//     const NAME: &'static str = "TransformPartitionAlign";
+//
+//     fn transform(&mut self, data_block: DataBlock) -> Result<Vec<DataBlock>> {
+//         let (meta, data_block) = self.unpark_block(data_block)?;
+//         self.max_partition = meta.get_global_max_partition();
+//
+//         // need repartition
+//         if meta.get_max_partition() != meta.get_global_max_partition() {
+//             self.repartition(meta, data_block)?;
+//             return Ok(vec![]);
+//         }
+//
+//         let partition = meta.get_sorting_partition();
+//         self.partitions.add_data(meta, data_block);
+//
+//         if partition > SINGLE_LEVEL_BUCKET_NUM && partition != self.working_partition {
+//             self.fetch_ready_partition()?;
+//             self.working_partition = partition;
+//             // return Ok(ready_partition);
+//         }
+//
+//         Ok(vec![])
+//     }
+//
+//     fn on_finish(&mut self, _output: bool) -> Result<Vec<DataBlock>> {
+//         let remain_size = self
+//             .partitions
+//             .data
+//             .values()
+//             .map(|x| x.len())
+//             .sum::<usize>();
+//
+//         let mut remain_partitions = Vec::with_capacity(remain_size + self.partitions.data.len());
+//         self.working_partition = self.max_partition as isize;
+//
+//         loop {
+//             let ready_partition = self.fetch_ready_partition()?;
+//
+//             if !ready_partition.is_empty() {
+//                 remain_partitions.extend(ready_partition);
+//                 continue;
+//             }
+//
+//             return Ok(remain_partitions);
+//         }
+//     }
+//
+//     fn need_spill(&self) -> bool {
+//         self.settings.check_spill()
+//     }
+//
+//     fn prepare_spill_payload(&mut self) -> Result<bool> {
+//         // self.partitions.data.f
+//         Ok(false)
+//     }
+//
+//     async fn flush_spill_payload(&mut self) -> Result<bool> {
+//         Ok(false)
+//     }
+// }
+
+#[derive(Debug)]
+struct Partitions {
+    data: BTreeMap<isize, Vec<(AggregateMeta, DataBlock)>>,
+}
+
+impl Partitions {
+    pub fn create() -> Partitions {
+        Partitions {
+            data: BTreeMap::new(),
+        }
+    }
+
+    pub fn add_data(&mut self, meta: AggregateMeta, block: DataBlock) {
+        if matches!(&meta, AggregateMeta::AggregatePayload(v) if v.payload.len() == 0)
+            || matches!(&meta, AggregateMeta::InFlightPayload(_) if block.is_empty())
+        {
+            return;
+        }
+
+        match self.data.entry(meta.get_partition()) {
+            std::collections::btree_map::Entry::Vacant(v) => {
+                v.insert(vec![(meta, block)]);
+            }
+            std::collections::btree_map::Entry::Occupied(mut v) => {
+                v.get_mut().push((meta, block));
+            }
+        };
+    }
+
+    pub fn min_partition(&self) -> Option<isize> {
+        self.data.keys().min().cloned()
+    }
+
+    pub fn take_partition(&mut self, partition: isize) -> Vec<(AggregateMeta, DataBlock)> {
+        self.data.remove(&partition).unwrap_or_default()
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs
new file mode 100644
index 0000000000000..142c20e452acd
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs
@@ -0,0 +1,87 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_catalog::table_context::TableContext;
+use databend_common_exception::Result;
+use databend_common_pipeline_core::processors::ProcessorPtr;
+use databend_common_pipeline_core::Pipe;
+use databend_common_pipeline_core::PipeItem;
+use databend_common_pipeline_core::Pipeline;
+use databend_common_storage::DataOperator;
+
+use super::TransformFinalAggregate;
+use super::TransformPartitionRestore;
+use crate::pipelines::processors::transforms::aggregator::transform_partition_align::TransformPartitionAlign;
+use crate::pipelines::processors::transforms::aggregator::transform_partition_dispatch::TransformPartitionDispatch;
+use crate::pipelines::processors::transforms::aggregator::transform_partition_exchange::ExchangePartition;
+use crate::pipelines::processors::transforms::aggregator::transform_partition_resorting::ResortingPartition;
+use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
+use crate::sessions::QueryContext;
+
+pub static SINGLE_LEVEL_BUCKET_NUM: isize = -1;
+
+pub fn build_final_aggregate(
+    ctx: Arc<QueryContext>,
+    pipeline: &mut Pipeline,
+    params: Arc<AggregatorParams>,
+) -> Result<()> {
+    let settings = ctx.get_settings();
+    let pipe_size = settings.get_max_threads()? as usize;
+
+    // 1. resorting partition
+    pipeline.exchange(1, Arc::new(ResortingPartition::create()))?;
+
+    // 2. align partitions
+    pipeline.add_transform(|input, output| {
+        Ok(ProcessorPtr::create(Box::new(
+            TransformPartitionAlign::create(ctx.clone(), params.clone(), input, output)?,
+        )))
+    })?;
+
+    // 3. dispatch partition
+    let processor = TransformPartitionDispatch::create(pipe_size);
+    let inputs_port = processor.get_inputs();
+    let outputs_port = processor.get_outputs();
+    pipeline.add_pipe(Pipe::create(inputs_port.len(), outputs_port.len(), vec![
+        PipeItem::create(
+            ProcessorPtr::create(Box::new(processor)),
+            inputs_port,
+            outputs_port,
+        ),
+    ]));
+
+    // 4. restore partition
+    let operator = DataOperator::instance().spill_operator();
+    pipeline.add_transform(|input, output| {
+        TransformPartitionRestore::create(input, output, operator.clone(), params.clone())
+    })?;
+
+    // 5. exchange local
+    let pipe_size = pipeline.output_len();
+    pipeline.exchange(
+        pipe_size,
+        ExchangePartition::create(pipe_size, params.clone()),
+    )?;
+
+    // 6. final aggregate
+    pipeline.add_transform(|input, output| {
+        Ok(ProcessorPtr::create(TransformFinalAggregate::try_create(
+            input.clone(),
+            output.clone(),
+            params.clone(),
+        )?))
+    })
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_dispatch.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_dispatch.rs
new file mode 100644
index 0000000000000..bfe7e87258e75
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_dispatch.rs
@@ -0,0 +1,263 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+use databend_common_exception::ErrorCode;
+use databend_common_exception::Result;
+use databend_common_expression::BlockMetaInfoDowncast;
+use databend_common_expression::DataBlock;
+use databend_common_pipeline_core::processors::Event;
+use databend_common_pipeline_core::processors::EventCause;
+use databend_common_pipeline_core::processors::InputPort;
+use databend_common_pipeline_core::processors::OutputPort;
+use databend_common_pipeline_core::processors::Processor;
+
+use crate::pipelines::processors::transforms::aggregator::AggregateMeta;
+
+#[derive(PartialEq)]
+enum PortStatus {
+    Idle,
+    NeedData,
+    Finished,
+}
+
+struct PortWithStatus<Port> {
+    pub status: PortStatus,
+    pub port: Arc<Port>,
+}
+
+pub struct TransformPartitionDispatch {
+    initialized: bool,
+
+    finished_outputs: usize,
+    waiting_outputs: VecDeque<usize>,
+    waiting_outputs_2: VecDeque<usize>,
+
+    sync_final_partition: bool,
+    sent_final_partition: Vec<bool>,
+    synchronized_final_partition: Vec<bool>,
+
+    current_data: Option<DataBlock>,
+
+    input: Arc<InputPort>,
+    outputs: Vec<PortWithStatus<OutputPort>>,
+}
+
+impl TransformPartitionDispatch {
+    pub fn create(outputs: usize) -> TransformPartitionDispatch {
+        let mut outputs_port = Vec::with_capacity(outputs);
+
+        for _index in 0..outputs {
+            outputs_port.push(PortWithStatus {
+                status: PortStatus::Idle,
+                port: OutputPort::create(),
+            });
+        }
+
+        TransformPartitionDispatch {
+            initialized: false,
+            finished_outputs: 0,
+            outputs: outputs_port,
+            input: InputPort::create(),
+            waiting_outputs: VecDeque::with_capacity(outputs),
+            waiting_outputs_2: VecDeque::with_capacity(outputs),
+            current_data: None,
+            sync_final_partition: false,
+            sent_final_partition: vec![false; outputs],
+            synchronized_final_partition: vec![false; outputs],
+        }
+    }
+
+    pub fn get_inputs(&self) -> Vec<Arc<InputPort>> {
+        vec![self.input.clone()]
+    }
+
+    pub fn get_outputs(&self) -> Vec<Arc<OutputPort>> {
+        self.outputs.iter().map(|x| x.port.clone()).collect()
+    }
+
+    fn unpark_block(mut data_block: DataBlock) -> Result<(AggregateMeta, DataBlock)> {
+        let Some(meta) = data_block.take_meta() else {
+            return Err(ErrorCode::Internal(
+                "Internal, TransformPartitionBucket only recv DataBlock with meta.",
+            ));
+        };
+
+        let Some(meta) = AggregateMeta::downcast_from(meta) else {
+            return Err(ErrorCode::Internal(
+                "Internal, TransformPartitionBucket only recv AggregateMeta".to_string(),
+            ));
+        };
+
+        Ok((meta, data_block))
+    }
+}
+
+impl Processor for TransformPartitionDispatch {
+    fn name(&self) -> String {
+        String::from("TransformPartitionDispatch")
+    }
+
+    fn as_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    #[allow(clippy::collapsible_if)]
+    fn event_with_cause(&mut self, cause: EventCause) -> Result<Event> {
+        if let EventCause::Output(output_index) = &cause {
+            let output = &mut self.outputs[*output_index];
+
+            if output.port.is_finished() {
+                if output.status != PortStatus::Finished {
+                    self.finished_outputs += 1;
+                    output.status = PortStatus::Finished;
+                }
+            } else if output.port.can_push() {
+                if self.sync_final_partition {
+                    if self.sent_final_partition[*output_index] {
+                        output.status = PortStatus::Idle;
+                        self.waiting_outputs_2.push_back(*output_index);
+                        self.synchronized_final_partition[*output_index] = true;
+                    } else {
+                        self.sent_final_partition[*output_index] = true;
+                        output.port.push_data(Ok(DataBlock::empty_with_meta(
+                            AggregateMeta::create_final(vec![]),
+                        )));
+                    }
+                } else if output.status != PortStatus::NeedData {
+                    output.status = PortStatus::NeedData;
+                    self.waiting_outputs.push_back(*output_index);
+                }
+            }
+        }
+
+        if !self.initialized && !self.waiting_outputs.is_empty() {
+            self.initialized = true;
+            self.input.set_need_data();
+        }
+
+        if self.finished_outputs == self.outputs.len() {
+            self.input.finish();
+            return Ok(Event::Finished);
+        }
+
+        if !self.sync_final_partition && self.input.has_data() && self.current_data.is_none() {
+            let data_block = self.input.pull_data().unwrap()?;
+            let (meta, data_block) = Self::unpark_block(data_block)?;
+
+            match meta {
+                AggregateMeta::FinalPartition(_) => {
+                    self.sync_final_partition = true;
+                    self.input.set_not_need_data();
+                }
+                meta => {
+                    self.input.set_need_data();
+                    self.current_data = Some(data_block.add_meta(Some(Box::new(meta)))?);
+                }
+            };
+        }
+
+        while self.sync_final_partition {
+            while let Some(output_index) = self.waiting_outputs.pop_front() {
+                if self.outputs[output_index].port.is_finished() {
+                    self.synchronized_final_partition[output_index] = true;
+
+                    if self.outputs[output_index].status != PortStatus::Finished {
+                        self.finished_outputs += 1;
+                        self.outputs[output_index].status = PortStatus::Finished;
+                    }
+                }
+
+                self.outputs[output_index]
+                    .port
+                    .push_data(Ok(DataBlock::empty_with_meta(AggregateMeta::create_final(
+                        vec![],
+                    ))));
+                self.sent_final_partition[output_index] = true;
+                self.outputs[output_index].status = PortStatus::Idle;
+            }
+
+            for (idx, synchronized) in self.synchronized_final_partition.iter().enumerate() {
+                if !synchronized && !self.outputs[idx].port.is_finished() {
+                    return Ok(Event::NeedConsume);
+                }
+            }
+
+            self.sync_final_partition = false;
+            self.sent_final_partition = vec![false; self.sent_final_partition.len()];
+            self.synchronized_final_partition = vec![false; self.sent_final_partition.len()];
+            std::mem::swap(&mut self.waiting_outputs, &mut self.waiting_outputs_2);
+
+            if self.input.has_data() {
+                let data_block = self.input.pull_data().unwrap()?;
+                let (meta, data_block) = Self::unpark_block(data_block)?;
+
+                match meta {
+                    AggregateMeta::FinalPartition(_) => {
+                        self.sync_final_partition = true;
+                        self.input.set_not_need_data();
+                        continue;
+                    }
+                    meta => {
+                        self.current_data = Some(data_block.add_meta(Some(Box::new(meta)))?);
+                    }
+                };
+            }
+
+            self.input.set_need_data();
+            break;
+        }
+
+        while !self.waiting_outputs.is_empty() && self.current_data.is_some() {
+            let output_index = self.waiting_outputs.pop_front().unwrap();
+
+            // Port is finished when waiting.
+            if self.outputs[output_index].port.is_finished() {
+                if self.outputs[output_index].status != PortStatus::Finished {
+                    self.finished_outputs += 1;
+                    self.outputs[output_index].status = PortStatus::Finished;
+                }
+
+                continue;
+            }
+
+            if let Some(data_block) = self.current_data.take() {
+                self.outputs[output_index].port.push_data(Ok(data_block));
+                self.outputs[output_index].status = PortStatus::Idle;
+                self.input.set_need_data();
+            }
+        }
+
+        if self.finished_outputs == self.outputs.len() {
+            self.input.finish();
+            return Ok(Event::Finished);
+        }
+
+        if self.input.is_finished() && self.current_data.is_none() {
+            for output in &self.outputs {
+                output.port.finish();
+            }
+
+            return Ok(Event::Finished);
+        }
+
+        match self.waiting_outputs.is_empty() {
+            true => Ok(Event::NeedConsume),
+            false => Ok(Event::NeedData),
+        }
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_exchange.rs
new file mode 100644
index 0000000000000..67a500714be7e
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_exchange.rs
@@ -0,0 +1,241 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::btree_map::Entry;
+use std::collections::BTreeMap;
+use std::sync::Arc;
+
+use bumpalo::Bump;
+use databend_common_exception::ErrorCode;
+use databend_common_exception::Result;
+use databend_common_expression::AggregateHashTable;
+use databend_common_expression::BlockMetaInfoDowncast;
+use databend_common_expression::DataBlock;
+use databend_common_expression::HashTableConfig;
+use databend_common_expression::InputColumns;
+use databend_common_expression::Payload;
+use databend_common_expression::PayloadFlushState;
+use databend_common_expression::ProbeState;
+use databend_common_pipeline_core::processors::Exchange;
+
+use crate::pipelines::processors::transforms::aggregator::AggregateMeta;
+use crate::pipelines::processors::transforms::aggregator::AggregatePayload;
+use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
+use crate::pipelines::processors::transforms::aggregator::InFlightPayload;
+
+const HASH_SEED: u64 = 9263883436177860930;
+
+pub struct ExchangePartition {
+    merge_window_size: usize,
+    params: Arc<AggregatorParams>,
+}
+
+impl ExchangePartition {
+    pub fn create(merge_window_size: usize, params: Arc<AggregatorParams>) -> Arc<Self> {
+        Arc::new(ExchangePartition {
+            merge_window_size,
+            params,
+        })
+    }
+}
+
+impl ExchangePartition {
+    fn partition_aggregate(mut payload: AggregatePayload, n: usize) -> Result<Vec<DataBlock>> {
+        if payload.payload.len() == 0 {
+            return Ok(vec![]);
+        }
+
+        let mut repartition_payloads = Vec::with_capacity(n);
+        let group_types = payload.payload.group_types.clone();
+        let aggrs = payload.payload.aggrs.clone();
+        let mut state = PayloadFlushState::default();
+
+        for _ in 0..repartition_payloads.capacity() {
+            repartition_payloads.push(Payload::new(
+                payload.payload.arena.clone(),
+                group_types.clone(),
+                aggrs.clone(),
+                payload.payload.states_layout.clone(),
+            ));
+        }
+
+        // scatter each page of the payload.
+        while payload
+            .payload
+            .scatter_with_seed::<HASH_SEED>(&mut state, repartition_payloads.len())
+        {
+            // copy to the corresponding bucket.
+            for (idx, bucket) in repartition_payloads.iter_mut().enumerate() {
+                let count = state.probe_state.partition_count[idx];
+
+                if count > 0 {
+                    let sel = &state.probe_state.partition_entries[idx];
+                    bucket.copy_rows(sel, count, &state.addresses);
+                }
+            }
+        }
+
+        payload.payload.state_move_out = true;
+
+        let mut partitions = Vec::with_capacity(repartition_payloads.len());
+
+        for repartition_payload in repartition_payloads {
+            partitions.push(DataBlock::empty_with_meta(
+                AggregateMeta::create_agg_payload(
+                    repartition_payload,
+                    payload.partition,
+                    payload.max_partition,
+                    payload.global_max_partition,
+                ),
+            ));
+        }
+
+        Ok(partitions)
+    }
+
+    fn partition_flight_payload(
+        &self,
+        payload: InFlightPayload,
+        block: DataBlock,
+        n: usize,
+    ) -> Result<Vec<DataBlock>> {
+        let rows_num = block.num_rows();
+
+        if rows_num == 0 {
+            return Ok(vec![]);
+        }
+
+        let group_len = self.params.group_data_types.len();
+
+        let mut state = ProbeState::default();
+
+        // create single partition hash table for deserialize
+        let capacity = AggregateHashTable::get_capacity_for_count(rows_num);
+        let config = HashTableConfig::default().with_initial_radix_bits(0);
+        let mut hashtable = AggregateHashTable::new_directly(
+            self.params.group_data_types.clone(),
+            self.params.aggregate_functions.clone(),
+            config,
+            capacity,
+            Arc::new(Bump::new()),
+            false,
+        );
+
+        let num_states = self.params.num_states();
+        let states_index: Vec<usize> = (0..num_states).collect();
+        let agg_states = InputColumns::new_block_proxy(&states_index, &block);
+
+        let group_index: Vec<usize> = (num_states..(num_states + group_len)).collect();
+        let group_columns = InputColumns::new_block_proxy(&group_index, &block);
+
+        let _ = hashtable.add_groups(
+            &mut state,
+            group_columns,
+            &[(&[]).into()],
+            agg_states,
+            rows_num,
+        )?;
+
+        hashtable.payload.mark_min_cardinality();
+        assert_eq!(hashtable.payload.payloads.len(), 1);
+
+        Self::partition_aggregate(
+            AggregatePayload {
+                partition: payload.partition,
+                payload: hashtable.payload.payloads.pop().unwrap(),
+                max_partition: payload.max_partition,
+                global_max_partition: payload.global_max_partition,
+            },
+            n,
+        )
+    }
+}
+
+impl Exchange for ExchangePartition {
+    const NAME: &'static str = "AggregatePartitionExchange";
+    const MULTIWAY_SORT: bool = false;
+
+    fn partition(&self, mut data_block: DataBlock, n: usize) -> Result<Vec<DataBlock>> {
+        let Some(meta) = data_block.take_meta() else {
+            return Err(ErrorCode::Internal(
+                "AggregatePartitionExchange only recv AggregateMeta",
+            ));
+        };
+
+        let Some(meta) = AggregateMeta::downcast_from(meta) else {
+            return Err(ErrorCode::Internal(
+                "AggregatePartitionExchange only recv AggregateMeta",
+            ));
+        };
+
+        match meta {
+            // already restore in upstream
+            AggregateMeta::SpilledPayload(_) => unreachable!(),
+            AggregateMeta::FinalPartition(_) => Ok(vec![]),
+            AggregateMeta::AggregatePayload(payload) => Self::partition_aggregate(payload, n),
+            AggregateMeta::InFlightPayload(payload) => {
+                self.partition_flight_payload(payload, data_block, n)
+            }
+        }
+    }
+
+    fn output_window_size(&self) -> usize {
+        self.merge_window_size
+    }
+
+    fn merge_output(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
+        let mut blocks = BTreeMap::<isize, AggregatePayload>::new();
+        for mut data_block in data_blocks {
+            let Some(meta) = data_block.take_meta() else {
+                return Err(ErrorCode::Internal(
+                    "Internal, ExchangePartition only recv DataBlock with meta.",
+                ));
+            };
+
+            let Some(aggregate_meta) = AggregateMeta::downcast_from(meta) else {
+                return Err(ErrorCode::Internal(
+                    "Internal, ExchangePartition only recv DataBlock with meta.",
+                ));
+            };
+
+            let mut payload = match aggregate_meta {
+                AggregateMeta::SpilledPayload(_) => unreachable!(),
+                AggregateMeta::FinalPartition(_) => unreachable!(),
+                AggregateMeta::InFlightPayload(_) => unreachable!(),
+                AggregateMeta::AggregatePayload(payload) => payload,
+            };
+
+            match blocks.entry(payload.partition) {
+                Entry::Vacant(v) => {
+                    v.insert(payload);
+                }
+                Entry::Occupied(mut v) => {
+                    payload.payload.state_move_out = true;
+                    v.get_mut()
+                        .payload
+                        .arena
+                        .extend(payload.payload.arena.clone());
+                    v.get_mut().payload.combine(payload.payload);
+                }
+            }
+        }
+
+        Ok(blocks
+            .into_values()
+            .map(|payload| {
+                DataBlock::empty_with_meta(Box::new(AggregateMeta::AggregatePayload(payload)))
+            })
+            .collect())
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_resorting.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_resorting.rs
new file mode 100644
index 0000000000000..00697bc99efe4
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_resorting.rs
@@ -0,0 +1,107 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::cmp::Ordering;
+use std::sync::atomic::AtomicUsize;
+use std::sync::atomic::Ordering as AtomicOrdering;
+
+use databend_common_exception::Result;
+use databend_common_expression::BlockMetaInfoDowncast;
+use databend_common_expression::DataBlock;
+use databend_common_pipeline_core::processors::Exchange;
+
+use crate::pipelines::processors::transforms::aggregator::AggregateMeta;
+
+pub struct ResortingPartition {
+    global_max_partition: AtomicUsize,
+}
+
+impl ResortingPartition {
+    pub fn create() -> Self {
+        ResortingPartition {
+            global_max_partition: AtomicUsize::new(0),
+        }
+    }
+
+    fn block_number(meta: &AggregateMeta) -> (isize, usize) {
+        (meta.get_sorting_partition(), meta.get_max_partition())
+    }
+}
+
+impl Exchange for ResortingPartition {
+    const NAME: &'static str = "PartitionResorting";
+    const MULTIWAY_SORT: bool = true;
+
+    fn partition(&self, mut data_block: DataBlock, n: usize) -> Result<Vec<DataBlock>> {
+        debug_assert_eq!(n, 1);
+
+        let Some(meta) = data_block.take_meta() else {
+            return Ok(vec![data_block]);
+        };
+
+        let Some(_) = AggregateMeta::downcast_ref_from(&meta) else {
+            return Ok(vec![data_block]);
+        };
+
+        let global_max_partition = self.global_max_partition.load(AtomicOrdering::SeqCst);
+        let mut meta = AggregateMeta::downcast_from(meta).unwrap();
+        meta.set_global_max_partition(global_max_partition);
+
+        Ok(vec![data_block.add_meta(Some(Box::new(meta)))?])
+    }
+
+    fn init_way(
+        &self,
+        _index: usize,
+        first_data: &DataBlock,
+    ) -> databend_common_exception::Result<()> {
+        let max_partition = match first_data.get_meta() {
+            None => 0,
+            Some(meta) => match AggregateMeta::downcast_ref_from(meta) {
+                None => 0,
+                Some(v) => v.get_global_max_partition(),
+            },
+        };
+
+        self.global_max_partition
+            .fetch_max(max_partition, std::sync::atomic::Ordering::SeqCst);
+        Ok(())
+    }
+
+    fn sorting_function(left_block: &DataBlock, right_block: &DataBlock) -> Ordering {
+        let Some(left_meta) = left_block.get_meta() else {
+            return Ordering::Equal;
+        };
+        let Some(left_meta) = AggregateMeta::downcast_ref_from(left_meta) else {
+            return Ordering::Equal;
+        };
+
+        let Some(right_meta) = right_block.get_meta() else {
+            return Ordering::Equal;
+        };
+        let Some(right_meta) = AggregateMeta::downcast_ref_from(right_meta) else {
+            return Ordering::Equal;
+        };
+
+        let (l_partition, l_max_partition) = ResortingPartition::block_number(left_meta);
+        let (r_partition, r_max_partition) = ResortingPartition::block_number(right_meta);
+
+        // ORDER BY max_partition asc, partition asc, idx asc
+        match l_max_partition.cmp(&r_max_partition) {
+            Ordering::Less => Ordering::Less,
+            Ordering::Greater => Ordering::Greater,
+            Ordering::Equal => l_partition.cmp(&r_partition),
+        }
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_restore.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_restore.rs
new file mode 100644
index 0000000000000..6cc9cab78642a
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_restore.rs
@@ -0,0 +1,196 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+use byteorder::BigEndian;
+use byteorder::ReadBytesExt;
+use databend_common_exception::Result;
+use databend_common_expression::arrow::deserialize_column;
+use databend_common_expression::BlockMetaInfoDowncast;
+use databend_common_expression::DataBlock;
+use databend_common_pipeline_core::processors::Event;
+use databend_common_pipeline_core::processors::InputPort;
+use databend_common_pipeline_core::processors::OutputPort;
+use databend_common_pipeline_core::processors::Processor;
+use databend_common_pipeline_core::processors::ProcessorPtr;
+use opendal::Operator;
+
+use crate::pipelines::processors::transforms::aggregator::AggregateMeta;
+use crate::pipelines::processors::transforms::aggregator::AggregatorParams;
+use crate::pipelines::processors::transforms::aggregator::SpilledPayload;
+
+type DeserializingMeta = (AggregateMeta, VecDeque<Vec<u8>>);
+
+pub struct TransformPartitionRestore {
+    input: Arc<InputPort>,
+    output: Arc<OutputPort>,
+
+    operator: Operator,
+    params: Arc<AggregatorParams>,
+    output_data: Option<DataBlock>,
+    reading_meta: Option<AggregateMeta>,
+    deserializing_meta: Option<DeserializingMeta>,
+}
+
+#[async_trait::async_trait]
+impl Processor for TransformPartitionRestore {
+    fn name(&self) -> String {
+        String::from("TransformPartitionRestore")
+    }
+
+    fn as_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn event(&mut self) -> Result<Event> {
+        if self.output.is_finished() {
+            self.input.finish();
+            return Ok(Event::Finished);
+        }
+
+        if !self.output.can_push() {
+            self.input.set_not_need_data();
+            return Ok(Event::NeedConsume);
+        }
+
+        if let Some(output_data) = self.output_data.take() {
+            self.output.push_data(Ok(output_data));
+            return Ok(Event::NeedConsume);
+        }
+
+        if self.deserializing_meta.is_some() {
+            self.input.set_not_need_data();
+            return Ok(Event::Sync);
+        }
+
+        if self.reading_meta.is_some() {
+            self.input.set_not_need_data();
+            return Ok(Event::Async);
+        }
+
+        if self.input.has_data() {
+            let mut data_block = self.input.pull_data().unwrap()?;
+
+            if let Some(block_meta) = data_block
+                .get_meta()
+                .and_then(AggregateMeta::downcast_ref_from)
+            {
+                if matches!(block_meta, AggregateMeta::SpilledPayload(_)) {
+                    self.input.set_not_need_data();
+                    let block_meta = data_block.take_meta().unwrap();
+                    self.reading_meta = AggregateMeta::downcast_from(block_meta);
+                    return Ok(Event::Async);
+                }
+            }
+
+            self.output.push_data(Ok(data_block));
+            return Ok(Event::NeedConsume);
+        }
+
+        if self.input.is_finished() {
+            self.output.finish();
+            return Ok(Event::Finished);
+        }
+
+        self.input.set_need_data();
+        Ok(Event::NeedData)
+    }
+
+    fn process(&mut self) -> Result<()> {
+        if let Some((meta, mut read_data)) = self.deserializing_meta.take() {
+            match meta {
+                AggregateMeta::SpilledPayload(payload) => {
+                    debug_assert!(read_data.len() == 1);
+                    let data = read_data.pop_front().unwrap();
+                    self.output_data = Some(self.deserialize(payload, data)?);
+                }
+                _ => unreachable!(),
+            }
+        }
+
+        Ok(())
+    }
+
+    #[async_backtrace::framed]
+    async fn async_process(&mut self) -> Result<()> {
+        if let Some(block_meta) = self.reading_meta.take() {
+            match &block_meta {
+                AggregateMeta::SpilledPayload(payload) => {
+                    let data = self
+                        .operator
+                        .read_with(&payload.location)
+                        .range(payload.data_range.clone())
+                        .await?
+                        .to_vec();
+
+                    self.deserializing_meta = Some((block_meta, VecDeque::from(vec![data])));
+                }
+                _ => unreachable!(),
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl TransformPartitionRestore {
+    pub fn create(
+        input: Arc<InputPort>,
+        output: Arc<OutputPort>,
+        operator: Operator,
+        params: Arc<AggregatorParams>,
+    ) -> Result<ProcessorPtr> {
+        Ok(ProcessorPtr::create(Box::new(TransformPartitionRestore {
+            input,
+            output,
+            operator,
+            params,
+            output_data: None,
+            reading_meta: None,
+            deserializing_meta: None,
+        })))
+    }
+
+    fn deserialize(&self, payload: SpilledPayload, data: Vec<u8>) -> Result<DataBlock> {
+        let columns = self.params.group_data_types.len() + self.params.aggregate_functions.len();
+
+        let mut blocks = vec![];
+        let mut cursor = data.as_slice();
+
+        while !cursor.is_empty() {
+            let mut block_columns = Vec::with_capacity(columns);
+
+            for _idx in 0..columns {
+                let column_size = cursor.read_u64::<BigEndian>().unwrap();
+                let (left, right) = cursor.split_at(column_size as usize);
+                block_columns.push(deserialize_column(left).unwrap());
+                cursor = right;
+            }
+
+            let block1 = DataBlock::new_from_columns(block_columns);
+            blocks.push(block1);
+        }
+
+        let block = DataBlock::concat(&blocks).unwrap();
+
+        block.add_meta(Some(AggregateMeta::create_in_flight_payload(
+            payload.partition,
+            payload.max_partition,
+            payload.global_max_partition,
+        )))
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs
index 3cf2d0621770f..3feed32aadce9 100644
--- a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs
+++ b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs
@@ -541,6 +541,7 @@ impl HashJoinProbeState {
             } else {
                 None
             };
+
             result_blocks.push(self.merge_eq_block(
                 probe_block,
                 build_block,
diff --git a/src/query/service/src/pipelines/processors/transforms/range_join/merge_join_state.rs b/src/query/service/src/pipelines/processors/transforms/range_join/merge_join_state.rs
index 10e0f7cd547c0..1e512b1a2c630 100644
--- a/src/query/service/src/pipelines/processors/transforms/range_join/merge_join_state.rs
+++ b/src/query/service/src/pipelines/processors/transforms/range_join/merge_join_state.rs
@@ -137,6 +137,7 @@ impl RangeJoinState {
                 j += 1;
             }
         }
+
         Ok(result_blocks)
     }
 
diff --git a/src/query/service/src/pipelines/processors/transforms/range_join/range_join_state.rs b/src/query/service/src/pipelines/processors/transforms/range_join/range_join_state.rs
index bdc79cd0df387..83a2aa153414f 100644
--- a/src/query/service/src/pipelines/processors/transforms/range_join/range_join_state.rs
+++ b/src/query/service/src/pipelines/processors/transforms/range_join/range_join_state.rs
@@ -92,16 +92,22 @@ impl RangeJoinState {
     }
 
     pub(crate) fn sink_right(&self, block: DataBlock) -> Result<()> {
-        // Sink block to right table
-        let mut right_table = self.right_table.write();
-        right_table.push(block);
+        if !block.is_empty() || block.get_meta().is_some() {
+            // Sink block to right table
+            let mut right_table = self.right_table.write();
+            right_table.push(block);
+        }
+
         Ok(())
     }
 
     pub(crate) fn sink_left(&self, block: DataBlock) -> Result<()> {
-        // Sink block to left table
-        let mut left_table = self.left_table.write();
-        left_table.push(block);
+        if !block.is_empty() || block.get_meta().is_some() {
+            // Sink block to left table
+            let mut left_table = self.left_table.write();
+            left_table.push(block);
+        }
+
         Ok(())
     }
 
@@ -133,9 +139,11 @@ impl RangeJoinState {
 
     pub fn task_id(&self) -> Option<usize> {
         let task_id = self.finished_tasks.fetch_add(1, atomic::Ordering::SeqCst);
+
         if task_id >= self.tasks.read().len() as u64 {
             return None;
         }
+
         Some(task_id as usize)
     }
 
@@ -176,6 +184,7 @@ impl RangeJoinState {
         let left_table = self.left_table.read();
         // Right table is bigger than left table
         let mut right_table = self.right_table.write();
+
         if !left_table.is_empty()
             && !right_table.is_empty()
             && left_table.len() * right_table.len() < max_threads
@@ -272,6 +281,7 @@ impl RangeJoinState {
             right_offset = 0;
             left_offset += left_block.num_rows();
         }
+
         Ok(())
     }
 }
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs
deleted file mode 100644
index 4aa65ba175a83..0000000000000
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use databend_common_catalog::table_context::TableContext;
-use databend_common_exception::Result;
-use databend_common_pipeline_core::Pipeline;
-use databend_common_settings::FlightCompression;
-
-use super::exchange_params::MergeExchangeParams;
-use crate::servers::flight::v1::exchange::serde::TransformExchangeDeserializer;
-use crate::servers::flight::v1::exchange::serde::TransformExchangeSerializer;
-use crate::servers::flight::v1::exchange::serde::TransformScatterExchangeSerializer;
-use crate::servers::flight::v1::exchange::DataExchange;
-use crate::servers::flight::v1::exchange::ExchangeSorting;
-use crate::servers::flight::v1::exchange::ShuffleExchangeParams;
-use crate::servers::flight::v1::scatter::BroadcastFlightScatter;
-use crate::servers::flight::v1::scatter::FlightScatter;
-use crate::servers::flight::v1::scatter::HashFlightScatter;
-use crate::sessions::QueryContext;
-
-pub trait ExchangeInjector: Send + Sync + 'static {
-    fn flight_scatter(
-        &self,
-        ctx: &Arc<QueryContext>,
-        exchange: &DataExchange,
-    ) -> Result<Arc<Box<dyn FlightScatter>>>;
-
-    fn exchange_sorting(&self) -> Option<Arc<dyn ExchangeSorting>>;
-
-    fn apply_merge_serializer(
-        &self,
-        params: &MergeExchangeParams,
-        compression: Option<FlightCompression>,
-        pipeline: &mut Pipeline,
-    ) -> Result<()>;
-
-    fn apply_shuffle_serializer(
-        &self,
-        params: &ShuffleExchangeParams,
-        compression: Option<FlightCompression>,
-        pipeline: &mut Pipeline,
-    ) -> Result<()>;
-
-    fn apply_merge_deserializer(
-        &self,
-        params: &MergeExchangeParams,
-        pipeline: &mut Pipeline,
-    ) -> Result<()>;
-
-    fn apply_shuffle_deserializer(
-        &self,
-        params: &ShuffleExchangeParams,
-        pipeline: &mut Pipeline,
-    ) -> Result<()>;
-}
-
-pub struct DefaultExchangeInjector;
-
-impl DefaultExchangeInjector {
-    pub fn create() -> Arc<dyn ExchangeInjector> {
-        Arc::new(DefaultExchangeInjector {})
-    }
-}
-
-impl ExchangeInjector for DefaultExchangeInjector {
-    fn flight_scatter(
-        &self,
-        ctx: &Arc<QueryContext>,
-        exchange: &DataExchange,
-    ) -> Result<Arc<Box<dyn FlightScatter>>> {
-        Ok(Arc::new(match exchange {
-            DataExchange::Merge(_) => unreachable!(),
-            DataExchange::Broadcast(exchange) => Box::new(BroadcastFlightScatter::try_create(
-                exchange.destination_ids.len(),
-            )?),
-            DataExchange::ShuffleDataExchange(exchange) => {
-                let local_id = &ctx.get_cluster().local_id;
-                let local_pos = exchange
-                    .destination_ids
-                    .iter()
-                    .position(|x| x == local_id)
-                    .unwrap();
-                HashFlightScatter::try_create(
-                    ctx.get_function_context()?,
-                    exchange.shuffle_keys.clone(),
-                    exchange.destination_ids.len(),
-                    local_pos,
-                )?
-            }
-        }))
-    }
-
-    fn exchange_sorting(&self) -> Option<Arc<dyn ExchangeSorting>> {
-        None
-    }
-
-    fn apply_merge_serializer(
-        &self,
-        params: &MergeExchangeParams,
-        compression: Option<FlightCompression>,
-        pipeline: &mut Pipeline,
-    ) -> Result<()> {
-        pipeline.add_transform(|input, output| {
-            TransformExchangeSerializer::create(input, output, params, compression)
-        })
-    }
-
-    fn apply_shuffle_serializer(
-        &self,
-        params: &ShuffleExchangeParams,
-        compression: Option<FlightCompression>,
-        pipeline: &mut Pipeline,
-    ) -> Result<()> {
-        pipeline.add_transform(|input, output| {
-            TransformScatterExchangeSerializer::create(input, output, compression, params)
-        })
-    }
-
-    fn apply_merge_deserializer(
-        &self,
-        params: &MergeExchangeParams,
-        pipeline: &mut Pipeline,
-    ) -> Result<()> {
-        pipeline.add_transform(|input, output| {
-            Ok(TransformExchangeDeserializer::create(
-                input,
-                output,
-                &params.schema,
-            ))
-        })
-    }
-
-    fn apply_shuffle_deserializer(
-        &self,
-        params: &ShuffleExchangeParams,
-        pipeline: &mut Pipeline,
-    ) -> Result<()> {
-        pipeline.add_transform(|input, output| {
-            Ok(TransformExchangeDeserializer::create(
-                input,
-                output,
-                &params.schema,
-            ))
-        })
-    }
-}
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
index d158ccf3c9b89..c3d399b7c1895 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
@@ -63,12 +63,12 @@ use crate::servers::flight::v1::actions::init_query_fragments;
 use crate::servers::flight::v1::actions::INIT_QUERY_FRAGMENTS;
 use crate::servers::flight::v1::actions::START_PREPARED_QUERY;
 use crate::servers::flight::v1::exchange::DataExchange;
-use crate::servers::flight::v1::exchange::DefaultExchangeInjector;
-use crate::servers::flight::v1::exchange::ExchangeInjector;
 use crate::servers::flight::v1::packets::Edge;
 use crate::servers::flight::v1::packets::QueryEnv;
 use crate::servers::flight::v1::packets::QueryFragment;
 use crate::servers::flight::v1::packets::QueryFragments;
+use crate::servers::flight::v1::scatter::BroadcastFlightScatter;
+use crate::servers::flight::v1::scatter::HashFlightScatter;
 use crate::servers::flight::FlightClient;
 use crate::servers::flight::FlightExchange;
 use crate::servers::flight::FlightReceiver;
@@ -470,9 +470,7 @@ impl DataExchangeManager {
             None => Err(ErrorCode::Internal("Query not exists.")),
             Some(query_coordinator) => {
                 assert!(query_coordinator.fragment_exchanges.is_empty());
-                let injector = DefaultExchangeInjector::create();
-                let mut build_res =
-                    query_coordinator.subscribe_fragment(&ctx, fragment_id, injector)?;
+                let mut build_res = query_coordinator.subscribe_fragment(&ctx, fragment_id)?;
 
                 let exchanges = std::mem::take(&mut query_coordinator.statistics_exchanges);
                 let statistics_receiver = StatisticsReceiver::spawn_receiver(&ctx, exchanges)?;
@@ -533,7 +531,6 @@ impl DataExchangeManager {
         &self,
         query_id: &str,
         fragment_id: usize,
-        injector: Arc<dyn ExchangeInjector>,
     ) -> Result<PipelineBuildResult> {
         let queries_coordinator_guard = self.queries_coordinator.lock();
         let queries_coordinator = unsafe { &mut *queries_coordinator_guard.deref().get() };
@@ -548,7 +545,7 @@ impl DataExchangeManager {
                     .query_ctx
                     .clone();
 
-                query_coordinator.subscribe_fragment(&query_ctx, fragment_id, injector)
+                query_coordinator.subscribe_fragment(&query_ctx, fragment_id)
             }
         }
     }
@@ -735,7 +732,6 @@ impl QueryCoordinator {
         &mut self,
         ctx: &Arc<QueryContext>,
         fragment_id: usize,
-        injector: Arc<dyn ExchangeInjector>,
     ) -> Result<PipelineBuildResult> {
         // Merge pipelines if exist locally pipeline
         if let Some(mut fragment_coordinator) = self.fragments_coordinator.remove(&fragment_id) {
@@ -759,21 +755,14 @@ impl QueryCoordinator {
                 fragment_coordinator
                     .pipeline_build_res
                     .as_ref()
-                    .map(|x| x.exchange_injector.clone())
-                    .ok_or_else(|| {
-                        ErrorCode::Internal("Pipeline build result is none, It's a bug")
-                    })?,
+                    .map(|x| x.enable_multiway_sort)
+                    .unwrap_or(false),
             )?;
             let mut build_res = fragment_coordinator.pipeline_build_res.unwrap();
 
             // Add exchange data transform.
 
-            ExchangeTransform::via(
-                ctx,
-                &exchange_params,
-                &mut build_res.main_pipeline,
-                injector,
-            )?;
+            ExchangeTransform::via(ctx, &exchange_params, &mut build_res.main_pipeline)?;
 
             return Ok(build_res);
         }
@@ -821,10 +810,8 @@ impl QueryCoordinator {
                     coordinator
                         .pipeline_build_res
                         .as_ref()
-                        .map(|x| x.exchange_injector.clone())
-                        .ok_or_else(|| {
-                            ErrorCode::Internal("Pipeline build result is none, It's a bug")
-                        })?,
+                        .map(|x| x.enable_multiway_sort)
+                        .unwrap_or(false),
                 )?,
             );
         }
@@ -916,13 +903,13 @@ impl FragmentCoordinator {
     pub fn create_exchange_params(
         &self,
         info: &QueryInfo,
-        exchange_injector: Arc<dyn ExchangeInjector>,
+        enable_multiway_sort: bool,
     ) -> Result<ExchangeParams> {
         if let Some(data_exchange) = &self.data_exchange {
             return match data_exchange {
                 DataExchange::Merge(exchange) => {
                     Ok(ExchangeParams::MergeExchange(MergeExchangeParams {
-                        exchange_injector: exchange_injector.clone(),
+                        enable_multiway_sort,
                         schema: self.physical_plan.output_schema()?,
                         fragment_id: self.fragment_id,
                         query_id: info.query_id.to_string(),
@@ -933,26 +920,30 @@ impl FragmentCoordinator {
                 }
                 DataExchange::Broadcast(exchange) => {
                     Ok(ExchangeParams::ShuffleExchange(ShuffleExchangeParams {
-                        exchange_injector: exchange_injector.clone(),
+                        enable_multiway_sort,
                         schema: self.physical_plan.output_schema()?,
                         fragment_id: self.fragment_id,
                         query_id: info.query_id.to_string(),
                         executor_id: info.current_executor.to_string(),
                         destination_ids: exchange.destination_ids.to_owned(),
-                        shuffle_scatter: exchange_injector
-                            .flight_scatter(&info.query_ctx, data_exchange)?,
+                        shuffle_scatter: Arc::new(Box::new(BroadcastFlightScatter::try_create(
+                            exchange.destination_ids.len(),
+                        )?)),
                     }))
                 }
                 DataExchange::ShuffleDataExchange(exchange) => {
                     Ok(ExchangeParams::ShuffleExchange(ShuffleExchangeParams {
-                        exchange_injector: exchange_injector.clone(),
+                        enable_multiway_sort,
                         schema: self.physical_plan.output_schema()?,
                         fragment_id: self.fragment_id,
                         query_id: info.query_id.to_string(),
                         executor_id: info.current_executor.to_string(),
                         destination_ids: exchange.destination_ids.to_owned(),
-                        shuffle_scatter: exchange_injector
-                            .flight_scatter(&info.query_ctx, data_exchange)?,
+                        shuffle_scatter: Arc::new(HashFlightScatter::try_create(
+                            &info.query_ctx,
+                            exchange.shuffle_keys.clone(),
+                            &exchange.destination_ids,
+                        )?),
                     }))
                 }
             };
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_params.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_params.rs
index 799efe506affe..15607c0454f8d 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_params.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_params.rs
@@ -16,7 +16,6 @@ use std::sync::Arc;
 
 use databend_common_expression::DataSchemaRef;
 
-use crate::servers::flight::v1::exchange::ExchangeInjector;
 use crate::servers::flight::v1::scatter::FlightScatter;
 
 #[derive(Clone)]
@@ -27,7 +26,7 @@ pub struct ShuffleExchangeParams {
     pub schema: DataSchemaRef,
     pub destination_ids: Vec<String>,
     pub shuffle_scatter: Arc<Box<dyn FlightScatter>>,
-    pub exchange_injector: Arc<dyn ExchangeInjector>,
+    pub enable_multiway_sort: bool,
 }
 
 #[derive(Clone)]
@@ -37,8 +36,8 @@ pub struct MergeExchangeParams {
     pub destination_id: String,
     pub schema: DataSchemaRef,
     pub ignore_exchange: bool,
+    pub enable_multiway_sort: bool,
     pub allow_adjust_parallelism: bool,
-    pub exchange_injector: Arc<dyn ExchangeInjector>,
 }
 
 pub enum ExchangeParams {
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_sink.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_sink.rs
index 73ed08eb1c021..ef606b21e3ac6 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_sink.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_sink.rs
@@ -16,20 +16,15 @@ use std::sync::Arc;
 
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
-use databend_common_expression::BlockMetaInfoDowncast;
-use databend_common_expression::DataBlock;
-use databend_common_pipeline_core::processors::ProcessorPtr;
 use databend_common_pipeline_core::Pipe;
-use databend_common_pipeline_core::PipeItem;
 use databend_common_pipeline_core::Pipeline;
 
 use super::exchange_params::ExchangeParams;
 use super::exchange_sink_writer::create_writer_item;
-use super::exchange_sorting::ExchangeSorting;
-use super::exchange_sorting::TransformExchangeSorting;
 use super::exchange_transform_shuffle::exchange_shuffle;
-use super::serde::ExchangeSerializeMeta;
 use crate::clusters::ClusterHelper;
+use crate::pipelines::processors::transforms::aggregator::FlightExchange;
+use crate::servers::flight::v1::scatter::MergeFlightScatter;
 use crate::sessions::QueryContext;
 use crate::sessions::TableContext;
 
@@ -55,29 +50,29 @@ impl ExchangeSink {
                     )));
                 }
 
-                let exchange_injector = &params.exchange_injector;
+                let settings = ctx.get_settings();
+                let compression = settings.get_query_flight_compression()?;
+
+                let nodes = vec![];
+                match params.enable_multiway_sort {
+                    true => pipeline.exchange(
+                        1,
+                        FlightExchange::<true>::create(
+                            nodes,
+                            compression,
+                            Arc::new(Box::new(MergeFlightScatter)),
+                        ),
+                    )?,
+                    false => pipeline.exchange(
+                        1,
+                        FlightExchange::<false>::create(
+                            nodes,
+                            compression,
+                            Arc::new(Box::new(MergeFlightScatter)),
+                        ),
+                    )?,
+                };
 
-                if !params.ignore_exchange {
-                    let settings = ctx.get_settings();
-                    let compression = settings.get_query_flight_compression()?;
-                    exchange_injector.apply_merge_serializer(params, compression, pipeline)?;
-                }
-
-                if !params.ignore_exchange && exchange_injector.exchange_sorting().is_some() {
-                    let output_len = pipeline.output_len();
-                    let sorting = SinkExchangeSorting::create();
-                    let transform = TransformExchangeSorting::create(output_len, sorting);
-
-                    let output = transform.get_output();
-                    let inputs = transform.get_inputs();
-                    pipeline.add_pipe(Pipe::create(output_len, 1, vec![PipeItem::create(
-                        ProcessorPtr::create(Box::new(transform)),
-                        inputs,
-                        vec![output],
-                    )]));
-                }
-
-                pipeline.try_resize(1)?;
                 assert_eq!(senders.len(), 1);
                 pipeline.add_pipe(Pipe::create(1, 0, vec![create_writer_item(
                     senders.remove(0),
@@ -111,27 +106,3 @@ impl ExchangeSink {
         }
     }
 }
-
-struct SinkExchangeSorting;
-
-impl SinkExchangeSorting {
-    pub fn create() -> Arc<dyn ExchangeSorting> {
-        Arc::new(SinkExchangeSorting {})
-    }
-}
-
-impl ExchangeSorting for SinkExchangeSorting {
-    fn block_number(&self, data_block: &DataBlock) -> Result<isize> {
-        let block_meta = data_block.get_meta();
-        let shuffle_meta = block_meta
-            .and_then(ExchangeSerializeMeta::downcast_ref_from)
-            .ok_or_else(|| {
-                ErrorCode::Internal(format!(
-                    "Failed to downcast ExchangeSerializeMeta from BlockMeta: {:?}",
-                    block_meta
-                ))
-            })?;
-
-        Ok(shuffle_meta.block_number)
-    }
-}
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_sink_writer.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_sink_writer.rs
index abebc2ba6a254..704359391bc0a 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_sink_writer.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_sink_writer.rs
@@ -69,9 +69,10 @@ impl AsyncSink for ExchangeWriterSink {
     #[async_backtrace::framed]
     async fn consume(&mut self, mut data_block: DataBlock) -> Result<bool> {
         let serialize_meta = match data_block.take_meta() {
-            None => Err(ErrorCode::Internal(
-                "ExchangeWriterSink only recv ExchangeSerializeMeta.",
-            )),
+            None => Err(ErrorCode::Internal(format!(
+                "ExchangeWriterSink only recv ExchangeSerializeMeta. {:?}",
+                data_block
+            ))),
             Some(block_meta) => ExchangeSerializeMeta::downcast_from(block_meta).ok_or_else(|| {
                 ErrorCode::Internal("ExchangeWriterSink only recv ExchangeSerializeMeta.")
             }),
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_sorting.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_sorting.rs
deleted file mode 100644
index 8cc931d64641a..0000000000000
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_sorting.rs
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::any::Any;
-use std::sync::Arc;
-
-use databend_common_exception::Result;
-use databend_common_expression::DataBlock;
-use databend_common_pipeline_core::processors::Event;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::Processor;
-
-pub trait ExchangeSorting: Send + Sync + 'static {
-    fn block_number(&self, data_block: &DataBlock) -> Result<isize>;
-}
-
-// N input one output
-pub struct TransformExchangeSorting {
-    inputs: Vec<Arc<InputPort>>,
-    output: Arc<OutputPort>,
-    sorting: Arc<dyn ExchangeSorting>,
-
-    buffer_len: usize,
-    buffer: Vec<Option<(isize, DataBlock)>>,
-}
-
-impl TransformExchangeSorting {
-    pub fn create(inputs: usize, sorting: Arc<dyn ExchangeSorting>) -> TransformExchangeSorting {
-        let output = OutputPort::create();
-        let mut buffer = Vec::with_capacity(inputs);
-        let mut inputs_port = Vec::with_capacity(inputs);
-
-        for _ in 0..inputs {
-            buffer.push(None);
-            inputs_port.push(InputPort::create());
-        }
-
-        TransformExchangeSorting {
-            output,
-            sorting,
-            buffer,
-            buffer_len: 0,
-            inputs: inputs_port,
-        }
-    }
-
-    pub fn get_output(&self) -> Arc<OutputPort> {
-        self.output.clone()
-    }
-    pub fn get_inputs(&self) -> Vec<Arc<InputPort>> {
-        self.inputs.clone()
-    }
-}
-
-#[async_trait::async_trait]
-impl Processor for TransformExchangeSorting {
-    fn name(&self) -> String {
-        String::from("TransformExchangeSorting")
-    }
-
-    fn as_any(&mut self) -> &mut dyn Any {
-        self
-    }
-
-    fn event(&mut self) -> Result<Event> {
-        if self.output.is_finished() {
-            for input in &self.inputs {
-                input.finish();
-            }
-
-            return Ok(Event::Finished);
-        }
-
-        let mut unready_inputs = false;
-        let mut all_inputs_finished = true;
-        for (index, input) in self.inputs.iter().enumerate() {
-            if input.is_finished() {
-                continue;
-            }
-
-            all_inputs_finished = false;
-            if self.buffer[index].is_none() {
-                if input.has_data() {
-                    let data_block = input.pull_data().unwrap()?;
-                    let block_number = self.sorting.block_number(&data_block)?;
-                    self.buffer[index] = Some((block_number, data_block));
-                    self.buffer_len += 1;
-                    input.set_need_data();
-                    continue;
-                }
-
-                unready_inputs = true;
-            }
-
-            input.set_need_data();
-        }
-
-        if !self.output.can_push() {
-            return Ok(Event::NeedConsume);
-        }
-
-        if all_inputs_finished && self.buffer_len == 0 {
-            self.output.finish();
-            return Ok(Event::Finished);
-        }
-
-        if !unready_inputs {
-            let mut min_index = 0;
-            let mut min_value = isize::MAX;
-            for (index, buffer) in self.buffer.iter().enumerate() {
-                if let Some((block_number, _)) = buffer {
-                    if *block_number < min_value {
-                        min_index = index;
-                        min_value = *block_number;
-                    }
-                }
-            }
-
-            if let Some((_, block)) = self.buffer[min_index].take() {
-                self.buffer_len -= 1;
-                self.output.push_data(Ok(block));
-                return Ok(Event::NeedConsume);
-            }
-        }
-
-        Ok(Event::NeedData)
-    }
-}
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_source.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_source.rs
index acdfb66de123e..2d4bcdef3d32a 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_source.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_source.rs
@@ -28,14 +28,13 @@ use super::exchange_params::ExchangeParams;
 use super::exchange_params::MergeExchangeParams;
 use super::exchange_source_reader::ExchangeSourceReader;
 use crate::clusters::ClusterHelper;
-use crate::servers::flight::v1::exchange::ExchangeInjector;
+use crate::pipelines::processors::transforms::aggregator::TransformAggregateDeserializer;
 use crate::sessions::QueryContext;
 
 /// Add Exchange Source to the pipeline.
 pub fn via_exchange_source(
     ctx: Arc<QueryContext>,
     params: &MergeExchangeParams,
-    injector: Arc<dyn ExchangeInjector>,
     pipeline: &mut Pipeline,
 ) -> Result<()> {
     // UpstreamTransform --->  DummyTransform   --->    DummyTransform      --->  DownstreamTransform
@@ -93,5 +92,7 @@ pub fn via_exchange_source(
         pipeline.try_resize(last_output_len)?;
     }
 
-    injector.apply_merge_deserializer(params, pipeline)
+    pipeline.add_transform(|input, output| {
+        TransformAggregateDeserializer::try_create(input, output, &params.schema)
+    })
 }
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_transform.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_transform.rs
index 47be1d1f473f8..0afb02e227455 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_transform.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_transform.rs
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::collections::HashMap;
 use std::sync::Arc;
 
 use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::Result;
-use databend_common_pipeline_core::processors::create_resize_item;
 use databend_common_pipeline_core::Pipe;
 use databend_common_pipeline_core::Pipeline;
 use databend_common_pipeline_transforms::processors::create_dummy_item;
@@ -27,7 +27,7 @@ use super::exchange_source::via_exchange_source;
 use super::exchange_source_reader::create_reader_item;
 use super::exchange_transform_shuffle::exchange_shuffle;
 use crate::clusters::ClusterHelper;
-use crate::servers::flight::v1::exchange::ExchangeInjector;
+use crate::pipelines::processors::transforms::aggregator::TransformAggregateDeserializer;
 use crate::sessions::QueryContext;
 
 pub struct ExchangeTransform;
@@ -37,11 +37,10 @@ impl ExchangeTransform {
         ctx: &Arc<QueryContext>,
         params: &ExchangeParams,
         pipeline: &mut Pipeline,
-        injector: Arc<dyn ExchangeInjector>,
     ) -> Result<()> {
         match params {
             ExchangeParams::MergeExchange(params) => {
-                via_exchange_source(ctx.clone(), params, injector, pipeline)
+                via_exchange_source(ctx.clone(), params, pipeline)
             }
             ExchangeParams::ShuffleExchange(params) => {
                 exchange_shuffle(ctx, params, pipeline)?;
@@ -58,8 +57,7 @@ impl ExchangeTransform {
                 let senders = flight_senders.into_iter();
                 for (destination_id, sender) in params.destination_ids.iter().zip(senders) {
                     items.push(match destination_id == &params.executor_id {
-                        true if max_threads == 1 => create_dummy_item(),
-                        true => create_resize_item(1, max_threads),
+                        true => create_dummy_item(),
                         false => create_writer_item(
                             sender,
                             false,
@@ -70,28 +68,52 @@ impl ExchangeTransform {
                     });
                 }
 
-                let mut nodes_source = 0;
                 let receivers = exchange_manager.get_flight_receiver(&exchange_params)?;
+                let nodes_source = receivers.len();
+
+                let mut lookup = params
+                    .destination_ids
+                    .iter()
+                    .cloned()
+                    .enumerate()
+                    .map(|(x, y)| (y, x))
+                    .collect::<HashMap<_, _>>();
+
+                let mut nodes = Vec::with_capacity(nodes_source);
+                let mut reorder = Vec::with_capacity(nodes_source);
+                nodes.push(params.executor_id.clone());
+                reorder.push(lookup.remove(&params.executor_id).unwrap());
+
                 for (destination_id, receiver) in receivers {
-                    if destination_id != params.executor_id {
-                        nodes_source += 1;
-                        items.push(create_reader_item(
-                            receiver,
-                            &destination_id,
-                            &params.executor_id,
-                            params.fragment_id,
-                        ));
+                    if destination_id == params.executor_id {
+                        continue;
                     }
-                }
 
-                let new_outputs = max_threads + nodes_source;
-                pipeline.add_pipe(Pipe::create(len, new_outputs, items));
+                    nodes.push(destination_id.clone());
+                    reorder.push(lookup.remove(&destination_id).unwrap());
 
-                if params.exchange_injector.exchange_sorting().is_none() {
-                    pipeline.try_resize(max_threads)?;
+                    items.push(create_reader_item(
+                        receiver,
+                        &destination_id,
+                        &params.executor_id,
+                        params.fragment_id,
+                    ));
                 }
 
-                injector.apply_shuffle_deserializer(params, pipeline)
+                pipeline.add_pipe(Pipe::create(len, nodes_source, items));
+
+                match params.enable_multiway_sort {
+                    true => pipeline.reorder_inputs(reorder),
+                    false => pipeline.try_resize(max_threads)?,
+                };
+
+                pipeline.add_transform(|input, output| {
+                    TransformAggregateDeserializer::try_create(
+                        input.clone(),
+                        output.clone(),
+                        &params.schema,
+                    )
+                })
             }
         }
     }
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_transform_scatter.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_transform_scatter.rs
deleted file mode 100644
index 0b69270eab8e3..0000000000000
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_transform_scatter.rs
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use databend_common_expression::DataBlock;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::ProcessorPtr;
-use databend_common_pipeline_transforms::processors::Transform;
-use databend_common_pipeline_transforms::processors::Transformer;
-
-use super::exchange_transform_shuffle::ExchangeShuffleMeta;
-use crate::servers::flight::v1::scatter::FlightScatter;
-
-pub struct ScatterTransform {
-    scatter: Arc<Box<dyn FlightScatter>>,
-}
-
-impl ScatterTransform {
-    pub fn create(
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-        scatter: Arc<Box<dyn FlightScatter>>,
-    ) -> ProcessorPtr {
-        ProcessorPtr::create(Transformer::create(input, output, ScatterTransform {
-            scatter,
-        }))
-    }
-}
-
-impl Transform for ScatterTransform {
-    const NAME: &'static str = "ScatterTransform";
-
-    fn transform(&mut self, data: DataBlock) -> databend_common_exception::Result<DataBlock> {
-        let blocks = self.scatter.execute(data)?;
-
-        Ok(DataBlock::empty_with_meta(ExchangeShuffleMeta::create(
-            blocks,
-        )))
-    }
-}
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_transform_shuffle.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_transform_shuffle.rs
index 9c3242147c8d3..9cefcde59d441 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_transform_shuffle.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_transform_shuffle.rs
@@ -12,35 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::any::Any;
-use std::collections::VecDeque;
 use std::fmt::Debug;
 use std::fmt::Formatter;
 use std::sync::Arc;
 
 use databend_common_catalog::table_context::TableContext;
-use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::local_block_meta_serde;
 use databend_common_expression::BlockMetaInfo;
-use databend_common_expression::BlockMetaInfoDowncast;
 use databend_common_expression::BlockMetaInfoPtr;
 use databend_common_expression::DataBlock;
-use databend_common_pipeline_core::processors::Event;
-use databend_common_pipeline_core::processors::EventCause;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::Processor;
-use databend_common_pipeline_core::processors::ProcessorPtr;
-use databend_common_pipeline_core::Pipe;
-use databend_common_pipeline_core::PipeItem;
 use databend_common_pipeline_core::Pipeline;
 
 use super::exchange_params::ShuffleExchangeParams;
-use super::exchange_sorting::ExchangeSorting;
-use super::exchange_sorting::TransformExchangeSorting;
-use super::exchange_transform_scatter::ScatterTransform;
-use super::serde::ExchangeSerializeMeta;
+use crate::pipelines::processors::transforms::aggregator::FlightExchange;
 use crate::sessions::QueryContext;
 
 pub struct ExchangeShuffleMeta {
@@ -64,397 +49,39 @@ local_block_meta_serde!(ExchangeShuffleMeta);
 #[typetag::serde(name = "exchange_shuffle")]
 impl BlockMetaInfo for ExchangeShuffleMeta {}
 
-struct OutputsBuffer {
-    inner: Vec<VecDeque<DataBlock>>,
-}
-
-impl OutputsBuffer {
-    pub fn create(capacity: usize, outputs: usize) -> OutputsBuffer {
-        OutputsBuffer {
-            inner: vec![capacity; outputs]
-                .into_iter()
-                .map(VecDeque::with_capacity)
-                .collect::<Vec<_>>(),
-        }
-    }
-
-    pub fn is_all_empty(&self) -> bool {
-        self.inner.iter().all(|x| x.is_empty())
-    }
-
-    pub fn is_empty(&self, index: usize) -> bool {
-        self.inner[index].is_empty()
-    }
-
-    pub fn is_full(&self) -> bool {
-        self.inner.iter().any(|x| x.len() == x.capacity())
-    }
-
-    pub fn clear(&mut self, index: usize) {
-        self.inner[index].clear();
-    }
-
-    pub fn pop(&mut self, index: usize) -> Option<DataBlock> {
-        self.inner[index].pop_front()
-    }
-
-    pub fn push_back(&mut self, index: usize, block: DataBlock) -> usize {
-        self.inner[index].push_back(block);
-        self.inner[index].len()
-    }
-}
-
-#[derive(PartialEq)]
-enum PortStatus {
-    Idle,
-    HasData,
-    NeedData,
-    Finished,
-}
-
-struct PortWithStatus<Port> {
-    pub status: PortStatus,
-    pub port: Arc<Port>,
-}
-
-struct ExchangeShuffleTransform {
-    initialized: bool,
-
-    finished_inputs: usize,
-    finished_outputs: usize,
-
-    waiting_outputs: Vec<usize>,
-    waiting_inputs: VecDeque<usize>,
-
-    buffer: OutputsBuffer,
-    inputs: Vec<PortWithStatus<InputPort>>,
-    outputs: Vec<PortWithStatus<OutputPort>>,
-}
-
-impl Processor for ExchangeShuffleTransform {
-    fn name(&self) -> String {
-        String::from("ExchangeShuffleTransform")
-    }
-
-    fn as_any(&mut self) -> &mut dyn Any {
-        self
-    }
-
-    fn event_with_cause(&mut self, cause: EventCause) -> Result<Event> {
-        if let EventCause::Output(output_index) = &cause {
-            let output = &mut self.outputs[*output_index];
-
-            if output.port.is_finished() {
-                if output.status != PortStatus::Finished {
-                    self.finished_outputs += 1;
-                    output.status = PortStatus::Finished;
-                }
-
-                self.buffer.clear(*output_index);
-
-                self.wakeup_inputs();
-                self.wakeup_outputs();
-            } else if output.port.can_push() {
-                if !self.buffer.is_empty(*output_index) {
-                    let data_block = self.buffer.pop(*output_index).unwrap();
-                    output.status = PortStatus::Idle;
-                    output.port.push_data(Ok(data_block));
-
-                    self.wakeup_inputs();
-                    self.wakeup_outputs();
-                } else if output.status != PortStatus::NeedData {
-                    output.status = PortStatus::NeedData;
-                    self.waiting_outputs.push(*output_index);
-                }
-            }
-        }
-
-        if !self.initialized && !self.waiting_outputs.is_empty() {
-            self.initialized = true;
-            for input in &self.inputs {
-                input.port.set_need_data();
-            }
-        }
-
-        if self.finished_outputs == self.outputs.len() {
-            for input in &self.inputs {
-                input.port.finish();
-            }
-
-            return Ok(Event::Finished);
-        }
-
-        if let EventCause::Input(input_index) = &cause {
-            let input = &mut self.inputs[*input_index];
-
-            if input.port.is_finished() {
-                if input.status != PortStatus::Finished {
-                    self.finished_inputs += 1;
-                    input.status = PortStatus::Finished;
-                }
-
-                self.wakeup_outputs();
-                self.wakeup_inputs();
-            } else if input.port.has_data() {
-                if !self.buffer.is_full() {
-                    self.take_input_data_into_buffer(*input_index);
-
-                    self.wakeup_outputs();
-                    self.wakeup_inputs();
-                } else if input.status != PortStatus::HasData {
-                    input.status = PortStatus::HasData;
-                    self.waiting_inputs.push_back(*input_index);
-                }
-            }
-        }
-
-        if self.finished_outputs == self.outputs.len() {
-            for input in &self.inputs {
-                input.port.finish();
-            }
-
-            return Ok(Event::Finished);
-        }
-
-        if self.finished_inputs == self.inputs.len() {
-            for (index, output) in self.outputs.iter_mut().enumerate() {
-                if self.buffer.is_empty(index) && output.status != PortStatus::Finished {
-                    self.finished_outputs += 1;
-                    output.status = PortStatus::Finished;
-                    output.port.finish();
-                }
-            }
-
-            if self.buffer.is_all_empty() {
-                return Ok(Event::Finished);
-            }
-        }
-
-        match self.waiting_outputs.is_empty() {
-            true => Ok(Event::NeedConsume),
-            false => Ok(Event::NeedData),
-        }
-    }
-
-    fn details_status(&self) -> Option<String> {
-        #[derive(Debug)]
-        #[allow(dead_code)]
-        struct Display {
-            queue_status: Vec<(usize, usize)>,
-            inputs: usize,
-            finished_inputs: usize,
-            outputs: usize,
-            finished_outputs: usize,
-
-            waiting_outputs: Vec<usize>,
-            waiting_inputs: VecDeque<usize>,
-        }
-
-        let mut queue_status = vec![];
-        for (idx, queue) in self.buffer.inner.iter().enumerate() {
-            queue_status.push((idx, queue.len()));
-        }
-
-        Some(format!("{:?}", Display {
-            queue_status,
-            inputs: self.inputs.len(),
-            outputs: self.outputs.len(),
-            finished_inputs: self.finished_inputs,
-            finished_outputs: self.finished_outputs,
-            waiting_inputs: self.waiting_inputs.clone(),
-            waiting_outputs: self.waiting_outputs.clone(),
-        }))
-    }
-}
-
-impl ExchangeShuffleTransform {
-    fn wakeup_inputs(&mut self) {
-        while !self.waiting_inputs.is_empty() && !self.buffer.is_full() {
-            let input_index = self.waiting_inputs.pop_front().unwrap();
-
-            self.take_input_data_into_buffer(input_index);
-        }
-    }
-
-    fn wakeup_outputs(&mut self) {
-        let mut new_waiting_output = Vec::with_capacity(self.waiting_outputs.len());
-
-        for waiting_output in &self.waiting_outputs {
-            let output = &mut self.outputs[*waiting_output];
-
-            if output.port.is_finished() {
-                if output.status != PortStatus::Finished {
-                    self.finished_outputs += 1;
-                    output.status = PortStatus::Finished;
-                }
-
-                self.buffer.clear(*waiting_output);
-                continue;
-            }
-
-            if self.buffer.is_empty(*waiting_output) {
-                new_waiting_output.push(*waiting_output);
-                continue;
-            }
-
-            let data_block = self.buffer.pop(*waiting_output).unwrap();
-            output.status = PortStatus::Idle;
-            output.port.push_data(Ok(data_block));
-        }
-
-        self.waiting_outputs = new_waiting_output;
-    }
-
-    fn take_input_data_into_buffer(&mut self, input_index: usize) {
-        let input = &mut self.inputs[input_index];
-
-        input.status = PortStatus::Idle;
-        let mut data_block = input.port.pull_data().unwrap().unwrap();
-
-        if let Some(block_meta) = data_block.take_meta() {
-            if let Some(shuffle_meta) = ExchangeShuffleMeta::downcast_from(block_meta) {
-                for (index, block) in shuffle_meta.blocks.into_iter().enumerate() {
-                    if (!block.is_empty() || block.get_meta().is_some())
-                        && self.outputs[index].status != PortStatus::Finished
-                    {
-                        self.buffer.push_back(index, block);
-                    }
-                }
-            }
-        }
-
-        if input.port.is_finished() {
-            if input.status != PortStatus::Finished {
-                self.finished_inputs += 1;
-                input.status = PortStatus::Finished;
-            }
-
-            return;
-        }
-
-        input.port.set_need_data();
-    }
-}
-
-impl ExchangeShuffleTransform {
-    pub fn create(inputs: usize, outputs: usize, buffer: usize) -> ExchangeShuffleTransform {
-        let mut inputs_port = Vec::with_capacity(inputs);
-        let mut outputs_port = Vec::with_capacity(outputs);
-
-        for _index in 0..inputs {
-            inputs_port.push(PortWithStatus {
-                status: PortStatus::Idle,
-                port: InputPort::create(),
-            });
-        }
-
-        for _index in 0..outputs {
-            outputs_port.push(PortWithStatus {
-                status: PortStatus::Idle,
-                port: OutputPort::create(),
-            });
-        }
-
-        ExchangeShuffleTransform {
-            initialized: false,
-            finished_inputs: 0,
-            finished_outputs: 0,
-            inputs: inputs_port,
-            outputs: outputs_port,
-            buffer: OutputsBuffer::create(buffer, outputs),
-            waiting_inputs: VecDeque::with_capacity(inputs),
-            waiting_outputs: Vec::with_capacity(outputs),
-        }
-    }
-
-    pub fn get_inputs(&self) -> Vec<Arc<InputPort>> {
-        self.inputs.iter().map(|x| x.port.clone()).collect()
-    }
-
-    pub fn get_outputs(&self) -> Vec<Arc<OutputPort>> {
-        self.outputs.iter().map(|x| x.port.clone()).collect()
-    }
-}
-
 // Scatter the data block and push it to the corresponding output port
 pub fn exchange_shuffle(
     ctx: &Arc<QueryContext>,
     params: &ShuffleExchangeParams,
     pipeline: &mut Pipeline,
 ) -> Result<()> {
-    // append scatter transform
-    pipeline.add_transform(|input, output| {
-        Ok(ScatterTransform::create(
-            input,
-            output,
-            params.shuffle_scatter.clone(),
-        ))
-    })?;
-
-    let exchange_injector = &params.exchange_injector;
+    if let Some(last_pipe) = pipeline.pipes.last() {
+        for item in &last_pipe.items {
+            item.processor.configure_peer_nodes(&params.destination_ids);
+        }
+    }
 
     let settings = ctx.get_settings();
     let compression = settings.get_query_flight_compression()?;
-    exchange_injector.apply_shuffle_serializer(params, compression, pipeline)?;
-
-    let output_len = pipeline.output_len();
-    if let Some(exchange_sorting) = &exchange_injector.exchange_sorting() {
-        let sorting = ShuffleExchangeSorting::create(exchange_sorting.clone());
-        let transform = TransformExchangeSorting::create(output_len, sorting);
 
-        let output = transform.get_output();
-        let inputs = transform.get_inputs();
-        pipeline.add_pipe(Pipe::create(output_len, 1, vec![PipeItem::create(
-            ProcessorPtr::create(Box::new(transform)),
-            inputs,
-            vec![output],
-        )]));
-    }
-
-    let inputs_size = pipeline.output_len();
-    let outputs_size = params.destination_ids.len();
-    let transform = ExchangeShuffleTransform::create(inputs_size, outputs_size, output_len);
-
-    let inputs = transform.get_inputs();
-    let outputs = transform.get_outputs();
-    pipeline.add_pipe(Pipe::create(inputs_size, outputs_size, vec![
-        PipeItem::create(ProcessorPtr::create(Box::new(transform)), inputs, outputs),
-    ]));
+    match params.enable_multiway_sort {
+        true => pipeline.exchange(
+            params.destination_ids.len(),
+            FlightExchange::<true>::create(
+                params.destination_ids.clone(),
+                compression,
+                params.shuffle_scatter.clone(),
+            ),
+        )?,
+        false => pipeline.exchange(
+            params.destination_ids.len(),
+            FlightExchange::<false>::create(
+                params.destination_ids.clone(),
+                compression,
+                params.shuffle_scatter.clone(),
+            ),
+        )?,
+    };
 
     Ok(())
 }
-
-struct ShuffleExchangeSorting {
-    inner: Arc<dyn ExchangeSorting>,
-}
-
-impl ShuffleExchangeSorting {
-    pub fn create(inner: Arc<dyn ExchangeSorting>) -> Arc<dyn ExchangeSorting> {
-        Arc::new(ShuffleExchangeSorting { inner })
-    }
-}
-
-impl ExchangeSorting for ShuffleExchangeSorting {
-    fn block_number(&self, data_block: &DataBlock) -> Result<isize> {
-        let block_meta = data_block.get_meta();
-        let shuffle_meta = block_meta
-            .and_then(ExchangeShuffleMeta::downcast_ref_from)
-            .unwrap();
-
-        for block in &shuffle_meta.blocks {
-            if let Some(block_meta) = block.get_meta() {
-                if let Some(block_meta) = ExchangeSerializeMeta::downcast_ref_from(block_meta) {
-                    return Ok(block_meta.block_number);
-                }
-            }
-
-            if !block.is_empty() || block.get_meta().is_some() {
-                return self.inner.block_number(block);
-            }
-        }
-
-        Err(ErrorCode::Internal(
-            "Internal, ShuffleExchangeSorting only recv ExchangeSerializeMeta.",
-        ))
-    }
-}
diff --git a/src/query/service/src/servers/flight/v1/exchange/mod.rs b/src/query/service/src/servers/flight/v1/exchange/mod.rs
index 194f2cbe1e3e5..ac51beb3bb7de 100644
--- a/src/query/service/src/servers/flight/v1/exchange/mod.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/mod.rs
@@ -13,16 +13,13 @@
 // limitations under the License.
 
 mod data_exchange;
-mod exchange_injector;
 mod exchange_manager;
 mod exchange_params;
 mod exchange_sink;
 mod exchange_sink_writer;
-mod exchange_sorting;
 mod exchange_source;
 mod exchange_source_reader;
 mod exchange_transform;
-mod exchange_transform_scatter;
 mod exchange_transform_shuffle;
 mod statistics_receiver;
 mod statistics_sender;
@@ -33,10 +30,7 @@ pub use data_exchange::BroadcastExchange;
 pub use data_exchange::DataExchange;
 pub use data_exchange::MergeExchange;
 pub use data_exchange::ShuffleDataExchange;
-pub use exchange_injector::DefaultExchangeInjector;
-pub use exchange_injector::ExchangeInjector;
 pub use exchange_manager::DataExchangeManager;
 pub use exchange_params::MergeExchangeParams;
 pub use exchange_params::ShuffleExchangeParams;
-pub use exchange_sorting::ExchangeSorting;
 pub use exchange_transform_shuffle::ExchangeShuffleMeta;
diff --git a/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs b/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs
index 5a757f37ba299..2d58d9d9c707b 100644
--- a/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs
@@ -23,46 +23,38 @@ use arrow_flight::SchemaAsIpc;
 use arrow_ipc::writer::DictionaryTracker;
 use arrow_ipc::writer::IpcDataGenerator;
 use arrow_ipc::writer::IpcWriteOptions;
-use arrow_ipc::CompressionType;
 use arrow_schema::ArrowError;
 use arrow_schema::Schema as ArrowSchema;
 use bytes::Bytes;
-use databend_common_base::runtime::profile::Profile;
-use databend_common_base::runtime::profile::ProfileStatisticsName;
-use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::local_block_meta_serde;
 use databend_common_expression::BlockMetaInfo;
 use databend_common_expression::BlockMetaInfoPtr;
 use databend_common_expression::DataBlock;
-use databend_common_io::prelude::bincode_serialize_into_buf;
 use databend_common_io::prelude::BinaryWrite;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::ProcessorPtr;
-use databend_common_pipeline_transforms::processors::BlockMetaTransform;
-use databend_common_pipeline_transforms::processors::BlockMetaTransformer;
-use databend_common_pipeline_transforms::processors::Transform;
-use databend_common_pipeline_transforms::processors::Transformer;
-use databend_common_pipeline_transforms::processors::UnknownMode;
-use databend_common_settings::FlightCompression;
-
-use crate::servers::flight::v1::exchange::ExchangeShuffleMeta;
-use crate::servers::flight::v1::exchange::MergeExchangeParams;
-use crate::servers::flight::v1::exchange::ShuffleExchangeParams;
+
 use crate::servers::flight::v1::packets::DataPacket;
 use crate::servers::flight::v1::packets::FragmentData;
 
 pub struct ExchangeSerializeMeta {
-    pub block_number: isize,
+    pub partition: isize,
+    pub max_partition: usize,
+    pub global_max_partition: usize,
     pub packet: Vec<DataPacket>,
 }
 
 impl ExchangeSerializeMeta {
-    pub fn create(block_number: isize, packet: Vec<DataPacket>) -> BlockMetaInfoPtr {
+    pub fn create(
+        partition: isize,
+        max_partition: usize,
+        global_max_partition: usize,
+        packet: Vec<DataPacket>,
+    ) -> BlockMetaInfoPtr {
         Box::new(ExchangeSerializeMeta {
             packet,
-            block_number,
+            partition,
+            max_partition,
+            global_max_partition,
         })
     }
 }
@@ -78,120 +70,25 @@ local_block_meta_serde!(ExchangeSerializeMeta);
 #[typetag::serde(name = "exchange_serialize")]
 impl BlockMetaInfo for ExchangeSerializeMeta {}
 
-pub struct TransformExchangeSerializer {
-    options: IpcWriteOptions,
-}
-
-impl TransformExchangeSerializer {
-    pub fn create(
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-        _params: &MergeExchangeParams,
-        compression: Option<FlightCompression>,
-    ) -> Result<ProcessorPtr> {
-        let compression = match compression {
-            None => None,
-            Some(compression) => match compression {
-                FlightCompression::Lz4 => Some(CompressionType::LZ4_FRAME),
-                FlightCompression::Zstd => Some(CompressionType::ZSTD),
-            },
-        };
-
-        Ok(ProcessorPtr::create(Transformer::create(
-            input,
-            output,
-            TransformExchangeSerializer {
-                options: IpcWriteOptions::default().try_with_compression(compression)?,
-            },
-        )))
-    }
-}
-
-impl Transform for TransformExchangeSerializer {
-    const NAME: &'static str = "ExchangeSerializerTransform";
-
-    fn transform(&mut self, data_block: DataBlock) -> Result<DataBlock> {
-        Profile::record_usize_profile(ProfileStatisticsName::ExchangeRows, data_block.num_rows());
-        serialize_block(0, data_block, &self.options)
-    }
-}
-
-pub struct TransformScatterExchangeSerializer {
-    local_pos: usize,
-    options: IpcWriteOptions,
-}
-
-impl TransformScatterExchangeSerializer {
-    pub fn create(
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-        compression: Option<FlightCompression>,
-        params: &ShuffleExchangeParams,
-    ) -> Result<ProcessorPtr> {
-        let local_id = &params.executor_id;
-        let compression = match compression {
-            None => None,
-            Some(compression) => match compression {
-                FlightCompression::Lz4 => Some(CompressionType::LZ4_FRAME),
-                FlightCompression::Zstd => Some(CompressionType::ZSTD),
-            },
-        };
-
-        Ok(ProcessorPtr::create(BlockMetaTransformer::create(
-            input,
-            output,
-            TransformScatterExchangeSerializer {
-                options: IpcWriteOptions::default().try_with_compression(compression)?,
-                local_pos: params
-                    .destination_ids
-                    .iter()
-                    .position(|x| x == local_id)
-                    .unwrap(),
-            },
-        )))
-    }
-}
-
-impl BlockMetaTransform<ExchangeShuffleMeta> for TransformScatterExchangeSerializer {
-    const UNKNOWN_MODE: UnknownMode = UnknownMode::Error;
-    const NAME: &'static str = "TransformScatterExchangeSerializer";
-
-    fn transform(&mut self, meta: ExchangeShuffleMeta) -> Result<Vec<DataBlock>> {
-        let mut new_blocks = Vec::with_capacity(meta.blocks.len());
-        for (index, block) in meta.blocks.into_iter().enumerate() {
-            if block.is_empty() {
-                new_blocks.push(block);
-                continue;
-            }
-
-            new_blocks.push(match self.local_pos == index {
-                true => block,
-                false => serialize_block(0, block, &self.options)?,
-            });
-        }
-
-        Ok(vec![DataBlock::empty_with_meta(
-            ExchangeShuffleMeta::create(new_blocks),
-        )])
-    }
-}
-
 pub fn serialize_block(
-    block_num: isize,
+    partition: isize,
+    max_partition: usize,
+    global_max_partition: usize,
     data_block: DataBlock,
     options: &IpcWriteOptions,
 ) -> Result<DataBlock> {
     if data_block.is_empty() && data_block.get_meta().is_none() {
         return Ok(DataBlock::empty_with_meta(ExchangeSerializeMeta::create(
-            block_num,
+            partition,
+            max_partition,
+            global_max_partition,
             vec![],
         )));
     }
 
     let mut meta = vec![];
     meta.write_scalar_own(data_block.num_rows() as u32)?;
-    bincode_serialize_into_buf(&mut meta, &data_block.get_meta())
-        .map_err(|_| ErrorCode::BadBytes("block meta serialize error when exchange"))?;
+    serde_json::to_writer(&mut meta, &data_block.get_meta())?;
 
     let (_, dict, values) = match data_block.is_empty() {
         true => batches_to_flight_data_with_options(
@@ -226,7 +123,10 @@ pub fn serialize_block(
     }
 
     Ok(DataBlock::empty_with_meta(ExchangeSerializeMeta::create(
-        block_num, packet,
+        partition,
+        max_partition,
+        global_max_partition,
+        packet,
     )))
 }
 
diff --git a/src/query/service/src/servers/flight/v1/exchange/serde/mod.rs b/src/query/service/src/servers/flight/v1/exchange/serde/mod.rs
index 7349b2f46b0c1..ccf7abfb694b6 100644
--- a/src/query/service/src/servers/flight/v1/exchange/serde/mod.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/serde/mod.rs
@@ -20,5 +20,3 @@ pub use exchange_deserializer::ExchangeDeserializeMeta;
 pub use exchange_deserializer::TransformExchangeDeserializer;
 pub use exchange_serializer::serialize_block;
 pub use exchange_serializer::ExchangeSerializeMeta;
-pub use exchange_serializer::TransformExchangeSerializer;
-pub use exchange_serializer::TransformScatterExchangeSerializer;
diff --git a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs
index 118cf8b8519c7..68a4e8d163829 100644
--- a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs
+++ b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs
@@ -15,6 +15,7 @@
 use std::collections::hash_map::DefaultHasher;
 use std::hash::Hasher;
 
+use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::type_check::check_function;
@@ -37,6 +38,7 @@ use databend_common_expression::Value;
 use databend_common_functions::BUILTIN_FUNCTIONS;
 
 use crate::servers::flight::v1::scatter::flight_scatter::FlightScatter;
+use crate::sessions::QueryContext;
 
 #[derive(Clone)]
 pub struct HashFlightScatter {
@@ -47,11 +49,15 @@ pub struct HashFlightScatter {
 
 impl HashFlightScatter {
     pub fn try_create(
-        func_ctx: FunctionContext,
+        ctx: &QueryContext,
         hash_keys: Vec<RemoteExpr>,
-        scatter_size: usize,
-        local_pos: usize,
+        destination_ids: &[String],
     ) -> Result<Box<dyn FlightScatter>> {
+        let local_id = &ctx.get_cluster().local_id;
+        let func_ctx = ctx.get_function_context()?;
+        let scatter_size = destination_ids.len();
+        let local_pos = destination_ids.iter().position(|x| x == local_id).unwrap();
+
         if hash_keys.len() == 1 {
             return OneHashKeyFlightScatter::try_create(
                 func_ctx,
diff --git a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_merge.rs b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_merge.rs
new file mode 100644
index 0000000000000..02cebbfcbe938
--- /dev/null
+++ b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_merge.rs
@@ -0,0 +1,37 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use databend_common_exception::Result;
+use databend_common_expression::DataBlock;
+
+use crate::servers::flight::v1::scatter::flight_scatter::FlightScatter;
+
+pub struct MergeFlightScatter;
+
+// impl MergeFlightScatter {
+//     pub fn try_create(scattered_size: usize) -> Result<Self> {
+//         Ok(MergeFlightScatter { scattered_size })
+//     }
+// }
+
+impl FlightScatter for MergeFlightScatter {
+    fn execute(&self, data_block: DataBlock) -> Result<Vec<DataBlock>> {
+        // let mut data_blocks = vec![];
+        // for _ in 0..self.scattered_size {
+        //     data_blocks.push(data_block.clone());
+        // }
+
+        Ok(vec![data_block])
+    }
+}
diff --git a/src/query/service/src/servers/flight/v1/scatter/mod.rs b/src/query/service/src/servers/flight/v1/scatter/mod.rs
index b5f5f900dab71..caaee700968e4 100644
--- a/src/query/service/src/servers/flight/v1/scatter/mod.rs
+++ b/src/query/service/src/servers/flight/v1/scatter/mod.rs
@@ -15,7 +15,9 @@
 mod flight_scatter;
 mod flight_scatter_broadcast;
 mod flight_scatter_hash;
+mod flight_scatter_merge;
 
 pub use flight_scatter::FlightScatter;
 pub use flight_scatter_broadcast::BroadcastFlightScatter;
 pub use flight_scatter_hash::HashFlightScatter;
+pub use flight_scatter_merge::MergeFlightScatter;
diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs
index 114c8dff71254..ae250a3e2acef 100644
--- a/src/query/service/src/spillers/spiller.rs
+++ b/src/query/service/src/spillers/spiller.rs
@@ -35,6 +35,7 @@ use databend_storages_common_cache::TempDir;
 use databend_storages_common_cache::TempPath;
 use opendal::Buffer;
 use opendal::Operator;
+use opendal::Writer;
 use parking_lot::RwLock;
 
 use super::serialize::*;
@@ -187,6 +188,17 @@ impl Spiller {
         format!("{}/{}", self.location_prefix, GlobalUniqName::unique())
     }
 
+    pub async fn create_aggregate_writer(&self, location: String) -> Result<SpillWriter> {
+        let writer = self.operator.writer(&location).await?;
+        Ok(SpillWriter {
+            bytes: 0,
+            writer,
+            location,
+            ctx: self.ctx.clone(),
+            private_spilled_files: self.private_spilled_files.clone(),
+        })
+    }
+
     pub async fn spill_stream_aggregate_buffer(
         &self,
         location: Option<String>,
@@ -556,3 +568,42 @@ fn record_read_profile(location: &Location, start: &Instant, read_bytes: usize)
         }
     }
 }
+
+pub struct SpillWriter {
+    bytes: usize,
+    writer: Writer,
+    location: String,
+    ctx: Arc<QueryContext>,
+    private_spilled_files: Arc<RwLock<HashMap<Location, Layout>>>,
+}
+
+impl SpillWriter {
+    pub async fn write(&mut self, bytes: Vec<u8>) -> Result<()> {
+        self.bytes += bytes.len();
+        Ok(self.writer.write(bytes).await?)
+    }
+
+    pub fn location(&self) -> String {
+        self.location.clone()
+    }
+
+    pub fn write_bytes(&self) -> usize {
+        self.bytes
+    }
+
+    pub async fn complete(&mut self) -> Result<()> {
+        self.writer.close().await?;
+
+        self.ctx.add_spill_file(
+            Location::Remote(self.location.clone()),
+            Layout::Aggregate,
+            self.bytes,
+        );
+
+        self.private_spilled_files
+            .write()
+            .insert(Location::Remote(self.location.clone()), Layout::Aggregate);
+
+        Ok(())
+    }
+}
diff --git a/src/query/storages/system/src/query_log_table.rs b/src/query/storages/system/src/query_log_table.rs
index c6c8e16946fdf..fc83a87d05cc4 100644
--- a/src/query/storages/system/src/query_log_table.rs
+++ b/src/query/storages/system/src/query_log_table.rs
@@ -179,7 +179,7 @@ pub struct QueryLogElement {
     // Transaction
     pub txn_state: String,
     pub txn_id: String,
-    pub peek_memory_usage: HashMap<String, usize>,
+    pub peak_memory_usage: HashMap<String, usize>,
 }
 
 impl SystemLogElement for QueryLogElement {
@@ -575,7 +575,7 @@ impl SystemLogElement for QueryLogElement {
         columns.next().unwrap().push(
             Scalar::Variant(
                 jsonb::Value::from(jsonb::Object::from_iter(
-                    self.peek_memory_usage
+                    self.peak_memory_usage
                         .iter()
                         .map(|(k, v)| (k.clone(), jsonb::Value::from(*v))),
                 ))
diff --git a/tests/sqllogictests/suites/query/cte/basic_r_cte.test b/tests/sqllogictests/suites/query/cte/basic_r_cte.test
index 1d4ce93efcd9d..60b9d6bae97a0 100644
--- a/tests/sqllogictests/suites/query/cte/basic_r_cte.test
+++ b/tests/sqllogictests/suites/query/cte/basic_r_cte.test
@@ -254,7 +254,7 @@ select concat('城市',rn::varchar) city from t1 where rn<=5;
 
 statement ok
 insert into train
-select concat('G',row_number()over()::varchar),c1.city,c2.city, n from city c1, city c2, (select 600 n union select 800 union select 1200 union select 1600) a ;
+select concat('G',row_number()over()::varchar),c1_city,c2_city, n from (SELECT c1.city as c1_city,c2.city as c2_city, n FROM city c1, city c2, (select 600 n union select 800 union select 1200 union select 1600) a order by c1.city,c2.city, n);
 
 statement ok
 insert into passenger
@@ -281,10 +281,8 @@ select
 from
         t0,(select 1 n union all select 2);
 ----
-261700 523200 210000
+224100 448000 210000
 
 statement ok
 use default;
 
-statement ok
-drop database db;
diff --git a/tests/sqllogictests/suites/query/window_function/window_bound.test b/tests/sqllogictests/suites/query/window_function/window_bound.test
index b9f7b17571ed4..763e2bd51507a 100644
--- a/tests/sqllogictests/suites/query/window_function/window_bound.test
+++ b/tests/sqllogictests/suites/query/window_function/window_bound.test
@@ -267,23 +267,23 @@ SELECT a, DENSE_RANK() OVER (ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND UNB
 6 4
 7 5
 
-query I
-SELECT DISTINCT lead((861, FALSE, FALSE), 9, (849, TRUE, TRUE)) OVER (
+query II
+SELECT * FROM (SELECT DISTINCT lead((861, FALSE, FALSE), 9, (849, TRUE, TRUE)) OVER (
     PARTITION BY 15560425903542832284, 965871850213131579 
-    ORDER BY 13746504519650342222, 5897530378272856518 ASC NULLS FIRST) 
-FROM range(100, 12000000, 467);
+    ORDER BY 13746504519650342222, 5897530378272856518 ASC NULLS FIRST) AS C
+FROM range(100, 12000000, 467)) ORDER BY C.1;
 ----
-(861,0,0)
 (849,1,1)
+(861,0,0)
 
 query II
-SELECT DISTINCT lead((861, FALSE, FALSE), 9, (849, TRUE, TRUE)) OVER (
+SELECT * FROM (SELECT DISTINCT lead((861, FALSE, FALSE), 9, (849, TRUE, TRUE)) OVER (
     PARTITION BY 15560425903542832284, 965871850213131579 
-    ORDER BY 13746504519650342222, 5897530378272856518 ASC NULLS FIRST) 
-FROM range(100, 120000000, 467);
+    ORDER BY 13746504519650342222, 5897530378272856518 ASC NULLS FIRST) AS C
+FROM range(100, 120000000, 467)) ORDER BY C.1;
 ----
-(861,0,0)
 (849,1,1)
+(861,0,0)
 
 statement ok
 DROP DATABASE test_window_bound;