diff --git a/testbed/scripts/submit_slurm_job_rebuttal_ginsburg.py b/testbed/scripts/submit_slurm_job_rebuttal_ginsburg.py
new file mode 100644
index 00000000..50c58d40
--- /dev/null
+++ b/testbed/scripts/submit_slurm_job_rebuttal_ginsburg.py
@@ -0,0 +1,108 @@
+import os
+from pathlib import Path
+
+template = """#!/bin/sh
+#
+#SBATCH --account=stats
+#SBATCH -c 4
+#SBATCH --time=11:59:00
+#SBATCH --mem-per-cpu=12gb
+#SBATCH --export=ALL
+
+module load anaconda
+
+cd /burg/stats/users/ag3750/treeffuser
+
+source .venv/bin/activate
+"""
+
+path = Path(__file__).resolve().parent
+parent_path = path.parent
+main_script_path = parent_path / "src" / "testbed" / "__main__.py"
+
+seeds = [0]
+evaluation_mode = "bayes_opt"
+split_idx = list(range(10))
+model_names = [
+    # "ngboost",
+    # "ibug",
+    # "ibug_kde",
+    "treeffuser",
+    # "deep_ensemble",
+    # "quantile_regression_tree",
+    # "mc_dropout",
+    # "card",
+    "ppm_lightgbm",
+    "ppm_xgboost",
+    # "ppm_mlp",
+]
+
+
+datasets = [
+    "bike",
+    "boston",
+    # "communities",  # contains NaN
+    # "energy",  # y is 2d
+    "facebook",
+    "kin8nm",
+    # "msd",  # very big X (463715, 90)
+    "naval",
+    "news",
+    "power",
+    "protein",
+    "superconductor",
+    # "wave",  # very big
+    "wine",
+    "yacht",
+    "movies",
+]
+
+
+def get_cmd(
+    model,
+    seed,
+    split_idx,
+    dataset,
+    evaluation_mode,
+):
+    tmp = (
+        f"python {main_script_path}"
+        f" --models {model}"
+        f" --seed {seed}"
+        f" --split_idx {split_idx}"
+        f" --datasets {dataset}"
+        f" --evaluation_mode {evaluation_mode}"
+        f" --wandb_project ale-ppm"
+        f" --n_iter_bayes_opt 25"
+    )
+    return tmp
+
+
+def get_slurm_script(
+    model,
+    seed,
+    split_idx,
+    dataset,
+    evaluation_mode,
+):
+    cmd = get_cmd(model, seed, split_idx, dataset, evaluation_mode)
+    return f"{template}\n{cmd}"
+
+
+jobs_scripts_path = Path("jobs_scripts")
+jobs_scripts_path.mkdir(parents=True, exist_ok=True)
+
+scripts = []
+for model in model_names:
+    for seed in seeds:
+        for split in split_idx:
+            for dataset in datasets:
+                script = get_slurm_script(model, seed, split, dataset, evaluation_mode)
+                scripts.append(script)
+
+for i, script in enumerate(scripts):
+    slurm_script_path = jobs_scripts_path / f"job_{i}.sh"
+    with slurm_script_path.open("w") as f:
+        f.write(script)
+    cmd = f"sbatch {slurm_script_path}"
+    os.system(cmd)  # noqa S605
diff --git a/testbed/src/testbed/__main__.py b/testbed/src/testbed/__main__.py
index 9e7d279d..d5002179 100644
--- a/testbed/src/testbed/__main__.py
+++ b/testbed/src/testbed/__main__.py
@@ -112,6 +112,11 @@ def get_model(
 
         return IBugXGBoost
 
+    if model_name == "ibug_kde":
+        from testbed.models.ibug_kde_ import IBugXGBoostKDE
+
+        return IBugXGBoostKDE
+
     if model_name == "drf":
         from testbed.models.drf_ import DistributionalRandomForest
 
diff --git a/testbed/src/testbed/data/uci/kin8nm/preprocess.py b/testbed/src/testbed/data/uci/kin8nm/preprocess.py
new file mode 100644
index 00000000..80ee1020
--- /dev/null
+++ b/testbed/src/testbed/data/uci/kin8nm/preprocess.py
@@ -0,0 +1,41 @@
+import argparse
+import zipfile
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from testbed.data.utils import _assign_k_splits
+
+
+def main(path_raw_dataset_dir: Path):
+    # Process the raw data file named temp.csv
+    raw_data_path = path_raw_dataset_dir / "data.zip"
+    # unzip the data
+    with zipfile.ZipFile(raw_data_path, "r") as zip_ref:
+        zip_ref.extractall(path_raw_dataset_dir)
+    raw_data_path = path_raw_dataset_dir / "data.csv"
+
+    df = pd.read_csv(raw_data_path)
+    y = df["y"].values
+    x = df.drop(columns=["y"]).values
+    categorical = []
+
+    k_fold_splits = _assign_k_splits(x.shape[0], 10, 0)
+
+    # Save preprocessed data
+    np.save(
+        path_raw_dataset_dir.parent / "data.npy",
+        {
+            "x": x,
+            "y": y,
+            "categorical": categorical,
+            "k_fold_splits": k_fold_splits,
+        },
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("path", type=Path)
+    args = parser.parse_args()
+    main(args.path)
diff --git a/testbed/src/testbed/data/uci/kin8nm/raw/data.zip b/testbed/src/testbed/data/uci/kin8nm/raw/data.zip
new file mode 100644
index 00000000..d61d9b2c
Binary files /dev/null and b/testbed/src/testbed/data/uci/kin8nm/raw/data.zip differ
diff --git a/testbed/src/testbed/models/ibug_.py b/testbed/src/testbed/models/ibug_.py
index 4ec859bf..4198bb97 100644
--- a/testbed/src/testbed/models/ibug_.py
+++ b/testbed/src/testbed/models/ibug_.py
@@ -4,6 +4,7 @@
 from numpy import ndarray
 from sklearn.model_selection import train_test_split
 from skopt.space import Integer
+from skopt.space import Real
 from xgboost import XGBRegressor
 
 from testbed.models import ProbabilisticModel
@@ -14,10 +15,13 @@ class IBugXGBoost(ProbabilisticModel):
     IBug wrapper for XGBoost.
     """
 
-    def __init__(self, k=100, seed=0):
+    def __init__(self, k=100, n_estimators=100, max_depth=2, learning_rate=0.1, seed=0):
         super().__init__(seed)
         self.model = None
         self.k = k
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.learning_rate = learning_rate
 
     def fit(
         self,
@@ -42,6 +46,9 @@ def fit(
 
         # train GBRT model
         self.gbrt_model = XGBRegressor(
+            n_estimators=self.n_estimators,
+            max_depth=self.max_depth,
+            learning_rate=self.learning_rate,
             reg_alpha=0,
             scale_pos_weight=1,
             base_score=0.5,
@@ -77,5 +84,8 @@ def search_space() -> dict:
         Return the search space for parameters of the model.
         """
         return {
-            "k": Integer(20, 150),
+            "k": Integer(20, 250),  # ibug runs CV on k by default
+            "n_estimators": Integer(10, 1000),
+            "learning_rate": Real(0.01, 0.5, prior="log-uniform"),
+            "max_depth": Integer(1, 100),
         }
diff --git a/testbed/src/testbed/models/ibug_kde_.py b/testbed/src/testbed/models/ibug_kde_.py
new file mode 100644
index 00000000..10bb0bdb
--- /dev/null
+++ b/testbed/src/testbed/models/ibug_kde_.py
@@ -0,0 +1,152 @@
+import numpy as np
+from ibug import IBUGWrapper
+from jaxtyping import Float
+from numpy import ndarray
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KernelDensity
+from skopt.space import Integer
+from skopt.space import Real
+from xgboost import XGBRegressor
+
+from testbed.models import ProbabilisticModel
+
+_BANDWIDTH_SEARCH_SPACE = np.logspace(-4, 3, 10)
+
+
+class IBugXGBoostKDE(ProbabilisticModel):
+    """
+    IBug wrapper for XGBoost.
+    """
+
+    def __init__(self, k=100, n_estimators=100, max_depth=2, learning_rate=0.1, seed=0):
+        super().__init__(seed)
+        self.model = None
+        self.best_bandwith = None
+
+        self.k = k
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.learning_rate = learning_rate
+
+        # For scaling
+        self.mu = None
+        self.sigma = None
+
+    def fit(
+        self,
+        X: Float[ndarray, "batch x_dim"],
+        y: Float[ndarray, "batch y_dim"],
+    ) -> ProbabilisticModel:
+        """
+        Fit the model to the data.
+        """
+        self.mu = np.mean(y, axis=0)
+        self.sigma = np.std(y, axis=0)
+
+        # Normalize y
+        y = (y - self.mu) / self.sigma
+
+        if y.shape[1] > 1:
+            raise ValueError("IBugXGBoost only accepts 1 dimensional y values.")
+
+        y = y[:, 0]
+        self.k = min(self.k, len(X))  # make sure k is not larger than the number of instances
+
+        X_train, X_val, y_train, y_val = train_test_split(
+            X,
+            y,
+            test_size=0.1,
+            random_state=self.seed,
+        )
+
+        # train GBRT model
+        self.gbrt_model = XGBRegressor(
+            n_estimators=self.n_estimators,
+            max_depth=self.max_depth,
+            learning_rate=self.learning_rate,
+            reg_alpha=0,
+            scale_pos_weight=1,
+            base_score=0.5,
+        ).fit(X_train, y_train)
+
+        # extend GBRT model into a probabilistic estimator
+        self.model = IBUGWrapper(k=self.k).fit(
+            self.gbrt_model, X_train, y_train, X_val=X_val, y_val=y_val
+        )
+
+        _, _, _, y_val_neighbor_vals = self.model.pred_dist(X_val, return_kneighbors=True)
+        self.best_bandwith = _find_best_bandwith(y_val_neighbor_vals.T, y_val)
+        return self
+
+    def predict(self, X: Float[ndarray, "batch x_dim"]) -> Float[ndarray, "batch y_dim"]:
+        """
+        Predict the mean for each input.
+        """
+        # predict mean and variance for unseen instances
+        location, scale = self.model.pred_dist(X)
+        location = location * self.sigma + self.mu
+        return location.reshape(-1, 1)
+
+    def sample(
+        self, X: Float[ndarray, "batch x_dim"], n_samples=10, seed=None
+    ) -> Float[ndarray, "n_samples batch y_dim"]:
+        """
+        Sample from the probability distribution for each input.
+        """
+        location, scale, _, neighbor_values = self.model.pred_dist(X, return_kneighbors=True)
+        assert location.shape == (len(X),)
+        assert scale.shape == (len(X),)
+        assert neighbor_values.shape == (
+            len(X),
+            self.model.k_,
+        )  # might be different than our k
+
+        samples = []
+        for i in range(len(X)):
+            kde = KernelDensity(
+                bandwidth=self.best_bandwith, algorithm="auto", kernel="gaussian"
+            )
+            kde.fit(neighbor_values[i].reshape(-1, 1))
+            samples.append(kde.sample(n_samples, random_state=seed))
+
+        samples = np.array(samples).transpose(1, 0, 2)
+        samples = samples * self.sigma + self.mu
+        assert samples.shape == (n_samples, len(X), 1)
+        return samples
+
+    @staticmethod
+    def search_space() -> dict:
+        """
+        Return the search space for parameters of the model.
+        """
+        return {
+            "k": Integer(20, 250),
+            "n_estimators": Integer(10, 1000),
+            "learning_rate": Real(0.01, 0.5, prior="log-uniform"),
+            "max_depth": Integer(1, 100),
+        }
+
+
+def _find_best_bandwith(
+    y_samples: Float[ndarray, "n_samples batch"],
+    y_true: Float[ndarray, "batch"],
+) -> float:
+    batch = y_true.shape[0]
+    assert y_samples.shape == (y_samples.shape[0], batch)
+
+    best_bandwith = 0
+    best_log_prob = -np.inf
+
+    for bandwidth in _BANDWIDTH_SEARCH_SPACE:
+        log_prob = 0
+        for i in range(batch):
+            y_i = y_samples[:, i].reshape(-1, 1)
+            kde = KernelDensity(bandwidth=bandwidth, algorithm="auto", kernel="gaussian")
+            kde.fit(y_i)
+            log_prob += kde.score(y_true[i].reshape(-1, 1))
+
+        if log_prob > best_log_prob:
+            best_log_prob = log_prob
+            best_bandwith = bandwidth
+
+    return best_bandwith
diff --git a/testbed/src/testbed/models/point_prediction_model.py b/testbed/src/testbed/models/point_prediction_model.py
index f50ce94a..5f61f9fe 100644
--- a/testbed/src/testbed/models/point_prediction_model.py
+++ b/testbed/src/testbed/models/point_prediction_model.py
@@ -105,7 +105,7 @@ def search_space() -> dict:
         return {
             "n_estimators": Integer(10, 1000),
             "learning_rate": Real(0.01, 0.5, prior="log-uniform"),
-            "max_depth": Integer(1, 10),
+            "max_depth": Integer(1, 100),
         }
 
 
diff --git a/testbed/tests/models/test_all_models.py b/testbed/tests/models/test_all_models.py
new file mode 100644
index 00000000..add18f67
--- /dev/null
+++ b/testbed/tests/models/test_all_models.py
@@ -0,0 +1,51 @@
+"""
+Contains test for simple sanity checks for the models
+"""
+
+from typing import Type
+
+import numpy as np
+import pytest
+from sklearn.metrics import r2_score
+from testbed.models import ProbabilisticModel
+from testbed.models.ibug_kde_ import IBugXGBoostKDE
+
+
+@pytest.mark.parametrize(
+    "model_class, params",
+    [
+        (IBugXGBoostKDE, {"k": 10}),
+    ],
+)
+def test_all_models_sanity(model_class: Type[ProbabilisticModel], params: dict):
+    """
+    Simple test to check if our wrapper for bayesian optimization works
+    """
+    # add seed
+    np.random.seed(0)
+
+    n_samples = 1000
+    n_pred_samples = 10
+    n_features = 1
+    n_targets = 1
+
+    X = np.random.rand(n_samples, n_features)
+    epsilon = np.random.rand(n_samples, n_targets) * 0.1
+    beta = np.random.rand(n_features, n_targets)
+
+    y = X @ beta + epsilon
+    model = model_class(**params)
+    model.fit(X, y)
+
+    # Check that everything runs
+    y_pred = model.predict(X)
+    y_samples = model.sample(X, n_samples=n_pred_samples)
+
+    assert y_pred.shape == (n_samples, n_targets)
+    assert y_samples.shape == (n_pred_samples, n_samples, n_targets)
+
+    r2_pred = r2_score(y.flatten(), y_pred.flatten())
+    r2_samples = r2_score(y.flatten(), y_samples.mean(axis=0).flatten())
+
+    assert r2_pred > 0.9
+    assert r2_samples > 0.9