diff --git a/testbed/scripts/submit_slurm_job_rebuttal_ginsburg.py b/testbed/scripts/submit_slurm_job_rebuttal_ginsburg.py new file mode 100644 index 00000000..50c58d40 --- /dev/null +++ b/testbed/scripts/submit_slurm_job_rebuttal_ginsburg.py @@ -0,0 +1,108 @@ +import os +from pathlib import Path + +template = """#!/bin/sh +# +#SBATCH --account=stats +#SBATCH -c 4 +#SBATCH --time=11:59:00 +#SBATCH --mem-per-cpu=12gb +#SBATCH --export=ALL + +module load anaconda + +cd /burg/stats/users/ag3750/treeffuser + +source .venv/bin/activate +""" + +path = Path(__file__).resolve().parent +parent_path = path.parent +main_script_path = parent_path / "src" / "testbed" / "__main__.py" + +seeds = [0] +evaluation_mode = "bayes_opt" +split_idx = list(range(10)) +model_names = [ + # "ngboost", + # "ibug", + # "ibug_kde", + "treeffuser", + # "deep_ensemble", + # "quantile_regression_tree", + # "mc_dropout", + # "card", + "ppm_lightgbm", + "ppm_xgboost", + # "ppm_mlp", +] + + +datasets = [ + "bike", + "boston", + # "communities", # contains NaN + # "energy", # y is 2d + "facebook", + "kin8nm", + # "msd", # very big X (463715, 90) + "naval", + "news", + "power", + "protein", + "superconductor", + # "wave", # very big + "wine", + "yacht", + "movies", +] + + +def get_cmd( + model, + seed, + split_idx, + dataset, + evaluation_mode, +): + tmp = ( + f"python {main_script_path}" + f" --models {model}" + f" --seed {seed}" + f" --split_idx {split_idx}" + f" --datasets {dataset}" + f" --evaluation_mode {evaluation_mode}" + f" --wandb_project ale-ppm" + f" --n_iter_bayes_opt 25" + ) + return tmp + + +def get_slurm_script( + model, + seed, + split_idx, + dataset, + evaluation_mode, +): + cmd = get_cmd(model, seed, split_idx, dataset, evaluation_mode) + return f"{template}\n{cmd}" + + +jobs_scripts_path = Path("jobs_scripts") +jobs_scripts_path.mkdir(parents=True, exist_ok=True) + +scripts = [] +for model in model_names: + for seed in seeds: + for split in split_idx: + for dataset in datasets: + script = get_slurm_script(model, seed, split, dataset, evaluation_mode) + scripts.append(script) + +for i, script in enumerate(scripts): + slurm_script_path = jobs_scripts_path / f"job_{i}.sh" + with slurm_script_path.open("w") as f: + f.write(script) + cmd = f"sbatch {slurm_script_path}" + os.system(cmd) # noqa S605 diff --git a/testbed/src/testbed/__main__.py b/testbed/src/testbed/__main__.py index 9e7d279d..d5002179 100644 --- a/testbed/src/testbed/__main__.py +++ b/testbed/src/testbed/__main__.py @@ -112,6 +112,11 @@ def get_model( return IBugXGBoost + if model_name == "ibug_kde": + from testbed.models.ibug_kde_ import IBugXGBoostKDE + + return IBugXGBoostKDE + if model_name == "drf": from testbed.models.drf_ import DistributionalRandomForest diff --git a/testbed/src/testbed/data/uci/kin8nm/preprocess.py b/testbed/src/testbed/data/uci/kin8nm/preprocess.py new file mode 100644 index 00000000..80ee1020 --- /dev/null +++ b/testbed/src/testbed/data/uci/kin8nm/preprocess.py @@ -0,0 +1,41 @@ +import argparse +import zipfile +from pathlib import Path + +import numpy as np +import pandas as pd +from testbed.data.utils import _assign_k_splits + + +def main(path_raw_dataset_dir: Path): + # Process the raw data file named temp.csv + raw_data_path = path_raw_dataset_dir / "data.zip" + # unzip the data + with zipfile.ZipFile(raw_data_path, "r") as zip_ref: + zip_ref.extractall(path_raw_dataset_dir) + raw_data_path = path_raw_dataset_dir / "data.csv" + + df = pd.read_csv(raw_data_path) + y = df["y"].values + x = df.drop(columns=["y"]).values + categorical = [] + + k_fold_splits = _assign_k_splits(x.shape[0], 10, 0) + + # Save preprocessed data + np.save( + path_raw_dataset_dir.parent / "data.npy", + { + "x": x, + "y": y, + "categorical": categorical, + "k_fold_splits": k_fold_splits, + }, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("path", type=Path) + args = parser.parse_args() + main(args.path) diff --git a/testbed/src/testbed/data/uci/kin8nm/raw/data.zip b/testbed/src/testbed/data/uci/kin8nm/raw/data.zip new file mode 100644 index 00000000..d61d9b2c Binary files /dev/null and b/testbed/src/testbed/data/uci/kin8nm/raw/data.zip differ diff --git a/testbed/src/testbed/models/ibug_.py b/testbed/src/testbed/models/ibug_.py index 4ec859bf..4198bb97 100644 --- a/testbed/src/testbed/models/ibug_.py +++ b/testbed/src/testbed/models/ibug_.py @@ -4,6 +4,7 @@ from numpy import ndarray from sklearn.model_selection import train_test_split from skopt.space import Integer +from skopt.space import Real from xgboost import XGBRegressor from testbed.models import ProbabilisticModel @@ -14,10 +15,13 @@ class IBugXGBoost(ProbabilisticModel): IBug wrapper for XGBoost. """ - def __init__(self, k=100, seed=0): + def __init__(self, k=100, n_estimators=100, max_depth=2, learning_rate=0.1, seed=0): super().__init__(seed) self.model = None self.k = k + self.n_estimators = n_estimators + self.max_depth = max_depth + self.learning_rate = learning_rate def fit( self, @@ -42,6 +46,9 @@ def fit( # train GBRT model self.gbrt_model = XGBRegressor( + n_estimators=self.n_estimators, + max_depth=self.max_depth, + learning_rate=self.learning_rate, reg_alpha=0, scale_pos_weight=1, base_score=0.5, @@ -77,5 +84,8 @@ def search_space() -> dict: Return the search space for parameters of the model. """ return { - "k": Integer(20, 150), + "k": Integer(20, 250), # ibug runs CV on k by default + "n_estimators": Integer(10, 1000), + "learning_rate": Real(0.01, 0.5, prior="log-uniform"), + "max_depth": Integer(1, 100), } diff --git a/testbed/src/testbed/models/ibug_kde_.py b/testbed/src/testbed/models/ibug_kde_.py new file mode 100644 index 00000000..10bb0bdb --- /dev/null +++ b/testbed/src/testbed/models/ibug_kde_.py @@ -0,0 +1,152 @@ +import numpy as np +from ibug import IBUGWrapper +from jaxtyping import Float +from numpy import ndarray +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KernelDensity +from skopt.space import Integer +from skopt.space import Real +from xgboost import XGBRegressor + +from testbed.models import ProbabilisticModel + +_BANDWIDTH_SEARCH_SPACE = np.logspace(-4, 3, 10) + + +class IBugXGBoostKDE(ProbabilisticModel): + """ + IBug wrapper for XGBoost. + """ + + def __init__(self, k=100, n_estimators=100, max_depth=2, learning_rate=0.1, seed=0): + super().__init__(seed) + self.model = None + self.best_bandwith = None + + self.k = k + self.n_estimators = n_estimators + self.max_depth = max_depth + self.learning_rate = learning_rate + + # For scaling + self.mu = None + self.sigma = None + + def fit( + self, + X: Float[ndarray, "batch x_dim"], + y: Float[ndarray, "batch y_dim"], + ) -> ProbabilisticModel: + """ + Fit the model to the data. + """ + self.mu = np.mean(y, axis=0) + self.sigma = np.std(y, axis=0) + + # Normalize y + y = (y - self.mu) / self.sigma + + if y.shape[1] > 1: + raise ValueError("IBugXGBoost only accepts 1 dimensional y values.") + + y = y[:, 0] + self.k = min(self.k, len(X)) # make sure k is not larger than the number of instances + + X_train, X_val, y_train, y_val = train_test_split( + X, + y, + test_size=0.1, + random_state=self.seed, + ) + + # train GBRT model + self.gbrt_model = XGBRegressor( + n_estimators=self.n_estimators, + max_depth=self.max_depth, + learning_rate=self.learning_rate, + reg_alpha=0, + scale_pos_weight=1, + base_score=0.5, + ).fit(X_train, y_train) + + # extend GBRT model into a probabilistic estimator + self.model = IBUGWrapper(k=self.k).fit( + self.gbrt_model, X_train, y_train, X_val=X_val, y_val=y_val + ) + + _, _, _, y_val_neighbor_vals = self.model.pred_dist(X_val, return_kneighbors=True) + self.best_bandwith = _find_best_bandwith(y_val_neighbor_vals.T, y_val) + return self + + def predict(self, X: Float[ndarray, "batch x_dim"]) -> Float[ndarray, "batch y_dim"]: + """ + Predict the mean for each input. + """ + # predict mean and variance for unseen instances + location, scale = self.model.pred_dist(X) + location = location * self.sigma + self.mu + return location.reshape(-1, 1) + + def sample( + self, X: Float[ndarray, "batch x_dim"], n_samples=10, seed=None + ) -> Float[ndarray, "n_samples batch y_dim"]: + """ + Sample from the probability distribution for each input. + """ + location, scale, _, neighbor_values = self.model.pred_dist(X, return_kneighbors=True) + assert location.shape == (len(X),) + assert scale.shape == (len(X),) + assert neighbor_values.shape == ( + len(X), + self.model.k_, + ) # might be different than our k + + samples = [] + for i in range(len(X)): + kde = KernelDensity( + bandwidth=self.best_bandwith, algorithm="auto", kernel="gaussian" + ) + kde.fit(neighbor_values[i].reshape(-1, 1)) + samples.append(kde.sample(n_samples, random_state=seed)) + + samples = np.array(samples).transpose(1, 0, 2) + samples = samples * self.sigma + self.mu + assert samples.shape == (n_samples, len(X), 1) + return samples + + @staticmethod + def search_space() -> dict: + """ + Return the search space for parameters of the model. + """ + return { + "k": Integer(20, 250), + "n_estimators": Integer(10, 1000), + "learning_rate": Real(0.01, 0.5, prior="log-uniform"), + "max_depth": Integer(1, 100), + } + + +def _find_best_bandwith( + y_samples: Float[ndarray, "n_samples batch"], + y_true: Float[ndarray, "batch"], +) -> float: + batch = y_true.shape[0] + assert y_samples.shape == (y_samples.shape[0], batch) + + best_bandwith = 0 + best_log_prob = -np.inf + + for bandwidth in _BANDWIDTH_SEARCH_SPACE: + log_prob = 0 + for i in range(batch): + y_i = y_samples[:, i].reshape(-1, 1) + kde = KernelDensity(bandwidth=bandwidth, algorithm="auto", kernel="gaussian") + kde.fit(y_i) + log_prob += kde.score(y_true[i].reshape(-1, 1)) + + if log_prob > best_log_prob: + best_log_prob = log_prob + best_bandwith = bandwidth + + return best_bandwith diff --git a/testbed/src/testbed/models/point_prediction_model.py b/testbed/src/testbed/models/point_prediction_model.py index f50ce94a..5f61f9fe 100644 --- a/testbed/src/testbed/models/point_prediction_model.py +++ b/testbed/src/testbed/models/point_prediction_model.py @@ -105,7 +105,7 @@ def search_space() -> dict: return { "n_estimators": Integer(10, 1000), "learning_rate": Real(0.01, 0.5, prior="log-uniform"), - "max_depth": Integer(1, 10), + "max_depth": Integer(1, 100), } diff --git a/testbed/tests/models/test_all_models.py b/testbed/tests/models/test_all_models.py new file mode 100644 index 00000000..add18f67 --- /dev/null +++ b/testbed/tests/models/test_all_models.py @@ -0,0 +1,51 @@ +""" +Contains test for simple sanity checks for the models +""" + +from typing import Type + +import numpy as np +import pytest +from sklearn.metrics import r2_score +from testbed.models import ProbabilisticModel +from testbed.models.ibug_kde_ import IBugXGBoostKDE + + +@pytest.mark.parametrize( + "model_class, params", + [ + (IBugXGBoostKDE, {"k": 10}), + ], +) +def test_all_models_sanity(model_class: Type[ProbabilisticModel], params: dict): + """ + Simple test to check if our wrapper for bayesian optimization works + """ + # add seed + np.random.seed(0) + + n_samples = 1000 + n_pred_samples = 10 + n_features = 1 + n_targets = 1 + + X = np.random.rand(n_samples, n_features) + epsilon = np.random.rand(n_samples, n_targets) * 0.1 + beta = np.random.rand(n_features, n_targets) + + y = X @ beta + epsilon + model = model_class(**params) + model.fit(X, y) + + # Check that everything runs + y_pred = model.predict(X) + y_samples = model.sample(X, n_samples=n_pred_samples) + + assert y_pred.shape == (n_samples, n_targets) + assert y_samples.shape == (n_pred_samples, n_samples, n_targets) + + r2_pred = r2_score(y.flatten(), y_pred.flatten()) + r2_samples = r2_score(y.flatten(), y_samples.mean(axis=0).flatten()) + + assert r2_pred > 0.9 + assert r2_samples > 0.9