Skip to content

IBUG rebuttal #109

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
108 changes: 108 additions & 0 deletions testbed/scripts/submit_slurm_job_rebuttal_ginsburg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import os
from pathlib import Path

template = """#!/bin/sh
#
#SBATCH --account=stats
#SBATCH -c 4
#SBATCH --time=11:59:00
#SBATCH --mem-per-cpu=12gb
#SBATCH --export=ALL

module load anaconda

cd /burg/stats/users/ag3750/treeffuser

source .venv/bin/activate
"""

path = Path(__file__).resolve().parent
parent_path = path.parent
main_script_path = parent_path / "src" / "testbed" / "__main__.py"

seeds = [0]
evaluation_mode = "bayes_opt"
split_idx = list(range(10))
model_names = [
# "ngboost",
# "ibug",
# "ibug_kde",
"treeffuser",
# "deep_ensemble",
# "quantile_regression_tree",
# "mc_dropout",
# "card",
"ppm_lightgbm",
"ppm_xgboost",
# "ppm_mlp",
]


datasets = [
"bike",
"boston",
# "communities", # contains NaN
# "energy", # y is 2d
"facebook",
"kin8nm",
# "msd", # very big X (463715, 90)
"naval",
"news",
"power",
"protein",
"superconductor",
# "wave", # very big
"wine",
"yacht",
"movies",
]


def get_cmd(
model,
seed,
split_idx,
dataset,
evaluation_mode,
):
tmp = (
f"python {main_script_path}"
f" --models {model}"
f" --seed {seed}"
f" --split_idx {split_idx}"
f" --datasets {dataset}"
f" --evaluation_mode {evaluation_mode}"
f" --wandb_project ale-ppm"
f" --n_iter_bayes_opt 25"
)
return tmp


def get_slurm_script(
model,
seed,
split_idx,
dataset,
evaluation_mode,
):
cmd = get_cmd(model, seed, split_idx, dataset, evaluation_mode)
return f"{template}\n{cmd}"


jobs_scripts_path = Path("jobs_scripts")
jobs_scripts_path.mkdir(parents=True, exist_ok=True)

scripts = []
for model in model_names:
for seed in seeds:
for split in split_idx:
for dataset in datasets:
script = get_slurm_script(model, seed, split, dataset, evaluation_mode)
scripts.append(script)

for i, script in enumerate(scripts):
slurm_script_path = jobs_scripts_path / f"job_{i}.sh"
with slurm_script_path.open("w") as f:
f.write(script)
cmd = f"sbatch {slurm_script_path}"
os.system(cmd) # noqa S605
5 changes: 5 additions & 0 deletions testbed/src/testbed/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ def get_model(

return IBugXGBoost

if model_name == "ibug_kde":
from testbed.models.ibug_kde_ import IBugXGBoostKDE

return IBugXGBoostKDE

if model_name == "drf":
from testbed.models.drf_ import DistributionalRandomForest

Expand Down
41 changes: 41 additions & 0 deletions testbed/src/testbed/data/uci/kin8nm/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import argparse
import zipfile
from pathlib import Path

import numpy as np
import pandas as pd
from testbed.data.utils import _assign_k_splits


def main(path_raw_dataset_dir: Path):
# Process the raw data file named temp.csv
raw_data_path = path_raw_dataset_dir / "data.zip"
# unzip the data
with zipfile.ZipFile(raw_data_path, "r") as zip_ref:
zip_ref.extractall(path_raw_dataset_dir)
raw_data_path = path_raw_dataset_dir / "data.csv"

df = pd.read_csv(raw_data_path)
y = df["y"].values
x = df.drop(columns=["y"]).values
categorical = []

k_fold_splits = _assign_k_splits(x.shape[0], 10, 0)

# Save preprocessed data
np.save(
path_raw_dataset_dir.parent / "data.npy",
{
"x": x,
"y": y,
"categorical": categorical,
"k_fold_splits": k_fold_splits,
},
)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("path", type=Path)
args = parser.parse_args()
main(args.path)
Binary file added testbed/src/testbed/data/uci/kin8nm/raw/data.zip
Binary file not shown.
14 changes: 12 additions & 2 deletions testbed/src/testbed/models/ibug_.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from numpy import ndarray
from sklearn.model_selection import train_test_split
from skopt.space import Integer
from skopt.space import Real
from xgboost import XGBRegressor

from testbed.models import ProbabilisticModel
Expand All @@ -14,10 +15,13 @@ class IBugXGBoost(ProbabilisticModel):
IBug wrapper for XGBoost.
"""

def __init__(self, k=100, seed=0):
def __init__(self, k=100, n_estimators=100, max_depth=2, learning_rate=0.1, seed=0):
super().__init__(seed)
self.model = None
self.k = k
self.n_estimators = n_estimators
self.max_depth = max_depth
self.learning_rate = learning_rate

def fit(
self,
Expand All @@ -42,6 +46,9 @@ def fit(

# train GBRT model
self.gbrt_model = XGBRegressor(
n_estimators=self.n_estimators,
max_depth=self.max_depth,
learning_rate=self.learning_rate,
reg_alpha=0,
scale_pos_weight=1,
base_score=0.5,
Expand Down Expand Up @@ -77,5 +84,8 @@ def search_space() -> dict:
Return the search space for parameters of the model.
"""
return {
"k": Integer(20, 150),
"k": Integer(20, 250), # ibug runs CV on k by default
"n_estimators": Integer(10, 1000),
"learning_rate": Real(0.01, 0.5, prior="log-uniform"),
"max_depth": Integer(1, 100),
}
152 changes: 152 additions & 0 deletions testbed/src/testbed/models/ibug_kde_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import numpy as np
from ibug import IBUGWrapper
from jaxtyping import Float
from numpy import ndarray
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KernelDensity
from skopt.space import Integer
from skopt.space import Real
from xgboost import XGBRegressor

from testbed.models import ProbabilisticModel

_BANDWIDTH_SEARCH_SPACE = np.logspace(-4, 3, 10)


class IBugXGBoostKDE(ProbabilisticModel):
"""
IBug wrapper for XGBoost.
"""

def __init__(self, k=100, n_estimators=100, max_depth=2, learning_rate=0.1, seed=0):
super().__init__(seed)
self.model = None
self.best_bandwith = None

self.k = k
self.n_estimators = n_estimators
self.max_depth = max_depth
self.learning_rate = learning_rate

# For scaling
self.mu = None
self.sigma = None

def fit(
self,
X: Float[ndarray, "batch x_dim"],
y: Float[ndarray, "batch y_dim"],
) -> ProbabilisticModel:
"""
Fit the model to the data.
"""
self.mu = np.mean(y, axis=0)
self.sigma = np.std(y, axis=0)

# Normalize y
y = (y - self.mu) / self.sigma

if y.shape[1] > 1:
raise ValueError("IBugXGBoost only accepts 1 dimensional y values.")

y = y[:, 0]
self.k = min(self.k, len(X)) # make sure k is not larger than the number of instances

X_train, X_val, y_train, y_val = train_test_split(
X,
y,
test_size=0.1,
random_state=self.seed,
)

# train GBRT model
self.gbrt_model = XGBRegressor(
n_estimators=self.n_estimators,
max_depth=self.max_depth,
learning_rate=self.learning_rate,
reg_alpha=0,
scale_pos_weight=1,
base_score=0.5,
).fit(X_train, y_train)

# extend GBRT model into a probabilistic estimator
self.model = IBUGWrapper(k=self.k).fit(
self.gbrt_model, X_train, y_train, X_val=X_val, y_val=y_val
)

_, _, _, y_val_neighbor_vals = self.model.pred_dist(X_val, return_kneighbors=True)
self.best_bandwith = _find_best_bandwith(y_val_neighbor_vals.T, y_val)
return self

def predict(self, X: Float[ndarray, "batch x_dim"]) -> Float[ndarray, "batch y_dim"]:
"""
Predict the mean for each input.
"""
# predict mean and variance for unseen instances
location, scale = self.model.pred_dist(X)
location = location * self.sigma + self.mu
return location.reshape(-1, 1)

def sample(
self, X: Float[ndarray, "batch x_dim"], n_samples=10, seed=None
) -> Float[ndarray, "n_samples batch y_dim"]:
"""
Sample from the probability distribution for each input.
"""
location, scale, _, neighbor_values = self.model.pred_dist(X, return_kneighbors=True)
assert location.shape == (len(X),)
assert scale.shape == (len(X),)
assert neighbor_values.shape == (
len(X),
self.model.k_,
) # might be different than our k

samples = []
for i in range(len(X)):
kde = KernelDensity(
bandwidth=self.best_bandwith, algorithm="auto", kernel="gaussian"
)
kde.fit(neighbor_values[i].reshape(-1, 1))
samples.append(kde.sample(n_samples, random_state=seed))

samples = np.array(samples).transpose(1, 0, 2)
samples = samples * self.sigma + self.mu
assert samples.shape == (n_samples, len(X), 1)
return samples

@staticmethod
def search_space() -> dict:
"""
Return the search space for parameters of the model.
"""
return {
"k": Integer(20, 250),
"n_estimators": Integer(10, 1000),
"learning_rate": Real(0.01, 0.5, prior="log-uniform"),
"max_depth": Integer(1, 100),
}


def _find_best_bandwith(
y_samples: Float[ndarray, "n_samples batch"],
y_true: Float[ndarray, "batch"],
) -> float:
batch = y_true.shape[0]
assert y_samples.shape == (y_samples.shape[0], batch)

best_bandwith = 0
best_log_prob = -np.inf

for bandwidth in _BANDWIDTH_SEARCH_SPACE:
log_prob = 0
for i in range(batch):
y_i = y_samples[:, i].reshape(-1, 1)
kde = KernelDensity(bandwidth=bandwidth, algorithm="auto", kernel="gaussian")
kde.fit(y_i)
log_prob += kde.score(y_true[i].reshape(-1, 1))

if log_prob > best_log_prob:
best_log_prob = log_prob
best_bandwith = bandwidth

return best_bandwith
2 changes: 1 addition & 1 deletion testbed/src/testbed/models/point_prediction_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def search_space() -> dict:
return {
"n_estimators": Integer(10, 1000),
"learning_rate": Real(0.01, 0.5, prior="log-uniform"),
"max_depth": Integer(1, 10),
"max_depth": Integer(1, 100),
}


Expand Down
Loading