diff --git a/elfi/examples/example_bolfi_invocation.py b/elfi/examples/example_bolfi_invocation.py new file mode 100644 index 00000000..60f71f32 --- /dev/null +++ b/elfi/examples/example_bolfi_invocation.py @@ -0,0 +1,314 @@ +import elfi +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import scipy.stats as ss + +from elfi import BOLFI +from matplotlib.animation import FuncAnimation, FFMpegWriter + +seed = 0 + +# Number of minimum samples. +n_low = 1 +# Number of maximum samples. +n = 5 +np.random.seed(seed) +gauss_samples = ss.norm().rvs(size=n) +""" +t_plus_range = [0.36, 0.44] +t_plus = np.linspace(*t_plus_range, 200) +model = lambda t_plus: t_plus**2 +x_range = [0.36, 0.44] +y_range = [-0.8, 1.5] +canvas = np.meshgrid(np.linspace(*x_range, 200), + np.linspace(*reversed(y_range), 200)) +t_plus_samples = t_plus_range.copy() + +fit_spline = [] +fit_spline_uncertainty = [] +fit_acq = [] +fit_discrepancy = [] +fit_acq_t_plus = [] + +prior = elfi.Prior('norm', 0.4, 0.02**2, name='t_+') +elfi_simulator = elfi.Simulator( + lambda *args, **_: model(args[0]) + ss.norm().rvs(size=1)[0], + prior, observed=model(0.415) +) +processed_data = elfi.Summary(lambda data: data, elfi_simulator) +distance = elfi.Distance('euclidean', processed_data) +log_distance = elfi.Operation(np.log, distance) +bolfi = elfi.BOLFI( + log_distance, initial_evidence=10, bounds={'t_+': t_plus_range}, seed=seed +) +for i in range(20): + bolfi.fit(n_evidence=10 + i + 1, bar=True) + #plt.plot(t_plus, bolfi.target_model.predict(t_plus, noiseless=True)[0]) + #plt.show() +""" + +t_plus_range = [0.36, 0.44] +t_plus = np.linspace(*t_plus_range, 200) + + +def model(t_plus): + return t_plus**2 + + +x_range = [0.358, 0.442] +t_plus_eval = np.linspace(*x_range, 200) +y_range = [-0.012, 0.052] +canvas = np.meshgrid(np.linspace(*x_range, 200), + np.linspace(*reversed(y_range), 200)) +t_plus_samples = t_plus_range.copy() + +fit_spline = [] +fit_spline_uncertainty = [] +fit_acq = [] +fit_discrepancy = [] +fit_acq_t_plus = [] + +prior = elfi.Prior('norm', 0.4, 0.02, name='t_+') +elfi_simulator = elfi.Simulator( + lambda *args, **_: model(args[0]) + 0.001 * ss.norm().rvs(size=1)[0], + prior, observed=model(0.415) +) +processed_data = elfi.Summary(lambda data: data, elfi_simulator) +distance = elfi.Distance('euclidean', processed_data) +# log_distance = elfi.Operation(np.log, distance) + + +class Animate_GP: + def __init__(self, fig, ax, initial_evidence=0, frames=None): + self.bolfi = elfi.BOLFI( + distance, + initial_evidence=initial_evidence, + bounds={'t_+': t_plus_range}, + seed=seed, + batch_size=10, + ) + self.ax = ax + self.ax.set_xlim(*x_range) + self.ax.set_ylim(*y_range) + self.ax.xaxis.set_ticks([]) + self.ax.yaxis.set_ticks([]) + self.ax.set_xlabel("model parameter") + self.ax.set_ylabel("discrepancy") + self.initial_evidence = initial_evidence + self.line, = self.ax.plot([], []) + self.acq, = self.ax.plot([], []) + self.img = self.ax.imshow( + 0 * canvas[0], + cmap='gist_yarg', + vmin=0, + vmax=1, + aspect='auto', + extent=[*x_range, *y_range] + ) + self.pts, = self.ax.plot( + [], [], lw=0, marker='o', + color=plt.rcParams['axes.prop_cycle'].by_key()['color'][0] + ) + trans = matplotlib.transforms.ScaledTranslation( + 240/72, -10/72, fig.dpi_scale_trans + ) + self.txt = self.ax.text( + 0.0, 1.0, str(self.initial_evidence) + " samples ", + transform=ax.transAxes + trans, verticalalignment='top' + ) + self.frames = frames + + def __call__(self, i): + if self.frames is not None: + n_evidence = self.initial_evidence + self.frames[i] + else: + n_evidence = self.initial_evidence + i + self.bolfi.fit(n_evidence=n_evidence) + mean, var = self.bolfi.target_model.predict( + t_plus_eval, noiseless=True + ) + self.line.set_data(t_plus_eval, mean) + eta_squared = 2 * np.log( + n_evidence**4 * np.pi**2 / 0.3 + ) + self.acq.set_data(t_plus_eval, mean - np.sqrt(eta_squared * var)) + unc = ( + np.exp(-(canvas[1] - mean.T)**2 / (2 * var.T)) + / np.sqrt(2 * np.pi * var.T) + ) + self.ax.imshow( + unc, + cmap='gist_yarg', + vmin=0, + vmax=np.max(unc), + aspect='auto', + extent=[*x_range, *y_range] + ) + self.pts.set_data( + self.bolfi.target_model._gp.X, self.bolfi.target_model._gp.Y + ) + self.txt.set_text(str(n_evidence) + " samples") + return self.line, self.acq, self.img, self.pts, self.txt + + +initial_evidence = 3 +frames = np.array([3, 4, 13, 14]) - initial_evidence + +fig, ax = plt.subplots(figsize=(6, 4)) +anim_gp = Animate_GP(fig, ax, initial_evidence=initial_evidence, frames=frames) +anim = FuncAnimation( + fig, anim_gp, frames=4, interval=200, blit=False, repeat=False +) + +bolfi = elfi.BOLFI( + distance, + initial_evidence=initial_evidence, + bounds={'t_+': t_plus_range}, + seed=seed, + batch_size=10, +) + +likelihood_range = [0.405, 0.425] +likelihood_eval = np.linspace(*likelihood_range, 200) +gps_at_each_frame = [] + +fig_static, axes = plt.subplots(figsize=(6, 4), nrows=2, ncols=2) + +for i, ax in enumerate(axes.flatten()): + ax.xaxis.set_ticks([]) + ax.yaxis.set_ticks([]) + ax.set_xlim(*x_range) + ax.set_ylim(*y_range) + if frames is not None: + n_evidence = initial_evidence + frames[i] + else: + n_evidence = initial_evidence + i + bolfi.fit(n_evidence=n_evidence) + trans = matplotlib.transforms.ScaledTranslation( + 126/72, -5/72, fig.dpi_scale_trans + ) + mean, var = bolfi.target_model.predict(t_plus_eval, noiseless=True) + gps_at_each_frame.append( + bolfi.target_model.predict(likelihood_eval, noiseless=True) + ) + ax.plot(t_plus_eval, mean) + eta_squared = 2 * np.log( + n_evidence**4 * np.pi**2 / 0.3 + ) + ax.plot(t_plus_eval, mean - np.sqrt(eta_squared * var)) + unc = ( + np.exp(-(canvas[1] - mean.T)**2 / (2 * var.T)) + / np.sqrt(2 * np.pi * var.T) + ) + ax.imshow( + unc, + cmap='gist_yarg', + vmin=0, + vmax=np.max(unc), + aspect='auto', + extent=[*x_range, *y_range] + ) + ax.scatter( + bolfi.target_model._gp.X, bolfi.target_model._gp.Y + ) + ax.text( + 0.0, 1.0, str(n_evidence) + " samples ", + transform=ax.transAxes + trans, + verticalalignment='top', horizontalalignment='right' + ) +fig_static.supxlabel(r"Model parameter $\theta$") +fig_static.supylabel("Model-data distance") +fig_static.tight_layout() + +posterior = bolfi.extract_posterior() +threshold = posterior.threshold +likelihood_yrange = [-0.0012, 0.0082] +likelihood_canvas = np.meshgrid( + np.linspace(*likelihood_range, 200), + np.linspace(*reversed(likelihood_yrange), 200) +) +mean, var = bolfi.target_model.predict(likelihood_eval, noiseless=True) +likelihood = ss.norm().cdf((threshold - mean) / np.sqrt(var)) +normalizing = np.sum( + 0.5 + * (likelihood[:-1] + likelihood[1:]) + * (t_plus_eval[1:] - t_plus_eval[:-1]) +) + +fig_int, (ax_gp, ax_int) = plt.subplots(figsize=(6, 4), nrows=2) +ax_gp.set_xlim(*likelihood_range) +ax_int.set_xlim(*likelihood_range) +ax_gp.xaxis.set_ticks([]) +ax_int.xaxis.set_ticks([]) +ax_gp.yaxis.set_ticks([]) +ax_int.yaxis.set_ticks([]) +ax_gp.set_ylabel(r"$\log||y_i(\theta)-y_i^\star||$") +ax_int.set_ylabel(r"$L_K(\theta)$") +ax_int.set_xlabel(r"Model parameter $\theta$") +trans_int = matplotlib.transforms.ScaledTranslation( + 10/72, -5/72, fig_int.dpi_scale_trans +) +ax_gp.text(0.0, 1.0, '(a)', transform=ax_gp.transAxes + trans_int, + verticalalignment='top', fontsize=20) +ax_int.text(0.0, 1.0, '(b)', transform=ax_int.transAxes + trans_int, + verticalalignment='top', fontsize=20) +trans_samples = matplotlib.transforms.ScaledTranslation( + 162/72, -10/72, fig.dpi_scale_trans +) +ax_gp.text( + 0.0, 1.0, str(n_evidence) + " samples ", + transform=ax_gp.transAxes + trans_samples, verticalalignment='top' +) +ax_gp.plot(likelihood_eval, mean) +ax_gp.plot(likelihood_eval, mean - np.sqrt(eta_squared * var)) +ax_gp.plot(likelihood_eval, [threshold] * len(likelihood_eval)) +ax_gp.scatter( + bolfi.target_model._gp.X, bolfi.target_model._gp.Y +) +truncated_unc = ( + np.exp(-(likelihood_canvas[1] - mean.T)**2 / (2 * var.T)) + / np.sqrt(2 * np.pi * var.T) +) / normalizing * (likelihood_canvas[1] < threshold) +ax_gp.imshow( + truncated_unc, + cmap='gist_yarg', + vmin=0, + vmax=np.max(truncated_unc), + aspect='auto', + extent=[*likelihood_range, *likelihood_yrange] +) +ax_int.plot( + likelihood_eval, + likelihood / normalizing, + color=plt.rcParams['axes.prop_cycle'].by_key()['color'][4] +) + +fig_int.tight_layout() + +eps_min = -0.005 +eps_max = 0.04 +threshold_eval = np.linspace(eps_min, eps_max, 200) +norm = matplotlib.colors.Normalize(eps_min, eps_max) +cmap = plt.get_cmap('viridis') +fig_eps, ax_eps = plt.subplots( + figsize=(4 * 2**0.5, 4), constrained_layout=True) +# mean, var = gps_at_each_frame[2] +for eps in threshold_eval: + likelihood_eps = ss.norm().cdf((eps - mean) / np.sqrt(var)) + normalizing_eps = np.sum( + 0.5 + * (likelihood_eps[:-1] + likelihood_eps[1:]) + * (t_plus_eval[1:] - t_plus_eval[:-1]) + ) + ax_eps.plot( + likelihood_eval, + likelihood_eps / normalizing_eps, + color=cmap(norm(eps)) + ) +ax_eps.set_title("Normalized Likelihoods") +fig_eps.colorbar(matplotlib.cm.ScalarMappable( + norm=norm, cmap=cmap +), ax=ax_eps, label="threshold") + +plt.show() diff --git a/elfi/examples/example_gpytorchbolfi_invocation.py b/elfi/examples/example_gpytorchbolfi_invocation.py new file mode 100644 index 00000000..6b744094 --- /dev/null +++ b/elfi/examples/example_gpytorchbolfi_invocation.py @@ -0,0 +1,321 @@ +import torch +# import pyro +import elfi +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import scipy.stats as ss + +from elfi.methods.inference.gpytorch_bolfi import GPyTorchBOLFI +from copy import deepcopy +from matplotlib.animation import FuncAnimation, FFMpegWriter +from torch import tensor + +seed = 0 + +np.random.seed(seed) +t_plus_range = [0.36, 0.44] +t_plus = np.linspace(*t_plus_range, 200) + + +def model(t_plus): + return t_plus**2 + + +x_range = [0.358, 0.442] +t_plus_eval = np.linspace(*x_range, 200) +y_range = [-0.012, 0.052] +canvas = np.meshgrid(np.linspace(*x_range, 200), + np.linspace(*reversed(y_range), 200)) +t_plus_samples = t_plus_range.copy() + +fit_spline = [] +fit_spline_uncertainty = [] +fit_acq = [] +fit_discrepancy = [] +fit_acq_t_plus = [] + +prior = elfi.Prior('norm', 0.4, 0.02, name='t_+') +# pyro_torch_prior = pyro.distributions.Normal(0.4, 0.02) +elfi_simulator = elfi.Simulator( + lambda *args, **_: model(args[0]) + 0.001 * ss.norm().rvs(size=1)[0], + prior, observed=model(0.415) +) +processed_data = elfi.Summary(lambda data: data, elfi_simulator) +distance = elfi.Distance('euclidean', processed_data) +# log_distance = elfi.Operation(np.log, distance) + +class Animate_GP: + def __init__(self, fig, ax, initial_evidence=0, frames=None): + + self.bolfi = GPyTorchBOLFI( + distance, + initial_evidence=initial_evidence, + bounds={'t_+': t_plus_range}, + seed=seed, + max_opt_iters=200, + # pyro_torch_prior=pyro_torch_prior, + ) + self.ax = ax + self.ax_acq = self.ax.twinx() + self.ax.set_xlim(*x_range) + self.ax_acq.set_xlim(*x_range) + # self.ax.set_ylim(*y_range) + self.ax.xaxis.set_ticks([]) + self.ax.yaxis.set_ticks([]) + self.ax_acq.xaxis.set_ticks([]) + self.ax_acq.yaxis.set_ticks([]) + self.ax.set_xlabel("model parameter") + self.ax.set_ylabel("discrepancy") + self.initial_evidence = initial_evidence + self.line, = self.ax.plot([], []) + self.acq, = self.ax_acq.plot([], [], color="orange") + self.img = self.ax.imshow( + 0 * canvas[0], + cmap='gist_yarg', + vmin=0, + vmax=1, + aspect='auto', + extent=[*x_range, *y_range] + ) + self.pts, = self.ax.plot( + [], [], lw=0, marker='o', + color=plt.rcParams['axes.prop_cycle'].by_key()['color'][0] + ) + trans = matplotlib.transforms.ScaledTranslation( + 240/72, -10/72, fig.dpi_scale_trans + ) + self.txt = self.ax.text( + 0.0, 1.0, str(self.initial_evidence) + " samples ", + transform=ax.transAxes + trans, verticalalignment='top' + ) + self.frames = frames + self.target_models = [] + self.posteriors = [] + + def __call__(self, i): + if self.frames is not None: + n_evidence = self.initial_evidence + self.frames[i] + else: + n_evidence = self.initial_evidence + i + self.bolfi.fit(n_evidence=n_evidence) + mean, var = self.bolfi.target_model.predict( + t_plus_eval, noiseless=True + ) + self.line.set_data(t_plus_eval, mean) + self.acq.set_data(t_plus_eval, self.bolfi.acquisition_method.evaluate( + torch.tensor(t_plus_eval).unsqueeze(-1).unsqueeze(-1), + t=n_evidence - self.initial_evidence + ).detach().numpy()) + unc = ( + np.exp(-(canvas[1] - mean)**2 / (2 * var)) + / np.sqrt(2 * np.pi * var) + ) + self.ax.imshow( + unc, + cmap='gist_yarg', + vmin=0, + vmax=np.max(unc), + aspect='auto', + extent=[*x_range, *y_range] + ) + self.pts.set_data( + self.bolfi.target_model.X.T.detach().numpy(), + self.bolfi.target_model.Y.detach().numpy() + ) + self.txt.set_text(str(n_evidence) + " samples") + # Store the results. + self.target_models.append(deepcopy(self.bolfi.target_model)) + self.posteriors.append(self.bolfi.extract_posterior()) + return self.line, self.acq, self.pts, self.txt # ,self.img + + +initial_evidence = 10 +frames = np.array([i for i in range(10, 24)]) - initial_evidence +# initial_evidence = 10 +# frames = np.array([i for i in range(10, 41)]) - initial_evidence + +fig, ax = plt.subplots(figsize=(6, 4)) +anim_gp = Animate_GP(fig, ax, initial_evidence=initial_evidence, frames=frames) +anim = FuncAnimation( + fig, anim_gp, frames=len(frames), interval=200, blit=False, repeat=False +) +plt.show() + +likelihood_range = [0.405, 0.425] +likelihood_eval = np.linspace(*likelihood_range, 200) +gps_at_each_frame = [] + +fig_static, axes = plt.subplots(figsize=(6, 4), nrows=2, ncols=2) + +for i, ax in enumerate(axes.flatten()): + if i == 2: + i = -2 + if i == 3: + i = -1 + + ax.xaxis.set_ticks([]) + ax.yaxis.set_ticks([]) + ax.set_xlim(*x_range) + ax_acq = ax.twinx() + ax_acq.set_xlim(*x_range) + ax.set_ylim(*y_range) + target_model = anim_gp.target_models[i] + trans = matplotlib.transforms.ScaledTranslation( + 126/72, -5/72, fig.dpi_scale_trans + ) + mean, var = target_model.predict(t_plus_eval, noiseless=True) + gps_at_each_frame.append( + target_model.predict(likelihood_eval, noiseless=True) + ) + ax.plot(t_plus_eval, mean) + if frames is not None: + n_evidence = initial_evidence + frames[i] + else: + n_evidence = initial_evidence + i + ax_acq.plot(t_plus_eval, anim_gp.bolfi.acquisition_method.evaluate( + torch.tensor(t_plus_eval).unsqueeze(-1).unsqueeze(-1), + t=frames[-1] - initial_evidence + ).detach().numpy(), color="orange") + unc = ( + np.exp(-(canvas[1] - mean)**2 / (2 * var)) + / np.sqrt(2 * np.pi * var) + ) + ax.imshow( + unc, + cmap='gist_yarg', + vmin=0, + vmax=np.max(unc), + aspect='auto', + extent=[*x_range, *y_range] + ) + ax.scatter( + target_model.X.T.detach().numpy(), + target_model.Y.detach().numpy() + ) + ax.text( + 0.0, 1.0, str(n_evidence) + " samples ", + transform=ax.transAxes + trans, + verticalalignment='top', horizontalalignment='right' + ) +fig_static.supxlabel(r"Model parameter $\theta$") +fig_static.supylabel("Model-data distance") +fig_static.tight_layout() + +posterior = anim_gp.posteriors[-1] +target_model = anim_gp.target_models[-1] +threshold = posterior.threshold # .loc.detach().numpy() +likelihood_yrange = [-0.0012, 0.0082] +likelihood_canvas = np.meshgrid( + np.linspace(*likelihood_range, 200), + np.linspace(*reversed(likelihood_yrange), 200) +) +mean, var = target_model.predict(likelihood_eval, noiseless=True) +likelihood = anim_gp.bolfi.acquisition_method.evaluate( + torch.tensor(likelihood_eval).unsqueeze(-1).unsqueeze(-1), + t=frames[-1] - initial_evidence +).detach().numpy() +normalizing = np.sum( + 0.5 + * (likelihood[:-1] + likelihood[1:]) + * (t_plus_eval[1:] - t_plus_eval[:-1]) +) + +fig_int, (ax_gp, ax_int) = plt.subplots(figsize=(6, 4), nrows=2) +ax_gp_acq = ax_gp.twinx() +ax_gp.set_xlim(*likelihood_range) +ax_gp_acq.set_xlim(*likelihood_range) +ax_int.set_xlim(*likelihood_range) +ax_gp.xaxis.set_ticks([]) +ax_int.xaxis.set_ticks([]) +ax_gp_acq.xaxis.set_ticks([]) +ax_gp.yaxis.set_ticks([]) +ax_int.yaxis.set_ticks([]) +ax_gp_acq.yaxis.set_ticks([]) +ax_gp.set_ylabel(r"$\log||y_i(\theta)-y_i^\star||$") +ax_int.set_ylabel(r"$L_K(\theta)$") +ax_int.set_xlabel(r"Model parameter $\theta$") +trans_int = matplotlib.transforms.ScaledTranslation( + 10/72, -5/72, fig_int.dpi_scale_trans +) +ax_gp.text(0.0, 1.0, '(a)', transform=ax_gp.transAxes + trans_int, + verticalalignment='top', fontsize=20) +ax_int.text(0.0, 1.0, '(b)', transform=ax_int.transAxes + trans_int, + verticalalignment='top', fontsize=20) +trans_samples = matplotlib.transforms.ScaledTranslation( + 162/72, -10/72, fig.dpi_scale_trans +) +ax_gp.text( + 0.0, 1.0, str(n_evidence) + " samples ", + transform=ax_gp.transAxes + trans_samples, verticalalignment='top' +) +ax_gp.plot(likelihood_eval, mean) +ax_gp_acq.plot( + likelihood_eval, + anim_gp.bolfi.acquisition_method.evaluate( + torch.tensor(t_plus_eval).unsqueeze(-1).unsqueeze(-1), + t=frames[-1] - initial_evidence + ).detach().numpy(), + color="orange" +) +ax_gp.plot(likelihood_eval, [threshold] * len(likelihood_eval)) +ax_gp.scatter( + target_model.X.T.detach().numpy(), + target_model.Y.detach().numpy() +) +truncated_unc = ( + np.exp(-(likelihood_canvas[1] - mean)**2 / (2 * var)) + / np.sqrt(2 * np.pi * var) +) / normalizing * (likelihood_canvas[1] < threshold) +ax_gp.imshow( + truncated_unc, + cmap='gist_yarg', + vmin=0, + vmax=np.max(truncated_unc), + aspect='auto', + extent=[*likelihood_range, *likelihood_yrange] +) +ax_int.plot( + likelihood_eval, + likelihood / normalizing, + color=plt.rcParams['axes.prop_cycle'].by_key()['color'][4] +) + +fig_int.tight_layout() + +eps_min = -1 +eps_max = 1 +threshold_eval = np.linspace(eps_min, eps_max, 200) +norm = matplotlib.colors.Normalize(eps_min, eps_max) +cmap = plt.get_cmap('viridis') +fig_eps, ax_eps = plt.subplots( + figsize=(4 * 2**0.5, 4), constrained_layout=True +) +# mean, var = gps_at_each_frame[2] +for eps in threshold_eval: + likelihood_eps = ss.norm().cdf((eps - mean) / np.sqrt(var)) + normalizing_eps = np.sum( + 0.5 + * (likelihood_eps[:-1] + likelihood_eps[1:]) + * (t_plus_eval[1:] - t_plus_eval[:-1]) + ) + ax_eps.plot( + likelihood_eval, + likelihood_eps / normalizing_eps, + color=cmap(norm(eps)) + ) +ax_eps.set_title("Normalized Likelihoods") +fig_eps.colorbar(matplotlib.cm.ScalarMappable( + norm=norm, cmap=cmap +), ax=ax_eps, label="threshold") + +plt.show() + +print(anim_gp.bolfi.target_model._gp(torch.linspace(*x_range, 200))) +print(anim_gp.bolfi.target_model._gp.likelihood(torch.linspace(*x_range, 200))) +print(anim_gp.bolfi.target_model._gp.likelihood( + anim_gp.bolfi.target_model._gp(torch.linspace(*x_range, 200)) +)) + +sample_size = 20 +print(anim_gp.bolfi.sample(sample_size, threshold=0, n_chains=1)) diff --git a/elfi/methods/bo/botorch_acquisition.py b/elfi/methods/bo/botorch_acquisition.py new file mode 100644 index 00000000..1dd2f287 --- /dev/null +++ b/elfi/methods/bo/botorch_acquisition.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: 2024 Yannick Kuhn +# +# SPDX-License-Identifier: BSD-3-Clause + + +from botorch.acquisition.analytic import UpperConfidenceBound +from botorch.generation.gen import gen_candidates_torch, get_best_candidates +from botorch.optim.initializers import gen_batch_initial_conditions +from elfi.methods.bo.acquisition import AcquisitionBase +from numpy import log, pi, random +from torch import float, long, tensor + + +class BoTorchAcquisitionBase(AcquisitionBase): + """Mimics the AcquisitionBase in ELFI, but botorch-compatible. + """ + + def __init__( + self, + model, + prior=None, + n_inits=10, + max_opt_iters=1000, + noise_var=None, + exploration_rate=10, + seed=None, + constraints=None, + ): + """ + :param model: + An instance of ``GPyTorchRegression``. Needs to have the + following attributes: `input_dim`, `bounds`, + `parameter_names`, and a callable `predict(x, noiseless)`. + """ + self.model = model + if prior is not None: + raise NotImplementedError( + "Priors for acquisition function not implemented." + ) + else: + self.prior = prior + self.n_inits = int(n_inits) + self.max_opt_iters = int(max_opt_iters) + self.constraints = constraints + if noise_var is not None: + self._check_noise_var(noise_var) + self.noise_var = self._transform_noise_var(noise_var) + else: + self.noise_var = noise_var + self.exploration_rate = exploration_rate + self.random_state = ( + random if seed is None else random.RandomState(seed) + ) + self.seed = 0 if seed is None else seed + + def acquire(self, n, t=None): + raise NotImplementedError + + +class BoTorchLCBSC(BoTorchAcquisitionBase): + def _beta(self, t): + # Count from 0. + if t is None: + t = 0 + t += 1 + d = self.model.input_dim + return 2 * log(t**(2 * d + 2) * pi**2 / (3 / self.exploration_rate)) + + def evaluate(self, x, t=None): + """ + Evaluate the acquisition function at `x`. + + :param x: + The test variable to evaluate at. + :param t: + The index of the current iteration, starting from 0. + :returns: + the lower confindence bound at `x`. + """ + lcb = UpperConfidenceBound( + self.model._gp, + beta=self._beta(t), + maximize=False + ) + return -lcb(x) + + def acquire(self, n, t=None, std_scale=None): + """ + Minimize the acquisition function. + + :param n: + Number of acquisition points to return. + :param t: + The index of the current iteration, starting from 0. + :param std_scale: + If left at None, logarithmically scales up the exploration. + Else gives a fixed numerical value for the prefactor of the + standard deviation in ``mu - sqrt(prefactor) * std``. + :returns: + np.ndarray; the shape is ``(n, input_dim)``. + """ + lcb = UpperConfidenceBound( + self.model._gp, + beta=std_scale or self._beta(t), + maximize=False, + ) + bounds = self.model.torch_bounds + x_init = gen_batch_initial_conditions( + lcb, + bounds, + q=n, + num_restarts=25, + raw_samples=500 * 2**self.model.input_dim, + inequality_constraints=[ + (tensor([i], dtype=long), + tensor([1], dtype=float), + b) + for i, b in enumerate(bounds[0]) + ] + [ + (tensor([i], dtype=long), + tensor([-1], dtype=float), + -b) + for i, b in enumerate(bounds[1]) + ] + ) + # While gen_candidates_scipy is exact, the noise is actually useful. + batch_candidates, batch_acq_values = gen_candidates_torch( + initial_conditions=x_init, + acquisition_function=lcb, + lower_bounds=bounds[0], + upper_bounds=bounds[1], + ) + candidate = get_best_candidates(batch_candidates, -batch_acq_values) + return candidate.detach().numpy() diff --git a/elfi/methods/bo/gpytorch_bolfi_model.py b/elfi/methods/bo/gpytorch_bolfi_model.py new file mode 100644 index 00000000..cfe23708 --- /dev/null +++ b/elfi/methods/bo/gpytorch_bolfi_model.py @@ -0,0 +1,460 @@ +# SPDX-FileCopyrightText: 2024 Yannick Kuhn +# +# SPDX-License-Identifier: BSD-3-Clause + +from botorch.models.gp_regression import SingleTaskGP +import gpytorch +from gpytorch.constraints import Positive +from gpytorch.kernels import RBFKernel, ScaleKernel +from gpytorch.means.mean import Mean +from gpytorch.priors import GammaPrior, NormalPrior +import numpy as np +from scipy.optimize import minimize +import torch + + +class ParabolicMean(Mean): + """ + A parabolic Prior mean function, i.e.: + + µ(x) = a ⋅ x² + b ⋅ x + c + + or more generally for multidimensional inputs: + + µ(x) = Σⱼ (aⱼ ⋅ xⱼ² + bⱼ ⋅ xⱼ + c) + """ + + def __init__( + self, + list_of_square_priors, + list_of_linear_priors, + constant_prior, + batch_shape=torch.Size(), + **kwargs, + ): + super().__init__() + + self.dim = len(list_of_square_priors) + self.batch_shape = batch_shape + + for j, square_prior in enumerate(list_of_square_priors): + self.register_parameter( + name='raw_square_coefficient_' + str(j), + parameter=torch.nn.Parameter( + square_prior.mean * torch.ones(batch_shape) + ) + ) + self.register_prior( + 'square_coefficient_' + str(j) + '_prior', + square_prior, + lambda module, index=j: module.square_coefficients[index], + lambda module, value, index=j: ( + module._square_closure(value, index) + ), + ) + self.register_constraint( + 'raw_square_coefficient_' + str(j), Positive() + ) + + for j, linear_prior in enumerate(list_of_linear_priors): + self.register_parameter( + name='raw_linear_coefficient_' + str(j), + parameter=torch.nn.Parameter( + linear_prior.mean * torch.ones(batch_shape) + ) + ) + self.register_prior( + 'linear_coefficient_' + str(j) + '_prior', + linear_prior, + lambda module, index=j: module.linear_coefficients[index], + lambda module, value, index=j: ( + module._linear_closure(value, index) + ), + ) + + self.register_parameter( + name='raw_constant', + parameter=torch.nn.Parameter( + constant_prior.mean * torch.ones(batch_shape) + ) + ) + self.register_prior( + 'constant_prior', + constant_prior, + lambda module: module.constant, + lambda module, value: module._constant_closure(value) + ) + + @property + def square_coefficients(self): + return torch.stack([ + getattr( + self, 'raw_square_coefficient_' + str(index) + '_constraint' + ).transform( + getattr(self, 'raw_square_coefficient_' + str(index)) + ) + for index in range(self.dim) + ], dim=0) + + @square_coefficients.setter + def square_coefficients(self, value): + # When value is e.g. a NumPy array, convert to a Torch tensor + # and copy dtype and device (CPU/GPU) from raw parameters. + if not torch.is_tensor(value): + value = torch.as_tensor(value).to(self.square_coefficients) + + for index in range(self.dim): + self._square_closure(self, value, index) + + def _square_closure(self, value, index): + self.initialize( + **{ + 'raw_square_coefficient_' + str(index): ( + getattr( + self, + 'raw_square_coefficient_' + str(index) + '_constraint' + ).inverse_transform(value) + ) + } + ) + + @property + def linear_coefficients(self): + return torch.stack([ + getattr(self, 'raw_linear_coefficient_' + str(index)) + for index in range(self.dim) + ], dim=0) + + @linear_coefficients.setter + def linear_coefficients(self, value): + if not torch.is_tensor(value): + value = torch.as_tensor(value).to(self.linear_coefficients) + + for index in range(self.dim): + self._linear_closure(self, value, index) + + def _linear_closure(self, value, index): + self.initialize( + **{'raw_linear_coefficient_' + str(index): value} + ) + + @property + def constant(self): + return self.raw_constant + + @constant.setter + def constant(self, value): + self._constant_closure(self, value) + + def _constant_closure(self, value): + if not torch.is_tensor(value): + value = torch.as_tensor(value).to(self.constant) + + self.initialize(raw_constant=value) + + def forward(self, x): + # The unsqueeze makes these have shape batch_shape x 1. + square_coefficients = self.square_coefficients.unsqueeze(-1) + linear_coefficients = self.linear_coefficients.unsqueeze(-1) + constant = self.constant.unsqueeze(-1) + + square_term = torch.sum(square_coefficients.T * x**2, dim=-1) + linear_term = torch.sum(linear_coefficients.T * x, dim=-1) + + return square_term + linear_term + constant + + +def BOLFIKernel(num_dims, length_scale, kernel_var): + # In original BOLFI, a variance term ("bias") would be added. + return ScaleKernel( + # ProductStructureKernel( + RBFKernel( + lengthscale_prior=GammaPrior(length_scale, 1), + ), + # num_dims=num_dims + # ), + outputscale_prior=GammaPrior(kernel_var, 1)) + + +class BOLFIKernel_manually_implemented(gpytorch.kernels.Kernel): + """ + Diagonal RBF kernel with importance weighting, i.e.: + + v(x, y) = σ_f² ⋅ exp((x - y)² / λ²) + + or more generally for multidimensional inputs: + + v(x, y) = σ_f² ⋅ exp(Σⱼ (xⱼ - yⱼ)² / λⱼ²) + + `list_of_lengthscale_priors` refers to the λⱼ. + `multiplicative_variance` refers to σ_f². + """ + + is_stationary = True + + def __init__( + self, + list_of_lengthscale_priors, + multiplicative_variance_prior, + batch_shape=torch.Size(), + **kwargs + ): + super().__init__(**kwargs) + + self.dim = len(list_of_lengthscale_priors) + + for j, lengthscale_prior in enumerate(list_of_lengthscale_priors): + # Register the raw hyperparameter (i.e. without constraints) + # with the inherited helper method register_parameter. + self.register_parameter( + name='raw_lengthscale_coefficient_' + str(j), + parameter=torch.nn.Parameter( + torch.zeros(self.batch_shape) + ) + ) + + # Register the constraint with the inherited helper method + # register_constraint. + self.register_constraint( + 'raw_lengthscale_coefficient_' + str(j), Positive() + ) + + # Set the parameter Prior with the inherited helper method + # register_prior. + # The first argument of the lambdas is an instance of this + # class, so treat it as self. The first lambda calls the + # method of this class that defines the parameter; its + # result will be input in the underlying probability + # distribution, initialized by the second argument. + # The second lambda additionally takes a tensor in + # (transformed) parameter space and initializes the internal + # parameter representation to the proper value by applying + # the inverse transform. + # This enables sampling parameter values from Priors. + if lengthscale_prior is not None: + self.register_prior( + 'lengthscale_coefficient_' + str(j) + '_prior', + lengthscale_prior, + lambda module, index=j: ( + module.lengthscale_coefficients[index] + ), + lambda module, value, index=j: ( + module._lengthscale_closure(value, index) + ), + ) + + self.register_parameter( + name='raw_multiplicative_variance', + parameter=torch.nn.Parameter(torch.zeros(batch_shape)) + ) + self.register_constraint('raw_multiplicative_variance', Positive()) + self.register_prior( + 'multiplicative_variance_prior', + multiplicative_variance_prior, + lambda module: module.multiplicative_variance, + lambda module, value: ( + module._multiplicative_variance_closure(value) + ) + ) + self.register_parameter( + name='raw_multiplicative_variance', + parameter=torch.nn.Parameter(torch.zeros(batch_shape)) + ) + self.register_constraint('raw_multiplicative_variance', Positive()) + self.register_prior( + 'multiplicative_variance_prior', + multiplicative_variance_prior, + lambda module: module.multiplicative_variance, + lambda module, value: ( + module._multiplicative_variance_closure(value) + ) + ) + + @property + def lengthscale_coefficients(self): + """Actual definition of the length scale as such.""" + # The attributes stem from the register_* helper methods. + # Any constraints are applied here. + return torch.stack([ + getattr( + self, + 'raw_lengthscale_coefficient_' + str(index) + '_constraint' + ).transform( + getattr(self, 'raw_lengthscale_coefficient_' + str(index)) + ) + for index in range(self.dim) + ], dim=0) + + @lengthscale_coefficients.setter + def lengthscale(self, value): + if not torch.is_tensor(value): + value = torch.as_tensor(value).to(self.square_coefficients) + + for index in range(self.dim): + self._lengthscale_closure(self, value, index) + + def _lengthscale_closure(self, value, index): + # When setting the paramater, transform the actual value to a raw one + # by applying the inverse transform. initialize is inherited. + self.initialize( + **{ + 'raw_lengthscale_coefficient_' + str(index): ( + getattr( + self, + 'raw_lengthscale_coefficient_' + str(index) + + '_constraint' + ).inverse_transform(value[index]) + ) + } + ) + + @property + def multiplicative_variance(self): + return self.raw_multiplicative_variance_constraint.transform( + self.raw_multiplicative_variance + ) + + @multiplicative_variance.setter + def multiplicative_variance(self, value): + self._multiplicative_variance_closure(self, value) + + def _multiplicative_variance_closure(self, value): + if not torch.is_tensor(value): + value = torch.as_tensor(value).to(self.multiplicative_variance) + + self.initialize(raw_multiplicative_variance=value) + + def forward(self, x1, x2, **params): + """Kernel function.""" + # Read: x_ = x / self.lengthscale. + x1_ = x1.div(self.lengthscale) + x2_ = x2.div(self.lengthscale) + + # covar_dist is an inherited helper method for computing the + # Euclidean distance between all pairs of points in x1 and x2. + # diff_matrix = self.covar_dist(x1_, x2_, **params) + diff_squared = torch.sum((x1_ - x2_)**2, dim=1) + + return self.multiplicative_variance * torch.exp(diff_squared) + + +class BOLFIModel(SingleTaskGP): + + def __init__( + self, + train_x, + train_y, + likelihood=gpytorch.likelihoods.GaussianLikelihood( + noise_constraint=gpytorch.constraints.GreaterThan(0) + ), + bounds=[] + ): + if bounds == []: + raise NotImplementedError( + "Default bounds not implemented yet, please provide them." + ) + + self.bounds = torch.tensor(bounds) + + self.dim = len(train_x[0]) + + # Heuristics to choose kernel parameters based on the initial + # data. + length_scale = (torch.max(self.bounds) - torch.min(self.bounds)) / 3 + kernel_var = (torch.max(train_y) / 3)**2 + # This refers to the fallback in BOLFI when no mean function is + # specified. Instead, a bias term gets added to the kernel. + # The bias term used has a Gamma distribution with expected + # value and variance both equal to bias_var. + # bias_var = kernel_var / 4 + """ + # In BOLFI, all Hyperpriors in the kernel are Gamma + # distributions. The lengthscale Priors are set to have expected + # value and variance + # both equal to length_scale. The multiplicate variance is set + # to have expected value and variance both equal to kernel_var. + self.covar_module = BOLFIKernel_manually_implemented( + list_of_lengthscale_priors=[ + GammaPrior(length_scale, 1) for _ in range(self.dim) + ], + multiplicative_variance_prior=GammaPrior(kernel_var, 1), + # bias_variance_prior=GammaPrior(bias_var, 1), + # The first index in the shape is the "length" of the batch, + # while the second is always 1 due to technical reasons. + batch_shape=torch.Size([1, 1]) + ) + """ + covar_module = BOLFIKernel(self.dim, length_scale, kernel_var) + + # Use the initial parabolic fit to the initial training data + # as means of the hyperparameters. + def parabolic_fit_function(x, args): + a = np.array(args[:self.dim]) + b = np.array(args[self.dim:-1]) + c = args[-1] + return np.sum(a * x**2 + b * x, axis=1) + c + + np_x = np.array(train_x.cpu()) + # Since BoTorch requires an explicit output dimension, but it is + # assumed to be 1 here, re-squeeze the training tensor. + np_y = np.array(train_y.squeeze().cpu()) + + if len(train_x) == 1: + np_y = np.atleast_1d(np_y) + a = np.zeros(self.dim) + b = np.zeros(self.dim) + c = np_y[0] + elif len(train_x) == 2: + a = np.zeros(self.dim) + b = (np_y[1] - np_y[0]) / (np_x[1] - np_x[0]) + c = np_y[0] - np.sum(b * np_x[0]) + else: + parabolic_fit = minimize( + lambda *args: np.sum( + (parabolic_fit_function(np_x, *args) - np_y)**2 + )**0.5, + x0=np.append(np.ones(self.dim), np.zeros(self.dim + 1)), + method='trust-constr' + ).x + a = np.array(parabolic_fit[:self.dim]) + b = np.array(parabolic_fit[self.dim:-1]) + c = parabolic_fit[-1] + + # Heuristics for the variances of the parabolic hyperparameters. + # a: check the "squaredness" of (train_y - b ⋅ train_x - c) / a. + if len(train_x) > 2: + a_var = np.atleast_1d(np.sum( + (np.sqrt(np.abs( + (np_y[:, None] - b * np_x - c) / a + )) - np_x)**2, axis=0 + )) # **0.5 would be standard deviation + else: + a_var = np.ones(self.dim) + # b: check the deviation of parabola minimum and data minimum + # location. + b_var = np.atleast_1d((-b - 2 * a * np_x[np.argmin(np_y)])**2) + # c: check the deviation of parabola minimum and data minimum. + c_var = (c - np.min(np_y))**2 + + mean_module = ParabolicMean( + [NormalPrior(a_j, a_j_var) for a_j, a_j_var in zip(a, a_var)], + [NormalPrior(b_j, b_j_var) for b_j, b_j_var in zip(b, b_var)], + NormalPrior(c, c_var), + # The first index in the shape is the "length" of the batch, + # while the second is always 1 due to technical reasons. + # batch_shape=torch.Size([1, 1]) + ) + + super().__init__( + train_X=train_x, + train_Y=train_y, + likelihood=likelihood, + covar_module=covar_module, + mean_module=mean_module + ) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + + return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) diff --git a/elfi/methods/bo/gpytorch_regression.py b/elfi/methods/bo/gpytorch_regression.py new file mode 100644 index 00000000..6edcd913 --- /dev/null +++ b/elfi/methods/bo/gpytorch_regression.py @@ -0,0 +1,404 @@ +# SPDX-FileCopyrightText: 2024 Yannick Kuhn +# +# SPDX-License-Identifier: BSD-3-Clause + +# This module contains an interface for using GPyTorch in ELFI. + +import copy +import gpytorch +import torch +import warnings + +from elfi.methods.bo.gpytorch_bolfi_model import BOLFIModel + + +class GPyTorchRegression: + """Gaussian Process regression using the GPyTorch library.""" + + def __init__( + self, + parameter_names=None, + bounds=None, + optimizer=None, + max_opt_iters=50, + gp=None, + initial_evidence=None, + **gp_params + ): + """ + Initialize GPyTorchRegression. + + :param parameter_names: list of str, optional + Names of parameter nodes. If None, sets dimension to 1. + :param bounds: dict, optional + The region where to estimate the posterior for each + parameter in ``model.parameters``. + ``{'parameter_name':(lower, upper), ... }`` + If not supplied, defaults to (0, 1) bounds for all + dimensions. + :param optimizer: string, optional + Optimizer for the GP hyper parameters. + :param max_opt_iters: int, optional + :param gp: gpytorch.models.ExactGP, optional + """ + + if isinstance(initial_evidence, dict): + raise ValueError( + "Batch dict for initial evidence not implemented in this " + "re-implementation of BOLFI. Please implement it like in ELFI." + ) + + if parameter_names is None: + input_dim = 1 + elif isinstance(parameter_names, (list, tuple)): + input_dim = len(parameter_names) + else: + raise ValueError( + "Keyword `parameter_names` must be a list of strings." + ) + + if bounds is None: + warnings.warn( + "Parameter bounds not specified. Using [0,1] for each " + "parameter." + ) + bounds = [(0, 1)] * input_dim + elif len(bounds) != input_dim: + raise ValueError( + ( + "Length of `bounds` ({}) does not match the length of " + + "`parameter_names` ({})." + ).format(len(bounds), input_dim) + ) + elif isinstance(bounds, dict): + if len(bounds) == 1: # might be parameter_names==None + bounds = [bounds[n] for n in bounds.keys()] + else: + # turn bounds dict into a list in the same order as + # parameter_names + bounds = [bounds[n] for n in parameter_names] + else: + raise ValueError( + "Keyword `bounds` must be a dictionary " + "`{'parameter_name': (lower, upper), ... }`" + ) + + # Make a 2 x d tensor of bounds for PyTorch and BoTorch. + torch_bounds = torch.tensor(bounds).T + + self.parameter_names = parameter_names + self.bounds = bounds + self.torch_bounds = torch_bounds + self.optimizer = optimizer + self.max_opt_iters = max_opt_iters + self._gp = gp + self.gp_params = gp_params + + self.input_dim = input_dim + self.is_sampling = False # set to True once in sampling phase + + self.initial_evidence = initial_evidence + self.initial_X = None + self.initial_Y = None + + self.first_update = False + + # def __str__(self): + # """Return __str__ of underlying Gaussian Process.""" + # return self._gp.__str__() + + # def __repr__(self): + # """Return __repr__ of underlying Gaussian Process.""" + # return self._gp.__repr__() + + def predict(self, x, noiseless=True, numpy_output=True): + """ + Return the GP model posterior mean and variance at x. + + Fast variance inference is made with LOVE via fast_pred_var(). + For accurate variance inference, you can just comment out the + part. + + :param x: np.array + NumPy compatible (n, input_dim) array of points to evaluate + if ``len(x.shape) == 1`` will be cast to 2D with + ``x[None, :]``. + :param noiseless: bool + Whether to include the noise variance or not to the returned + variance. + :param numpy_output: bool + Compatibility with ELFI built-in methods. Set to False to + retain PyTorch Tensor output. + + :returns: + Returns as expected by ELFI: + - pred.mean; torch.tensor, the predictive mean + - pred.variance; torch.tensor, the predictive variance + Shape: + GP posterior (mean, var) at x where + mean : np.array + with shape ``(x.shape[0], 1)`` + var : np.array + with shape ``(x.shape[0], 1)`` + """ + + # Make sure that the GP is in "computing predictions" mode. + self._gp.eval() + self._gp.likelihood.eval() + + x = torch.Tensor(x) + + if not noiseless: + raise NotImplementedError( + "GP prediction with additional diagonal kernel noise not " + "implemented." + ) + + try: + with gpytorch.settings.fast_pred_var(): + pred = self._gp.likelihood(self._gp(x)) + except RuntimeError: + warnings.warn("Cholesky failed. Adding more jitter...") + with ( + gpytorch.settings.cholesky_jitter(float=1e-2) + ): + pred = self._gp.likelihood(self._gp(x)) + + if numpy_output: + return pred.mean.detach().numpy(), pred.variance.detach().numpy() + else: + return pred.mean, pred.variance + + def predict_mean(self, x, numpy_output=True): + """ + Return the GP model posterior mean function at x. + + :param numpy_output: bool + Compatibility with ELFI built-in methods. Set to False to + retain PyTorch Tensor output. + :returns: + np.array with shape ``(x.shape[0], 1)``. + """ + + return self.predict(x, numpy_output=numpy_output)[0] + + def predictive_gradients(self, x, numpy_output=True): + """ + Return the gradients of the GP model posterior mean and variance. + + Given a set of points at which to predict X* (size [N*,Q]), + compute the derivatives of the mean and variance. + Resulting arrays are sized: + dmu_dX* -- [N*, Q ,D], where D is the number of output in + this GP (usually one). + + Note that this is not the same as computing the mean and + variance of the derivative of the function! + dv_dX* -- [N*, Q], since all outputs have same variance. + + :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q) ] + + :param x: np.array + NumPy compatible (n, input_dim) array of points to evaluate + if ``len(x.shape) == 1`` will be cast to 2D with + ``x[None, :]``. + :param numpy_output: bool + Compatibility with ELFI built-in methods. Set to False to + retain PyTorch Tensor output. + :returns: + tuple + GP (grad_mean, grad_var) at x where + grad_mean : np.array + with shape ``(x.shape[0], input_dim)`` + grad_var : np.array + with shape ``(x.shape[0], input_dim)`` + """ + + self._gp.eval() + self._gp.likelihood.eval() + + def pred_evaluation(X_eval): + try: + with gpytorch.settings.fast_pred_var(): + pred = self._gp.likelihood(self._gp(X_eval)) + except RuntimeError: + warnings.warn("Cholesky failed. Adding more jitter...") + with gpytorch.settings.cholesky.jitter(float=1e-2): + pred = self._gp.likelihood(self._gp(X_eval)) + return pred + + # Note how the gradient is stored in X_mean rather than the + # function. + X_mean = torch.autograd.Variable(torch.Tensor(x), requires_grad=True) + pred_for_mean = pred_evaluation(X_mean) + pred_mean_grad_term = pred_for_mean.mean.sum() + pred_mean_grad_term.backward() + pred_mean_grad = X_mean.grad + + X_var = torch.autograd.Variable(torch.Tensor(x), requires_grad=True) + pred_for_var = pred_evaluation(X_var) + pred_var_grad_term = pred_for_var.variance.sum() + pred_var_grad_term.backward() + pred_var_grad = X_var.grad + + if numpy_output: + return ( + pred_mean_grad.detach().numpy(), pred_var_grad.detach().numpy() + ) + else: + return pred_mean_grad, pred_var_grad + + def predictive_gradient_mean(self, x, numpy_output=True): + """ + Return the gradient of the GP model posterior mean at x. + + + :param x: np.array + NumPy compatible (n, input_dim) array of points to evaluate + if ``len(x.shape) == 1`` will be cast to 2D with + ``x[None, :]``. + :param numpy_output: bool + Compatibility with ELFI built-in methods. Set to False to + retain PyTorch Tensor output. + :returns: + np.array with shape ``(x.shape[0], input_dim)``. + """ + + return self.predictive_gradients(x, numpy_output=numpy_output)[0] + + def update(self, x, y, optimize=False): + """ + Update the GP model with new data. + + :param inputs: + A torch.Tensor of the shape + "b1 x ... x bk x m x d" + or + "f x b1 x ... x bk x m x d". + Locations of (fantasy) observations. + :param targets: + A torch:Tensor of the shape + "b1 x ... x bk x m" + or + "f x b1 x ... x bk x m". + Labels of (fantasy) observations. + :param optimize: + Whether or not to optimize hyperparameters. + :returns: + An ``ExactGP`` model with ``n + m`` training examples, where + the ``m`` fantasy examples have been added and all test-time + caches have been updated. + """ + + x = torch.tensor(x, dtype=torch.double) + y = torch.tensor(y, dtype=torch.double).unsqueeze(-1) + initial_optimize = self.first_update + + if self.initial_X is None: + self.initial_X = x + self.initial_Y = y + self._gp = BOLFIModel( + self.initial_X, self.initial_Y, bounds=self.bounds + ) + self._gp.set_train_data(self.X, self.Y, strict=False) + elif self.initial_evidence > len(self.initial_X): + self.initial_X = torch.cat((self.initial_X, x)) + self.initial_Y = torch.cat((self.initial_Y, y)) + self._gp = BOLFIModel( + self.initial_X, self.initial_Y, bounds=self.bounds + ) + self._gp.set_train_data(self.X, self.Y, strict=False) + # Use two passes (the optimizer gets stored between samples. + if len(self.initial_X) >= self.initial_evidence - 2: + optimize = True + else: + # "get_fantasy_model" updates the GP, but discards previous + # hyperparameter training, as the model gets replaced and + # hence the links get broken. + # self._gp = self._gp.get_fantasy_model(x[0], y[0]) + self._gp.set_train_data( + torch.concatenate((self.X, x)), + torch.concatenate((self.Y, y[0])), + strict=False, + ) + + if optimize or initial_optimize: + self.optimize() + self.first_update = False + + def optimize(self): + """Optimize GP hyperparameters. Use Adam by default.""" + self._gp.train() + self._gp.likelihood.train() + optimizer = self.optimizer if self.optimizer else torch.optim.Adam( + self._gp.parameters(), lr=0.1 + ) # Includes GaussianLikelihood parameters + # One possible loss function for GPs: marginal log likehood. + mll = gpytorch.mlls.ExactMarginalLogLikelihood( + self._gp.likelihood, self._gp + ) + for i in range(self.max_opt_iters): + # Delete gradients from previous iteration by overwriting with 0. + optimizer.zero_grad() + # Output from model. + output = self._gp(self.X) + # Calculate loss and backpropagation gradients. + loss = -mll(output, self.Y) + loss.backward() + # print( + # 'Iter %d/%d - Loss: %.3f' % ( + # i + 1, self.max_opt_iters, loss.item(), + # ) + # ) + optimizer.step() + + @property + def n_evidence(self): + """Return the number of observed samples.""" + if self._gp is None: + return 0 + return len(self._gp.train_inputs[0]) + + @property + def X(self): + """Return input evidence.""" + return self._gp.train_inputs[0] + + @property + def Y(self): + """Return output evidence.""" + return self._gp.train_targets + + @property + def noise(self): + """Return the noise.""" + # In GPy: + # return self._gp.Gaussian_noise.variance[0] + raise NotImplementedError( + "No additional noise to GP implemented." + ) + + @property + def instance(self): + """Return the gp instance.""" + return self._gp + + def copy(self): + """Return a copy of current instance.""" + kopy = copy.copy(self) + if self._gp: + kopy._gp = self._gp.copy() + + # if 'kernel' in self.gp_params: + # kopy.gp_params['kernel'] = self.gp_params['kernel'].copy() + + # if 'mean_function' in self.gp_params: + # kopy.gp_params['mean_function'] = ( + # self.gp_params['mean_function'].copy() + # ) + + return kopy + + def __copy__(self): + """Return a copy of current instance.""" + return self.copy() diff --git a/elfi/methods/inference/gpytorch_bolfi.py b/elfi/methods/inference/gpytorch_bolfi.py new file mode 100644 index 00000000..d7991152 --- /dev/null +++ b/elfi/methods/inference/gpytorch_bolfi.py @@ -0,0 +1,601 @@ +# SPDX-FileCopyrightText: 2024 Yannick Kuhn +# +# SPDX-License-Identifier: BSD-3-Clause + +from botorch.acquisition.analytic import UpperConfidenceBound +from botorch.generation.gen import gen_candidates_scipy, get_best_candidates +from botorch.optim.initializers import gen_batch_initial_conditions +from elfi.loader import get_sub_seed +from elfi.methods.inference.bolfi import BayesianOptimization, BOLFI +import elfi.methods.mcmc as mcmc +from elfi.methods.posteriors import BolfiPosterior +from elfi.methods.results import BolfiSample, OptimizationResult +from elfi.methods.utils import arr2d_to_batch # , resolve_sigmas +from elfi.model.elfi_model import NodeReference +from elfi.model.extensions import ModelPrior +import gpytorch +import numpy as np +import pyro +from pyro.infer.mcmc import NUTS, MCMC +import torch + +from elfi.methods.bo.botorch_acquisition import BoTorchLCBSC +from elfi.methods.bo.gpytorch_regression import GPyTorchRegression + + +class GPyTorchBayesianOptimization(BayesianOptimization): + """Bayesian Optimization of an unknown target function.""" + + def __init__( + self, + model, + target_name=None, + bounds=None, + initial_evidence=None, + update_interval=10, + target_model=None, + acquisition_method=None, + acq_noise_var=0, + exploration_rate=10, + batch_size=1, + batches_per_acquisition=None, + async_acq=False, + pyro_torch_prior=None, + **kwargs + ): + """ + Initialize Bayesian optimization. + + :param model: ElfiModel, NodeReference + :param target_name: str, NodeReference.+ + Only needed if model is an ElfiModel + :param bounds : dict, optional + The region where to estimate the posterior for each + parameter in ``model.parameters``: + ``dict('parameter_name':(lower, upper), ... )``. + Not used if custom `target_model` is given. + :param initial_evidence: int, dict, optional + Number of initial evidence or a precomputed batch dict + containing parameter and discrepancy values. Default value + depends on the dimensionality. + :param update_interval: int, optional + How often to update the GP hyperparameters of the + `target_model`. + :param target_model: GPyRegression, optional + :param acquisition_method: AnalyticAcquisitionFunction, optional + Method of acquiring evidence points. Defaults to LCBSC. + :param acq_noise_var: float, dict, optional + Variance(s) of the noise added in the default LCBSC + acquisition method. If a dictionary, values should be float + specifying the variance for each dimension. + :param exploration_rate: float, optional + Exploration rate of the acquisition method + :param batch_size: int, optional + Elfi batch size. Defaults to 1. + :param batches_per_acquisition: int, optional + How many batches will be requested from the acquisition + function at one go. Defaults to `max_parallel_batches`. + :param async_acq: bool, optional + Allow acquisitions to be made asynchronously, i.e., do not + wait for all the results from the previous acquisition + before making the next. This can be more efficient with a + large amount of workers (e.g. in cluster environments) but + foregoes the guarantee for the exactly same result with the + same initial conditions (e.g. the seed). Default: False. + :param pyro_torch_prior: torch.distributions, optional + (Temporarily) needed as a by-pass of the internal processing + of the elfi.Prior, if the trained model has to be sampled. + Provide as the mathematically same object as the elfi.Prior. + If a series of elfi.Priors was given for a multivariate + distribution, give this as a multivariate distribution. + :param **kwargs: + """ + self.pyro_torch_prior = pyro_torch_prior + target_model = target_model or GPyTorchRegression( + ( + model.model.parameter_names + if isinstance(model, NodeReference) else + model.parameter_names + ), + bounds=bounds, + initial_evidence=initial_evidence, + **kwargs + ) + super(GPyTorchBayesianOptimization, self).__init__( + model, + target_name=target_name, + bounds=bounds, + initial_evidence=initial_evidence, + update_interval=update_interval, + target_model=target_model, + acquisition_method=acquisition_method, + acq_noise_var=acq_noise_var, + exploration_rate=exploration_rate, + batch_size=batch_size, + batches_per_acquisition=batches_per_acquisition, + async_acq=async_acq, + # **kwargs + ) + if acquisition_method is None: + self.acquisition_method = BoTorchLCBSC( + self.target_model, + noise_var=acq_noise_var, + exploration_rate=exploration_rate, + seed=self.seed + ) + + def extract_result(self): + """ + Extract the result from the current state. + Overrides the SciPy-based method in ELFI. + + :returns: + OptimizationResult + """ + # SciPy version for GPy. + # x_min, _ = stochastic_optimization( + # self.target_model.predict_mean, + # self.target_model.bounds, seed=self.seed + # ) + bounds = self.target_model.torch_bounds + # Re-use the Upper Confidence Bound as Mean with beta = 0. + # Reminder: ucb = mean + sqrt(beta * variance). With + # maximize = False, it is ucb = -mean + sqrt(beta * variance). + mean = UpperConfidenceBound( + self.target_model._gp, beta=0, maximize=False + ) + x_init = gen_batch_initial_conditions( + mean, + bounds, + q=1, + num_restarts=25, + raw_samples=500 * 2**self.target_model.input_dim + ) + # gen_candidates_scipy gives the minimum without noise. + x_min, _ = gen_candidates_scipy( + initial_conditions=x_init, + acquisition_function=mean, + lower_bounds=bounds[0], + upper_bounds=bounds[1], + ) + x_min = x_min[0].T.detach().numpy() + + batch_min = arr2d_to_batch( + x_min, self.target_model.parameter_names + ) + outputs = arr2d_to_batch( + self.target_model.X.detach().numpy(), + self.target_model.parameter_names + ) + + # batch_min = arr2d_to_batch(x_min, self.parameter_names) + # outputs = arr2d_to_batch( + # self.target_model.X, self.parameter_names + # ) + outputs[self.target_name] = self.target_model.Y[0] + + return OptimizationResult( + x_min=batch_min, + outputs=outputs, + **self._extract_result_kwargs() + ) + + +# The order of inheritance is the order as it is written here. +# Hence, GPyTorchBayesianOptimization.__init__ gets inherited, +# not BOLFI.__init__, which was inherited from BayesianOptimization. +class GPyTorchBOLFI(GPyTorchBayesianOptimization, BOLFI): + """ + Bayesian Optimization for Likelihood-Free Inference (BOLFI). + + Approximates the discrepancy function by a stochastic regression + model. Discrepancy model is fit by sampling the discrepancy function + at points decided by the acquisition function. + The method implements the framework introduced in + Gutmann & Corander, 2016. + + References + ---------- + Gutmann M U, Corander J (2016). Bayesian Optimization for Likelihood + -Free Inference of Simulator-Based Statistical Models. + JMLR 17(125):1−47, 2016. + http://jmlr.org/papers/v17/15-017.html + """ + + def extract_posterior(self, threshold=None): + """ + Return an object representing the approximate posterior. + + The approximation is based on surrogate model regression. + + :param threshold: float, optional + Discrepancy threshold for creating the posterior (log with + log discrepancy). + + :returns: + elfi.methods.posteriors.BolfiPosterior + """ + if self.state['n_evidence'] == 0: + raise ValueError( + 'Model is not fitted yet, please see the `fit` method.') + + # Original ELFI implementation. + # prior = ModelPrior( + # self.model, + # parameter_names=self.target_model.parameter_names + # ) + # if self.pyro_torch_prior is None: + # print( + # "Warning: Translation from elfi.model.extensions." + # "ModelPrior into a PyTorch-compatible distribution " + # "not implemented." + # ) + # prior = self.pyro_torch_prior + prior = ModelPrior( + self.model, + parameter_names=self.target_model.parameter_names + ) + if threshold is None: + bolfi_model = self.target_model._gp + bounds = self.target_model.torch_bounds + mean = UpperConfidenceBound( + bolfi_model, beta=0, maximize=False + ) + x_init = gen_batch_initial_conditions( + mean, + bounds, + q=1, + num_restarts=25, + raw_samples=500 * 2**self.target_model.input_dim + ) + # gen_candidates_scipy gives the minimum without noise. + batch_candidates, batch_acq_values = gen_candidates_scipy( + initial_conditions=x_init, + acquisition_function=mean, + lower_bounds=bounds[0], + upper_bounds=bounds[1], + ) + candidate = get_best_candidates( + batch_candidates, -batch_acq_values + ) + with gpytorch.settings.fast_computations(False, False, False): + threshold = bolfi_model.likelihood( + bolfi_model(candidate) + ).mean.cpu().detach().numpy() + return BolfiPosterior( + self.target_model, threshold=threshold, prior=prior + ) + + def sample( + self, + n_samples, + warmup=None, + n_chains=4, + threshold=None, + initials=None, + algorithm='nuts', + n_evidence=None, + **kwargs + ): + r""" + Sample the posterior distribution of BOLFI. + + Here the likelihood is defined through the cumulative density + function of the standard normal distribution: + L(\theta) \propto F((h-\mu(\theta)) / \sigma(\theta)) + where h is the threshold, and \mu(\theta) and \sigma(\theta) are + the posterior mean and (noisy) standard deviation of the + associated Gaussian process. The sampling is performed with an + MCMC sampler (the No-U-Turn Sampler, NUTS). + + :param n_samples: int + Number of requested samples from the posterior for each + chain. This includes warmup, and note that the effective + sample size is usually considerably smaller. + :param warmup: int, optional + Length of warmup sequence in MCMC sampling. Defaults to + ``n_samples//2``. + :param n_chains: int, optional + Number of independent chains. + :param threshold: float, optional + The threshold (bandwidth) for posterior (give as log if log + discrepancy). + :param initials: array of shape (n_chains, n_params), optional + Initial values for the sampled parameters for each chain. + Defaults to best evidence points. + :param algorithm: string, optional + Sampling algorithm to use. Currently 'nuts' (default) is + supported only. + :param n_evidence: int + If the regression model is not fitted yet, specify the + amount of evidence. + :returns: + BolfiSample + """ + # ToDo: change from ELFI-NUTS to Pyro-NUTS. + # The problem: Pyro-NUTS does not support unnormalized + # distributions. + + if self.state['n_batches'] == 0: + self.fit(n_evidence) + + # TODO: add more MCMC algorithms + if algorithm not in ['nuts', 'metropolis']: + raise ValueError("Unknown posterior sampler.") + if algorithm == 'metropolis': + raise NotImplementedError( + "Metropolis sampler not implemented. Use 'nuts' insteaad." + ) + + posterior = self.extract_posterior(threshold) + warmup = warmup or n_samples // 2 + + # Unless given, select the evidence points with smallest + # discrepancy + if initials is not None: + if np.asarray(initials).shape != ( + n_chains, self.target_model.input_dim + ): + raise ValueError( + "The shape of initials must be (n_chains, n_params).") + else: + inds = torch.argsort(self.target_model.Y) + initials = self.target_model.X[inds].detach().numpy() + + # enables caching for default RBF kernel + self.target_model.is_sampling = True + + tasks_ids = [] + ii_initial = 0 + """ + if algorithm == 'metropolis': + # ToDo: provide initial sigma proposals. + sigma_proposals = resolve_sigmas( + self.target_model.parameter_names, + sigma_proposals, + self.target_model.bounds + ) + """ + + # sampling is embarrassingly parallel, so depending on + # self.client this may parallelize + for ii in range(n_chains): + seed = get_sub_seed(self.seed, ii) + # discard bad initialization points + while np.isinf(posterior.logpdf(initials[ii_initial])): + ii_initial += 1 + if ii_initial == len(inds): + raise ValueError( + "BOLFI.sample: Cannot find enough acceptable " + "initialization points!" + ) + + if algorithm == 'nuts': + tasks_ids.append( + self.client.apply( + mcmc.nuts, + n_samples, + initials[ii_initial], + posterior.logpdf, + posterior.gradient_logpdf, + n_adapt=warmup, + seed=seed, + **kwargs)) + """ + elif algorithm == 'metropolis': + tasks_ids.append( + self.client.apply( + mcmc.metropolis, + n_samples, + initials[ii_initial], + posterior.logpdf, + sigma_proposals, + warmup, + seed=seed, + **kwargs)) + """ + ii_initial += 1 + + # get results from completed tasks or run sampling + # (client-specific) + chains = [] + for id in tasks_ids: + chains.append(self.client.get_result(id)) + + chains = np.asarray(chains) + print( + "{} chains of {} iterations acquired. Effective sample size and " + "Rhat for each parameter:".format(n_chains, n_samples) + ) + for ii, node in enumerate(self.target_model.parameter_names): + print(node, mcmc.eff_sample_size(chains[:, :, ii]), + mcmc.gelman_rubin_statistic(chains[:, :, ii])) + self.target_model.is_sampling = False + + return BolfiSample( + method_name='BOLFI', + chains=chains, + parameter_names=self.target_model.parameter_names, + warmup=warmup, + threshold=float(posterior.threshold), + n_sim=self.state['n_evidence'], + seed=self.seed) + + def sample_torch_prototype( + self, + n_samples, + warmup=None, + n_chains=4, + threshold=None, + initials=None, + algorithm='nuts', + n_evidence=None, + **kwargs + ): + r""" + Sample the posterior distribution of BOLFI. + + Here the likelihood is defined through the cumulative density + function of the standard normal distribution: + L(\theta) \propto F((h-\mu(\theta)) / \sigma(\theta)) + where h is the threshold, and \mu(\theta) and \sigma(\theta) are + the posterior mean and (noisy) standard deviation of the + associated Gaussian process. The sampling is performed with an + MCMC sampler (the No-U-Turn Sampler, NUTS). + :param n_samples: int + Number of requested samples from the posterior for each + chain. This includes warmup, and note that the effective + sample size is usually considerably smaller. + :param warmup: int, optional + Length of warmup sequence in MCMC sampling. Defaults to + ``n_samples//2``. + :param n_chains: int, optional + Number of independent chains. + :param threshold: float, optional + The threshold (bandwidth) for posterior (give as log if log + discrepancy). + :param initials: array of shape (n_chains, n_params), optional + Initial values for the sampled parameters for each chain. + Defaults to best evidence points. + :param algorithm: string, optional + Sampling algorithm to use. Currently 'nuts' (default) is + supported only. + :param n_evidence: int + If the regression model is not fitted yet, specify the + amount of evidence. + :returns: + BolfiSample + """ + + if self.state['n_batches'] == 0: + self.fit(n_evidence) + + if algorithm not in ['nuts',]: + raise ValueError("Unknown posterior sampler.") + + posterior = self.extract_posterior(threshold) + warmup = warmup or n_samples // 2 + + # Unless given, select the evidence points with smallest + # discrepancy + if initials is not None: + if torch.asarray(initials).shape != ( + n_chains, self.target_model.input_dim + ): + raise ValueError( + "The shape of initials must be (n_chains, n_params)." + ) + else: + inds = torch.argsort(self.target_model.Y) + initials = torch.asarray(self.target_model.X[inds]) + + self.target_model.is_sampling = True + + print( + "ToDo: check if CDF transform of prior distribution is the same " + "as the distribution given by prior multiplied with CDF." + ) + + self.target_model._gp.eval() + self.target_model._gp.likelihood.eval() + + def pyro_model(): + # Settings (matrices have "small" shape + # sample_size x sample_size): + # covar_root_decomposition False, + # log_prob False, + # fast_solves False + x = self.pyro_torch_prior + with gpytorch.settings.fast_computations(False, False, False): + gp_eval = self.target_model._gp.likelihood( + self.target_model._gp(x) + ) + # The transform from GP mean and variance to "GP CDF". + t = pyro.distributions.transforms.CumulativeDistributionTransform( + pyro.distributions.Normal( + threshold - gp_eval.mean, torch.sqrt(gp_eval.variance) + ) + ) + # Uniform distribution that might lead to Likelihood + # sampling. + # lower_bounds = torch.tensor([ + # interval[0] for interval in self.bounds.values() + # ]) + # upper_bounds = torch.tensor([ + # interval[1] for interval in self.bounds.values() + # ]) + # u = torch.distributions.Uniform( + # lower_bounds, upper_bounds + # ) + return pyro.sample( + "posterior", + pyro.distributions.TransformedDistribution( + posterior.prior, [t] + ) + ) + + # seed = get_sub_seed(self.seed, ii) + # discard bad initialization points + # while torch.isinf(torch.asarray( + # posterior.logpdf(initials[ii_initial]) + # )): + # ii_initial += 1 + # if ii_initial == len(inds): + # raise ValueError( + # "BOLFI.sample: Cannot find enough acceptable " + # "initialization points!" + # ) + + if algorithm == 'nuts': + print("ToDo: pass initial parameters to Pyro-NUTS.") + print("ToDo: pass seed to Pyro-NUTS.") + print("ToDo: switch from Pyro to NumPyro for performance.") + init_params, potential_fn, transforms, _ = ( + pyro.infer.mcmc.util.initialize_model( + pyro_model, + model_args=(initials[0],), + num_chains=n_chains, + ) + ) + nuts_kernel = NUTS(potential_fn=potential_fn) + mcmc_run = MCMC( + nuts_kernel, + num_samples=n_samples, + warmup_steps=warmup, + num_chains=n_chains, + initial_params=init_params, + transforms=transforms, + disable_progbar=False, + ) + mcmc_run.run() + chains = mcmc_run.get_samples() # ['posterior'] + print(chains) + + # get results from completed tasks or run sampling + # (client-specific) + chains = torch.asarray(chains) + print( + "{} chains of {} iterations acquired. Effective sample size and " + "Rhat for each parameter:".format(n_chains, n_samples) + ) + # for ii, node in enumerate(self.target_model.parameter_names): + # print( + # node, + # eff_sample_size(chains[:, :, ii]), + # gelman_rubin_statistic(chains[:, :, ii]) + # ) + mcmc_run.summary(prob=0.95) + self.target_model.is_sampling = False + + self.target_model._gp.train() + self.target_model._gp.likelihood.train() + + return BolfiSample( + method_name='BOLFI', + chains=chains, + parameter_names=self.target_model.parameter_names, + warmup=warmup, + threshold=float(posterior.threshold), + n_sim=self.state['n_evidence'], + seed=self.seed, + )