"""
Implement GP using GPytorch which naturally supports gradient computation.
It's fairly scalable since it uses GPU and some other tricks.
The gradient computation is a pain but eventually I was able to do it after some search.
"""
import copy
import tqdm
from pdb import set_trace
import numpy as np
import numpy.linalg as la
from ConfigSpace import ConfigurationSpace
from ConfigSpace.hyperparameters import UniformIntegerHyperparameter
try:
import torch
import gpytorch
from gpytorch.models import ApproximateGP
from gpytorch.variational import CholeskyVariationalDistribution
from gpytorch.variational import VariationalStrategy
from torch.utils.data import TensorDataset, DataLoader
except:
print("GPytorch is not installed, cannot import this module")
from .model import Model, ModelFactory
def transform_input(xu_means, xu_std, XU):
XUt = []
for i in range(XU.shape[1]):
XUt.append((XU[:,i] - xu_means[i]) / xu_std[i])
return np.vstack(XUt).T
def transform_output(xu_means, xu_std, XU):
XUt = []
for i in range(XU.shape[1]):
XUt.append((XU[:,i] * xu_std[i]) + xu_means[i])
return np.vstack(XUt).T
class GPytorchGP(Model):
"""Define a base class that can be extended to both scalable and un-scalable case"""
def __init__(self, system, mean='constant', kernel='RBF', niter=40, lr=0.1,
use_cuda=True):
super().__init__(system)
self.niter = niter
self.lr = lr
self.device = (torch.device('cuda') if (use_cuda and torch.cuda.is_available())
else torch.device('cpu'))
if use_cuda and torch.cuda.is_available():
print("Cuda is used for GPytorch")
self.gpmodel = None
self.gp_mean = mean
self.gp_kernel = kernel
@staticmethod
def get_configuration_space(system):
cs = ConfigurationSpace()
return cs
def update_state(self, state, new_ctrl, new_obs):
return np.copy(new_obs)
def traj_to_state(self, traj):
return traj[-1].obs[:]
def state_to_obs(self, state):
return state[:]
def pred(self, state, ctrl):
X = np.concatenate([state, ctrl])
X = X[np.newaxis,:]
Xt = transform_input(self.xu_means, self.xu_std, X)
# for this one, make a prediction is easy...
TsrXt = torch.from_numpy(Xt).to(self.device)
predy = self.gpmodel.likelihood(self.gpmodel(TsrXt))
out = predy.mean.cpu().data.numpy()
dy = transform_output(self.dy_means, self.dy_std, out).flatten()
return state + dy
def get_sampler(self):
d = self.system.obs_dim
u = np.random.normal(loc=0, scale=1, size=d).reshape((d, 1))
def sample(state, ctrl):
X = np.concatenate([state, ctrl])
X = X[np.newaxis,:]
Xt = transform_input(self.xu_means, self.xu_std, X)
# for this one, make a prediction is easy...
TsrXt = torch.from_numpy(Xt).to(self.device)
predy = self.gpmodel.likelihood(self.gpmodel(TsrXt))
#predf = self.gpmodel(TsrXt)
mean = predy.mean.cpu().data.reshape((d,1))
cov = predy.covariance_matrix.cpu().data
L = np.linalg.cholesky(cov)
out = mean + np.dot(L, u)
out = out.reshape((1,d))
#out2 = predy.sample().cpu().data.numpy()
dy = transform_output(self.dy_means, self.dy_std, out).flatten()
return state + dy
return sample
def sample(self, state, ctrl):
X = np.concatenate([state, ctrl])
X = X[np.newaxis,:]
Xt = transform_input(self.xu_means, self.xu_std, X)
# for this one, make a prediction is easy...
TsrXt = torch.from_numpy(Xt).to(self.device)
predy = self.gpmodel.likelihood(self.gpmodel(TsrXt))
#predf = self.gpmodel(TsrXt)
d = self.system.obs_dim
mean = predy.mean.cpu().data.reshape((d,1))
cov = predy.covariance_matrix.cpu().data
L = np.linalg.cholesky(cov)
u = np.random.normal(loc=0, scale=1, size=d).reshape((d, 1))
out = mean + np.dot(L, u)
out = out.reshape((1,d))
#out2 = predy.sample().cpu().data.numpy()
dy = transform_output(self.dy_means, self.dy_std, out).flatten()
return state + dy
def pred_timeit(self, state, ctrl):
import time
start = time.time()
X = np.concatenate([state, ctrl])
X = X[np.newaxis,:]
Xt = transform_input(self.xu_means, self.xu_std, X)
print("time1=", (time.time() - start)*1000, "ms")
# for this one, make a prediction is easy...
TsrXt = torch.from_numpy(Xt).to(self.device)
print("time2=", (time.time() - start)*1000, "ms")
predy = self.gpmodel.likelihood(self.gpmodel(TsrXt))
print("time3=", (time.time() - start)*1000, "ms")
out = predy.mean.cpu().data.numpy()
print("time4=", (time.time() - start)*1000, "ms")
dy = transform_output(self.dy_means, self.dy_std, out).flatten()
print("time5=", (time.time() - start)*1000, "ms")
return state + dy
def pred_batch(self, state, ctrl):
"""The batch mode"""
X = np.concatenate([state, ctrl], axis=1)
Xt = transform_input(self.xu_means, self.xu_std, X)
TsrXt = torch.from_numpy(Xt).to(self.device)
predy = self.gpmodel.likelihood(self.gpmodel(TsrXt))
out = predy.mean.cpu().data.numpy()
dy = transform_output(self.dy_means, self.dy_std, out).flatten()
return state + dy.reshape((state.shape[0], self.state_dim))
def sample_parallel(self, state, ctrl):
"""The batch mode"""
X = np.concatenate([state, ctrl], axis=1)
Xt = transform_input(self.xu_means, self.xu_std, X)
TsrXt = torch.from_numpy(Xt).to(self.device)
predy = self.gpmodel.likelihood(self.gpmodel(TsrXt))
out = predy.sample().cpu().data.numpy()
dy = transform_output(self.dy_means, self.dy_std, out).flatten()
return state + dy.reshape((state.shape[0], self.state_dim))
def pred_diff(self, state, ctrl):
"""Prediction, but with gradient information"""
X = np.concatenate([state, ctrl])
X = X[np.newaxis,:]
Xt = transform_input(self.xu_means, self.xu_std, X)
obs_dim = len(state)
# get the Tensor
TsrXt = torch.from_numpy(Xt).to(self.device)
TsrXt = TsrXt.repeat(obs_dim, 1)
TsrXt.requires_grad_(True)
predy = self.gpmodel.likelihood(self.gpmodel(TsrXt)).mean
predy.backward(torch.eye(obs_dim).to(self.device), retain_graph=True)
jac = TsrXt.grad.cpu().data.numpy()
# properly scale back...
jac = jac / self.xu_std[None] * self.dy_std[:, np.newaxis]
# since repeat, y value is the first one...
out = predy[:1,:].cpu().data.numpy()
dy = transform_output(self.dy_means, self.dy_std, out).flatten()
n = self.system.obs_dim
state_jac = jac[:, :n] + np.eye(n)
ctrl_jac = jac[:, n:]
return state + dy, state_jac, ctrl_jac
def pred_diff_parallel(self, state, ctrl):
"""Prediction, but with gradient information"""
X = np.concatenate([state, ctrl], axis=1)
Xt = transform_input(self.xu_means, self.xu_std, X)
obs_dim = state.shape[1]
m = state.shape[0]
# get the Tensor
TsrXt = torch.from_numpy(Xt).to(self.device)
TsrXt = TsrXt.repeat(obs_dim, 1, 1).permute(1,0,2).flatten(0,1)
TsrXt.requires_grad_(True)
predy = self.gpmodel.likelihood(self.gpmodel(TsrXt)).mean
predy.backward(torch.eye(obs_dim).to(self.device).repeat(m,1), retain_graph=True)
predy = predy.reshape((m, obs_dim, obs_dim))
#predy.backward(retain_graph=True)
jac = TsrXt.grad.cpu().data.numpy()
jac = jac.reshape((m, obs_dim, TsrXt.shape[-1]))
# properly scale back...
jac = jac / np.tile(self.xu_std, (m,obs_dim,1)) * np.tile(self.dy_std, (m,1))[:,:,np.newaxis]
# since repeat, y value is the first one...
out = predy[:,0,:].cpu().data.numpy()
dy = transform_output(self.dy_means, self.dy_std, out)
n = self.system.obs_dim
state_jacs = jac[:, :, :n] + np.tile(np.eye(n), (m,1,1))
ctrl_jacs = jac[:, :, n:]
return state + dy, state_jacs, ctrl_jacs
@property
def state_dim(self):
return self.system.obs_dim
def get_parameters(self):
raise NotImplementedError
def set_parameters(self, params):
raise NotImplementedError
class BatchIndependentMultitaskGPModel(gpytorch.models.ExactGP):
def __init__(self, num_task, mean='constant', kernel='RBF'):
likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=num_task)
super().__init__(None, None, likelihood)
self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_task]))
self.covar_module = gpytorch.kernels.ScaleKernel(
gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_task])),
batch_shape=torch.Size([num_task])
)
def forward(self, x):
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultitaskMultivariateNormal.from_batch_mvn(
gpytorch.distributions.MultivariateNormal(mean_x, covar_x), task_dim=0
)
class ApproximateGPytorchModel(gpytorch.models.ApproximateGP):
def __init__(self, inducing_points, num_task, mean='constant', kernel='RBF'):
# We have to mark the CholeskyVariationalDistribution as batch
# so that we learn a variational distribution for each task
size = torch.Size([num_task])
variational_distribution = gpytorch.variational.CholeskyVariationalDistribution(
inducing_points.size(-2), batch_shape=size
)
# We have to wrap the VariationalStrategy in a MultitaskVariationalStrategy
# so that the output will be a MultitaskMultivariateNormal rather than a batch output
variational_strategy = gpytorch.variational.IndependentMultitaskVariationalStrategy(
gpytorch.variational.VariationalStrategy(
self, inducing_points, variational_distribution, learn_inducing_locations=True
), num_tasks=num_task
)
super().__init__(variational_strategy)
# The mean and covariance modules should be marked as batch
# so we learn a different set of hyperparameters
self.mean_module = gpytorch.means.ConstantMean(batch_shape=size)
self.covar_module = gpytorch.kernels.ScaleKernel(
gpytorch.kernels.RBFKernel(batch_shape=size),
batch_shape=size
)
def forward(self, x):
# The forward function should be written as if we were dealing with each output
# dimension in batch
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
class LargeGaussianProcess(GPytorchGP):
def __init__(self, system, mean='constant', kernel='RBF', niter=40, lr=0.1):
super().__init__(system, mean, kernel, niter, lr)
self.gpmodel = BatchIndependentMultitaskGPModel(self.system.obs_dim, mean, kernel).double()
self.gpmodel = self.gpmodel.to(self.device)
def train(self, trajs, silent=False):
# Initialize kernels
self.gpmodel.train()
self.gpmodel.likelihood.train()
optimizer = torch.optim.Adam(self.gpmodel.parameters(), lr=self.lr) # Includes GaussianLikelihood parameters
# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.gpmodel.likelihood, self.gpmodel)
# prepare data
X = np.concatenate([traj.obs[:-1,:] for traj in trajs])
dY = np.concatenate([traj.obs[1:,:] - traj.obs[:-1,:] for traj in trajs])
U = np.concatenate([traj.ctrls[:-1,:] for traj in trajs])
XU = np.concatenate((X, U), axis = 1) # stack X and U together
self.xu_means = np.mean(XU, axis=0)
self.xu_std = np.std(XU, axis=0)
XUt = transform_input(self.xu_means, self.xu_std, XU)
self.dy_means = np.mean(dY, axis=0)
self.dy_std = np.std(dY, axis=0)
dYt = transform_input(self.dy_means, self.dy_std, dY)
# convert into desired tensor
train_x = torch.from_numpy(XUt)
train_y = torch.from_numpy(dYt)
train_x = train_x.to(self.device)
train_y = train_y.to(self.device).contiguous()
self.gpmodel.set_train_data(train_x, train_y, False)
for i in range(self.niter):
optimizer.zero_grad()
output = self.gpmodel(train_x)
loss = -mll(output, train_y)
loss.backward()
print('Iter %d/%d - Loss: %.3f' % (i + 1, self.niter, loss.item()))
optimizer.step()
# training is finished, now go to eval mode
self.gpmodel.eval()
self.gpmodel.likelihood.eval()
# this part implements the approximate GP
[docs]class ApproximateGPModelFactory(ModelFactory):
"""
Gaussian Processes (GPs) are a non-parametric regression method. Since GPs have trouble
scaling to large training sets, this class provides a variational GP which automatically
selects a subset of the training data for inference. This functionality is provided by
the gPyTorch library. For more details see the original documentation_, and the
corresponding paper at https://arxiv.org/pdf/1411.2005.pdf
.. _documentation: https://docs.gpytorch.ai/en/v1.1.1/examples/04_Variational_and_Approximate_GPs/SVGP_Regression_CUDA.html
Hyperparameters:
- *induce_count* (Type: int, Lower: 50, Upper: 200, Default: 100): Number of inducing points
to include in the gaussian process.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.Model = ApproximateGPModel
self.name = "ApproximateGP"
def get_configuration_space(self):
cs = ConfigurationSpace()
induce_count = UniformIntegerHyperparameter("induce_count", lower=50,
upper=200, default_value=100)
cs.add_hyperparameter(induce_count)
return cs
class ApproximateGPModel(GPytorchGP, Model):
def __init__(self, system, mean='constant', kernel='RBF', niter=5, lr=0.1, batch_size=1024, induce_count=500, **kwargs):
super().__init__(system, mean, kernel, niter, lr, **kwargs)
self.batch_size = batch_size
self.induce_count = induce_count
def train(self, trajs, silent=False):
"""Given collected trajectories, train the GP to approximate the actual dynamics"""
# extract transfer pairs from data
X = np.concatenate([traj.obs[:-1,:] for traj in trajs])
dY = np.concatenate([traj.obs[1:,:] - traj.obs[:-1,:] for traj in trajs])
num_task = dY.shape[1]
self.num_task = num_task
U = np.concatenate([traj.ctrls[:-1,:] for traj in trajs])
XU = np.concatenate((X, U), axis = 1) # stack X and U together
self.xu_means = np.mean(XU, axis=0)
self.xu_std = np.std(XU, axis=0)
XUt = transform_input(self.xu_means, self.xu_std, XU)
self.dy_means = np.mean(dY, axis=0)
self.dy_std = np.std(dY, axis=0)
dYt = transform_input(self.dy_means, self.dy_std, dY)
# convert into desired tensor data loader
train_x = torch.from_numpy(XUt)
train_y = torch.from_numpy(dYt)
train_x = train_x.to(self.device)
train_y = train_y.to(self.device)
train_dataset = TensorDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
# construct the approximate GP instance
induce = torch.stack([train_x[:self.induce_count] for _ in range(num_task)], dim=0)
self.induce = induce
self.gpmodel = ApproximateGPytorchModel(induce, num_task, self.gp_mean, self.gp_kernel).double()
self.gpmodel = self.gpmodel.to(self.device)
likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=num_task)
likelihood = likelihood.to(self.device)
# Initialize kernels
self.gpmodel.train()
likelihood.train()
optimizer = torch.optim.Adam([
{'params': self.gpmodel.parameters()},
{'params': likelihood.parameters()},
], lr=self.lr)
# Our loss object. We're using the VariationalELBO
mll = gpytorch.mlls.VariationalELBO(likelihood, self.gpmodel, num_data=train_y.size(0))
if silent:
itr = range(self.niter)
else:
itr = tqdm.tqdm(range(self.niter))
for i in itr:
# Within each iteration, we will go over each minibatch of data
for x_batch, y_batch in train_loader:
optimizer.zero_grad()
output = self.gpmodel(x_batch)
loss = -mll(output, y_batch)
# minibatch_iter.set_postfix(loss=loss.item())
loss.backward()
optimizer.step()
self.gpmodel.eval()
likelihood.eval()
self.gpmodel.likelihood = likelihood
def get_parameters(self):
return {"gpmodel_state" : self.gpmodel.state_dict(),
"induce" : self.induce,
"xu_means" : self.xu_means,
"xu_std" : self.xu_std,
"dy_means" : self.dy_means,
"dy_std" : self.dy_std,
"num_task" : self.num_task}
def set_parameters(self, params):
self.xu_means = params["xu_means"]
self.xu_std = params["xu_std"]
self.dy_means = params["dy_means"]
self.dy_std = params["dy_std"]
self.induce = params["induce"]
self.num_task = params["num_task"]
self.gpmodel = ApproximateGPModel(self.induce, self.num_task, self.gp_mean,
self.gp_kernel).double()
self.gpmodel = self.gpmodel.to(self.device)
likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
num_tasks=self.num_task)
likelihood = likelihood.to(self.device)
self.gpmodel.likelihood = likelihood
self.gpmodel.load_state_dict(params["gpmodel_state"])