# Short demo of how to fit and use the GP class to predict unseen values based on a
# mean function and prior distributions.
# Before loading reticulate, you will need to configure your Python Path to
# use the correct Python version where mogp_emulator is installed.
# mogp_emulator requires Python 3, but some OSs still have Python 2 as the
# default, so you may not get the right one unless you explicitly configure
# it in reticulate. I use the Python that I installed on my Mac with homebrew,
# though on Linux the Python installed via a package manager may have a
# different path.
# The environment variable is RETICULATE_PYTHON, and I set it to
# "/usr/local/bin/python" as this is the Python where mogp_emulator is installed.
# This is set automatically in my .Renviron startup file in my home directory,
# but you may want to configure it some other way. No matter how you decide
# to configure it, you have to set it prior to loading the reticulate library.
library(reticulate)
mogp_emulator <- import("mogp_emulator")
mogp_priors <- import("mogp_emulator.Priors")
# create some data
n_train <- 10
x_scale <- 2.
x1 <- runif(n_train)*x_scale
x2 <- runif(n_train)*x_scale
y <- exp(-x1**2 - x2**2)
x <- data.frame(x1, x2, y)
# GaussianProcess requires data as a matrix, but often you may want to do some
# regression using a data frame in R. To do this, we can split this data frame
# into inputs, targets, and a dictionary mapping column names to integer indices
# using the function below
extract_targets <- function(df, target_cols = list("y")) {
"separate a data frame into inputs, targets, and inputdict for use with GP class"
for (t in target_cols) {
stopifnot(t %in% names(x))
}
n_targets <- length(target_cols)
inputs <- matrix(NA, ncol=ncol(x) - n_targets, nrow=nrow(x))
targets <- matrix(NA, ncol=n_targets, nrow=nrow(x))
inputdict <- dict()
input_count <- 1
target_count <- 1
for (n in names(x)) {
if (n %in% target_cols) {
targets[,target_count] <- as.matrix(x[n])
} else {
inputs[,input_count] <- as.matrix(x[n])
inputdict[n] <- as.integer(input_count - 1)
input_count <- input_count + 1
}
}
if (n_targets == 1) {
targets <- c(targets)
}
return(list(inputs, targets, inputdict))
}
target_list <- extract_targets(x)
inputs <- target_list[[1]]
targets <- target_list[[2]]
inputdict <- target_list[[3]]
# Create the mean function formula as a string (or you could extract from the
# formula found via regression). If you want correct expansion of your formula
# in the Python code, you will need to install the patsy package (it is pip
# installable) as it is used internally in mogp_emulator to parse formulas.
# Additionally, you will need to convert the column names from the data frame
# to integer indices in the inputs matrix. This is done with a dict object as
# illustrated below.
mean_func <- "y ~ x[0] + x[1] + I(x[0]*x[1])"
# Priors are specified by giving a list of prior objects (or NULL if you
# wish to use weak prior information). Each distribution has some parameters
# to set -- NormalPrior is (mean, std), Gamma is (shape, scale), and
# InvGammaPrior is (shape, scale). See the documentation or code for the exact
# functional format of the PDF.
# If you don't know how many parameters you need to specify, it depends on
# the mean function and the number of input dimensions. Mean functions
# have a fixed number of parameters (though in some cases this can depend
# on the dimension of the inputs as well), and then covariance functions have
# one correlation length per input dimension plus a covariance scale and
# a nugget parameter.
# If in doubt, you can create the GP instance with no priors, use gp$n_params
# to get the number, and then set the priors manually using gp$priors <- priors
# In this case, we have 4 mean function parameters (normal distribution on a
# linear scale), 2 correlations lengths (normal distribution on a log scale,
# so lognormal), a sigma^2 covariance parameter (inverse gamma) and a nugget
# (Gamma). If you choose an adaptive or fixed nugget, the nugget prior is ignored.
priors <- mogp_priors$GPPriors(mean=mogp_priors$MeanPriors(mean=c(0., 0., 0., 0.),
cov=c(1., 1., 1., 1.)),
corr=list(mogp_priors$LogNormalPrior(1., 1.),
mogp_priors$LogNormalPrior(1., 1.)),
cov=mogp_priors$InvGammaPrior(2., 1.),
nugget=mogp_priors$GammaPrior(1., 0.2))
# Finally, create the GP instance. If we had multiple outputs, we would
# create a MultiOutputGP class in a similar way, but would have the option
# of giving a single mean and list of priors (assumes it is the same for
# each emulator), or a list of mean functions and a list of lists of
# prior distributions. nugget can also be set with a single value or a list.
gp <- mogp_emulator$GaussianProcess(inputs, targets,
mean=mean_func,
priors=priors,
nugget="fit")
# gp is fit using the fit_GP_MAP function. It accepts a GaussianProcess or
# MultiOutputGP object and returns the same type of object with the
# hyperparameters fit via MAP estimation, with some options for how to perform
# the minimization routine. You can also pass the arguments to create a GP/MOGP
# to this function and it will return the object with estimated hyperparameters
gp <- mogp_emulator$fit_GP_MAP(gp)
print(gp$current_logpost)
print(gp$theta)
# now create some test data to make predictions and compare with known values
n_test <- 10000
x1_test <- runif(n_test)*x_scale
x2_test <- runif(n_test)*x_scale
x_test <- cbind(x1_test, x2_test)
y_actual <- exp(-x1_test**2 - x2_test**2)
y_predict <- gp$predict(x_test)
# y_predict is an object holding the mean, variance and derivatives (if computed)
# access the values via y_predict$mean, y_predict$unc, and y_predict$deriv
print(sum((y_actual - y_predict$mean)**2)/n_test)