# Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib notebook

# 1D simulation of the Brownian motion process
total_time = 1 # total time
nb_steps = 500  # number of timesteps
delta_t = total_time / nb_steps # value of timestep
nb_processes = 5 # Simulate 5 different motions
mean = 0  # Mean of each movement
stdev = np.sqrt(delta_t)  # Standard deviation of each movement

# Simulate the brownian motions in a 1D space by cumulatively
#  making a new movement delta_d
positions = np.cumsum(
    # Move randomly from current location to N(0, delta_t)
    np.random.normal(
        mean, stdev, (nb_processes, nb_steps)),
    axis=1)

plt.figure(figsize=(6, 4))
# Make the plots
t = np.arange(0, total_time, delta_t)
for i in range(nb_processes):
    plt.plot(t, positions[i,:])
#     plt.plot(t, positions[i,:], alpha=0.1) # use this if you change nb_processes = 500
#     then you will see that many functions are centered around 0 (the mean)
#     the more you move away from the mean the less common the functions become
#     this is what is means to have a (normal) distribution over functions
plt.title((
    'Brownian motion process\n '
    'Position over time for 5 independent realizations'))
plt.xlabel('$t$ (time)', fontsize=13)
plt.ylabel('$d$ (position)', fontsize=13)
plt.xlim([-0, 1])
plt.tight_layout()
plt.show()
#

n=100 # number of input points
x=np.linspace(-7,7,n).reshape(n,1) # gives vector of evenly distributed input points

def se_kernel(a, b, sigma,l):
    """definition of SE kernel with lengthscale and variance"""
    dist2 = (a-b.T)**2 # distance squared
    return sigma**2*np.exp(-dist2/(2*l**2))

sigma = 1 # sigma usually stands for standard deviation, variance = stdev^2
l=2 #length scale

K_ss = np.array(se_kernel(x, x, sigma,l))

df = pd.DataFrame(K_ss, index=[f"input{i}" for i in range(1,n+1)], columns = [f"input{i}" for i in range(1,n+1)])

display(df.iloc[:10,:10])

# Prior Distribution using SE kernel
mu = 0
L_ss=np.linalg.cholesky(K_ss + 1e-14*np.eye(n)) # triangular L** matrix from Cholesky decomposition (sort of sqrt of K**)
                                                # + 1e-14*np.eye(n) ensures numerical stability wrt Cholesky

k=5 # number of samples to plot
rand_v=np.random.normal(size=(n,k)) # k n-dimensional vectors from normal distribution N(0,I)
prior = np.dot(L_ss, rand_v) + mu # make a proper normal distribution to sample prior N(0,L)

plt.figure(figsize=(6, 4))
plt.plot(x, prior) # input and output coordinates
plt.plot(x, mu*np.ones(n), 'r--', lw=2) # plot mean, mean=0
plt.axis([-7, 7, -5, 5]) 
plt.gca().fill_between(x.reshape(-1), -2*sigma+mu, 2*sigma+mu, color="#dddddd") # 2 sigma confidence interval around mean, 95%
# plt.title(str(k)+' samples from the GP prior + 2 $\sigma$')
plt.xlabel("Input")
plt.ylabel("Output")
plt.show()

plt.savefig("prior.png")

x_train = np.array([-1,2,-3,1]).reshape(-1,1) # 4 training points x
y_train = np.array([2,1,4,1]).reshape(-1,1) #  4 observed output values at training points

noise = 0.00*np.eye(x_train.shape[0]) #change to 0.05*np.eye(x_train.shape[0]) for added Gaussian noise

K = se_kernel(x_train, x_train, sigma,l) + noise # K(X,X) 4 x 4 covariance matrix for training using previously definsed se_kernel
L = np.linalg.cholesky(K)

K_s = se_kernel(x, x_train, sigma,l) # K(X*,X) Covariance between training points x_train and 'test points' x* from prior.

alpha = np.linalg.solve(L.T, np.linalg.solve(L, y_train))
v = np.linalg.solve(L,K_s.T)

posterior_mu = np.dot(K_s, alpha).reshape(-1) # mean of the posterior, containing predictions
posterior_cov = K_ss - np.dot(v.T,v) # cov(f*)

# Compute variance from posterior covariance matrix (diagonal)
var = np.diag(posterior_cov).reshape(-1) # diagonal of the posterior covariance matrix
stdv = np.sqrt(var).reshape(-1) # standard deviations as elementwise sqrt of variance

# create sample functions that fit posterior covariance matrix
L_posterior = np.linalg.cholesky(posterior_cov + 1e-13*np.eye(n)) # Cholesky decomposed posterior covariance L
posterior_samples = posterior_mu.reshape(-1,1) + np.dot(L_posterior, np.random.normal(size=(n,3))) # sampling posteirior with mean mu and posterior cov matrix 

plt.figure()
plt.gca().fill_between(x.reshape(-1), posterior_mu-2*stdv, posterior_mu+2*stdv, color="#dddddd") # 2 sigma confidence interval fill
plt.plot(x.reshape(-1), posterior_mu, 'r--', lw=2) # posterior mean
plt.plot(x_train, y_train, 'ko', ms=4) # training points
plt.axis([-7, 7, -5, 5])
plt.plot(x, posterior_samples) # posterior samples
plt.xlabel("Input")
plt.ylabel("Output")
# plt.title('GP posterior')
plt.show()

plt.savefig("posterior.png")

sigma = 1 # defined in previous code block, jut copying them again here
l = 2 # defined in previous code block, jut copying them again here

x_train = np.array([-1,2]).reshape(-1,1) # 2 training points x
y_train = np.array([2,1]).reshape(-1,1) #  2 observed output values at training points
x_test = np.array([0]).reshape(-1,1) # 1 test point containing input at 0

# constructing the covaraince matrices
K = se_kernel(x_train, x_train, sigma,l) # training point-training point K(X,X)
K_s = se_kernel(x_test, x_train, sigma,l) # test point-training point K(X*, X)
K_s_transpose = K_s.T # only for demonstration
#usually don't have to store transposed matrix as you can transpose the original matrix when needed, saves memory
K_ss = se_kernel(x_test, x_test, sigma,l) # covariance between test point - test point (in this case only one test point)

L = np.linalg.cholesky(K)

alpha = np.linalg.solve(L.T, np.linalg.solve(L, y_train))
fs_mu = np.dot(K_s, alpha).reshape(-1) # vector of predicted output values at test points (in this case only 1 dimensional)

print ("Predicted value/s")
print(fs_mu)

v = np.linalg.solve(L,K_s.T)
posterior_cov = K_ss - np.dot(v.T,v) # cov(f*)
# for one test point the posterior covariance will just be a number
# corresponding to the variance of the GP at that test point

var = np.diag(posterior_cov).reshape(-1) # diagonal of the posterior covariance matrix
print ("The variance at the predicted value/s:")
print (var)
stdv = np.sqrt(var).reshape(-1) # standard deviations as elementwise sqrt of variance
print ("The standard deviation at the predicted value/s:")
print (stdv)

Predicted value/s
[1.89044808]
The variance at the predicted value/s:
[0.10671625]
The standard deviation at the predicted value/s:
[0.32667453]

	input1	input2	input3	input4	input5	input6	input7	input8	input9	input10
input1	1.000000	0.997503	0.990051	0.977753	0.960793	0.939419	0.913940	0.884717	0.852158	0.816703
input2	0.997503	1.000000	0.997503	0.990051	0.977753	0.960793	0.939419	0.913940	0.884717	0.852158
input3	0.990051	0.997503	1.000000	0.997503	0.990051	0.977753	0.960793	0.939419	0.913940	0.884717
input4	0.977753	0.990051	0.997503	1.000000	0.997503	0.990051	0.977753	0.960793	0.939419	0.913940
input5	0.960793	0.977753	0.990051	0.997503	1.000000	0.997503	0.990051	0.977753	0.960793	0.939419
input6	0.939419	0.960793	0.977753	0.990051	0.997503	1.000000	0.997503	0.990051	0.977753	0.960793
input7	0.913940	0.939419	0.960793	0.977753	0.990051	0.997503	1.000000	0.997503	0.990051	0.977753
input8	0.884717	0.913940	0.939419	0.960793	0.977753	0.990051	0.997503	1.000000	0.997503	0.990051
input9	0.852158	0.884717	0.913940	0.939419	0.960793	0.977753	0.990051	0.997503	1.000000	0.997503
input10	0.816703	0.852158	0.884717	0.913940	0.939419	0.960793	0.977753	0.990051	0.997503	1.000000

Gaussian Process Regression (GPR) (a.k.a. Kriging) Introduction¶

The Multivariate Distribution¶

What is a Gaussian Process (GP)?¶

Code Walk-through¶

GPR Kernels¶

The Prior Distribution¶

How to Construct the Prior¶

Covariance Matrix ($\Sigma$)¶

Code walkthough¶

Relating Outputs to One Another¶

Sampling prior outputs using the covariance matrix¶

Code Walkthtough¶

The Posterior Distribution¶

Computing Predictions¶

Code Walk-through¶

Predicting outputs at new test points¶

Bayes' Theorem¶

Resources¶

Here is a list of resources I used to make this notebook and learn the basics of GPs myself¶