# Copyright (C) 2018 Juan Pablo Carbajal
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .

# Author: Juan Pablo Carbajal 
# Created: 2018-03-01

Emulation of the three gauges simultaneously

The following script shows the process to emulate the water depths timeseries as measured by all the gauges. The data is composed in a single 3D parametric curve and then approximated with singular components. The emulator is built to reproduce the loading or coefficients of those components. This way the time is not an input to the emulator.

First we load the data

fname = @(s) fullfile('data',sprintf('cchannel_%s.dat',s));

[N dim]   = size (X); # N = 200 input values, dim = 3 hydrogrpah paramters.
[nT nP N] = size (Y); # nT numer of time samples, nP =3 number of gauges.

Highly correlated outputs

The three gauges generate outputs that are highly correlated. The plot below shows the 3D curves corresponding to each input parameter.

figure (1)
Yp = permute (Y, [1 3 2]); # Put number of samples as 2nd dimension
plot3 (Yp(:,:,1),Yp(:,:,2),Yp(:,:,3));
axis tight
xlabel ('Gauge 1')
ylabel ('Gauge 2')
zlabel ('Gauge 3')
clear Yp


We take the 3D curves built form the three gauges:

$$ \vec{C}(t,\vec{x}_i) = \left(g_1(t,\vec{x}_i), g_2(t,\vec{x}_i), g_3(t, \vec{x}_i)\right)$$

where $\vec{x}_i$ is the i-th input vector (there are N=200 of these).

Then we dcompose them in the following way

$$ \vec{C}(t,\vec{x}_i) = \sum_{k=1}^K \vec{p}_k(t) b_k(\vec{x}_i) $$

where $\vec{p}$ are singular vectors and only the loadings $b$ depend on the inputs. The numer of compoenents is chosen to produce a reconstruction error of tol=1e-2.

# This part of the script runs only once, to re-run clear the varibale _P_
if !exist ('P', 'var')
  # POD
  C = cat (2, Y(:,1,:), Y(:,2,:), Y(:,3,:));
  U = reshape(C, [nT*nP, N]).';

  tol = 1e-2;
  [P,l,B] = pod (U, tol);

  res     = (P*B - U)(:);
  [err i] = max (abs (res));
  printf ("POD error (%d): %.2e\n", size(P,2), err * sign(res(i)));
  fflush (stdout);

  nEmu = length (l);

  #pkg load statistics
  #dCov = zeros (dim, nE);
  #for i=1:dim
  #  for j=1:nE
  #    dCov(i,j) = dcov (X(:,i), P(:,j));
  #  endfor

  #for i=1:nE
  #  figure(i+1)
  #  clf
  #  [~, o] = sort (dCov(:,i), 'descend');
  #  scatter3 (X(:,o(1)), X(:, o(2)), P(:,i), 15, X(:,o(3)), 'filled');
  #  xlabel (X_name{o(1)});
  #  ylabel (X_name{o(2)});
  #  zlabel (num2str(i));
  #  hc = colorbar;
  #  set(hc,'ylabel', X_name{o(3)});
  #  axis tight

POD error (7): -7.44e-03

Train the emulator

We build an emulator for the loadings $b$. When plotted (see commented code in previous section) this loadings are fairly smooth. Hence we use a GP with linear mean function and a squared exponential covariance function with automatic relevance determination.

$$ m(\vec{x}) = a_0 + \sum_{i=1}^3 a_i x_i $$

$$ K(\vec{x}, \vec{y}) = \sigma \exp \left(-\sum_{i=1}^3 \left(\frac{x_i - y_i}{\ell_i}\right)^2\right) $$

where $\sigma$ is the amplitude of the covariance and $\ell_i$ are the ARD lengths. The higher the ARD length the less relevant the given compoenent is for the regression.

# This part of the script runs only once, to re-run clear the varibale _x_
# If idx is not generated, generate it.
if ~exist ('x', 'var')
  # Separate in train and test set
  printf ("Building train/test data\n");
  fflush (stdout);

  [Dtrn Dtst trn tst] = splitdata ([X P], 0.7);

  x = y = idx = n = struct ('trn', [], 'tst', []);
  x.trn = Dtrn(:,1:dim); x.tst = Dtst(:,1:dim);
  y.trn = Dtrn(:,dim+1:end); y.tst = Dtst(:,dim+1:end);
  idx.trn = trn; idx.tst = tst;
  n.trn = length (trn); n.tst = length (tst);

  printf ("# training points: %d\n", n.trn);
  printf ("# testing points : %d\n", n.tst);
  fflush (stdout);

pkg load gpml

meanfunc = {@meanSum, {{@meanLinear}, {@meanConst}}};
covfunc  = {@covSEard};

# Set this to false to avoid printing summary results
verbose = true;
if ~exist ('POST', 'var')

  arg_ = {x.trn, y.trn, covfunc,'mean', meanfunc};%, 'inference', inffunc};
  if verbose
    [HYP, POST, args, NLML] = gpcell (arg_{:}, 'verbose');
    [HYP, POST, args, NLML] = gpcell (arg_{:});

  train_time = toc ();
  printf ("Training duration %.2f s\n", train_time);
  fflush (stdout);

  save (fname ('podgp'), 'HYP', 'POST', 'args','x','y','idx','n', ...

if n.tst > 0
  % Test error
  [m_tst dm2_tst] = gp_emulator (HYP, POST, args, x.tst);
  test_time = toc ();

  err   = sqrt (sumsq (y.tst - m_tst) / n.tst);
  rerr  = err ./ sqrt (sumsq (y.tst) / n.tst);

  [maerr i]  = max (abs(y.tst - m_tst));
  ymae       = y.tst(i);
  marerr     = maerr ./ abs (ymae);
  idx_mae    = idx.tst(i);

if verbose
  printf ("--- GP training result ---\n");
  for i=1:nEmu
  printf ("### Component %d\n", i);
    printf ("Log. Marginal Likelihood (N=%4d): %.3g\n", n.trn, -NLML{i});
    if n.tst > 0
      printf ("L2-norm error on test set        : %.2g (~%.2g%%)\n", err(i), ...
      printf ("Max. abs. error on test set      : %.2g@%.2g (~%.2g%%)\n", ...
               maerr(i), ymae(i), marerr(i)*100);

  if n.tst > 0
    printf ("Runtime test set (N=%4d)        : %.2f s\n", n.tst, test_time);
  printf ("--------------------------\n");

elseif n.tst > 0
  printf ("MAE (%d@%d): %.2g@%.2g (~%.2g%%)\n", ...
  n.tst, n.trn, maerr, ymae, marerr*100);

fflush (stdout);
Building train/test data
# training points: 140
# testing points : 60
Function evaluation 97; Value -1.396967e+03
Function evaluation 92; Value -1.202512e+03
Function evaluation 83; Value -9.918300e+02
Function evaluation 85; Value -8.182661e+02
Function evaluation 77; Value -7.717795e+02
Function evaluation 102; Value -6.838930e+02
Function evaluation 118; Value -5.884541e+02
Training duration 12.23 s
--- GP training result ---
### Component 1
Log. Marginal Likelihood (N= 140): 1.4e+03
L2-norm error on test set        : 2.4e-06 (~0.0035%)
Max. abs. error on test set      : 8.1e-06@-0.03 (~0.027%)
### Component 2
Log. Marginal Likelihood (N= 140): 1.2e+03
L2-norm error on test set        : 6.8e-06 (~0.01%)
Max. abs. error on test set      : 1.5e-05@-0.091 (~0.017%)
### Component 3
Log. Marginal Likelihood (N= 140): 992
L2-norm error on test set        : 6.2e-05 (~0.097%)
Max. abs. error on test set      : 0.0003@-0.03 (~1%)
### Component 4
Log. Marginal Likelihood (N= 140): 818
L2-norm error on test set        : 0.00015 (~0.22%)
Max. abs. error on test set      : 0.00042@-0.056 (~0.75%)
### Component 5
Log. Marginal Likelihood (N= 140): 772
L2-norm error on test set        : 0.00023 (~0.39%)
Max. abs. error on test set      : 0.0013@-0.03 (~4.5%)
### Component 6
Log. Marginal Likelihood (N= 140): 684
L2-norm error on test set        : 0.00053 (~0.74%)
Max. abs. error on test set      : 0.0028@-0.03 (~9.5%)
### Component 7
Log. Marginal Likelihood (N= 140): 588
L2-norm error on test set        : 0.00082 (~1.2%)
Max. abs. error on test set      : 0.0043@-0.03 (~15%)
Runtime test set (N=  60)        : 0.07 s