# vim: set tabstop=4
# s_pcr_pH.py
#!/usr/bin/env python3

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see http://www.gnu.org/licenses/.
#
# Juan Pablo Carbajal <ajuanpi+dev@gmailcom>
# 01.02.2018

PCR on the NH4 vs. pH signal

Author: Juan Pablo Carbajal ajuanpi+dev@gmailcom
Date: 01.02.2018

import numpy as np

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['text.usetex'] = True
matplotlib.rcParams['text.latex.unicode'] = True
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = 'DejaVu Sans'

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, ARDRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RepeatedKFold

from dataparser import pHData, ioData

Data load and pre-process

First load the data, filter NA values and select the output to regress. In this case we use the NH4 value at the effluent.

filename = '../data/180109_AI4_MW.csv'

pHData = pHData(filename)
pH = pHData.interp_nan()

Ydata = ioData(filename)
# filter NA values in output
NH4 = Ydata.NH4[:,1]
nonan = np.logical_not(np.isnan(NH4))
NH4 = NH4[nonan]
pH = pH[nonan,:]
# Filter outputs that are lower than 0.3
above = np.greater(NH4,0.3)
NH4 = NH4[above]
pH = pH[above,:]

PCR training

We extract PCs from the whole dataset

pca = PCA(n_components=0.995, svd_solver='full')
pca.fit(pH)
PCA(copy=True, iterated_power='auto', n_components=0.995,
random_state=None,
  svd_solver='full', tol=0.0, whiten=False)

We choose the regression strategy

#reg = LinearRegression()
reg = ARDRegression(n_iter=int(1e3))

Then we find the best regressor coeficients by cross validation

rkf = RepeatedKFold(n_splits=5, n_repeats=50)
coef = []
for train_index, test_index in rkf.split(pH):
    pH_train, pH_test = pH[train_index,:], pH[test_index,:]
    y_train, y_test = NH4[train_index], NH4[test_index]
   
    X_train = pca.transform(pH_train)
    reg.fit (X_train, y_train)
    coef.append(reg.coef_)

reg.coef_ = np.median(coef, axis=0)

Regression coefficients as filter

The regression \(\beta_t\) can be interpreted as a linear filter applied to the time signal of each cycle \(i\)

\[ \sum_{t=1}^T X_{it}\beta_t = y_i\]

beta = pca.inverse_transform (reg.coef_)
plt.ion()
plt.figure(1)
plt.clf()
plt.plot(pHData.time, beta,'-')
plt.xlabel('Time')
plt.ylabel(r'$\beta_t$')
plt.show()
Filters per phase.

Regression quality

Here we simply assess the quality of the regression using the dataset.

# Make predictions
# alternatively do pH_test.dot(M)
y_pred = reg.predict(pca.transform(pH))

print("Mean squared error: %.2f "
      % mean_squared_error(NH4, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f ' 
      % r2_score(NH4, y_pred))
Mean squared error: 52.06
Variance score: 0.60
plt.figure(2)
plt.clf()
plt.plot(NH4,'o', label='data')
plt.plot(y_pred,'x', label='pred.')
plt.ylabel('NH4')
plt.xlabel('Cycle')
plt.legend()
plt.show()
Regression results.