# vim: set tabstop=4
# s_pcr_phase_pH.py
#!/usr/bin/env python3

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see http://www.gnu.org/licenses/.
#
# Juan Pablo Carbajal <ajuanpi+dev@gmailcom>
# 01.02.2018

PCR extracting PCs from each phase of the pH signal

Author: Juan Pablo Carbajal ajuanpi+dev@gmailcom
Date: 01.02.2018

import numpy as np

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['text.usetex'] = True
matplotlib.rcParams['text.latex.unicode'] = True
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = 'DejaVu Sans'

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, ARDRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error, r2_score

from dataparser import pHData, ioData

Data load and pre-process

First load the data, filter NA values and select the output to regress. In this case we use the NH4 value at the effluent.

filename = '../data/180109_AI4_MW.csv'

pHData = pHData(filename)
pH = pHData.interp_nan()

Ydata = ioData(filename)
# Filter NA values in output
NH4 = Ydata.NH4[:,1]
nonan = np.logical_not(np.isnan(NH4))
NH4 = NH4[nonan]
pH = pH[nonan,:]
# Filter outputs that are lower than 0.3
above = np.greater(NH4,0.3)
NH4 = NH4[above]
pH = pH[above,:]

# List of phases
phases = np.unique(pHData.phase)

PCR training

We extract PCs from the whole dataset on each phase. We use variance explanation to select the number of components

phased_pca = []
idx = []
for i in phases:
    idx.append(np.nonzero(pHData.phase==i)[0])
    phased_pca.append(PCA(n_components=0.995, svd_solver='full'))
    phased_pca[-1].fit(pH[:,idx[-1]])

The extracted number components tell use where is the variability in the pH signal

for i,p in zip(phases,phased_pca):
    print ('Phase %d: %d components' % (i,p.n_components_))
Phase 1: 1 components
Phase 2: 2 components
Phase 3: 2 components
Phase 4: 4 components
Phase 5: 2 components
Phase 6: 2 components
Phase 7: 1 components

We define functions to transform on each phase

def phasedPCA_transform(S):
    tmp = []
    for i,p in enumerate(phases):
        tmp.append(phased_pca[i].transform(S[:,idx[i]]))
    X = np.concatenate(tmp,axis=1)
    return X

We choose the regression strategy

#reg = LinearRegression()
reg = ARDRegression(n_iter=int(1e3))

Then we find the best regressor coeficients by cross validation

rkf = RepeatedKFold(n_splits=5, n_repeats=50)
coef = []
wcoef = []
W = 0
for train_index, test_index in rkf.split(pH):
    pH_train, pH_test = pH[train_index,:], pH[test_index,:]
    y_train, y_test = NH4[train_index], NH4[test_index]
   
    X_train = phasedPCA_transform(pH_train)
    reg.fit (X_train, y_train)
    
    coef.append(reg.coef_)
    
    y_pred = reg.predict(phasedPCA_transform(pH_test))
    w = np.exp(r2_score(y_test, y_pred)-1)
    W += w;
    wcoef.append(reg.coef_ * w)

#reg.coef_ = np.sum(wcoef, axis=0) / W
reg.coef_ = np.median(coef, axis=0)

Regression coefficients as filter

By concatenating the regression coefficients \(\beta_{jt}\) for phase \(j \in [2,7]\) into a vector \(\beta_t\), it can be interpreted as a linear filter applied to the time signal of each cycle \(i\)

\[ \sum_{t=1}^T X_{it}\beta_t = y_i\]

# Build the filter
beta = []
n=0
for p in phased_pca:
    j = range(n,n+p.n_components_)
    n = j[-1]+1
    beta.append(p.inverse_transform (reg.coef_[j]))
plt.ion()
plt.figure(1)
plt.clf()
for i,m in enumerate(beta):
    p = phases[i]
    plt.subplot(len(beta),1,i+1)
    t = pHData.time[idx[i]]
    plt.plot((t-t[0])/(t[-1]-t[0]), m)
    plt.ylabel(r'$\beta_{%d,t}$' % p)

plt.xlabel('Time')
plt.show()
Filters per phase.

Regression quality

Here we simply assess the quality of the regression using the dataset.

# Make predictions
y_pred = reg.predict(phasedPCA_transform(pH))

print("Mean squared error: %.2f "
      % mean_squared_error(NH4, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f ' 
      % r2_score(NH4, y_pred))
Mean squared error: 67.51
Variance score: 0.48
plt.figure(2)
plt.clf()
plt.plot(NH4,'o', label='data')
plt.plot(y_pred,'x', label='pred.')
plt.ylabel('NH4')
plt.xlabel('Cycle')
plt.legend()
plt.show()
Regression results.