# vim: set tabstop=4
# s_ARDfeatures_pH.py
#!/usr/bin/env python3

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see http://www.gnu.org/licenses/.
#
# Juan Pablo Carbajal <ajuanpi+dev@gmailcom>
# 01.02.2018

Automatic feature extraction on pH signal using ARD

Author: Juan Pablo Carbajal ajuanpi+dev@gmailcom
Date: 01.02.2018

import numpy as np

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['text.usetex'] = True
matplotlib.rcParams['text.latex.unicode'] = True
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = 'DejaVu Sans'

from sklearn.linear_model import ARDRegression
from sklearn.metrics import mean_squared_error, r2_score

from dataparser import pHData, ioData

Data load and pre-process

First load the data, filter NA values and select the output to regress. In this case we use the NH4 value at the effluent.

filename = '../data/180109_AI4_MW.csv'

pHData = pHData(filename)
pH = pHData.interp_nan()

Ydata = ioData(filename)
# filter NA values in output
NH4 = Ydata.NH4[:,1]
nonan = np.logical_not(np.isnan(NH4))
NH4 = NH4[nonan]
pH = pH[nonan,:]
# Filter outputs that are lower than 0.3
above = np.greater(NH4,0.3)
NH4 = NH4[above]
pH = pH[above,:]

if 'reg' not in vars():
    #' ## Regress with ARD and check the weights of each time step
    reg = ARDRegression (n_iter=int(100), compute_score=True)
    reg.fit(pH, NH4)

    #' Order features with decreasing weight
    o = np.argsort(np.abs(reg.coef_))[::-1]
    idx = o[:16]

Regress with selected features

reg_s = ARDRegression (n_iter=int(300))
reg_s.fit(pH[:,idx], NH4)
y_pred = reg_s.predict(pH[:,idx])

print("Mean squared error: %.2f "
      % mean_squared_error(NH4, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f ' 
      % r2_score(NH4, y_pred))
Mean squared error: 63.44
Variance score: 0.51
plt.ion()
plt.figure(1)
plt.clf()
plt.subplot(2,1,1)
plt.plot(pHData.time, reg.coef_)
plt.ylabel('Weight of the model')
plt.subplot(2,1,2)

s = np.abs(reg.coef_[idx])
s = 36 * s / s.max() + 16
plt.scatter(pHData.time[idx], np.mean(pH[:,idx],axis=0), s=s, alpha=0.9)
#plt.autoscale(enable=False, axis='y', tight=True)
plt.ylim(7.25,8.3)
plt.autoscale(enable=False, axis='x', tight=True)
plt.plot(pHData.time, pH.T,'-', linewidth=0.5)
plt.ylabel('pH')
plt.show()
ARD feature selection
plt.figure(2)
plt.clf()
plt.plot(NH4,'o', label='data')
plt.plot(y_pred,'x', label='pred.')
plt.ylabel('NH4')
plt.xlabel('Cycle')
plt.legend()
plt.show()

#plt.figure(3)
#plt.clf()
#plt.plot(reg.scores_)
#plt.xlabel('Iteration')
#plt.ylabel('Log marginal likelihood')
Regression results.