# vim: set tabstop=4
# s_ENGfeatures_regress.py
#!/usr/bin/env python3

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see http://www.gnu.org/licenses/.
# Juan Pablo Carbajal <ajuanpi+dev@gmailcom>
# 01.02.2018

Regression using engineered features

Author: Juan Pablo Carbajal ajuanpi+dev@gmailcom
Date: 12.03.2018

import numpy as np

import matplotlib.pyplot as plt

import platform
if platform.system() is not 'Windows':
    import matplotlib
    matplotlib.rcParams['text.usetex'] = True
    matplotlib.rcParams['text.latex.unicode'] = True
    matplotlib.rcParams['font.family'] = 'sans-serif'
    matplotlib.rcParams['font.sans-serif'] = 'DejaVu Sans'

from sklearn.linear_model import ARDRegression
from sklearn.metrics import mean_squared_error, r2_score

from dataparser import sensorData, ioData
from pH_features import aeration_valley, aeration_climax
from O2_features import respirogram

# TODO: do this OS robust
datapath = '../data/'
files = {'pH':'180215_AI4_MW.csv', 'O2':'180130_AI2_MW.csv'}

Load data and verify integrity of outputs

sensor = dict()
_NH4 = np.array([])
for signal, filenam in files.items():
    filename = datapath + filenam
    print('Processing signal {} from {}'.format(signal, filename))

    Ydata = ioData(filename)
    if not _NH4.size > 0:
        _NH4 = Ydata.NH4[:,1]

        # filter NA values in output
        nonan = np.isfinite(_NH4)
        _NH4 = _NH4[nonan]

    # Check allfiles have the same output values
    if not np.allclose (_NH4, Ydata.NH4[nonan,1]):
        raise ValueError('Output signals differ!')

    sensor[signal] = sensorData(filename)
Processing signal pH from ../data/180215_AI4_MW.csv
Processing signal O2 from ../data/180130_AI2_MW.csv

Filter inputs that do not show pH aeration valley

## Select aeration phase
pH_aer = np.nonzero(sensor['pH'].phase==4)[0]
_pH = sensor['pH'].interp_nan()[nonan,:][:,pH_aer]
pH_t = sensor['pH'].time[pH_aer]
pH_t = (pH_t - pH_t[0])/(pH_t[-1] - pH_t[0])
dt = pH_t[1] - pH_t[0]

## Compute amonia valley
fp_valley_opt = {'order':5}
s_valley_opt = {'freq':5}

novalley = np.array ([True]*_pH.shape[0])
for i,s in enumerate(_pH): # s is all pH values for one cycle
    m, s_smooth = aeration_valley(pH_t, s, \
                           smooth_opt=s_valley_opt, \
    if m:
        novalley[i] = False
    # uncomment here to see the signals
#    plt.figure(1)
#    plt.clf()
#    plt.plot(pH_t, s, label='signal')
#    plt.plot(pH_t, s_smooth, label='smoothed signal')
#    if m:
#        plt.plot(*m, 'go', label='valley')
#    plt.xlabel('time')
#    plt.ylabel('value')
#    plt.autoscale(enable=True, axis='x', tight=True)
#    plt.show()

print("Number of signals with amonia valley: {}/{}".format(\
    np.sum(np.logical_not(novalley)), _pH.shape[0]))

## Select data without amoinia valley
NH4 = _NH4[novalley]
pH = _pH[novalley,:]

O2_aer = np.nonzero(sensor['O2'].phase==4)[0]
_O2 = sensor['O2'].interp_nan()[nonan,:][:,O2_aer]
O2_t = sensor['O2'].time[O2_aer]
O2_t = (O2_t - O2_t[0])/(O2_t[-1] - O2_t[0])
O2 = _O2[novalley,:]
Number of signals with amonia valley: 44/79

Extract features

from scipy.signal import argrelextrema
fp_climax_opt = {'finder':argrelextrema, 'comparator':np.greater_equal, 
s_climax_opt = {'freq':10}

pH_M = np.zeros ([pH.shape[0],2])
for i,s in enumerate(pH): # s is all pH values for one cycle
    val, s_smooth = aeration_climax(pH_t, s, \
            smooth_opt=s_climax_opt, findpeaks_opt=fp_climax_opt)
    pH_M[i,:] = np.array(val)
    # uncomment here to see the signals
#    plt.figure(1)
#    plt.clf()
#    plt.plot(pH_t, s, label='signal')
#    plt.plot(pH_t, s_smooth, label='smoothed signal')
#    plt.plot(*val, 'go', label='climax')
#    plt.xlabel('time')
#    plt.ylabel('value')
#    plt.autoscale(enable=True, axis='x', tight=True)
#    plt.show()

O2_R = np.zeros ([O2.shape[0],2])
for i,s in enumerate(O2): # s is all O2 values for one cycle
    Rm, Rp, idx = respirogram (O2_t, s)
    O2_R[i,0] = np.mean(Rm[idx+1])
    O2_R[i,1] = np.mean(Rp[idx-1])

Regress with ARD and check the weights

N = pH.shape[0]
X = np.zeros([N,4])
X[:,0:2] = pH_M

X[:,2:4] = O2_R
scale = lambda x: (x - np.mean(x,axis=0)) / np.std(x,axis=0) # zscore
#scale = lambda x: (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0)) # baseline
X = scale(X)

## Nonlinear mapping of inputs
# The O2_R zscored  features seem to be have an exponential relation with the output
e_zs_O2R = scale(np.exp(np.abs(X[:,2:4])))
plt.plot(e_zs_O2R, NH4, 'o')

X = np.concatenate((X, e_zs_O2R), axis=1)

## ARD regression
reg = ARDRegression (n_iter=int(1e3), compute_score=True)
reg.fit(X, NH4)
y_pred = reg.predict(X)

print("Mean squared error: %.2f "
      % mean_squared_error(NH4, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f ' 
      % r2_score(NH4, y_pred))
Mean squared error: 50.74
Variance score: 0.64
_n_ = np.arange(len(reg.coef_))
plt.bar(_n_, reg.coef_)
plt.ylabel('Weight in the model')
name = np.array(['pH\_t\_max', 'pH\_max','O2\_R-', 'O2\_R+',\
plt.xticks(_n_, name)
ARD feature selection
plt.plot(NH4,'o', label='data')
plt.plot(y_pred,'x', label='pred.')
Regression results.