Advanced Introduction to C++, Scientific Computing and Machine Learning




Claudius Gros, WS 2019/20

Institut für theoretische Physik
Goethe-University Frankfurt a.M.

Machine Learning - Linear Classifier

machine learning vs. AI

what is intelligence?

learning

ML toolboxes

sample problem settings

unsupervised learning

clustering

dimensionality reduction

supervised learning

classification

regression

linear classifier



$\displaystyle\qquad\quad y= f(\mathbf{w}\cdot\mathbf{x}-b), \qquad\quad \mathbf{w}\cdot\mathbf{x}=\sum_{j=0}^{d-1} w_j x_j $

hyperplane

$$ \mathbf{x}\qquad \mathrm{such\ that} \qquad \fbox{$\phantom{\big|}\mathbf{w}\cdot\mathbf{x}=b\phantom{\big|}$} $$ $$ \begin{array}{rcl} \mathrm{right-side} &:& \mathbf{w}\cdot\mathbf{x}> b \\ \mathrm{left-side} &:& \mathbf{w}\cdot\mathbf{x}< b \end{array} $$ $$ y(z<0)\ \to\ -1,\quad\qquad y(z>0)\ \to\ 1,\quad\qquad z=\mathbf{w}\cdot\mathbf{x}-b $$

task

training / testing sets

supervised learning


$\displaystyle \qquad \begin{array}{rcl} \mathrm{first\ feature} &:& \{\mathbf{x}_\alpha\}, \qquad \alpha\in[1,N_1] \\ \mathrm{second\ feature} &:& \{\mathbf{x}_\beta\}, \qquad \beta\in[N_1+1,N] \end{array} $




center of mass of individual classes

$$ \mathbf{m}_1 = \frac{1}{N_1}\sum_\alpha \mathbf{x}_\alpha, \quad\qquad \mathbf{m}_2 = \frac{1}{N_2}\sum_\beta \mathbf{x}_\beta $$

information theory - covariance matrix



probability distribution


$\displaystyle\qquad\quad p(\mathbf{x})\ge0, \qquad\qquad \int p(\mathbf{x})\,d\mathbf{x}=1 $

covariance matrix

$$ S_{ij} = \big\langle (x_i-m_i)(x_j-m_j)\big\rangle $$
$$ \hat S = \big\langle (\mathbf{x}-\mathbf{m})\,(\mathbf{x}-\mathbf{m})^T\big\rangle, \qquad\quad S_{ij} = (\hat{S})_{ij}, \qquad\quad \mathbf{x}=\left(\begin{array}{c} x_0\\ \vdots \\ x_{d-1} \end{array}\right) $$

information theory - multivariate Gaussian

diagonal case

$$ \hat{S}=\left(\begin{array}{ccc} \sigma_0^2 & \dots & 0 \\ \vdots & \ddots & \vdots \\ 0 & \dots & \sigma_{d-1}^2 \end{array}\right), \qquad\quad \sigma_k^2 = \big\langle (x_i-m_i)^2\big\rangle $$ $$ |\hat{S}| = \prod_k \sigma_k^2, \qquad\quad \hat{S}^{-1}=\left(\begin{array}{ccc} 1/\sigma_0^2 & \dots & 0 \\ \vdots & \ddots & \vdots \\ 0 & \dots & 1/\sigma_{d-1}^2 \end{array}\right) $$ $$ N(\mathbf{x}) = \prod_k \frac{\exp(-(x_k-m_k)^2/(2\sigma_k^2))}{\sqrt{2\pi\sigma_k^2}} $$

statistics of feature

in terms of the covariance matrix

$$ \fbox{$\phantom{\big|}\displaystyle\sigma_\alpha^2= \mathbf{w}^T \hat S_\alpha \mathbf{w}\phantom{\big|}$}\,, \qquad\quad \hat S_\alpha = \big\langle (\mathbf{x}_\alpha-\mathbf{m}_\alpha)\, (\mathbf{x}_\alpha-\mathbf{m}_\alpha)^T\big\rangle $$

linear discriminant analysis (LDA)

Fisher objective function

task

$$ S = \frac{(\mu_1-\mu_2)^2}{\sigma_1^2+\sigma_2^2} = \frac{(\mathbf{w}\cdot(\mathbf{m}_1-\mathbf{m}_2))^2} {\mathbf{w}^T (\hat{S}_1+\hat{S}_2) \mathbf{w}} = \frac{\mathbf{w}^T(\mathbf{m}_1-\mathbf{m}_2) (\mathbf{m}_1-\mathbf{m}_2)^T\mathbf{w}} {\mathbf{w}^T (\hat{S}_1+\hat{S}_2) \mathbf{w}} $$ $$ S = \frac{\mathbf{w}^T\hat{A}\mathbf{w}} {\mathbf{w}^T \hat{B}\mathbf{w}}, \qquad\quad \hat{A}=(\mathbf{m}_1-\mathbf{m}_2)(\mathbf{m}_1-\mathbf{m}_2)^T, \qquad\quad \hat{B}=\hat{S}_1+\hat{S}_2 $$

LDA - maximization

$$ S = \frac{\mathbf{w}^T\hat{A}\mathbf{w}} {\mathbf{w}^T \hat{B}\mathbf{w}}, \qquad\quad \hat{A}=(\mathbf{m}_1-\mathbf{m}_2)(\mathbf{m}_1-\mathbf{m}_2)^T, \qquad\quad \hat{B}=\hat{S}_1+\hat{S}_2 $$

proportionality factors

$$ \mathbf{w}^T\hat{A}=\mathbf{w}^T \big((\mathbf{m}_1-\mathbf{m}_2)(\mathbf{m}_1-\mathbf{m}_2)^T\big) \propto (\mathbf{m}_1-\mathbf{m}_2)^T $$

regularization

LDA threshold


miss-classifications

$$ E= \int_{-\infty}^b p_2(z)\,dz + \int_b^{\infty} p_1(z)\,dz $$ $$ \frac{\partial E}{\partial b} = 0, \qquad\quad \fbox{$\displaystyle\phantom{\big|} p_1(b) = p_2(b)\phantom{\big|}$} $$

LDA standalone code

Copy Copy to clipboad
Downlaod Download
#include <iostream>
#include <sstream>     // for stringstream 
#include <stdio.h>     // for printf
#include <string>      // c_str()
#include <cmath>       // for abs
#include <fstream>     // file streams
#include <stdlib.h>    // srand, rand
using namespace std;

/* contains
 *
 * double nextRandom()
 * string xyValuesGnu()
 * void plotViaGnuplot()
 * void gaussElimination()
 * void generateClassificationData()
 * void calcCoVarMatrices()
 * void evaluatePlane()
 */


// ***
// *** random number in [0,1] 
// ***
double nextRandom()
{ static bool firstTime = true;
 if (firstTime)
   {
   srand( (unsigned)time( NULL ) );
   firstTime = false;
   }
 return (rand()/(double)RAND_MAX); 
} // end of nextRandom()


// ***
// *** (x,y) string for inline data input for gnuplot
// ***

template<int N>
string xyValuesGnu(double (&xyValues)[2][N], string dataName)
{
 stringstream ss;
 ss << dataName << " << " << "EOD\n";       // defining data block
 for (int i=0; i<N; i++)
   ss << xyValues[0][i] << " " << xyValues[1][i] << endl;
 ss << "EOD\n";                             // terminating data block
//
 return ss.str();
} // end of xyValuesGnu()


// *** 
// *** plot data via gnuplot
// *** 

void plotViaGnuplot(string dataNameOne,  string dataToPlotOne,
                    string dataNameTwo,  string dataToPlotTwo,
                    string dataNamePlane,string dataToPlotPlane,
                    string dataNameBasis,string dataToPlotBasis)
{
 FILE *pipeGnu = popen("gnuplot", "w");            // streaming to gnuplot
 fprintf(pipeGnu, "set term qt persist size 1000,800\n");
 fprintf(pipeGnu, "set nokey\n");                  // no legend
//
 fprintf(pipeGnu, "%s\n",dataToPlotOne.c_str());   
 fprintf(pipeGnu, "%s\n",dataToPlotTwo.c_str());   
 fprintf(pipeGnu, "%s\n",dataToPlotPlane.c_str()); // send data block
 fprintf(pipeGnu, "%s\n",dataToPlotBasis.c_str()); // send data block
//                                                 // plot data
 fprintf(pipeGnu,   "plot \"%s\" w points pt 5 ps 2 \n", dataNameOne.c_str());
 fprintf(pipeGnu, "replot \"%s\" w points pt 5 ps 2 \n", dataNameTwo.c_str());
 fprintf(pipeGnu, "replot \"%s\" w lines lw 7 \n", dataNamePlane.c_str());
 fprintf(pipeGnu, "replot \"%s\" w linesp pt 7 ps 4 lw 4 \n", dataNameBasis.c_str());
//
 fclose(pipeGnu);    // closing the pipe to gnuplot
} // end of plotViaGnuplot


// *** Gauss elimination of Ax=b with pivoting.
// *** On input quadratic matrix A and vector b.
// *** On ouput solution x.
 
template<int N>
void gaussElimination(double (&A)[N][N], double (&b)[N], double (&x)[N]) 
{
 double temp;                       // temp variable
 const double smallNumber = 1e-10;  // a small number
//
//--- loop over all columns A[][col];  A[][col]
 for (int col=0; col<N; col++) 
   {             
//--- find pivot row; swap pivot with actualy row
   int pivotRow = col; 
   for (int row=col+1;row<N;row++) 
     if ( abs(A[row][col]) > abs(A[pivotRow][col]) ) 
       pivotRow = row;
 
//--- swap row A[col][] with A[pivotRow][]
   for (int i=0;i<N;i++)
     {
     temp = A[col][i];
     A[col][i] = A[pivotRow][i];
     A[pivotRow][i] = temp;
     }
 
//--- swap elements b[col] with b[pivotRow]
   temp = b[col];  
   b[col] = b[pivotRow]; 
   b[pivotRow] = temp;
 
//--- throw a warning if matrix is singular or nearly singular
   if (abs(A[col][col]) <= smallNumber) 
     cerr << "Matrix is singular or nearly singular" << endl;
 
//--- Gauss with pivot within A and b
   for (int row=col+1;row<N;row++) 
     {
     double alpha = A[row][col] / A[col][col];  // divide by pivot
     b[row] -= alpha*b[col];
     for (int j=col;j<N;j++) 
       A[row][j] -= alpha * A[col][j];
     }
      }  // end of loop over all columns
 
//--- back substitution
 for (int i=N-1;i>=0;i--) 
   {
   double sum = 0.0;
   for (int j=i+1;j<N;j++) 
      sum += A[i][j] * x[j];
   x[i] = (b[i] - sum) / A[i][i];   // pivots on diagonal after swapping
   }
}  // end of gaussElimination()
 
 
// ***
// *** generate data to be classified; only for (dim==2)
// ***
template<int nOne, int nTwo>
void generateClassificationData(double dataClassOne[2][nOne], double dataClassTwo[2][nTwo]) 
{
 double angleOne =  0.75*M_PI;          // angles of the large axis of the data with
 double angleTwo =  0.75*M_PI;          // respect to the x-axis
 double L1One[2], L1Two[2];             // large eigen-directions of data distribution
 double L2One[2], L2Two[2];             // minor eigen-directions of data distribution
//
 double L1OneLength = 1.5;              // respective lengths
 double L2OneLength = 0.2;              // respective lengths
 double L1TwoLength = 1.0;         
 double L2TwoLength = 0.2;        
//
 double rr, ss;
//
 L1One[0] =  L1OneLength*cos(angleOne);
 L1One[1] =  L1OneLength*sin(angleOne);
 L2One[0] =  L2OneLength*sin(angleOne);   // L2 orthognal to L1
 L2One[1] = -L2OneLength*cos(angleOne);
//
 L1Two[0] =  L1TwoLength*cos(angleTwo);
 L1Two[1] =  L1TwoLength*sin(angleTwo);
 L2Two[0] =  L2TwoLength*sin(angleTwo);   
 L2Two[1] = -L2TwoLength*cos(angleTwo);
//
 for (int iOne=0; iOne<nOne; iOne++)     // generate data for first class
  {
  rr = 2.0*nextRandom()-1.0;
  ss = 2.0*nextRandom()-1.0;
  dataClassOne[0][iOne] =  1.0 + rr*L1One[0] + ss*L2One[0];  // x component
  dataClassOne[1][iOne] =  0.0 + rr*L1One[1] + ss*L2One[1];  // y component
  }
//
 for (int iTwo=0; iTwo<nTwo; iTwo++)     // generate data for second class
  {
  rr = 2.0*nextRandom()-1.0;
  ss = 2.0*nextRandom()-1.0;
  dataClassTwo[0][iTwo] = -1.0 + rr*L1Two[0] + ss*L2Two[0];
  dataClassTwo[1][iTwo] =  0.0 + rr*L1Two[1] + ss*L2Two[1]; 
  }
// test printing
 if (1==2)
   {
   for (int iOne=0; iOne<nOne; iOne++)   
     cout <<  dataClassOne[0][iOne] << " " << dataClassOne[1][iOne] << endl;
   cout << endl;
   for (int iTwo=0; iTwo<nTwo; iTwo++)   
     cout <<  dataClassTwo[0][iTwo] << " " << dataClassTwo[1][iTwo] << endl;
   }
}    // end of generateClassificationData()
 

// ***
// *** calculate the covariance matrices
// ***
template<int nOne, int nTwo>
void calcCoVarMatrices(double dataClassOne[2][nOne], double dataClassTwo[2][nTwo],
                       double coVarOne[2][2], double coVarTwo[2][2],
                       double mOne[2], double mTwo[2], 
                       double S[2][2], double deltaM[2])
{
 mOne[0] = 0;
 mOne[1] = 0;
 mTwo[0] = 0;
 mTwo[1] = 0;
//
 for (int iOne=0; iOne<nOne; iOne++)   
   {
   mOne[0] += dataClassOne[0][iOne]/nOne;
   mOne[1] += dataClassOne[1][iOne]/nOne;
   }
 for (int iTwo=0; iTwo<nTwo; iTwo++)   
   {
   mTwo[0] += dataClassTwo[0][iTwo]/nTwo;
   mTwo[1] += dataClassTwo[1][iTwo]/nTwo;
   }
//
   deltaM[0] = mTwo[0] - mOne[0];
   deltaM[1] = mTwo[1] - mOne[1];
//
 for (int iOne=0; iOne<nOne; iOne++)   
   for (int i=0; i<2; i++) 
     for (int j=0; j<2; j++) 
       coVarOne[i][j] += (dataClassOne[i][iOne]-mOne[i])*
                         (dataClassOne[j][iOne]-mOne[j])/nOne;
 for (int iTwo=0; iTwo<nTwo; iTwo++)   
   for (int i=0; i<2; i++) 
     for (int j=0; j<2; j++) 
       coVarTwo[i][j] += (dataClassTwo[i][iTwo]-mTwo[i])*
                         (dataClassTwo[j][iTwo]-mTwo[j])/nTwo;
//
 for (int i=0; i<2; i++) 
   for (int j=0; j<2; j++) 
     S[i][j] = coVarOne[i][j] + coVarTwo[i][j];
//
} // end of calcCoVarMatrices()


// ***
// *** evaluate the plane
// ***
void evaluatePlane(double w[2], double dataPlane[2][2])
{
 double rr = sqrt(w[0]*w[0]+w[1]*w[1]);
 w[0] = w[0]/rr;
 w[1] = w[1]/rr;        // normalization
//
 dataPlane[0][0] =  1.8*w[1];      // orthogonal
 dataPlane[1][0] = -1.8*w[0];
 dataPlane[0][1] = -dataPlane[0][0]; 
 dataPlane[1][1] = -dataPlane[1][0];
//
// test printing
 if (1==1)
   cout <<  w[0] << " " << w[1] << endl;
} // end of evaluatePlane()


// ***
// *** main
// ***
int main() 
{
 double dataClassOne[2][900];        // the data points 
 double dataClassTwo[2][800];
 double dataBasis[2][2];             // connecting the center of masses
//
 dataBasis[0][0] = -1.0;             // (-1,0)
 dataBasis[1][0] =  0.0;    
 dataBasis[0][1] =  1.0;             // (1,0)
 dataBasis[1][1] =  0.0;    
//
 double mOne[2];                     // center of masses
 double mTwo[2];                
 double deltaM[2];                   // mTwo-mOne
 double w[2];                        // feature vector
 double dataPlane[2][2];             // orthogonal to the feature vector
//
 double coVarOne[2][2];              // covariance matrices
 double coVarTwo[2][2];
 double        S[2][2];              // sum of covariance matrices
//
 generateClassificationData(dataClassOne,dataClassTwo); 
//
 calcCoVarMatrices(dataClassOne, dataClassTwo, coVarOne, coVarTwo,
                   mOne, mTwo, S, deltaM);               
//
 gaussElimination(S,deltaM,w);             // solve    "S deltaM = w"
 evaluatePlane(w, dataPlane);              // w defines the plane
//
 string dataNameOne   = "$dataOne";                             // dataName
 string dataNameTwo   = "$dataTwo";                             
 string dataNamePlane = "$dataPlane";                             
 string dataNameBasis = "$dataBasis";                             
 string dataToPlotOne   = xyValuesGnu(dataClassOne,dataNameOne);  // data to string
 string dataToPlotTwo   = xyValuesGnu(dataClassTwo,dataNameTwo);  
 string dataToPlotPlane = xyValuesGnu(dataPlane,dataNamePlane);  
 string dataToPlotBasis = xyValuesGnu(dataBasis,dataNameBasis);  
//
 plotViaGnuplot(dataNameOne  ,dataToPlotOne,
                dataNameTwo  ,dataToPlotTwo, 
                dataNamePlane,dataToPlotPlane,
                dataNameBasis,dataToPlotBasis);
 return 1;
}  // end of main()