Machine Learning Primer -- Basics




Claudius Gros, WS 2024/25

Institut für theoretische Physik
Goethe-University Frankfurt a.M.

Machine Learning - Linear Classifier

linear classifier



$\qquad\quad y= f(\mathbf{w}\cdot\mathbf{x}-b), \qquad\quad \mathbf{w}\cdot\mathbf{x}=\sum_{j=0}^{d-1} w_j x_j $

hyperplane

$$ \mathbf{x}\qquad \mathrm{such\ that} \qquad \fbox{$\phantom{\big|}\mathbf{w}\cdot\mathbf{x}=b\phantom{\big|}$} $$ $$ \begin{array}{rcl} \mathrm{right-side} &:& \mathbf{w}\cdot\mathbf{x}> b \\ \mathrm{left-side} &:& \mathbf{w}\cdot\mathbf{x}< b \end{array} $$ $$ y(z<0)\ \to\ -1,\quad\qquad y(z>0)\ \to\ 1,\quad\qquad z=\mathbf{w}\cdot\mathbf{x}-b $$

task


training / testing sets

supervised learning


$ \qquad \begin{array}{rcl} \mathrm{first\ feature} &:& \{\mathbf{x}_\alpha\}, \qquad \alpha\in[1,N_1] \\ \mathrm{second\ feature} &:& \{\mathbf{x}_\beta\}, \qquad \beta\in[N_1+1,N] \end{array} $




center of mass of individual classes

$$ \mathbf{m}_1 = \frac{1}{N_1}\sum_\alpha \mathbf{x}_\alpha, \quad\qquad \mathbf{m}_2 = \frac{1}{N_2}\sum_\beta \mathbf{x}_\beta $$

covariance matrix



probability distribution


$\qquad\quad p(\mathbf{x})\ge0, \qquad\qquad \int p(\mathbf{x})\,d\mathbf{x}=1 $

covariance matrix

$$ S_{ij} = \big\langle (x_i-m_i)(x_j-m_j)\big\rangle $$
$$ \hat S = \big\langle (\mathbf{x}-\mathbf{m})\,(\mathbf{x}-\mathbf{m})^T\big\rangle, \qquad\quad S_{ij} = (\hat{S})_{ij}, \qquad\quad \mathbf{x}=\left(\begin{array}{c} x_0\\ \vdots \\ x_{d-1} \end{array}\right) $$

covariance ellipse

$$ \frac{x^2}{a^2}+\frac{y^2}{b^2} = 1, \quad\qquad \left\{\begin{array}{rcl} x(t) &=& \sqrt{\lambda_1}\cos\Theta\cos t -\sqrt{\lambda_2}\sin\Theta\sin t \\[0.1ex] y(t) &=& \sqrt{\lambda_1}\sin\Theta\cos t +\sqrt{\lambda_2}\cos\Theta\sin t \end{array}\right. $$ $$ \mathbf{x} = \hat{V} \cdot \hat{\Lambda} \cdot \mathbf{T}, \quad\qquad \mathbf{x} = {x\choose y}, \quad\qquad \mathbf{T} = {\cos t\choose \sin t}, $$ $$ \hat{V}=\left(\begin{array}{cc} \cos\Theta & -\sin\Theta \\ \sin\Theta & \cos\Theta \end{array}\right) \quad\qquad \hat{\Lambda}=\left(\begin{array}{cc} \sqrt{\lambda_1} & 0 \\ 0 & \sqrt{\lambda_2}, \end{array}\right) $$


plotting a covariance ellipse




Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3
import math                      # math
import numpy as np               # numerics
import matplotlib.pyplot as plt  # plotting
from numpy import linalg as LA   # linear algebra

# ***
# *** covariance matrix from data
# ***

def covarianceMatrix(data):
  '''normalized covariance matrix of input data'''
  nRow = len(data)                     # number of data points
  nCol = len(data[0])                  # dimension
  mean = [0.0 for iCol in range(nCol)]
  for thisRow in data:                 # loop over data points
    mean += thisRow*1.0/nRow           # vector-mean
#
  coVar = np.zeros((nCol,nCol))        # empty matrix
  for thisRow in data:
    dataPoint = thisRow - mean         # shifted data point
    coVar += np.outer(dataPoint,dataPoint)/nRow    # outer product
#
  if (1==2):
    print("\ninput data\n", data)
    print("nRow, nCol\n", nRow, nCol)
    print("mean\n", mean)
  print("\n covariance matrix: measured \n",coVar)
#
  return mean, coVar                   # returning both

# ***
# *** generate data for plotting an ellipse
# ***

def plotEllipseData(data):
  "generates ellipse from symmetric 2x2 slice of the input matrix"
#
  slice22 = [inner[:2] for inner in data[:2]]   # slice
  slice22[0][1] = slice22[1][0]                 # symmetrize
                                                # ^ should not be necessary
  eigenValues, eigenVectors = LA.eig(slice22)   # eigensystem
#
  if (eigenValues[0]<0.0) or (eigenValues[1]<0.0):
    print("# plotEllipseData: only positive eigenvalues (variance)")
    return
#
  if (1==2):
    print("\n# coVar     \n", coVar  )
    print("\n# slice22     ", slice22)
    print("\n# eigenValues ", eigenValues)
    print("\n# eigenVectors\n",eigenVectors)
#
  a = math.sqrt(eigenValues[0])
  b = math.sqrt(eigenValues[1])
  cTheta = eigenVectors[0][0]
  sTheta = eigenVectors[1][0]
  x = []
  y = []
  for i in range(nPoints:=101):          # walrus assignment
    tt = i*2.0*math.pi/(nPoints-1)       # full loop
    cc = math.cos(tt)
    ss = math.sin(tt)
    xx = a*cTheta*cc - b*sTheta*ss
    yy = a*sTheta*cc + b*cTheta*ss
    x.append(xx)
    y.append(yy)
#   print(xx,yy)
  return x, y

# ***
# *** generate 2D test data
# ***

def testData(angle, var1, var2, nData, startMean=[0.0,0.0]):
  "2D Gaussian for a given angle and main variances.\
   A = \sum_i \lambda_i |lambda_i><lambda_i|"
#
  eigen1 = [math.cos(angle),-math.sin(angle)]
  eigen2 = [math.sin(angle), math.cos(angle)]
  startCoVar  = var1*np.outer(eigen1,eigen1)
  startCoVar += var2*np.outer(eigen2,eigen2)
  print("\n covariance matrix: data generation \n",startCoVar)
  return np.random.multivariate_normal(startMean, startCoVar, nData)

# ***
# *** main (with a block)
# ***

if (__name__ == "__main__"):                 # blocking out-of-use access
#
  dataMatrix = testData(0.3*math.pi, 1.0, 9.0, 100)
  mean, coVar = covarianceMatrix(dataMatrix)

  if (1==2):
    print("\n#main: data\n", dataMatrix)
    print("#main: mean\n", mean)
    print("\n#main: coVar matrix \n",coVar)

  xEllipse, yEllipse = plotEllipseData(coVar)  # coVar-ellipse

  xData = [thisRow[0] for thisRow in dataMatrix]
  yData = [thisRow[1] for thisRow in dataMatrix]

  Z_90 = math.sqrt(4.601)                      # 90% confidence
  Z_95 = math.sqrt(5.991)                      # 95% confidence
  Z_99 = math.sqrt(9.210)                      # 99% confidence
#
  xE_90 = [Z_90*xx + mean[0] for xx in xEllipse]
  yE_90 = [Z_90*yy + mean[1] for yy in yEllipse]
  xE_99 = [Z_99*xx + mean[0] for xx in xEllipse]
  yE_99 = [Z_99*yy + mean[1] for yy in yEllipse]
#
  plt.plot(xE_90, yE_90, label="90%")
  plt.plot(xE_99, yE_99, label="99%")
  plt.plot(xData, yData, "ob", markersize=5)
#
  plt.legend(loc="upper left")
  plt.axis('square')                           # square plot
  plt.savefig('foo.svg')                       # export figure
  plt.show()

python intermezzo

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import math                        # math
import matplotlib.pyplot as plt    # plotting
import ML_covarianceMatrix as CM   # loading user-defined module


dataMatrix  = CM.testData(0.3*math.pi, 1.0, 9.0, 100)
mean, coVar = CM.covarianceMatrix(dataMatrix)

xEllipse, yEllipse = CM.plotEllipseData(coVar)  # coVar-ellipse

xData = [thisRow[0] for thisRow in dataMatrix]
yData = [thisRow[1] for thisRow in dataMatrix]

Z_90 = math.sqrt(4.601)                      # 90% confidence
Z_95 = math.sqrt(5.991)                      # 95% confidence
Z_99 = math.sqrt(9.210)                      # 99% confidence

xE_90 = [Z_90*xx + mean[0] for xx in xEllipse]
yE_90 = [Z_90*yy + mean[1] for yy in yEllipse]
xE_99 = [Z_99*xx + mean[0] for xx in xEllipse]
yE_99 = [Z_99*yy + mean[1] for yy in yEllipse]

plt.plot(xE_90, yE_90, label="90%")
plt.plot(xE_99, yE_99, label="99%")
plt.plot(xData, yData, "ob", markersize=5)

plt.legend(loc="upper left")
plt.axis('square')                           # square plot
plt.savefig('foo.svg')                       # export figure
plt.show()

multivariate Gaussians

diagonal case

$$ \hat{S}=\left(\begin{array}{ccc} \sigma_0^2 & \dots & 0 \\ \vdots & \ddots & \vdots \\ 0 & \dots & \sigma_{d-1}^2 \end{array}\right), \qquad\quad \sigma_k^2 = \big\langle (x_k-m_k)^2\big\rangle $$ $$ |\hat{S}| = \prod_k \sigma_k^2, \qquad\quad \hat{S}^{-1}=\left(\begin{array}{ccc} 1/\sigma_0^2 & \dots & 0 \\ \vdots & \ddots & \vdots \\ 0 & \dots & 1/\sigma_{d-1}^2 \end{array}\right) $$ $$ N(\mathbf{x}) = \prod_k \frac{\exp(-(x_k-m_k)^2/(2\sigma_k^2))}{\sqrt{2\pi\sigma_k^2}} $$

statistics of features

in terms of the covariance matrix

$$ \fbox{$\phantom{\big|}\sigma_\alpha^2= \mathbf{w}^T \hat S_\alpha \mathbf{w}\phantom{\big|}$} \qquad\quad \hat S_\alpha = \big\langle (\mathbf{x}_\alpha-\mathbf{m}_\alpha)\, (\mathbf{x}_\alpha-\mathbf{m}_\alpha)^T\big\rangle $$

linear discriminant analysis (LDA)

Fisher objective function

task

$$ S = \frac{(\mu_1-\mu_2)^2}{\sigma_1^2+\sigma_2^2} = \frac{(\mathbf{w}\cdot(\mathbf{m}_1-\mathbf{m}_2))^2} {\mathbf{w}^T (\hat{S}_1+\hat{S}_2) \mathbf{w}} = \frac{\mathbf{w}^T(\mathbf{m}_1-\mathbf{m}_2) (\mathbf{m}_1-\mathbf{m}_2)^T\mathbf{w}} {\mathbf{w}^T (\hat{S}_1+\hat{S}_2) \mathbf{w}} $$ $$ S = \frac{\mathbf{w}^T\hat{A}\mathbf{w}} {\mathbf{w}^T \hat{B}\mathbf{w}} \qquad\quad \hat{A}=(\mathbf{m}_1-\mathbf{m}_2)(\mathbf{m}_1-\mathbf{m}_2)^T \qquad\quad \hat{B}=\hat{S}_1+\hat{S}_2 $$

LDA - maximization

$$ S = \frac{\mathbf{w}^T\hat{A}\mathbf{w}} {\mathbf{w}^T \hat{B}\mathbf{w}} \qquad\quad \hat{A}=(\mathbf{m}_1-\mathbf{m}_2)(\mathbf{m}_1-\mathbf{m}_2)^T \qquad\quad \hat{B}=\hat{S}_1+\hat{S}_2 $$

proportionality factors

$$ \mathbf{w}^T\hat{A}=\mathbf{w}^T \big((\mathbf{m}_1-\mathbf{m}_2)(\mathbf{m}_1-\mathbf{m}_2)^T\big) \propto (\mathbf{m}_1-\mathbf{m}_2)^T $$

regularization


LDA threshold


miss-classifications

$$ E= \int_{-\infty}^b p_2(z)\,dz + \int_b^{\infty} p_1(z)\,dz $$ $$ \frac{\partial E}{\partial b} = 0 \qquad\quad \fbox{$\phantom{\big|} p_1(b) = p_2(b)\phantom{\big|}$} $$

basic LDA code



Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import math                        # math
import matplotlib.pyplot as plt    # plotting
import numpy as np 
from numpy.linalg import inv       # inverse matrix
import ML_covarianceMatrix as CM   # loading user-defined module


#
# --- data generation
#

dataMatrix_A  = CM.testData( 0.4*math.pi, 1.0, 9.0, 30, [ 6.0,0.0])
dataMatrix_B  = CM.testData(-0.4*math.pi, 1.0, 9.0, 30, [-6.0,0.0])
mean_A, coVar_A = CM.covarianceMatrix(dataMatrix_A)  
mean_B, coVar_B = CM.covarianceMatrix(dataMatrix_B)  

#
# --- LDA
#

S_inverse    = inv(coVar_A+coVar_B)
w_vector     = np.matmul(S_inverse,mean_B-mean_A)       # LDA

w_normalized = w_vector / np.sqrt(np.sum(w_vector**2))  # normalization
w_orthogonal = [w_normalized[1], -w_normalized[0]]      # \perp vector

LL = 5.0
midPoint = 0.5*(mean_B+mean_A)
x_lower  = midPoint[0] - LL*w_orthogonal[0]
y_lower  = midPoint[1] - LL*w_orthogonal[1]
x_upper  = midPoint[0] + LL*w_orthogonal[0]
y_upper  = midPoint[1] + LL*w_orthogonal[1]
x_LDA_plane = [x_lower, x_upper]
y_LDA_plane = [y_lower, y_upper]

if (1==2):
  print("\n#main w_orthogonal\n", w_orthogonal)
  print("\n#main midPoint    \n", midPoint    )

#
# --- printing
#

xEllipse_A, yEllipse_A = CM.plotEllipseData(coVar_A)  # coVar-ellipse
xEllipse_B, yEllipse_B = CM.plotEllipseData(coVar_B)  # coVar-ellipse

xData_A = [thisRow[0] for thisRow in dataMatrix_A]
yData_A = [thisRow[1] for thisRow in dataMatrix_A]
xData_B = [thisRow[0] for thisRow in dataMatrix_B]
yData_B = [thisRow[1] for thisRow in dataMatrix_B]

Z_95 = math.sqrt(5.991)                      # 95% confidence  
 
xE_95_A = [Z_95*xx + mean_A[0] for xx in xEllipse_A]
yE_95_A = [Z_95*yy + mean_A[1] for yy in yEllipse_A]
xE_95_B = [Z_95*xx + mean_B[0] for xx in xEllipse_B]
yE_95_B = [Z_95*yy + mean_B[1] for yy in yEllipse_B]

x_connectMean = [ mean_A[0], mean_B[0] ]
y_connectMean = [ mean_A[1], mean_B[1] ]
 
plt.plot(xE_95_A, yE_95_A, "k", linewidth=2.0, label="95%")
plt.plot(xE_95_B, yE_95_B, "b", linewidth=2.0, label="95%")
plt.plot(xData_A, yData_A, "ok", markersize=5)
plt.plot(xData_B, yData_B, "ob", markersize=5)

plt.plot(x_connectMean, y_connectMean, "r", linewidth=3.0)
plt.plot(x_connectMean, y_connectMean, "or", markersize=9.0)

plt.plot(x_LDA_plane, y_LDA_plane, "g", linewidth=4.0, label="LDA plane")
 
plt.legend(loc="upper left")
#plt.axis('square')                           # square plot
plt.savefig('foo.svg')                       # export figure
plt.show()