Machine Learning Primer -- Basics




Claudius Gros, WS 2024/25

Institut für theoretische Physik
Goethe-University Frankfurt a.M.

Support Vector Machines - Basics

maximal margins



$\qquad\quad \big\{\mathbf{x}_i,l_i\big\} \qquad l_i=\pm1 $ $\qquad\quad \mathbf{L} = \big(l_0,\,\dots,\,l_{N-1}\big) $ $\qquad\quad \mathbf{w}\cdot\mathbf{x}=b $

margin condition

$$ \begin{array}{rclcrcl} \mathbf{w}\cdot\mathbf{x}_i&\ge& b+\tilde d &\quad\mbox{for}\quad & l_i&=&+1 \\ \mathbf{w}\cdot\mathbf{x}_i&\le& b-\tilde d &\quad\mbox{for}\quad & l_i&=&-1 \\ \end{array} $$ $$ \pm \tilde{d}\ \to\ l_i \qquad\quad \fbox{$ \phantom{\big|}l_i\big(\mathbf{w}\cdot\mathbf{x}_i-b\big)- 1 \ge 0\phantom{\big|}$} \qquad\quad \forall i $$

maximal margin condition

$$ \mathbf{x}= x\frac{\mathbf{w}}{|\mathbf{w}|} \qquad\quad \mathbf{w}\cdot\mathbf{x}=b \qquad\quad x=\frac{b}{|\mathbf{w}|} $$

optimization conditions

$$ \mathrm{min}\,|\mathbf{w}|\qquad\quad l_i\big(\mathbf{w}\cdot\mathbf{x}_i-b\big)- 1 \ge 0 \qquad\quad \forall i $$

support vectors

$$ L_P = \frac{\mathbf{w}^2}{2} -\sum_{i=0}^{N-1} a_i\,\Big[ l_i\big(\mathbf{w}\cdot\mathbf{x}_i-b\big)- 1\Big] $$

minimisation vs. maximisation

support vectors

$$ l_i\big(\mathbf{w}\cdot\mathbf{x}_i-b\big)- 1 \ge 0, \qquad\quad l_s\big(\mathbf{w}\cdot\mathbf{x}_s-b\big)- 1 = 0 $$ $$ -a_i\,\Big[ l_i\big(\mathbf{w}\cdot\mathbf{x}_i-b\big)- 1\big], \qquad\quad a_i\ge0 $$ $$ \fbox{$\phantom{\big|} a_i\to0 \phantom{\big|}$} \qquad\mathrm{for\ non-support\ vectors} $$

dual representation

$$ L_P = \frac{\mathbf{w}^2}{2} -\sum_i a_i l_i\big(\mathbf{w}\cdot\mathbf{x}_i-b\big) +\sum_i a_i $$ $$ \frac{\partial L_P}{\partial\mathbf{w}} = 0 \qquad\quad \Rightarrow \qquad\quad \fbox{$\phantom{\big|} \mathbf{w} = \sum_i a_i l_i\mathbf{x}_i \phantom{\big|}$} $$ $$ \frac{\partial L_P}{\partial b} = 0 \qquad\quad \Rightarrow \qquad\quad \fbox{$\phantom{\big|} \sum_i a_i l_i=0 \phantom{\big|}$} $$ $$ L_D = \frac{\mathbf{w}^2}{2}-\mathbf{w}^2+\sum_i a_i = \sum_i a_i -\frac{1}{2}\sum_{i,j} \big( a_i l_i \mathbf{x}_i\big)\cdot \big( a_j l_j \mathbf{x}_j\big) $$

dual form

$$ \underset{\mathbf{a}}{\mathrm{max}}\left[ \sum_i a_i-\frac{1}{2}\sum_{ij} a_iH_{ij}a_j\right]\,, \qquad a_i\ge 0, \qquad \sum_i a_il_i=0 $$
$$ \fbox{$\phantom{\big|} H_{ij} = l_i\,\mathbf{x}_i\cdot\mathbf{x}_j\,l_j \phantom{\big|}$} $$ $$ -\frac{1}{2}\sum_{ij} a_iH_{ij}a_j = -\frac{1}{2}\mathbf{c}\cdot\mathbf{c}, \qquad\quad \mathbf{c} = \sum_j a_j l_j \mathbf{x}_j $$

hyperplane

$$ \underset{\{a_l\}}{\mathrm{max}}\left[ \sum_i a_i-\frac{1}{2}\sum_{ij} a_iH_{ij}a_j\right]\,, \qquad a_i\ge 0, \qquad \sum_i a_il_i=0 $$
$$ \mathbf{w}=\sum_ia_il_i\mathbf{x}_i =\sum_{m\in S} a_ml_m\mathbf{x}_m $$
$$ l_s\big(\mathbf{w}\cdot\mathbf{x}_s-b\big) = 1, \qquad\quad \mathbf{w}\cdot\mathbf{x}_s-b = l_s $$
$$ b = \sum_{m\in S} a_ml_m\,\mathbf{x}_m\cdot\mathbf{x}_s -l_s $$

numerics

$$ \frac{\partial L_D}{\partial a_k} =1-l_k\mathbf{x}_k\cdot\mathbf{w}, \qquad\quad L_D = \sum_i a_i -\frac{1}{2} \mathbf{w}\cdot\mathbf{w}, \qquad\quad \mathbf{w} = \sum_j a_j l_j \mathbf{x}_j $$

(1) update Lagrange parameters

$$ a_k \ \ \to\ \ a_k+\epsilon_a\big( 1-l_k\mathbf{x}_k\cdot\mathbf{w}) $$

(2) orthogonalize Lagrange parameters

$$ \frac{\partial L_P}{\partial b} = 0, \qquad\quad \sum_i a_i l_i=0, \qquad\quad \fbox{$\phantom{\big|} \mathbf{a}\cdot\mathbf{L}=0 \phantom{\big|}$}\,, \qquad\quad \mathbf{L}=\big(l_0,\,\dots,\,l_{N-1}\big) $$ $$ \fbox{$\phantom{\big|} \mathbf{a}\ \ \to\ \ \mathbf{a} - \mathbf{a}\cdot\mathbf{L}\frac{\mathbf{L}}{\mathbf{L}\cdot\mathbf{L}} \phantom{\big|}$}\,, \qquad\quad \mathbf{L}\cdot\left[\mathbf{a} - \mathbf{a}\cdot\mathbf{L}\frac{\mathbf{L}}{\mathbf{L}\cdot\mathbf{L}} \right]=0, \qquad\quad \mathbf{L}\cdot\mathbf{L} = N $$

(3) special treatment for non-support vectors


steepest ascent SVM code




Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

#!/usr/bin/env python3
#!/usr/bin/env python3

import math                        # math
import copy                        # e.g. deepcopy
import matplotlib.pyplot as plt    # plotting
import numpy as np
from numpy.linalg import inv       # inverse matrix
import ML_covarianceMatrix as CM   # loading user-defined module


# *******
# *** SVM 
# *******

def SVM_ascent(svmData, svmL):
  "Simple SVM code"
  nIter        =   1000         # fixed number of update iterations
  epsilon      =  0.001         # update rate
  svmThreshold =  0.001         # for support vectors
  nData = len(svmL)             # total number of labeled data points
  svmA  = np.random.rand(nData) # random [0,1] Lagrange parameters
  svmW  = np.zeros(2)           # 2D w-vector

  for iIter in range(nIter):                 # loop
    svmW = [0.0, 0.0]
    for ii in range(nData):   
      svmW += svmL[ii]*svmA[ii]*svmData[ii]
#   print(f'# {iIter:5d} {svmW[0]:6.2f} {svmW[1]:6.2f}')

    for ii in range(nData):                  # updating L-parameters
      svmA[ii] += epsilon*(1.0-svmL[ii]*np.dot(svmData[ii],svmW))

    factor = np.dot(svmA,svmL)*1.0/nData
    for ii in range(nData):                  # orthogonalization 
      svmA[ii] = svmA[ii] - factor*svmL[ii]

    for ii in range(nData):                  # positiveness
      svmA[ii] = max(0.0,svmA[ii])
#
# --- iteration loop finished
# --- threshold per support vector
#
  svmB  = np.zeros(nData) 
  BB_mean   = 0.0
  BB_number = 0
  for ii in range(nData):                    # positiveness
    if (svmA[ii]>svmThreshold):
      svmB[ii] = np.dot(svmData[ii],svmW) - svmL[ii]
      BB_mean   += svmB[ii]
      BB_number += 1
  BB_mean = BB_mean/BB_number

  print("# SVM data ")
# print(f'# {svmW[0]:6.2f} {svmW[1]:6.2f}')
  for ii in range(nData):                  
#   if (svmA[ii]>svmThreshold):
      print(f'{ii:5d} {svmA[ii]:10.6f} {svmL[ii]:5.1f} {svmB[ii]:10.4f} ',
            end="")
      print(f'{svmData[ii][0]:6.2f} {svmData[ii][1]:6.2f} ')
#
  return svmA, BB_mean, svmW     # Lagrange / threshold / w-vector
 

# ********
# *** main
# ********

dataMatrix_A  = CM.testData( 0.3*math.pi, 1.0, 9.0, 20, [ 8.0,0.0])
dataMatrix_B  = CM.testData(-0.3*math.pi, 1.0, 9.0, 20, [-8.0,0.0])
mean_A, coVar_A = CM.covarianceMatrix(dataMatrix_A)
mean_B, coVar_B = CM.covarianceMatrix(dataMatrix_B)

#
# --- SVM preparation
#
dataLabel_A = [ 1.0 for _ in range(len(dataMatrix_A))]
dataLabel_B = [-1.0 for _ in range(len(dataMatrix_B))]
data_SVM   = np.vstack((dataMatrix_A,dataMatrix_B))    # vertical stacking
data_label = np.hstack((dataLabel_A,dataLabel_B))      # horizontal stacking
                                                       # do SVM
data_lagrange, data_BB, data_ww = SVM_ascent(data_SVM, data_label)

if (1==2):
  print()
  print(dataMatrix_A)
  print(dataLabel_A)
  print(dataMatrix_B)
  print(dataLabel_B)
  print(data_SVM)
  print(data_label)

#
# --- LDA (for comparison)
#

S_inverse    = inv(coVar_A+coVar_B)
w_vector     = np.matmul(S_inverse,mean_B-mean_A)       # LDA

w_normalized = w_vector / np.sqrt(np.sum(w_vector**2))  # normalization
w_orthogonal = [w_normalized[1], -w_normalized[0]]      # \perp vector

LL = 5.0
midPoint = 0.5*(mean_B+mean_A)
x_lower  = midPoint[0] - LL*w_orthogonal[0]
y_lower  = midPoint[1] - LL*w_orthogonal[1]
x_upper  = midPoint[0] + LL*w_orthogonal[0]
y_upper  = midPoint[1] + LL*w_orthogonal[1]
x_LDA_plane = [x_lower, x_upper]
y_LDA_plane = [y_lower, y_upper]

#
# --- data points (normal, support)
#

xA_normal  = []
yA_normal  = []
xB_normal  = []
yB_normal  = []

xA_support = []
yA_support = []
xB_support = []
yB_support = []

svmThreshold = 0.001
for ii in range(len(data_label)):
  if (data_label[ii]>0) and (data_lagrange[ii]<svmThreshold):
   xA_normal.append(data_SVM[ii][0])
   yA_normal.append(data_SVM[ii][1])
  if (data_label[ii]<0) and (data_lagrange[ii]<svmThreshold):
   xB_normal.append(data_SVM[ii][0])
   yB_normal.append(data_SVM[ii][1])
  if (data_label[ii]>0) and (data_lagrange[ii]>svmThreshold):
   xA_support.append(data_SVM[ii][0])
   yA_support.append(data_SVM[ii][1])
  if (data_label[ii]<0) and (data_lagrange[ii]>svmThreshold):
   xB_support.append(data_SVM[ii][0])
   yB_support.append(data_SVM[ii][1])


#
# --- SVM plane and margins
#

svmPlane_x = [0.0, 0.0]
svmPlane_y = [0.0, 0.0]
svmMar_A_x = [0.0, 0.0]
svmMar_A_y = [0.0, 0.0]
svmMar_B_x = [0.0, 0.0]
svmMar_B_y = [0.0, 0.0]
r  = math.sqrt(data_ww[0]*data_ww[0]+data_ww[1]*data_ww[1])
rr = r*r
Len = 4.0
svmPlane_x[0] = (data_BB-0.0)*data_ww[0]/rr + Len*data_ww[1]/r
svmPlane_y[0] = (data_BB-0.0)*data_ww[1]/rr - Len*data_ww[0]/r
svmPlane_x[1] = (data_BB-0.0)*data_ww[0]/rr - Len*data_ww[1]/r
svmPlane_y[1] = (data_BB-0.0)*data_ww[1]/rr + Len*data_ww[0]/r

svmMar_A_x[0] = (data_BB-1.0)*data_ww[0]/rr + Len*data_ww[1]/r
svmMar_A_y[0] = (data_BB-1.0)*data_ww[1]/rr - Len*data_ww[0]/r
svmMar_A_x[1] = (data_BB-1.0)*data_ww[0]/rr - Len*data_ww[1]/r
svmMar_A_y[1] = (data_BB-1.0)*data_ww[1]/rr + Len*data_ww[0]/r

svmMar_B_x[0] = (data_BB+1.0)*data_ww[0]/rr + Len*data_ww[1]/r
svmMar_B_y[0] = (data_BB+1.0)*data_ww[1]/rr - Len*data_ww[0]/r
svmMar_B_x[1] = (data_BB+1.0)*data_ww[0]/rr - Len*data_ww[1]/r
svmMar_B_y[1] = (data_BB+1.0)*data_ww[1]/rr + Len*data_ww[0]/r

#
# --- printing, including covariance ellipse
#

xEllipse_A, yEllipse_A = CM.plotEllipseData(coVar_A)
xEllipse_B, yEllipse_B = CM.plotEllipseData(coVar_B)  

Z_95 = math.sqrt(5.991)                  
xE_95_A = [Z_95*xx + mean_A[0] for xx in xEllipse_A]
yE_95_A = [Z_95*yy + mean_A[1] for yy in yEllipse_A]
xE_95_B = [Z_95*xx + mean_B[0] for xx in xEllipse_B]
yE_95_B = [Z_95*yy + mean_B[1] for yy in yEllipse_B]

x_connectMean = [ mean_A[0], mean_B[0] ]
y_connectMean = [ mean_A[1], mean_B[1] ]

plt.plot(xE_95_A, yE_95_A, "k", linewidth=2.0, label="95%")
plt.plot(xE_95_B, yE_95_B, "b", linewidth=2.0, label="95%")

plt.plot(xA_normal, yA_normal, "ok", markersize=5)
plt.plot(xB_normal, yB_normal, "ob", markersize=5)
plt.plot(xA_support, yA_support, "ok", markersize=10, mfc='none')
plt.plot(xB_support, yB_support, "ob", markersize=10, mfc='none')

plt.plot(x_connectMean, y_connectMean, "r", linewidth=3.0)
plt.plot(x_connectMean, y_connectMean, "or", markersize=9.0)

plt.plot(svmPlane_x, svmPlane_y, color=[1.0,0.84,0.0], 
         linewidth=4.0, label="SVM plane")
plt.plot(svmMar_A_x, svmMar_A_y, color=[1.0,0.84,0.0], 
         linewidth=4.0, linestyle="dashed", label="SVM margin")
plt.plot(svmMar_B_x, svmMar_B_y, color=[1.0,0.84,0.0],
         linewidth=4.0, linestyle="dashed", label="")

plt.plot(x_LDA_plane, y_LDA_plane, "g", linewidth=2.0, label="LDA plane")

plt.legend(loc="upper left")
#plt.axis('square')                           # square plot
plt.savefig('foo.svg')                       # export figure
plt.show()