Machine Learning Primer -- Python Tutorial

Claudius Gros, WS 2024/25

Institut für theoretische Physik
Goethe-University Frankfurt a.M.

Acceleration / Functional Programming

never use loops

tensor operations are optimized
:: loops not
torch.matmul(A,B) last two indices
A: [...,R,C]
B: [...,C,S]
:: many matrices at once, $\quad \sum_{C}A_{...,R,C}\cdot B_{...,C,S}$

#!/usr/bin/env python3

import torch

nMatrix = 50
nRow    = 40
nCol    = 30
nS      = 20

AA = torch.randn(nMatrix,nRow,nCol)             # combine many matrices
BB = torch.randn(nMatrix,nCol,nS  )             # to a single tensor

YY = torch.zeros(nMatrix,nRow,nS)   
if (1==1): 
  YY = torch.matmul(AA,BB)                      # tensor operation
else:
  for ii in range(nMatrix):                     # looping over all matrices
    for nn in range(nRow):          
      for mm in range(nS):          
        for ll in range(nCol):                  # explicit multiplication
           YY[ii][nn][mm] += AA[ii][nn][ll]*BB[ii][ll][mm]
#
print()
print("AA", AA.shape)
print("BB", BB.shape)
print("YY", YY.shape)
print()
print(f'we did multiply {nMatrix:d} matrices')
print(f'of type ({nRow:d}x{nCol:d}) and ({nCol:d}x{nS:d})')

vectorized maps

torch.vmap(f) for functions
f=f(x) with tensor arguments
:: adds an additional dimension
:: avoids loops → performance
example of functional programming
:: JAX alternative to PyTorch
:: emphasis on functional computing
example: full vs. row by row normalization
:: compare slide 'dividing tensors' $$ x\ \ \to\ \ \frac{x}{|x|}, \quad\qquad |x|^2 = \sum_i (x_i)^2 $$

#!/usr/bin/env python3

import torch
import random

def normalizeVector(x):
    "euclidan normalization of entry tensor"
    return x/torch.sqrt(x.pow(2).sum())

# main

print("original tensor")
x = torch.rand(4, 2)
#x = torch.arange(8).view(4,2)
print(x)
print("#====================\n")

# entire tensor normalized

print("fully normalized tensor")
allNormal = normalizeVector(x)
print("|allNormal|^2 : ", torch.sum(allNormal.pow(2)).item())
print(allNormal)
print("#====================\n")

# looping over rows

print("looping over rows manually")
for ii in range(len(allNormal)):
  rowVector = normalizeVector(allNormal[ii])
  print("|rowVector|^2 : ", ii, torch.sum(rowVector.pow(2)).item())
print("#====================\n")

# all rows are normalized, using vectorized function  
# function executed at every point of the map

print("row by row normalization with vmap")
g = torch.vmap(normalizeVector)
rowNormal = g(x)
for ii in range(len(allNormal)):
  print("|rowNormal|^2 : ", ii, torch.sum(rowNormal[ii].pow(2)).item())
print(rowNormal)
print("#====================\n")

#!/usr/bin/env python3

import torch
import random

def normalizeVector(x):
    "euclidan normalization of entry tensor"
    return x/torch.sqrt(x.pow(2).sum())

# main

print("original tensor")
x = torch.rand(4, 2)
#x = torch.arange(8).view(4,2)
print(x)
print("#====================\n")

# entire tensor normalized

print("fully normalized tensor")
allNormal = normalizeVector(x)
print("|allNormal|^2 : ", torch.sum(allNormal.pow(2)).item())
print(allNormal)
print("#====================\n")

# looping over rows

print("looping over rows manually")
for ii in range(len(allNormal)):
  rowVector = normalizeVector(allNormal[ii])
  print("|rowVector|^2 : ", ii, torch.sum(rowVector.pow(2)).item())
print("#====================\n")

# all rows are normalized, using vectorized function  
# function executed at every point of the map

print("row by row normalization with vmap")
g = torch.vmap(normalizeVector)
rowNormal = g(x)
for ii in range(len(allNormal)):
  print("|rowNormal|^2 : ", ii, torch.sum(rowNormal[ii].pow(2)).item())
print(rowNormal)
print("#====================\n")

conditional tensory operations

torch.where() element-wise conditional operation
:: avoids loop over elements
random masks for elment-wise stochastic operations

#!/usr/bin/env python3

# conditional tensor operations

import torch

xIn  = torch.randn(6)
xOut = torch.where(xIn > 0, 1.0, 0.0)     # conditional mapping
print("xIn \n", xIn)
print("xOut\n", xOut)
print()

yIn    = torch.arange(10)
yOnes  = torch.ones(10).int()             # float to int
yOut = torch.where(yIn%2==0, yIn, yOnes)  # conditional substitution

print("yIn   \n", yIn)
print("yOnes \n", yOnes)
print("yOut  \n", yOut)
print()

# element-wise stochastic operations
# need random masks
zIn    = torch.arange(10)
randMask = torch.rand(10)                 # in [0,1]
randMask = torch.where(randMask<0.5,1,0)  # either 0 or 1
zOut = torch.where(zIn%2==0, zIn, randMask)

print("zIn      \n", zIn)
print("randMask \n", randMask)
print("zOut     \n", zOut)

basic threading

thread
:: asynchronous process for processing part of the code
:: newly generated with the program
:: controlled by the 'main' process
fork
:: divide entire program into two
:: generates two independent processes

#!/usr/bin/env python3

import threading   # nomen est omen
import time        # sleep, etc.

#
# function to run asynchronous
#
def do_work(tNumber=0):
    print("starting  thread # %s", tNumber)
    rr = 1.0
    if (tNumber==0):
      time.sleep(10)             # sleeping 10 seconds
    else:
      while (1==1):              # doing heavy stuff
        rr = 1.0/(1.0+rr)
    print("finishing thread # %s", tNumber)

#
# main
#
allThreads = list()
for i in range(5):
  x =threading.Thread(target=do_work, args=(i,))
  allThreads.append(x)
  x.start()                      # starting thread 
#
for i in range(len(allThreads)):
  allThreads[i].join()           # waiting for threads to finish
#
print("\n# all done folks")

automatic threading

automatic threading with torch / numpy
:: may use all available CPU if not controlled
tensors yes, lists no
:: for primary compute

#!/usr/bin/env python3

import torch
import os                          # operating system

nMatrix = 500
nRow    = 400
nCol    = 300
nS      = 200

torch.set_num_threads(1)           # for cpu-based hardware
nCPU = os.cpu_count()              # number of available cpu
if (nCPU>2): 
  torch.set_num_threads(nCPU-2)    # leave two for other uses
#
print("\n# number of CPU, threads: ",nCPU,torch.get_num_threads()) 
#
AA = torch.randn(nMatrix,nRow,nCol)            
BB = torch.randn(nMatrix,nCol,nS  )        
#
for ii in range(1000):             # do heavy stuff
  if (ii%50==0):
    print(ii)
  YY = torch.matmul(AA,BB)

exponential forking

1 → 2 → 4 → ...
PID process identifier

#!/usr/bin/env python3

import os                         # underlying operation system

aa = 10
print()
print(f"main PID:", os.getpid())  # pid of main
print()

newPid = os.fork()                # forking 
myPid = os.getpid()               # pid after forking

aa = aa + 2
print(f"   myPID: {myPid:6d},  newPid: {newPid:6d},                    aa: {aa:6d}")

newNewPid = os.fork()
myPid = os.getpid()

aa = aa + 10
print(f"   myPID: {myPid:6d},  newPid: {newPid:6d}, newNewPid: {newPid:6d}, aa: {aa:6d}")

a functional deep network

functions used in functional programming are 'pure'
:: no dependencies other than the arguments
→ can be reused
normally, class functions are not pure
torch.nn.functional functional module
log-softmax $\ =\ \log(\mathrm{softmax})$

$$ \mathrm{LogSoftmax}(x_i) = \log\left( \frac{\exp(x_i)}{\sum_k\log(x_k)} \right) $$

leaky ReLU

$$ \mathrm{LeakyReLU}(x_i) = \left\{\begin{array}{rl} x & \mathrm{for}\ \ x>0\\[0.5ex] \gamma\, x & \mathrm{otherwise} \end{array}\right. \quad\qquad \gamma=0.01 $$

#!/usr/bin/env python3

# simple example of a functional network
# size defined by arguments

# leaky_relu(input, negative_slope=0.01) 

import torch
import torch.nn.functional as F

def functionalNetwork(x, w1, w2):
  h = F.linear(x, w1)
# h = F.relu(h)
  h = F.leaky_relu(h)
  y = F.linear(h, w2)
  return F.log_softmax(y, dim=1)

# usage
x  = torch.randn(64, 1000)   # 64 samples, 1000 features
w1 = torch.randn(500, 1000)  # first layer weights
w2 = torch.randn(10, 500)    # second layer weights

output = functionalNetwork(x, w1, w2)