Machine Learning Primer -- Python Tutorial




Claudius Gros, WS 2024/25

Institut für theoretische Physik
Goethe-University Frankfurt a.M.

PyTorch

PyTorch example

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import torch                     # PyTorch instead of NumPy
import math
import matplotlib.pyplot as plt


myType   = torch.float
myDevice = torch.device("cpu")   # "cuda:0" for GPU; not activated

# global parameters
nData = 2000                     # number of training pairs
nIter = 2000                     # number training iterations
nPar  =    4                     # number of fit parameters

learning_rate = 0.5e-2/nData     # relative learning rate
fitPar = []                      # empty list; fit parameters
for i in range(nPar):            # randn() : normal distribution
  fitPar.append(torch.randn((), device=myDevice, dtype=myType))
print(fitPar)

def fitFunction(x):              # polynomial fitting function 
  sum = 0.0
  for i in range(nPar):
    sum += fitPar[i]*(x**i)
  return sum

# linspace returns a list
x = torch.linspace(-math.pi, math.pi, nData, device=myDevice, dtype=myType)
y = torch.sin(x)                 # target function y = sin(x)

# training iteration
for iIter in range(nIter):
  y_pred = fitFunction(x)                  # list; element-wise
  loss   = torch.square(y_pred - y).sum()  # sum of squared elements

  if iIter % 100 == 99:                    # test printout
    print(f'{iIter:5d}  {loss:10.6f}')

  grad_y_pred = 2.0 * (y_pred - y)         # error signal
  for i in range(nPar):                    # least-square fit
    gradient = ( grad_y_pred*(x**i) ).sum()
    fitPar[i] -= learning_rate * gradient

# showing result
plt.plot(x, torch.sin(x)             , 'b', label="sin(x)")
plt.plot(x, fitFunction(x)           , 'r', label="polynomial fit")
plt.plot(x, 0.0*x                    , '--k')
plt.legend()
plt.show()

automatic gradient evaluation




[Data Hacker]
Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import torch                     # PyTorch needs to be installed

dim = 2
eps = 0.1
x = torch.ones(dim, requires_grad=True)  # leaf of computational graph
print("x           : ",x)
print("x.data      : ",x.data)
print("x[0]        : ",x[0])
print("x[0].item() : ",x[0].item())
print()

y = x + 2
out = torch.dot(y,y)             # scalar product
print("y      : ",y)
print("out    : ",out)
print()

out.backward()                   # backward pass --> gradients
print("x.grad : ",x.grad)

with torch.no_grad():            # detach from computational graph
  x -= eps*x.grad                # updating parameter tensor
  x.grad = None                  # flush

print("x      : ",x.data)

print("\n#---")
print("#--- .backward() adds new gradient to old gradient")
print("#---             convenient for batch updating")
print("#---\n")

y = torch.zeros(dim, requires_grad=True)  
torch.dot(y+1,y+1).backward()
print("y.grad : ",y.grad)
torch.dot(y+1,y+1).backward()
print("y.grad : ",y.grad)
torch.dot(y+1,y+1).backward()
print("y.grad : ",y.grad)
torch.dot(y+1,y+1).backward()

least square fit

$$ y = \sin(x) \approx \sum_{k=0}^{{\rm nPar}-1} f_k x^k $$
Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import torch                   
import math
import matplotlib.pyplot as plt


myType   = torch.float
myDevice = (
    "cuda"                       # for GPUs
    if torch.cuda.is_available()
    else "mps"                   # 'MultiProcessor Specification'
    if torch.backends.mps.is_available()
    else "cpu"                   # plain old CPU
)

# global parameters
nData = 2000                     # number of training pairs
nIter = 2000                     # number training iterations
nPar  =    4                     # number of fit parameters
learning_rate = 0.5e-2/nData    

# gradients with respect to figPar[] to be evaluated
fitPar = []                      # list of 1x1 tensors
for i in range(nPar):      
  fitPar.append(torch.randn((), device=myDevice, dtype=myType,\
                                requires_grad=True))  
print(fitPar)

x = torch.linspace(-math.pi, math.pi, nData)
y = torch.sin(x)                 # element-wise

def fitFunction(x):              # polynomial fitting fuction
  sum = 0.0
  for i in range(nPar):
    sum += fitPar[i]*(x**i)      # element-wise, x is a tensor
  return sum                     # returns a tensor

# training iteration
for iIter in range(nIter):
  y_pred     = fitFunction(x)              # forward pass
  lossTensor = (y_pred - y).pow(2).sum()   # element-wise pow(2)

  if iIter % 100 == 99:                    # print scalar loss value
    print(f'{iIter:5d}  {lossTensor.item():10.6f}')

# backward pass 
# calculates gradients, viz 'tensor.grad',
# with respect to tensors with "requires_grad=True"
  lossTensor.backward()                    

# temporarily 'detaching' all tensors for by-hand updating
# the value of   fitPar[i].grad   is not affected
  with torch.no_grad():        
    for i in range(nPar):                  # gradients via backward pass
      fitPar[i] -= learning_rate * fitPar[i].grad
      fitPar[i].grad = None

# "detach" tensors requiring gradients from computation graph
plt.plot(x, torch.sin(x)                    , 'b', label="sin(x)")
plt.plot(x, fitFunction(x).detach().numpy() , 'r', label="fit")
plt.plot(x, 0.0*x                           , '--k')
plt.legend()
plt.show()

inplace operations

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import torch    

a = torch.randn(5, requires_grad=True)
b = 2 * a
c = b ** 2          # non-linear
#c = b + 2          # linear
print(b._version)
b += 1              # an inplace operation
                    # b changed outside the computation graph
print(b._version)
print()
c.sum().backward()
print("a:      ", a)
print("a.grad: ", a.grad)

PyTorch modules

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import torch
import math
import matplotlib.pyplot as plt

#
# wrapping (x+p)^2 inside a module
#
class MyLayer(torch.nn.Module):   # inherintance
  def __init__(self, p):          # constructor
    super().__init__()            
    self.p = p

  def forward(self, x):           # define forward pass
    return torch.dot(x+self.p,x+self.p)

#
# main start
#
myLayerObject = MyLayer(2.0)      # instanstiation
input = torch.ones(3,requires_grad=True) 
output = myLayerObject(input)     # forward pass (implicit)
output.backward()                 
print("\n# input")
print(input)
print("\n# output")
print(output)
print("\n# input.grad")
print(input.grad)

a network layer

$$ \tanh(x), \quad\qquad \sigma(x)=\frac{1}{1+\exp(-x)}, \quad\qquad \mathrm{ReLU}(x) = \mathrm{max}(0,x) $$
Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import torch
import math
import matplotlib.pyplot as plt

#
# relu = max(0,x)  layer (rectified linear)
#
class MyLayer(torch.nn.Module):   # inheritance
  def __init__(self, dim):        # constructor
    super().__init__()
    self.weights = torch.randn(dim,dim,requires_grad=True)

  def forward(self, x):           # default forward pass
    return torch.relu(torch.matmul(self.weights,x))

  def forward_tanh(self, x):      # alternative forward pass
    return torch.tanh(torch.matmul(self.weights,x))

  def update(self, eps):          # updating weights
    with torch.no_grad():
      self.weights -= eps*self.weights.grad
      self.weights.grad = None

#
# a single training pair  (myData,myValue)
#
dim           = 4
nIter         = 1000
learningRate  = 1.0e-2
myLayerObject = MyLayer(dim)                 # instanstiation

myData  = torch.FloatTensor(dim).uniform_()
myValue = torch.relu(torch.FloatTensor(dim).uniform_())

print("\n# output")
for iIter in range(nIter):                    # trainning loop
#
  output = myLayerObject(myData)              # forward pass (implicit)
# output = myLayerObject.forward(myData)      # forward pass (explicit)
# output = myLayerObject.forward_tanh(myData) # forward pass (specific)
#
  loss   = (output-myValue).pow(2).sum()
  loss.backward()                             # backward pass
  myLayerObject.update(learningRate)          # weight updating
  print(output.data)
print("\n# myValue")
print(myValue)

tensor reshaping

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import torch
import math
import matplotlib.pyplot as plt

uu = torch.ones(3)
xx = torch.ones(3,5)
print("\n# uu")
print(uu)
print("\n# xx")
print(xx)
print("\n# uu unsqueezed")
print(torch.unsqueeze(uu, 1))
print("\n# xx plus uu unsqueezed along 1")
print(xx+torch.unsqueeze(uu, 1))
#
aa = torch.arange(4.0)
print("\n# arranged\n", aa)
aa = torch.reshape(aa, (2, 2))
print("\n# reshaped\n", aa)
aa = torch.reshape(aa, (-1,))
print("\n#   ..back\n", aa)

dividing tensors

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import torch

aa = torch.arange(10.0).reshape(2,5)
bb = torch.arange(5) + 1.0
cc = torch.divide(aa,bb)
print("aa")
print(aa)
print("bb")
print(bb)
print("cc")
print(cc)
print()

print("# ======================")
print("# row-wise normalization")
print("# ======================")
aaSum   = aa.sum(1)                     # aaSum = aa.pow(2).sum(1) 
aaTrans = torch.divide(aa.transpose(0,1),aaSum)
aaNorm  = torch.transpose(aaTrans,0,1)
print("asSum")
print(aaSum)
print("asTrans")
print(aaTrans)
print("aaNorm")
print(aaNorm)

batch processing

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env p ython3

import torch
import math
import matplotlib.pyplot as plt

#
# tanh layer module
#
class MyLayer(torch.nn.Module):      
  def __init__(self, dim, yesTheta): 
    super().__init__()
    self.weights  = torch.randn(dim,dim,requires_grad=True)
    self.theta    = torch.randn(dim,    requires_grad=True)
    self.yesTheta = yesTheta          # 1/0 with/without thresholds

  def forward(self, x):               # unsqueezing threshold vector
    tt = torch.unsqueeze(self.yesTheta*self.theta, 1)
    return torch.tanh(torch.matmul(self.weights,x)-self.yesTheta*tt)

  def update(self, eps):              # updating internal parameters
    with torch.no_grad():
      self.weights     -= eps*self.weights.grad
      self.theta       -= eps*self.theta.grad*self.yesTheta
      self.weights.grad = None
      self.theta.grad   = None

#
# a single training pair  (myData,myValue)
#
dim           = 4
nData         = 3
nIter         = 1000
learningRate  = 5.0e-2
myLayerObject = MyLayer(dim, 1.0)            # instanstiation

myData  = torch.FloatTensor(dim,nData).uniform_()
myValue = torch.relu(torch.FloatTensor(dim,nData).uniform_())

print("\n# output")
for iIter in range(nIter):                   # trainning loop
  output = myLayerObject(myData)             # forward pass (implicit)
  loss   = (output-myValue).pow(2).sum()
  loss.backward()                            # backward pass
  myLayerObject.update(learningRate)         # weight updating
  print(loss.item())
print("\n# output")
print(output.data)
print("\n# myValue")
print(myValue)

Boolean mapping network


bijective mapping
$$ \hspace{2ex}\begin{array}{cc|cc} X & Y & \mbox{XOR} & Y \\[0.1ex] \hline + & + & - & + \\ + & - & + & - \\ - & + & + & + \\ - & - & - & - \end{array}\hspace{2ex} $$
Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import torch
import math
import matplotlib.pyplot as plt

# global parameters
nData    =  4                      # number of training pairs
nLayer   =  2                      # number of layers
unitsPerLayer = 2
b        =  0.9                    # (+b) / (-b)  : logical True/False

nIter   =    6000                  # training iterations
learning_rate = 1.5e-2

#
# tanh layer module, could be non-squared
#
class TanhLayer(torch.nn.Module):            # inherintance
  def __init__(self, dim_out, dim_in):       # constructor
    super().__init__()            
    self.weights  = torch.randn(dim_out,dim_in,requires_grad=True)
    self.theta    = torch.randn(dim_out,       requires_grad=True)

  def forward(self, x):            # define forward pass
    return torch.tanh(torch.matmul(self.weights,x)-self.theta)

  def update(self, eps):           # updating internal parameters
    with torch.no_grad():
      self.weights     -= eps*self.weights.grad
      self.theta       -= eps*self.theta.grad
      self.weights.grad = None
      self.theta.grad   = None

#
# n-idential layer model
#
allLayers = [TanhLayer(unitsPerLayer, unitsPerLayer) for _ in range(nLayer)]
def model(x):
  for iLayer in range(nLayer):     
    x = allLayers[iLayer](x)
  return x

#
# ( unitsPerLayer | nData )  tensor of training data
# element-wise mapping of uniform distribution [0,1] to binary
# automatic casting of boolean  (..>..)  to  0/1
#
allTraining_data   = torch.FloatTensor(nData,unitsPerLayer).uniform_()
allTraining_value  = torch.FloatTensor(nData,unitsPerLayer).uniform_()
for iData in range(nData):
  for unit in range(2):                   # boolean first two units
    allTraining_data[iData][unit] =\
      b*(2.0*(allTraining_data[iData][unit]>0.5)-1.0)

#
# (cros-idenity|XOR) for first two units
#

if (unitsPerLayer>0) and (nData>0) and (1==1): 
  allTraining_data[0][0]  =  b
  allTraining_data[0][1]  =  b
  allTraining_value[0][0] =  b 
  allTraining_value[0][1] = -b 

if (unitsPerLayer>0) and (nData>1) and (1==1): 
  allTraining_data[1][0]  =  b
  allTraining_data[1][1]  = -b
  allTraining_value[1][0] = -b
  allTraining_value[1][1] =  b

if (unitsPerLayer>0) and (nData>2) and (1==1): 
  allTraining_data[2][0]  = -b
  allTraining_data[2][1]  =  b
  allTraining_value[2][0] =  b
  allTraining_value[2][1] =  b

if (unitsPerLayer>0) and (nData>3) and (1==1): 
  allTraining_data[3][0]  = -b
  allTraining_data[3][1]  = -b
  allTraining_value[3][0] = -b
  allTraining_value[3][1] = -b
if (1==1):
  print("\n# traing data/value")
  print(allTraining_data)
  print(allTraining_value)

#
# explict sum allows for experiments
#
def lossFunction(outputActivity, targetActivity):
  loss = torch.zeros(1)
  for ii in range(list(outputActivity.size())[0]):     # casting to list
# for ii in range(2):                                  # for testing
    loss += ( outputActivity[ii] - targetActivity[ii] ).pow(2)
  return loss

#
# iterating over itentical batch of training data
#
batchLoss = 0.0                        # average loss 
for iIter in range(nIter):
  iData = iIter%nData                  # go through all training data
  training_data = torch.clone(allTraining_data[iData])
  loss = lossFunction(model(training_data), allTraining_value[iData])
  loss.backward()
#
  batchLoss += loss.item()
  if (iData==0):
    if iIter % 200 == 0:
      print(f'{iIter:5d}  {batchLoss:10.6f}')
    batchLoss = 0
#
  for iLayer in range(nLayer):         # parameter updating
    allLayers[iLayer].update(learning_rate)

#
# performance testing
#
print()
for iData in range(nData):
  training_data = torch.clone(allTraining_data[iData])
  output        = model(training_data)
#
  xIn  = training_data[0]
  yIn  = training_data[1]
  xVal = allTraining_value[iData][0].item()
  yVal = allTraining_value[iData][1].item()
  xOut =        output[0]
  yOut =        output[1]
  print(f'{xIn:6.3f} {yIn:6.3f} | ', end="")
  print(f'{xVal:6.3f} {yVal:6.3f} || ', end="")
  print(f'{xOut:6.3f} {yOut:6.3f}')

complex-valued neural nets



$$\fbox{$\displaystyle\phantom{\big|} g(z) = \frac{z}{1+|z|} \phantom{\big|}$} $$
Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

#
# complex network, plane wave classification
#

import torch
import random
import math
import cmath                    # complex math

dataDim       = 10
nLayer        = 2               # two == one hidden layer
nData         = 10               
nBatch        = nData
nEpochs       = 20
nIter         = nBatch*nEpochs
learningRate  = 1.0e-2

print("# default data type: ", torch.get_default_dtype())
# default data type can be changed, but (yet) not to cfloat


class ComplexLayer(torch.nn.Module):   
  def __init__(self, dimOut, dimIn, zero_if_linear = 1.0): 
    super().__init__()
    self.weights = torch.randn(dimOut, dimIn,
                   requires_grad=True, dtype=torch.cfloat)
    self.bias    = torch.randn(dimOut, 
                   requires_grad=True, dtype=torch.cfloat)
    self.zero_if_linear = zero_if_linear     # zero for linear layer

  def forward(self, x):           # cicrular squatting
    z = torch.matmul(self.weights, x) - self.bias
    return z/(1.0+self.zero_if_linear*z.abs())

  def update(self, eps):          # updating parameters
    with torch.no_grad():
      self.weights -= eps*self.weights.grad
      self.bias    -= eps*self.bias.grad
      self.weights.grad = None
      self.bias.grad    = None

#
# model, output layer is linear
#
allLayers =     [ComplexLayer(dataDim, dataDim) for _ in range(nLayer-1)]
allLayers.append(ComplexLayer(1      , dataDim, zero_if_linear=0.0))
print("# allLayers : ", allLayers)

def model(x):
  for iLayer in range(nLayer):
    x = allLayers[iLayer](x)
  return x

#
# generate data: plane waves
#
myData   = torch.ones(nData, dataDim, dtype=torch.cfloat)
myValues = torch.ones(nData,       1, dtype=torch.cfloat)
delta_k = 2.0*math.pi/dataDim             # 2\pi / length

for iData in range(nData):
  qq = iData*delta_k                      # wave vector
  ww =  complex(math.cos(qq), math.sin(qq))
  myValues[iData][0] *= ww                # circular encoding
#
  for iDim in range(dataDim):
    zz = complex(math.cos(iDim*qq), math.sin(iDim*qq))
    myData[iData][iDim] *= zz

if (1==2):                                # test output: data
  for iDim in range(dataDim):
    print(myData[1][iDim].item().real,
          myData[1][iDim].item().imag)
if (1==2):                                # test output: targets
  for iData in range(nData):
    print(myValues[iData][0].item().real,
          myValues[iData][0].item().imag)

#
# training loop
#
for iIter in range(nIter):                    # trainning loop
  thisData = random.randrange(nData)          # select random data entry
  x = myData[thisData]
  y = model(x)                                # forward pass
  target = myValues[thisData][0]
  loss = abs((target-y).pow(2))               # loss must be real
  loss.backward()                             # summing over batch
#
  if (iIter%nBatch==0):                       # updating
    for iLayer in range(nLayer):
      allLayers[iLayer].update(learningRate/nBatch)
    print(f'{iIter:6d}  {loss.item():8.4f}')

attention layer

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

#
# basic attention layer, token-wise neural net not included
#
import torch
import math
import matplotlib.pyplot as plt

nLayer   =  3                      # number of layers
tokenPerLayer = 5                  # context length
nToken   =  tokenPerLayer*nLayer   # total number of token
dim      =    2                    # embedding dimension
yesMask  =  1.0                    # 1/0 masked attention on/off

b        =  0.9                    # (+b) / (-b)  : logical True/False
nIter   =   4400                   # training iterations
learning_rate = 1.5e-2

#
# attention layer module
#
class attentionLayer(torch.nn.Module):
  def __init__(self, dim, nContext, yesMask=1, yesNorm=True, myID=0):
    super().__init__()
    self.Q_mat  = torch.randn(nContext,dim,dim,requires_grad=True)
    self.K_mat  = torch.randn(nContext,dim,dim,requires_grad=True)
    self.V_mat  = torch.randn(nContext,dim,dim,requires_grad=True)
#
    mySigma = 1.0/(dim*dim)    # initialization outside computation graph
    torch.nn.init.normal_(self.Q_mat, mean=0.0, std=mySigma)
    torch.nn.init.normal_(self.K_mat, mean=0.0, std=mySigma)
    torch.nn.init.normal_(self.V_mat, mean=0.0, std=mySigma)
#
    self.alpha  = torch.zeros(nContext,nContext)
    self.yesMask  = yesMask    # masked self attention
    self.yesNorm  = yesNorm    # layer normalization
    self.nContext = nContext
    self.dim      = dim        # embedding
    self.ID       = myID
#
    self.paddingMask = torch.zeros(nContext,nContext)   # for masking
    for ii in range(nContext):
      for jj in range(ii+1,nContext):
        self.paddingMask[ii][jj] = -1.0e9               # exp -> 0

  def layerNorm(self, x):
    mean  = torch.zeros(self.dim)             # vector mean
    sigma = torch.tensor(0.0)                 # zero-dimensional tensor
#   for ii in range(self.nContext):
#      mean += x[ii] / self.nContext
    mean = torch.sum(x, 0) / self.nContext    # sum over rows
#
#   for ii in range(self.nContext):
#      sigma += torch.square(x[ii]-mean).sum()
#   sigma = torch.sqrt(sigma/self.nContext)
    sigma = torch.sqrt(torch.square(x-mean).sum() / self.nContext)
#
#   for ii in range(self.nContext):           # layer normalization
#     x[ii] -= mean
#     x[ii] /= sigma
    x = (x-mean)/sigma                        # for all rows
    return x

  def forward(self, x, storeAttention=False):
    if (self.yesNorm):
      self.layerNorm(x)
#  Q/K/V vectors
    Q  = torch.zeros(self.nContext,self.dim)
    K  = torch.zeros(self.nContext,self.dim)
    V  = torch.zeros(self.nContext,self.dim)
    for ii in range(self.nContext):
      Q[ii] = torch.matmul(self.Q_mat[ii],x[ii])
      K[ii] = torch.matmul(self.K_mat[ii],x[ii])
      V[ii] = torch.matmul(self.V_mat[ii],x[ii])
#  local attention matrix
    alpha = torch.zeros(self.nContext,self.nContext)
    for ii in range(self.nContext):
      for jj in range(self.nContext):
        alpha[ii][jj] = torch.exp( torch.dot(Q[ii],K[jj])\
                                 + yesMask*self.paddingMask[ii][jj] )
      alpha[ii] /= alpha[ii].sum()      # normalization
#  store attention matrix
    if storeAttention:
      self.alpha = alpha
    return torch.matmul(alpha,V)
#   return torch.matmul(alpha,V) + x    # with skip connections

  def update(self, eps):                # updating internal parameters
    with torch.no_grad():
      self.Q_mat -= eps*self.Q_mat.grad
      self.K_mat -= eps*self.K_mat.grad
      self.V_mat -= eps*self.V_mat.grad
      self.Q_mat.grad = None
      self.K_mat.grad = None
      self.V_mat.grad = None

#
# n-idential layer model
#
allLayers = [attentionLayer(dim,tokenPerLayer,myID=iL) for iL in range(nLayer)]
def model(x, storeAttention=False):
  for iLayer in range(nLayer):
    x = allLayers[iLayer](x, storeAttention)
  return x

#
# console printing of attention matrix
#
def printAttenionMatrix():
  for iLayer in range(nLayer):
    print()
    print("# attention matrix for layer ", iLayer)
    for ss in range(tokenPerLayer):
      for tt in range(tokenPerLayer):
         alpha = allLayers[iLayer].alpha[ss][tt]
         print(f'{alpha:9.4f}', end="")
      print()

#
# test output of token activities
#
def printTokenActivities(x, myString):
  print()
  print("# activity for", myString)
  for ii in range(dim):
    for token in range(tokenPerLayer):
        print(f'{x[token][ii]:8.4f}', end="")
    print()

#
# standard loss function
#
def lossFunction(outputActivity, targetActivity):
  return torch.square(outputActivity - targetActivity).sum()

#
# random boolean (\pm b) mapping
#
training_data  =\
    b*(2.0*(torch.FloatTensor(tokenPerLayer,dim).uniform_()>0.5)-1.0)
training_value =\
    b*(2.0*(torch.FloatTensor(tokenPerLayer,dim).uniform_()>0.5)-1.0)

#
# testing model
#
if (1==2):
  print("# training_data")
  print(training_data,"\n")
  print("# training_value")
  print(training_value,"\n")
#
for iIter in range(nIter):
  loss = lossFunction(model(training_data),training_value)
  if (loss<0.001):
    break
  loss.backward()
#
  for iLayer in range(nLayer):
    allLayers[iLayer].update(learning_rate)
  if (iIter%200==0):
    print(f'{iIter:4d} {loss.item():9.4f}')

#
# compare output with target
#
print()
yy = model(training_data, storeAttention=True)
printTokenActivities(training_value, "training_value")
printTokenActivities(yy            , "output activities")
#
if (1==2):
  print()
  printAttenionMatrix()