Machine Learning Primer -- Python Tutorial

Claudius Gros, WS 2025/26

Institut für theoretische Physik
Goethe-University Frankfurt a.M.

PyTorch Modules

PyTorch modules

package sub-structures into modules
:: define forward pass
try withhout inheritance
class MyLayer():
__call__ class magic function
:: makes a class instance callable, used by
:: torch.nn.Module, to redirect to .forward()

#!/usr/bin/env python3

import torch
import math
import matplotlib.pyplot as plt

#
# wrapping (x+p)^2 inside a module
#
class MyLayer(torch.nn.Module):   # inherintance
  def __init__(self, p):          # constructor
    super().__init__()            
    self.p = p

  def forward(self, x):           # define forward pass
    return torch.dot(x+self.p,x+self.p)

#
# main start
#
myLayerObject = MyLayer(2.0)      # instanstiation
input = torch.ones(3,requires_grad=True) 
output = myLayerObject(input)     # forward pass (implicit)
output.backward()                 
print("\n# input")
print(input)
print("\n# output")
print(output)
print("\n# input.grad")
print(input.grad)

a network layer

transfer functions: tanh, sigmoidal, ReLU

$$ \tanh(x), \quad\qquad \sigma(x)=\frac{1}{1+\exp(-x)}, \quad\qquad \mathrm{ReLU}(x) = \mathrm{max}(0,x) $$

ReLU: rectified linear unit
:: vanishing gradient for $\ x<0$

#!/usr/bin/env python3

import torch
import math
import matplotlib.pyplot as plt

#
# relu = max(0,x)  layer (rectified linear)
#
class MyLayer(torch.nn.Module):   # inheritance
  def __init__(self, dim):        # constructor
    super().__init__()
    self.weights = torch.randn(dim,dim,requires_grad=True)

  def forward(self, x):           # default forward pass
    return torch.relu(torch.matmul(self.weights,x))

  def forward_tanh(self, x):      # alternative forward pass
    return torch.tanh(torch.matmul(self.weights,x))

  def update(self, eps):          # updating weights
    with torch.no_grad():
      self.weights -= eps*self.weights.grad
      self.weights.grad = None

#
# a single training pair  (myData,myValue)
#
dim           = 4
nIter         = 1000
learningRate  = 1.0e-2
myLayerObject = MyLayer(dim)                 # instanstiation

myData  = torch.FloatTensor(dim).uniform_()
myValue = torch.relu(torch.FloatTensor(dim).uniform_())

print("\n# output")
for iIter in range(nIter):                    # trainning loop
#
  output = myLayerObject(myData)              # forward pass (implicit)
# output = myLayerObject.forward(myData)      # forward pass (explicit)
# output = myLayerObject.forward_tanh(myData) # forward pass (specific)
#
  loss   = (output-myValue).pow(2).sum()
  loss.backward()                             # backward pass
  myLayerObject.update(learningRate)          # weight updating
  print(output.data)
print("\n# myValue")
print(myValue)

#!/usr/bin/env python3

import torch
import math
import matplotlib.pyplot as plt

#
# relu = max(0,x)  layer (rectified linear)
#
class MyLayer(torch.nn.Module):   # inheritance
  def __init__(self, dim):        # constructor
    super().__init__()
    self.weights = torch.randn(dim,dim,requires_grad=True)

def forward(self, x):           # default forward pass
    return torch.relu(torch.matmul(self.weights,x))

def forward_tanh(self, x):      # alternative forward pass
    return torch.tanh(torch.matmul(self.weights,x))

def update(self, eps):          # updating weights
    with torch.no_grad():
      self.weights -= eps*self.weights.grad
      self.weights.grad = None

#
# a single training pair  (myData,myValue)
#
dim           = 4
nIter         = 1000
learningRate  = 1.0e-2
myLayerObject = MyLayer(dim)                 # instanstiation

myData  = torch.FloatTensor(dim).uniform_()
myValue = torch.relu(torch.FloatTensor(dim).uniform_())

print("\n# output")
for iIter in range(nIter):                    # trainning loop
#
  output = myLayerObject(myData)              # forward pass (implicit)
# output = myLayerObject.forward(myData)      # forward pass (explicit)
# output = myLayerObject.forward_tanh(myData) # forward pass (specific)
#
  loss   = (output-myValue).pow(2).sum()
  loss.backward()                             # backward pass
  myLayerObject.update(learningRate)          # weight updating
  print(output.data)
print("\n# myValue")
print(myValue)

tensor batch processing

processing many (data,value) pairs in one batch
input/activities/output $\quad\to\quad$ tensors
torch.matmul(A,B) acts on last two indices
$C_{\alpha,\beta,i,j} = A_{\alpha,\beta,i,k}\, B_{\alpha,\beta,k,j}$
leading dimensions ($\alpha,\beta,..$) with identical shapes
$\hat{W}\cdot \hat{x}\ $ retains shape
:: threshold needs to be 'unsqueezed
:: adding batch dimension

#!/usr/bin/env p ython3

import torch
import math
import matplotlib.pyplot as plt

#
# tanh layer module
#
class MyLayer(torch.nn.Module):      
  def __init__(self, dim, yesTheta): 
    super().__init__()
    self.weights  = torch.randn(dim,dim,requires_grad=True)
    self.theta    = torch.randn(dim,    requires_grad=True)
    self.yesTheta = yesTheta          # 1/0 with/without thresholds

  def forward(self, x):               # unsqueezing threshold vector
    tt = torch.unsqueeze(self.yesTheta*self.theta, 1)
    return torch.tanh(torch.matmul(self.weights,x)-self.yesTheta*tt)

  def update(self, eps):              # updating internal parameters
    with torch.no_grad():
      self.weights     -= eps*self.weights.grad
      self.theta       -= eps*self.theta.grad*self.yesTheta
      self.weights.grad = None
      self.theta.grad   = None

#
# a single training pair  (myData,myValue)
#
dim           = 4
nData         = 3
nIter         = 1000
learningRate  = 5.0e-2
myLayerObject = MyLayer(dim, 1.0)            # instanstiation

myData  = torch.FloatTensor(dim,nData).uniform_()
myValue = torch.relu(torch.FloatTensor(dim,nData).uniform_())

print("\n# output")
for iIter in range(nIter):                   # trainning loop
  output = myLayerObject(myData)             # forward pass (implicit)
  loss   = (output-myValue).pow(2).sum()
  loss.backward()                            # backward pass
  myLayerObject.update(learningRate)         # weight updating
  print(loss.item())
print("\n# output")
print(output.data)
print("\n# myValue")
print(myValue)

#!/usr/bin/env p ython3

import torch
import math
import matplotlib.pyplot as plt

#
# tanh layer module
#
class MyLayer(torch.nn.Module):      
  def __init__(self, dim, yesTheta): 
    super().__init__()
    self.weights  = torch.randn(dim,dim,requires_grad=True)
    self.theta    = torch.randn(dim,    requires_grad=True)
    self.yesTheta = yesTheta          # 1/0 with/without thresholds

def forward(self, x):               # unsqueezing threshold vector
    tt = torch.unsqueeze(self.yesTheta*self.theta, 1)
    return torch.tanh(torch.matmul(self.weights,x)-self.yesTheta*tt)

def update(self, eps):              # updating internal parameters
    with torch.no_grad():
      self.weights     -= eps*self.weights.grad
      self.theta       -= eps*self.theta.grad*self.yesTheta
      self.weights.grad = None
      self.theta.grad   = None

#
# a single training pair  (myData,myValue)
#
dim           = 4
nData         = 3
nIter         = 1000
learningRate  = 5.0e-2
myLayerObject = MyLayer(dim, 1.0)            # instanstiation

myData  = torch.FloatTensor(dim,nData).uniform_()
myValue = torch.relu(torch.FloatTensor(dim,nData).uniform_())

print("\n# output")
for iIter in range(nIter):                   # trainning loop
  output = myLayerObject(myData)             # forward pass (implicit)
  loss   = (output-myValue).pow(2).sum()
  loss.backward()                            # backward pass
  myLayerObject.update(learningRate)         # weight updating
  print(loss.item())
print("\n# output")
print(output.data)
print("\n# myValue")
print(myValue)

Boolean mapping network

bijective mapping

$$ \hspace{2ex}\begin{array}{cc|cc} X & Y & \mbox{XOR} & Y \\[0.1ex] \hline + & + & - & + \\ + & - & + & - \\ - & + & + & + \\ - & - & - & - \end{array}\hspace{2ex} $$

PyTorch: modules for individual layers
here: hand-made layered network of constant width
:: list of layer modules
layer activity tensor: $\ \mathrm{unitsPerLayer}\times\mathrm{nData}\ $
bijective boolean mapping
:: invertable
:: no information loss
:: $\pm b\ $ defines boolean values
here: two neurons per layer
one/two/more layers

#!/usr/bin/env python3

import torch
import math
import matplotlib.pyplot as plt

# global parameters
nData    =  4                      # number of training pairs
nLayer   =  2                      # number of layers
unitsPerLayer = 2
b        =  0.9                    # (+b) / (-b)  : logical True/False

nIter   =    6000                  # training iterations
learning_rate = 1.5e-2

#
# tanh layer module, could be non-squared
#
class TanhLayer(torch.nn.Module):            # inherintance
  def __init__(self, dim_out, dim_in):       # constructor
    super().__init__()            
    self.weights  = torch.randn(dim_out,dim_in,requires_grad=True)
    self.theta    = torch.randn(dim_out,       requires_grad=True)

  def forward(self, x):            # define forward pass
    return torch.tanh(torch.matmul(self.weights,x)-self.theta)

  def update(self, eps):           # updating internal parameters
    with torch.no_grad():
      self.weights     -= eps*self.weights.grad
      self.theta       -= eps*self.theta.grad
      self.weights.grad = None
      self.theta.grad   = None

#
# n-idential layer model
#
allLayers = [TanhLayer(unitsPerLayer, unitsPerLayer) for _ in range(nLayer)]
def model(x):
  for iLayer in range(nLayer):     
    x = allLayers[iLayer](x)
  return x

#
# ( unitsPerLayer | nData )  tensor of training data
# element-wise mapping of uniform distribution [0,1] to binary
# automatic casting of boolean  (..>..)  to  0/1
#
allTraining_data   = torch.FloatTensor(nData,unitsPerLayer).uniform_()
allTraining_value  = torch.FloatTensor(nData,unitsPerLayer).uniform_()
for iData in range(nData):
  for unit in range(2):                   # boolean first two units
    allTraining_data[iData][unit] =\
      b*(2.0*(allTraining_data[iData][unit]>0.5)-1.0)

#
# (cros-idenity|XOR) for first two units
#

if (unitsPerLayer>0) and (nData>0) and (1==1): 
  allTraining_data[0][0]  =  b
  allTraining_data[0][1]  =  b
  allTraining_value[0][0] =  b 
  allTraining_value[0][1] = -b 

if (unitsPerLayer>0) and (nData>1) and (1==1): 
  allTraining_data[1][0]  =  b
  allTraining_data[1][1]  = -b
  allTraining_value[1][0] = -b
  allTraining_value[1][1] =  b

if (unitsPerLayer>0) and (nData>2) and (1==1): 
  allTraining_data[2][0]  = -b
  allTraining_data[2][1]  =  b
  allTraining_value[2][0] =  b
  allTraining_value[2][1] =  b

if (unitsPerLayer>0) and (nData>3) and (1==1): 
  allTraining_data[3][0]  = -b
  allTraining_data[3][1]  = -b
  allTraining_value[3][0] = -b
  allTraining_value[3][1] = -b
if (1==1):
  print("\n# traing data/value")
  print(allTraining_data)
  print(allTraining_value)

#
# explict sum allows for experiments
#
def lossFunction(outputActivity, targetActivity):
  loss = torch.zeros(1)
  for ii in range(list(outputActivity.size())[0]):     # casting to list
# for ii in range(2):                                  # for testing
    loss += ( outputActivity[ii] - targetActivity[ii] ).pow(2)
  return loss

#
# iterating over itentical batch of training data
#
batchLoss = 0.0                        # average loss 
for iIter in range(nIter):
  iData = iIter%nData                  # go through all training data
  training_data = torch.clone(allTraining_data[iData])
  loss = lossFunction(model(training_data), allTraining_value[iData])
  loss.backward()
#
  batchLoss += loss.item()
  if (iData==0):
    if iIter % 200 == 0:
      print(f'{iIter:5d}  {batchLoss:10.6f}')
    batchLoss = 0
#
  for iLayer in range(nLayer):         # parameter updating
    allLayers[iLayer].update(learning_rate)

#
# performance testing
#
print()
for iData in range(nData):
  training_data = torch.clone(allTraining_data[iData])
  output        = model(training_data)
#
  xIn  = training_data[0]
  yIn  = training_data[1]
  xVal = allTraining_value[iData][0].item()
  yVal = allTraining_value[iData][1].item()
  xOut =        output[0]
  yOut =        output[1]
  print(f'{xIn:6.3f} {yIn:6.3f} | ', end="")
  print(f'{xVal:6.3f} {yVal:6.3f} || ', end="")
  print(f'{xOut:6.3f} {yOut:6.3f}')

#!/usr/bin/env python3

import torch
import math
import matplotlib.pyplot as plt

# global parameters
nData    =  4                      # number of training pairs
nLayer   =  2                      # number of layers
unitsPerLayer = 2
b        =  0.9                    # (+b) / (-b)  : logical True/False

nIter   =    6000                  # training iterations
learning_rate = 1.5e-2

#
# tanh layer module, could be non-squared
#
class TanhLayer(torch.nn.Module):            # inherintance
  def __init__(self, dim_out, dim_in):       # constructor
    super().__init__()            
    self.weights  = torch.randn(dim_out,dim_in,requires_grad=True)
    self.theta    = torch.randn(dim_out,       requires_grad=True)

def forward(self, x):            # define forward pass
    return torch.tanh(torch.matmul(self.weights,x)-self.theta)

def update(self, eps):           # updating internal parameters
    with torch.no_grad():
      self.weights     -= eps*self.weights.grad
      self.theta       -= eps*self.theta.grad
      self.weights.grad = None
      self.theta.grad   = None

#
# n-idential layer model
#
allLayers = [TanhLayer(unitsPerLayer, unitsPerLayer) for _ in range(nLayer)]
def model(x):
  for iLayer in range(nLayer):     
    x = allLayers[iLayer](x)
  return x

#
# ( unitsPerLayer | nData )  tensor of training data
# element-wise mapping of uniform distribution [0,1] to binary
# automatic casting of boolean  (..>..)  to  0/1
#
allTraining_data   = torch.FloatTensor(nData,unitsPerLayer).uniform_()
allTraining_value  = torch.FloatTensor(nData,unitsPerLayer).uniform_()
for iData in range(nData):
  for unit in range(2):                   # boolean first two units
    allTraining_data[iData][unit] =\
      b*(2.0*(allTraining_data[iData][unit]>0.5)-1.0)

#
# (cros-idenity|XOR) for first two units
#

if (unitsPerLayer>0) and (nData>0) and (1==1): 
  allTraining_data[0][0]  =  b
  allTraining_data[0][1]  =  b
  allTraining_value[0][0] =  b 
  allTraining_value[0][1] = -b

if (unitsPerLayer>0) and (nData>1) and (1==1): 
  allTraining_data[1][0]  =  b
  allTraining_data[1][1]  = -b
  allTraining_value[1][0] = -b
  allTraining_value[1][1] =  b

if (unitsPerLayer>0) and (nData>2) and (1==1): 
  allTraining_data[2][0]  = -b
  allTraining_data[2][1]  =  b
  allTraining_value[2][0] =  b
  allTraining_value[2][1] =  b

if (unitsPerLayer>0) and (nData>3) and (1==1): 
  allTraining_data[3][0]  = -b
  allTraining_data[3][1]  = -b
  allTraining_value[3][0] = -b
  allTraining_value[3][1] = -b
if (1==1):
  print("\n# traing data/value")
  print(allTraining_data)
  print(allTraining_value)

#
# explict sum allows for experiments
#
def lossFunction(outputActivity, targetActivity):
  loss = torch.zeros(1)
  for ii in range(list(outputActivity.size())[0]):     # casting to list
# for ii in range(2):                                  # for testing
    loss += ( outputActivity[ii] - targetActivity[ii] ).pow(2)
  return loss

#
# iterating over itentical batch of training data
#
batchLoss = 0.0                        # average loss 
for iIter in range(nIter):
  iData = iIter%nData                  # go through all training data
  training_data = torch.clone(allTraining_data[iData])
  loss = lossFunction(model(training_data), allTraining_value[iData])
  loss.backward()
#
  batchLoss += loss.item()
  if (iData==0):
    if iIter % 200 == 0:
      print(f'{iIter:5d}  {batchLoss:10.6f}')
    batchLoss = 0
#
  for iLayer in range(nLayer):         # parameter updating
    allLayers[iLayer].update(learning_rate)

#
# performance testing
#
print()
for iData in range(nData):
  training_data = torch.clone(allTraining_data[iData])
  output        = model(training_data)
#
  xIn  = training_data[0]
  yIn  = training_data[1]
  xVal = allTraining_value[iData][0].item()
  yVal = allTraining_value[iData][1].item()
  xOut =        output[0]
  yOut =        output[1]
  print(f'{xIn:6.3f} {yIn:6.3f} | ', end="")
  print(f'{xVal:6.3f} {yVal:6.3f} || ', end="")
  print(f'{xOut:6.3f} {yOut:6.3f}')

complex-valued neural nets

dtype=torch.cfloat
complex data type for tensors
:: for activities, weights, ...
torch.matmul()
only between tensors with identical data types
non-linear units $\ g(z)$
- holomorphic $\frac{dg}{dz}$ exists
- non-holomorphic does not exist everywhere
:: compare A Survey of Complex-Valued Neural Networks
circular squatting (non-holomorphic)
:: extends dynamical range

$$\fbox{$\displaystyle\phantom{\big|} g(z) = \frac{z}{1+|z|} \phantom{\big|}$} $$

plane wave classification $\displaystyle \quad\qquad \mathrm{e}^{ikx} \quad\to\quad \mathrm{e}^{ik}$
$k$ wavevector
$x$ linear data dimension

#!/usr/bin/env python3

#
# complex network, plane wave classification
#

import torch
import random
import math
import cmath                    # complex math

dataDim       = 10
nLayer        = 2               # two == one hidden layer
nData         = 10               
nBatch        = nData
nEpochs       = 20
nIter         = nBatch*nEpochs
learningRate  = 1.0e-2

print("# default data type: ", torch.get_default_dtype())
# default data type can be changed, but (yet) not to cfloat


class ComplexLayer(torch.nn.Module):   
  def __init__(self, dimOut, dimIn, zero_if_linear = 1.0): 
    super().__init__()
    self.weights = torch.randn(dimOut, dimIn,
                   requires_grad=True, dtype=torch.cfloat)
    self.bias    = torch.randn(dimOut, 
                   requires_grad=True, dtype=torch.cfloat)
    self.zero_if_linear = zero_if_linear     # zero for linear layer

  def forward(self, x):           # cicrular squatting
    z = torch.matmul(self.weights, x) - self.bias
    return z/(1.0+self.zero_if_linear*z.abs())

  def update(self, eps):          # updating parameters
    with torch.no_grad():
      self.weights -= eps*self.weights.grad
      self.bias    -= eps*self.bias.grad
      self.weights.grad = None
      self.bias.grad    = None

#
# model, output layer is linear
#
allLayers =     [ComplexLayer(dataDim, dataDim) for _ in range(nLayer-1)]
allLayers.append(ComplexLayer(1      , dataDim, zero_if_linear=0.0))
print("# allLayers : ", allLayers)

def model(x):
  for iLayer in range(nLayer):
    x = allLayers[iLayer](x)
  return x

#
# generate data: plane waves
#
myData   = torch.ones(nData, dataDim, dtype=torch.cfloat)
myValues = torch.ones(nData,       1, dtype=torch.cfloat)
delta_k = 2.0*math.pi/dataDim             # 2\pi / length

for iData in range(nData):
  qq = iData*delta_k                      # wave vector
  ww =  complex(math.cos(qq), math.sin(qq))
  myValues[iData][0] *= ww                # circular encoding
#
  for iDim in range(dataDim):
    zz = complex(math.cos(iDim*qq), math.sin(iDim*qq))
    myData[iData][iDim] *= zz

if (1==2):                                # test output: data
  for iDim in range(dataDim):
    print(myData[1][iDim].item().real,
          myData[1][iDim].item().imag)
if (1==2):                                # test output: targets
  for iData in range(nData):
    print(myValues[iData][0].item().real,
          myValues[iData][0].item().imag)

#
# training loop
#
for iIter in range(nIter):                    # trainning loop
  thisData = random.randrange(nData)          # select random data entry
  x = myData[thisData]
  y = model(x)                                # forward pass
  target = myValues[thisData][0]
  loss = abs((target-y).pow(2))               # loss must be real
  loss.backward()                             # summing over batch
#
  if (iIter%nBatch==0):                       # updating
    for iLayer in range(nLayer):
      allLayers[iLayer].update(learningRate/nBatch)
    print(f'{iIter:6d}  {loss.item():8.4f}')

#!/usr/bin/env python3

#
# complex network, plane wave classification
#

import torch
import random
import math
import cmath                    # complex math

dataDim       = 10
nLayer        = 2               # two == one hidden layer
nData         = 10               
nBatch        = nData
nEpochs       = 20
nIter         = nBatch*nEpochs
learningRate  = 1.0e-2

print("# default data type: ", torch.get_default_dtype())
# default data type can be changed, but (yet) not to cfloat

class ComplexLayer(torch.nn.Module):   
  def __init__(self, dimOut, dimIn, zero_if_linear = 1.0): 
    super().__init__()
    self.weights = torch.randn(dimOut, dimIn,
                   requires_grad=True, dtype=torch.cfloat)
    self.bias    = torch.randn(dimOut, 
                   requires_grad=True, dtype=torch.cfloat)
    self.zero_if_linear = zero_if_linear     # zero for linear layer

def forward(self, x):           # cicrular squatting
    z = torch.matmul(self.weights, x) - self.bias
    return z/(1.0+self.zero_if_linear*z.abs())

def update(self, eps):          # updating parameters
    with torch.no_grad():
      self.weights -= eps*self.weights.grad
      self.bias    -= eps*self.bias.grad
      self.weights.grad = None
      self.bias.grad    = None

#
# model, output layer is linear
#
allLayers =     [ComplexLayer(dataDim, dataDim) for _ in range(nLayer-1)]
allLayers.append(ComplexLayer(1      , dataDim, zero_if_linear=0.0))
print("# allLayers : ", allLayers)

def model(x):
  for iLayer in range(nLayer):
    x = allLayers[iLayer](x)
  return x

#
# generate data: plane waves
#
myData   = torch.ones(nData, dataDim, dtype=torch.cfloat)
myValues = torch.ones(nData,       1, dtype=torch.cfloat)
delta_k = 2.0*math.pi/dataDim             # 2\pi / length

for iData in range(nData):
  qq = iData*delta_k                      # wave vector
  ww =  complex(math.cos(qq), math.sin(qq))
  myValues[iData][0] *= ww                # circular encoding
#
  for iDim in range(dataDim):
    zz = complex(math.cos(iDim*qq), math.sin(iDim*qq))
    myData[iData][iDim] *= zz

if (1==2):                                # test output: data
  for iDim in range(dataDim):
    print(myData[1][iDim].item().real,
          myData[1][iDim].item().imag)
if (1==2):                                # test output: targets
  for iData in range(nData):
    print(myValues[iData][0].item().real,
          myValues[iData][0].item().imag)

#
# training loop
#
for iIter in range(nIter):                    # trainning loop
  thisData = random.randrange(nData)          # select random data entry
  x = myData[thisData]
  y = model(x)                                # forward pass
  target = myValues[thisData][0]
  loss = abs((target-y).pow(2))               # loss must be real
  loss.backward()                             # summing over batch
#
  if (iIter%nBatch==0):                       # updating
    for iLayer in range(nLayer):
      allLayers[iLayer].update(learningRate/nBatch)
    print(f'{iIter:6d}  {loss.item():8.4f}')

build-in modules

PyTorch provides everything ML needs
import torch.nn as nn
:: 'neural networks', lots of useful modules
torch.nn.linear() basic linear module
- contains weights, bias, with
  requires_grad=True
- automatic broadcasting
  :: used everywhere

#!/usr/bin/env python3
#
# torch.nn.linear() illustration
#
import torch
import torch.nn as nn

# linear layer, dimension  4 --> 3
#                      input --> output
LL = nn.Linear(4, 3)

print()
print("LL shape, weights :", LL.weight.shape)
print("LL shape, bias    :", LL.bias.shape)

print("\n# --- broadcasting examples ---")

# 1D input: just the feature dimension
x1   = torch.randn(4)
out1 = LL(x1)
print(f"\n1D input:\n {x1.shape} -> {out1.shape}")

# nn.Linear acts on last dimension only
# 2D input: batch dimension + features
#  5 samples, 4 features each
x2 = torch.randn(5, 4) 
out2 = LL(x2)
print(f"\n2D input:\n {x2.shape} -> {out2.shape}")

# 3D input: batch + sequence + features (like in transformers)
# 2 batches, 10 tokens, 4 features each
x3 = torch.randn(2, 10, 4)  
out3 = LL(x3)
print(f"\n3D input:\n {x3.shape} -> {out3.shape}")