matrices
: NumPy tensors
: PyTroch #!/usr/bin/env python3 import torch # PyTorch instead of NumPy import math import matplotlib.pyplot as plt myType = torch.float myDevice = torch.device("cpu") # "cuda:0" for GPU; not activated # global parameters nData = 2000 # number of training pairs nIter = 2000 # number training iterations nPar = 4 # number of fit parameters learning_rate = 0.5e-2/nData # relative learning rate fitPar = [] # empty list; fit parameters for i in range(nPar): # randn() : normal distribution fitPar.append(torch.randn((), device=myDevice, dtype=myType)) print(fitPar) def fitFunction(x): # polynomial fitting function sum = 0.0 for i in range(nPar): sum += fitPar[i]*(x**i) return sum # linspace returns a list x = torch.linspace(-math.pi, math.pi, nData, device=myDevice, dtype=myType) y = torch.sin(x) # target function y = sin(x) # training iteration for iIter in range(nIter): y_pred = fitFunction(x) # list; element-wise loss = torch.square(y_pred - y).sum() # sum of squared elements if iIter % 100 == 99: # test printout print(f'{iIter:5d} {loss:10.6f}') grad_y_pred = 2.0 * (y_pred - y) # error signal for i in range(nPar): # least-square fit gradient = ( grad_y_pred*(x**i) ).sum() fitPar[i] -= learning_rate * gradient # showing result plt.plot(x, torch.sin(x) , 'b', label="sin(x)") plt.plot(x, fitFunction(x) , 'r', label="polynomial fit") plt.plot(x, 0.0*x , '--k') plt.legend() plt.show()
requires_grad = True
detach
(temporarily) tensors for
'by hand' operations, requires_grad = False
with torch.no_grad():
#!/usr/bin/env python3 import torch # PyTorch needs to be installed dim = 2 eps = 0.1 x = torch.ones(dim, requires_grad=True) # leaf of computational graph print("x : ",x) print("x.data : ",x.data) print("x[0] : ",x[0]) print("x[0].item() : ",x[0].item()) print() y = x + 2 out = torch.dot(y,y) # scalar product print("y : ",y) print("out : ",out) print() out.backward() # backward pass --> gradients print("x.grad : ",x.grad) with torch.no_grad(): # detach from computational graph x -= eps*x.grad # updating parameter tensor x.grad = None # flush print("x : ",x.data) print("\n#---") print("#--- .backward() adds new gradient to old gradient") print("#--- convenient for batch updating") print("#---\n") y = torch.zeros(dim, requires_grad=True) torch.dot(y+1,y+1).backward() print("y.grad : ",y.grad) torch.dot(y+1,y+1).backward() print("y.grad : ",y.grad) torch.dot(y+1,y+1).backward() print("y.grad : ",y.grad) torch.dot(y+1,y+1).backward()
#!/usr/bin/env python3 import torch import math import matplotlib.pyplot as plt myType = torch.float myDevice = ( "cuda" # for GPUs if torch.cuda.is_available() else "mps" # 'MultiProcessor Specification' if torch.backends.mps.is_available() else "cpu" # plain old CPU ) # global parameters nData = 2000 # number of training pairs nIter = 2000 # number training iterations nPar = 4 # number of fit parameters learning_rate = 0.5e-2/nData # gradients with respect to figPar[] to be evaluated fitPar = [] # list of 1x1 tensors for i in range(nPar): fitPar.append(torch.randn((), device=myDevice, dtype=myType,\ requires_grad=True)) print(fitPar) x = torch.linspace(-math.pi, math.pi, nData) y = torch.sin(x) # element-wise def fitFunction(x): # polynomial fitting fuction sum = 0.0 for i in range(nPar): sum += fitPar[i]*(x**i) # element-wise, x is a tensor return sum # returns a tensor # training iteration for iIter in range(nIter): y_pred = fitFunction(x) # forward pass lossTensor = (y_pred - y).pow(2).sum() # element-wise pow(2) if iIter % 100 == 99: # print scalar loss value print(f'{iIter:5d} {lossTensor.item():10.6f}') # backward pass # calculates gradients, viz 'tensor.grad', # with respect to tensors with "requires_grad=True" lossTensor.backward() # temporarily 'detaching' all tensors for by-hand updating # the value of fitPar[i].grad is not affected with torch.no_grad(): for i in range(nPar): # gradients via backward pass fitPar[i] -= learning_rate * fitPar[i].grad fitPar[i].grad = None # "detach" tensors requiring gradients from computation graph plt.plot(x, torch.sin(x) , 'b', label="sin(x)") plt.plot(x, fitFunction(x).detach().numpy() , 'r', label="fit") plt.plot(x, 0.0*x , '--k') plt.legend() plt.show()
tensor._version
version counter
#!/usr/bin/env python3 import torch a = torch.randn(5, requires_grad=True) b = 2 * a c = b ** 2 # non-linear #c = b + 2 # linear print(b._version) b += 1 # an inplace operation # b changed outside the computation graph print(b._version) print() c.sum().backward() print("a: ", a) print("a.grad: ", a.grad)
class MyLayer():
#!/usr/bin/env python3 import torch import math import matplotlib.pyplot as plt # # wrapping (x+p)^2 inside a module # class MyLayer(torch.nn.Module): # inherintance def __init__(self, p): # constructor super().__init__() self.p = p def forward(self, x): # define forward pass return torch.dot(x+self.p,x+self.p) # # main start # myLayerObject = MyLayer(2.0) # instanstiation input = torch.ones(3,requires_grad=True) output = myLayerObject(input) # forward pass (implicit) output.backward() print("\n# input") print(input) print("\n# output") print(output) print("\n# input.grad") print(input.grad)
#!/usr/bin/env python3 import torch import math import matplotlib.pyplot as plt # # relu = max(0,x) layer (rectified linear) # class MyLayer(torch.nn.Module): # inheritance def __init__(self, dim): # constructor super().__init__() self.weights = torch.randn(dim,dim,requires_grad=True) def forward(self, x): # default forward pass return torch.relu(torch.matmul(self.weights,x)) def forward_tanh(self, x): # alternative forward pass return torch.tanh(torch.matmul(self.weights,x)) def update(self, eps): # updating weights with torch.no_grad(): self.weights -= eps*self.weights.grad self.weights.grad = None # # a single training pair (myData,myValue) # dim = 4 nIter = 1000 learningRate = 1.0e-2 myLayerObject = MyLayer(dim) # instanstiation myData = torch.FloatTensor(dim).uniform_() myValue = torch.relu(torch.FloatTensor(dim).uniform_()) print("\n# output") for iIter in range(nIter): # trainning loop # output = myLayerObject(myData) # forward pass (implicit) # output = myLayerObject.forward(myData) # forward pass (explicit) # output = myLayerObject.forward_tanh(myData) # forward pass (specific) # loss = (output-myValue).pow(2).sum() loss.backward() # backward pass myLayerObject.update(learningRate) # weight updating print(output.data) print("\n# myValue") print(myValue)
reshape
, flatten
,
squeeze
, unsqueeze
, view
squeeze
allows to add a vector to a matrix row-wise
#!/usr/bin/env python3 import torch import math import matplotlib.pyplot as plt uu = torch.ones(3) xx = torch.ones(3,5) print("\n# uu") print(uu) print("\n# xx") print(xx) print("\n# uu unsqueezed") print(torch.unsqueeze(uu, 1)) print("\n# xx plus uu unsqueezed along 1") print(xx+torch.unsqueeze(uu, 1)) # aa = torch.arange(4.0) print("\n# arranged\n", aa) aa = torch.reshape(aa, (2, 2)) print("\n# reshaped\n", aa) aa = torch.reshape(aa, (-1,)) print("\n# ..back\n", aa)
#!/usr/bin/env python3 import torch aa = torch.arange(10.0).reshape(2,5) bb = torch.arange(5) + 1.0 cc = torch.divide(aa,bb) print("aa") print(aa) print("bb") print(bb) print("cc") print(cc) print() print("# ======================") print("# row-wise normalization") print("# ======================") aaSum = aa.sum(1) # aaSum = aa.pow(2).sum(1) aaTrans = torch.divide(aa.transpose(0,1),aaSum) aaNorm = torch.transpose(aaTrans,0,1) print("asSum") print(aaSum) print("asTrans") print(aaTrans) print("aaNorm") print(aaNorm)
#!/usr/bin/env p ython3 import torch import math import matplotlib.pyplot as plt # # tanh layer module # class MyLayer(torch.nn.Module): def __init__(self, dim, yesTheta): super().__init__() self.weights = torch.randn(dim,dim,requires_grad=True) self.theta = torch.randn(dim, requires_grad=True) self.yesTheta = yesTheta # 1/0 with/without thresholds def forward(self, x): # unsqueezing threshold vector tt = torch.unsqueeze(self.yesTheta*self.theta, 1) return torch.tanh(torch.matmul(self.weights,x)-self.yesTheta*tt) def update(self, eps): # updating internal parameters with torch.no_grad(): self.weights -= eps*self.weights.grad self.theta -= eps*self.theta.grad*self.yesTheta self.weights.grad = None self.theta.grad = None # # a single training pair (myData,myValue) # dim = 4 nData = 3 nIter = 1000 learningRate = 5.0e-2 myLayerObject = MyLayer(dim, 1.0) # instanstiation myData = torch.FloatTensor(dim,nData).uniform_() myValue = torch.relu(torch.FloatTensor(dim,nData).uniform_()) print("\n# output") for iIter in range(nIter): # trainning loop output = myLayerObject(myData) # forward pass (implicit) loss = (output-myValue).pow(2).sum() loss.backward() # backward pass myLayerObject.update(learningRate) # weight updating print(loss.item()) print("\n# output") print(output.data) print("\n# myValue") print(myValue)
#!/usr/bin/env python3 import torch import math import matplotlib.pyplot as plt # global parameters nData = 4 # number of training pairs nLayer = 2 # number of layers unitsPerLayer = 2 b = 0.9 # (+b) / (-b) : logical True/False nIter = 6000 # training iterations learning_rate = 1.5e-2 # # tanh layer module, could be non-squared # class TanhLayer(torch.nn.Module): # inherintance def __init__(self, dim_out, dim_in): # constructor super().__init__() self.weights = torch.randn(dim_out,dim_in,requires_grad=True) self.theta = torch.randn(dim_out, requires_grad=True) def forward(self, x): # define forward pass return torch.tanh(torch.matmul(self.weights,x)-self.theta) def update(self, eps): # updating internal parameters with torch.no_grad(): self.weights -= eps*self.weights.grad self.theta -= eps*self.theta.grad self.weights.grad = None self.theta.grad = None # # n-idential layer model # allLayers = [TanhLayer(unitsPerLayer, unitsPerLayer) for _ in range(nLayer)] def model(x): for iLayer in range(nLayer): x = allLayers[iLayer](x) return x # # ( unitsPerLayer | nData ) tensor of training data # element-wise mapping of uniform distribution [0,1] to binary # automatic casting of boolean (..>..) to 0/1 # allTraining_data = torch.FloatTensor(nData,unitsPerLayer).uniform_() allTraining_value = torch.FloatTensor(nData,unitsPerLayer).uniform_() for iData in range(nData): for unit in range(2): # boolean first two units allTraining_data[iData][unit] =\ b*(2.0*(allTraining_data[iData][unit]>0.5)-1.0) # # (cros-idenity|XOR) for first two units # if (unitsPerLayer>0) and (nData>0) and (1==1): allTraining_data[0][0] = b allTraining_data[0][1] = b allTraining_value[0][0] = b allTraining_value[0][1] = -b if (unitsPerLayer>0) and (nData>1) and (1==1): allTraining_data[1][0] = b allTraining_data[1][1] = -b allTraining_value[1][0] = -b allTraining_value[1][1] = b if (unitsPerLayer>0) and (nData>2) and (1==1): allTraining_data[2][0] = -b allTraining_data[2][1] = b allTraining_value[2][0] = b allTraining_value[2][1] = b if (unitsPerLayer>0) and (nData>3) and (1==1): allTraining_data[3][0] = -b allTraining_data[3][1] = -b allTraining_value[3][0] = -b allTraining_value[3][1] = -b if (1==1): print("\n# traing data/value") print(allTraining_data) print(allTraining_value) # # explict sum allows for experiments # def lossFunction(outputActivity, targetActivity): loss = torch.zeros(1) for ii in range(list(outputActivity.size())[0]): # casting to list # for ii in range(2): # for testing loss += ( outputActivity[ii] - targetActivity[ii] ).pow(2) return loss # # iterating over itentical batch of training data # batchLoss = 0.0 # average loss for iIter in range(nIter): iData = iIter%nData # go through all training data training_data = torch.clone(allTraining_data[iData]) loss = lossFunction(model(training_data), allTraining_value[iData]) loss.backward() # batchLoss += loss.item() if (iData==0): if iIter % 200 == 0: print(f'{iIter:5d} {batchLoss:10.6f}') batchLoss = 0 # for iLayer in range(nLayer): # parameter updating allLayers[iLayer].update(learning_rate) # # performance testing # print() for iData in range(nData): training_data = torch.clone(allTraining_data[iData]) output = model(training_data) # xIn = training_data[0] yIn = training_data[1] xVal = allTraining_value[iData][0].item() yVal = allTraining_value[iData][1].item() xOut = output[0] yOut = output[1] print(f'{xIn:6.3f} {yIn:6.3f} | ', end="") print(f'{xVal:6.3f} {yVal:6.3f} || ', end="") print(f'{xOut:6.3f} {yOut:6.3f}')
torch.matmul()
#!/usr/bin/env python3 # # complex network, plane wave classification # import torch import random import math import cmath # complex math dataDim = 10 nLayer = 2 # two == one hidden layer nData = 10 nBatch = nData nEpochs = 20 nIter = nBatch*nEpochs learningRate = 1.0e-2 print("# default data type: ", torch.get_default_dtype()) # default data type can be changed, but (yet) not to cfloat class ComplexLayer(torch.nn.Module): def __init__(self, dimOut, dimIn, zero_if_linear = 1.0): super().__init__() self.weights = torch.randn(dimOut, dimIn, requires_grad=True, dtype=torch.cfloat) self.bias = torch.randn(dimOut, requires_grad=True, dtype=torch.cfloat) self.zero_if_linear = zero_if_linear # zero for linear layer def forward(self, x): # cicrular squatting z = torch.matmul(self.weights, x) - self.bias return z/(1.0+self.zero_if_linear*z.abs()) def update(self, eps): # updating parameters with torch.no_grad(): self.weights -= eps*self.weights.grad self.bias -= eps*self.bias.grad self.weights.grad = None self.bias.grad = None # # model, output layer is linear # allLayers = [ComplexLayer(dataDim, dataDim) for _ in range(nLayer-1)] allLayers.append(ComplexLayer(1 , dataDim, zero_if_linear=0.0)) print("# allLayers : ", allLayers) def model(x): for iLayer in range(nLayer): x = allLayers[iLayer](x) return x # # generate data: plane waves # myData = torch.ones(nData, dataDim, dtype=torch.cfloat) myValues = torch.ones(nData, 1, dtype=torch.cfloat) delta_k = 2.0*math.pi/dataDim # 2\pi / length for iData in range(nData): qq = iData*delta_k # wave vector ww = complex(math.cos(qq), math.sin(qq)) myValues[iData][0] *= ww # circular encoding # for iDim in range(dataDim): zz = complex(math.cos(iDim*qq), math.sin(iDim*qq)) myData[iData][iDim] *= zz if (1==2): # test output: data for iDim in range(dataDim): print(myData[1][iDim].item().real, myData[1][iDim].item().imag) if (1==2): # test output: targets for iData in range(nData): print(myValues[iData][0].item().real, myValues[iData][0].item().imag) # # training loop # for iIter in range(nIter): # trainning loop thisData = random.randrange(nData) # select random data entry x = myData[thisData] y = model(x) # forward pass target = myValues[thisData][0] loss = abs((target-y).pow(2)) # loss must be real loss.backward() # summing over batch # if (iIter%nBatch==0): # updating for iLayer in range(nLayer): allLayers[iLayer].update(learningRate/nBatch) print(f'{iIter:6d} {loss.item():8.4f}')
__init__()
outside computation graph
#!/usr/bin/env python3 # # basic attention layer, token-wise neural net not included # import torch import math import matplotlib.pyplot as plt nLayer = 3 # number of layers tokenPerLayer = 5 # context length nToken = tokenPerLayer*nLayer # total number of token dim = 2 # embedding dimension yesMask = 1.0 # 1/0 masked attention on/off b = 0.9 # (+b) / (-b) : logical True/False nIter = 4400 # training iterations learning_rate = 1.5e-2 # # attention layer module # class attentionLayer(torch.nn.Module): def __init__(self, dim, nContext, yesMask=1, yesNorm=True, myID=0): super().__init__() self.Q_mat = torch.randn(nContext,dim,dim,requires_grad=True) self.K_mat = torch.randn(nContext,dim,dim,requires_grad=True) self.V_mat = torch.randn(nContext,dim,dim,requires_grad=True) # mySigma = 1.0/(dim*dim) # initialization outside computation graph torch.nn.init.normal_(self.Q_mat, mean=0.0, std=mySigma) torch.nn.init.normal_(self.K_mat, mean=0.0, std=mySigma) torch.nn.init.normal_(self.V_mat, mean=0.0, std=mySigma) # self.alpha = torch.zeros(nContext,nContext) self.yesMask = yesMask # masked self attention self.yesNorm = yesNorm # layer normalization self.nContext = nContext self.dim = dim # embedding self.ID = myID # self.paddingMask = torch.zeros(nContext,nContext) # for masking for ii in range(nContext): for jj in range(ii+1,nContext): self.paddingMask[ii][jj] = -1.0e9 # exp -> 0 def layerNorm(self, x): mean = torch.zeros(self.dim) # vector mean sigma = torch.tensor(0.0) # zero-dimensional tensor # for ii in range(self.nContext): # mean += x[ii] / self.nContext mean = torch.sum(x, 0) / self.nContext # sum over rows # # for ii in range(self.nContext): # sigma += torch.square(x[ii]-mean).sum() # sigma = torch.sqrt(sigma/self.nContext) sigma = torch.sqrt(torch.square(x-mean).sum() / self.nContext) # # for ii in range(self.nContext): # layer normalization # x[ii] -= mean # x[ii] /= sigma x = (x-mean)/sigma # for all rows return x def forward(self, x, storeAttention=False): if (self.yesNorm): self.layerNorm(x) # Q/K/V vectors Q = torch.zeros(self.nContext,self.dim) K = torch.zeros(self.nContext,self.dim) V = torch.zeros(self.nContext,self.dim) for ii in range(self.nContext): Q[ii] = torch.matmul(self.Q_mat[ii],x[ii]) K[ii] = torch.matmul(self.K_mat[ii],x[ii]) V[ii] = torch.matmul(self.V_mat[ii],x[ii]) # local attention matrix alpha = torch.zeros(self.nContext,self.nContext) for ii in range(self.nContext): for jj in range(self.nContext): alpha[ii][jj] = torch.exp( torch.dot(Q[ii],K[jj])\ + yesMask*self.paddingMask[ii][jj] ) alpha[ii] /= alpha[ii].sum() # normalization # store attention matrix if storeAttention: self.alpha = alpha return torch.matmul(alpha,V) # return torch.matmul(alpha,V) + x # with skip connections def update(self, eps): # updating internal parameters with torch.no_grad(): self.Q_mat -= eps*self.Q_mat.grad self.K_mat -= eps*self.K_mat.grad self.V_mat -= eps*self.V_mat.grad self.Q_mat.grad = None self.K_mat.grad = None self.V_mat.grad = None # # n-idential layer model # allLayers = [attentionLayer(dim,tokenPerLayer,myID=iL) for iL in range(nLayer)] def model(x, storeAttention=False): for iLayer in range(nLayer): x = allLayers[iLayer](x, storeAttention) return x # # console printing of attention matrix # def printAttenionMatrix(): for iLayer in range(nLayer): print() print("# attention matrix for layer ", iLayer) for ss in range(tokenPerLayer): for tt in range(tokenPerLayer): alpha = allLayers[iLayer].alpha[ss][tt] print(f'{alpha:9.4f}', end="") print() # # test output of token activities # def printTokenActivities(x, myString): print() print("# activity for", myString) for ii in range(dim): for token in range(tokenPerLayer): print(f'{x[token][ii]:8.4f}', end="") print() # # standard loss function # def lossFunction(outputActivity, targetActivity): return torch.square(outputActivity - targetActivity).sum() # # random boolean (\pm b) mapping # training_data =\ b*(2.0*(torch.FloatTensor(tokenPerLayer,dim).uniform_()>0.5)-1.0) training_value =\ b*(2.0*(torch.FloatTensor(tokenPerLayer,dim).uniform_()>0.5)-1.0) # # testing model # if (1==2): print("# training_data") print(training_data,"\n") print("# training_value") print(training_value,"\n") # for iIter in range(nIter): loss = lossFunction(model(training_data),training_value) if (loss<0.001): break loss.backward() # for iLayer in range(nLayer): allLayers[iLayer].update(learning_rate) if (iIter%200==0): print(f'{iIter:4d} {loss.item():9.4f}') # # compare output with target # print() yy = model(training_data, storeAttention=True) printTokenActivities(training_value, "training_value") printTokenActivities(yy , "output activities") # if (1==2): print() printAttenionMatrix()