Machine Learning Primer -- Part III: Advanced Topics

Claudius Gros, WS 2024/25

Institut für theoretische Physik
Goethe-University Frankfurt a.M.

Text Processing / Datasets

general text pre-processing

standard language processing packages
:: SpaCy (used here)
:: NLTK natural language toolkit
:: Flair PyTorch extension (HU Berlin)
tokenization with
Named Entity Recognition (NER)
- CARDINAL: Numerals that don't fall into other categories
- EVENT: Named events
- DATE: Absolute or relative dates
- FAC: Facilities (buildings, monuments)
- GPE: Geo-Political Entity (countries, cities, states)
- LAW: Named documents or laws
- LOC: Location
- MISC: Miscellaneous entities
- MONEY: Monetary values
- NORP: Nationalities or Religious or Political groups
- ORDINAL: Ordinal numbers (first, second)
- ORG: Organizations
- PER: Person
- PERCENT: Percentage values
- QUANTITY: Measurements
- PRODUCT: Products
- TIME: Times smaller than a day
- WORK_OF_ART: Titles of books, songs, etc.

#!/usr/bin/env python3

# pip install -U pip setuptools wheel
# pip install -U spacy
# python -m spacy download en_core_web_sm
#                          small english model

import spacy
from spacy import displacy

def tokenSpacy(inputText):
  """Named Entity Recognition (NER) with spacy"""
  NER = spacy.load("en_core_web_sm")
  textSpacy = NER(inputText)

  print("# ===============================")
  cleanText = ""
  for token in textSpacy:
     print(f'{token.text:10s} | {token.pos_:10s} | {token.tag_:10s} | {token.dep_:10s} | ',   f'{token.is_alpha:b}  {token.is_stop:b}')
     cleanText += " " + token.text
  print("# ===============================")
  for word in textSpacy.ents:
     print(f'{word.text:15s} {word.label_:12s}')
     if (word.label_=="PERSON"):                              # no real names
       cleanText =  cleanText.replace(word.text, word.label_)
  print("# ===============================")
  print(cleanText)
  print("# ===============================")
  svgIm = displacy.render(textSpacy, style="dep")             # visualization
  open("foo.svg", "w").write(svgIm)                           # export svg file


def tokenSimple(inputText):
  """cleans and tokenize a given input text; hands-on"""
  addSpace  = [".", ",", ";", ":","!", "?", "'", '"', "\\", "/"]
  addSpace += ["(", ")", "[", "]","{", "}", "<", ">", "|"]
  addSpace += ["@", "#", "$", "%","^", "&", "*", "=", "_"]
  addSpace += ["0","1","2","3","4","5","6","7","8","9"]
  toSpace   = ["\n"]
  for aa in addSpace:
    inputText = inputText.replace(aa," "+aa+" ")
  for tt in toSpace:
    inputText = inputText.replace(tt," ")
#
  inputList  = inputText.split()
  cleanText  = " ".join(inputList)        
  vocabulary = list(set(inputList))
#
  print("# -------------------------------")
  print(cleanText)
  print("# -------------------------------")
  print(vocabulary)
  print("# -------------------------------")

#
# main
#

rawText = """In 2024, the physics Nobel prize was 
             awarded to \nGeoff Hinton and John Hopfield 
             (did they really receive $1M each?)."""

tokenSpacy(rawText)
tokenSimple(rawText)

#!/usr/bin/env python3

# pip install -U pip setuptools wheel
# pip install -U spacy
# python -m spacy download en_core_web_sm
#                          small english model

import spacy
from spacy import displacy

def tokenSpacy(inputText):
  """Named Entity Recognition (NER) with spacy"""
  NER = spacy.load("en_core_web_sm")
  textSpacy = NER(inputText)

print("# ===============================")
  cleanText = ""
  for token in textSpacy:
     print(f'{token.text:10s} | {token.pos_:10s} | {token.tag_:10s} | {token.dep_:10s} | ',   f'{token.is_alpha:b}  {token.is_stop:b}')
     cleanText += " " + token.text
  print("# ===============================")
  for word in textSpacy.ents:
     print(f'{word.text:15s} {word.label_:12s}')
     if (word.label_=="PERSON"):                              # no real names
       cleanText =  cleanText.replace(word.text, word.label_)
  print("# ===============================")
  print(cleanText)
  print("# ===============================")
  svgIm = displacy.render(textSpacy, style="dep")             # visualization
  open("foo.svg", "w").write(svgIm)                           # export svg file

def tokenSimple(inputText):
  """cleans and tokenize a given input text; hands-on"""
  addSpace  = [".", ",", ";", ":","!", "?", "'", '"', "\\", "/"]
  addSpace += ["(", ")", "[", "]","{", "}", "<", ">", "|"]
  addSpace += ["@", "#", "$", "%","^", "&", "*", "=", "_"]
  addSpace += ["0","1","2","3","4","5","6","7","8","9"]
  toSpace   = ["\n"]
  for aa in addSpace:
    inputText = inputText.replace(aa," "+aa+" ")
  for tt in toSpace:
    inputText = inputText.replace(tt," ")
#
  inputList  = inputText.split()
  cleanText  = " ".join(inputList)        
  vocabulary = list(set(inputList))
#
  print("# -------------------------------")
  print(cleanText)
  print("# -------------------------------")
  print(vocabulary)
  print("# -------------------------------")

#
# main
#

rawText = """In 2024, the physics Nobel prize was 
             awarded to \nGeoff Hinton and John Hopfield 
             (did they really receive $1M each?)."""

tokenSpacy(rawText)
tokenSimple(rawText)

text cleaning/parsing

texts may contain unwanted token
:: links (URLs)
:: hashtags
:: html commands
:: ...
urlExtract nomen ets omen
Beautiful Soup html handling

#!/usr/bin/env python3

# text preprocessing/cleaning/parsing
# pip install beautifulsoup4 lxml

import spacy
from spacy import displacy
from urlextract import URLExtract
import re                                 # regular expressions
from bs4 import BeautifulSoup

def html_handling(inputText):
  "straightfoward html tag removeal"
  soup = BeautifulSoup(inputText, 'lxml')
  return soup.get_text()


def hashtag_handling(inputText, hashToken="HASH"):
  "substitutes hashtags"
  inputTags = re.findall(r"#\w+", inputText)
  for hh in inputTags:
    inputText = inputText.replace(hh, hashToken)
  return inputText, inputTags


def email_handling(inputText, mailToken="EMAIL"):
  "substitutes email addresses"
  inputMails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", inputText)
  for ee in inputMails:
    inputText = inputText.replace(ee, mailToken)
  return inputText, inputMails


def URL_handling(inputText, URLtoken="URL"):
  "substitutes all links in input text by a given token"
  extractor = URLExtract()
  inputLinks = extractor.find_urls(inputText)
  for ll in inputLinks:
    inputText = inputText.replace(ll, URLtoken)
  return inputText, inputLinks


def tokenSpacy(inputText):
  """Named Entity Recognition (NER) with spacy.
     Returns a single-line string."""
  NER = spacy.load("en_core_web_sm")
  textSpacy = NER(inputText)

  cleanText = ""
  for token in textSpacy:
     cleanText += token.text + " "
  for word in textSpacy.ents:
     if (word.label_=="PERSON"):                   # no real names
       cleanText =  cleanText.replace(word.text, word.label_)
  cleanText = " ".join(cleanText.splitlines())     # only a single line
  cleanText = " ".join(cleanText.split())          # only single white spaces
  return cleanText

#
# main
#

rawText = """In 2024, the physics Nobel prize #nobelPrize
             was awarded to Geoff Hinton #geoffHinton and 
             John Hopfield #johnHopfield. <br>For more 
             information see https://www.nobelprize.org/, 
             or journalist@newspaper.universe."""
if (1==2):
  rawText = """It follows from x<y that y>x."""
print(rawText, "\n")

rawText = html_handling(rawText)
print(rawText, "\n")

rawText, _ = email_handling(rawText)
print(rawText, "\n")

rawText, _ = hashtag_handling(rawText)
print(rawText, "\n")

rawText, _ = URL_handling(rawText)
print(rawText, "\n")

cleanText = tokenSpacy(rawText)
print(cleanText)

#!/usr/bin/env python3

# text preprocessing/cleaning/parsing
# pip install beautifulsoup4 lxml

import spacy
from spacy import displacy
from urlextract import URLExtract
import re                                 # regular expressions
from bs4 import BeautifulSoup

def html_handling(inputText):
  "straightfoward html tag removeal"
  soup = BeautifulSoup(inputText, 'lxml')
  return soup.get_text()

def hashtag_handling(inputText, hashToken="HASH"):
  "substitutes hashtags"
  inputTags = re.findall(r"#\w+", inputText)
  for hh in inputTags:
    inputText = inputText.replace(hh, hashToken)
  return inputText, inputTags

def email_handling(inputText, mailToken="EMAIL"):
  "substitutes email addresses"
  inputMails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", inputText)
  for ee in inputMails:
    inputText = inputText.replace(ee, mailToken)
  return inputText, inputMails

def URL_handling(inputText, URLtoken="URL"):
  "substitutes all links in input text by a given token"
  extractor = URLExtract()
  inputLinks = extractor.find_urls(inputText)
  for ll in inputLinks:
    inputText = inputText.replace(ll, URLtoken)
  return inputText, inputLinks

def tokenSpacy(inputText):
  """Named Entity Recognition (NER) with spacy.
     Returns a single-line string."""
  NER = spacy.load("en_core_web_sm")
  textSpacy = NER(inputText)

cleanText = ""
  for token in textSpacy:
     cleanText += token.text + " "
  for word in textSpacy.ents:
     if (word.label_=="PERSON"):                   # no real names
       cleanText =  cleanText.replace(word.text, word.label_)
  cleanText = " ".join(cleanText.splitlines())     # only a single line
  cleanText = " ".join(cleanText.split())          # only single white spaces
  return cleanText

#
# main
#

rawText = """In 2024, the physics Nobel prize #nobelPrize
             was awarded to Geoff Hinton #geoffHinton and 
             John Hopfield #johnHopfield. <br>For more 
             information see https://www.nobelprize.org/, 
             or journalist@newspaper.universe."""
if (1==2):
  rawText = """It follows from x<y that y>x."""
print(rawText, "\n")

rawText = html_handling(rawText)
print(rawText, "\n")

rawText, _ = email_handling(rawText)
print(rawText, "\n")

rawText, _ = hashtag_handling(rawText)
print(rawText, "\n")

rawText, _ = URL_handling(rawText)
print(rawText, "\n")

cleanText = tokenSpacy(rawText)
print(cleanText)

word embedding

token → vector
:: embedding dimension dim < |vocabulary|
:: (trained) model needed
static fixed mapping token → vector
dynamic context-sensitive mapping
spacy.cli.download("en_core_web_lg")
:: download language model from inside program
:: en_core_web_lg large; dim=300 (static)
:: en_core_web_sm small; dim=96 (dynamic)
standard embedding packages
:: SpaCy python / open source
:: Word2Vec can be trained (Google; word2vec)
:: GloVe Global Vectors for Word Representation (Stanford)
:: FastText
from transformers import AutoTokenizer
:: Huggingface 'transformer' class provides access to tokenizer
:: of main transformer types, like GPT2

#!/usr/bin/env python3

import spacy

# ===  
# === model generated word vectors
# ===

#spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")        # large model, dim = 300

vocabulary = ["apple", "banana", "cherry", "date", "elderberry"]

word_vectors = {}                         # dictionary for word vectors

for word in vocabulary:                   # word vectors
    token = nlp.vocab[word]
    word_vectors[word] = token.vector

print(word_vectors["apple"])
print()
print(len(word_vectors["apple"]))
print("# ==============================")

# === 
# === dynamic word vectors
# === 

nlp = spacy.load("en_core_web_sm")        # small, dynamic word vectors

sentence = "I like to eat an apple every morning."
doc = nlp(sentence)

for token in doc:
  print(f"{token.text:12s} | {len(token.vector):5d}")
# print(f"{token.text:12s} | {len(token.vector):5d} | {token.vector}")

training datasets

Google dataset search for research studies
generic dataset structure (ML)
:: dictionary of list of dictionary
myDict[split][row][feature] string / int / string
:: technically a dictionary of datasets
datasets python library
:: handling of public datasets
:: Hugging Face public datasets ($\approx\!2.5\cdot10^5$)
IMBD movie rating dataset
:: 25000 training/test movie reviews
TorchVision comes with several image-related datasets
→ RBM code
scikit-learn datasets
→ SVM code

#!/usr/bin/env python3

# IMDB movie rating dataset
# 25 000 train/test,  31/32MByte
# https://huggingface.co/datasets/stanfordnlp/imdb

from datasets import load_dataset, load_from_disk
import os

# Load the IMDB dataset

def download_data(dataSetName = "imdb"):
  """download from Huggingface if not available locally"""
  localDatasetPath = dataSetName + "_all"
  yesLocal = True if os.path.exists(localDatasetPath)\
                  else False
#
  MyDataset = None
  if (yesLocal):
    print(f'# loading {dataSetName} from file')
    MyDataset = load_from_disk(localDatasetPath)
  else:
    print(f'# downloading {dataSetName} from Huggingface')
    MyDataset = load_dataset(dataSetName)
    MyDataset.save_to_disk(localDatasetPath)
#
  print(MyDataset)
  for split in MyDataset.keys():          # slits and features
    print("# split   ", split)
    print("# features ", MyDataset[split].features)
    print()
#
  if not yesLocal:                        # specific to IMDB
    for split in MyDataset.keys():
     fileName = dataSetName + "." + split
     with open(fileName, 'w', encoding='utf-8') as f:
       for example in MyDataset[split]:
         f.write(f"{example['label']:3d} {example['text']}\n")
#
  return MyDataset

#
# main
#

main_dataset = download_data()

# available datasets
if (1==2):
  from huggingface_hub import list_datasets
  with open("huggingface.sets", 'w') as f:
    [f.write(f"{dataset.id}\n") for dataset in list_datasets()]

#!/usr/bin/env python3

# IMDB movie rating dataset
# 25 000 train/test,  31/32MByte
# https://huggingface.co/datasets/stanfordnlp/imdb

from datasets import load_dataset, load_from_disk
import os

# Load the IMDB dataset

def download_data(dataSetName = "imdb"):
  """download from Huggingface if not available locally"""
  localDatasetPath = dataSetName + "_all"
  yesLocal = True if os.path.exists(localDatasetPath)\
                  else False
#
  MyDataset = None
  if (yesLocal):
    print(f'# loading {dataSetName} from file')
    MyDataset = load_from_disk(localDatasetPath)
  else:
    print(f'# downloading {dataSetName} from Huggingface')
    MyDataset = load_dataset(dataSetName)
    MyDataset.save_to_disk(localDatasetPath)
#
  print(MyDataset)
  for split in MyDataset.keys():          # slits and features
    print("# split   ", split)
    print("# features ", MyDataset[split].features)
    print()
#
  if not yesLocal:                        # specific to IMDB
    for split in MyDataset.keys():
     fileName = dataSetName + "." + split
     with open(fileName, 'w', encoding='utf-8') as f:
       for example in MyDataset[split]:
         f.write(f"{example['label']:3d} {example['text']}\n")
#
  return MyDataset

#
# main
#

main_dataset = download_data()

# available datasets
if (1==2):
  from huggingface_hub import list_datasets
  with open("huggingface.sets", 'w') as f:
    [f.write(f"{dataset.id}\n") for dataset in list_datasets()]

private datasets

for increased interoperability
:: use generic dataset structure for private datasets
entire dataset {split:splitDataset}
splitDataset [featureDict]
featureDict {feature:featureValue}

#!/usr/bin/env python3

# creating a privat dataset using
# the standard format

from datasets import Dataset, DatasetDict, load_from_disk

# defining data for each split
train_data = [
    {'text': 'First training example', 'label': 0},
    {'text': 'Second training example', 'label': 1}
             ]

test_data = [
    {'text': 'First test example', 'label': 1},
    {'text': 'Second test example', 'label': 0}
            ]

# creating datasets objects for each split
# :: could be stored separatly
train_dataset = Dataset.from_list(train_data)
test_dataset  = Dataset.from_list(test_data)

# DatasetDict combining splits
dataset_dict = DatasetDict(
   {'train': train_dataset, 'test': test_dataset}
                          )
# saving entire DatasetDict
dataset_dict.save_to_disk('./my_dataset')

# loading entire dataset
loaded_dataset = load_from_disk('./my_dataset')
print(loaded_dataset)

time series dataset example

XOR times series $\ x(t)$
:: sum modulo two
generalization: NT series
N number of symbols
$\tau\ $ delay

#!/usr/bin/env python3

# generate and store delayed XOR time series 
# x(t) = ( x(t-tau) + x(t-tau-1) ) % N

import random
from datasets import Dataset, DatasetDict, load_from_disk

class GP():
  "global parameters"
  tau      = 1           # delay
  N        = 10          # number symbols
  nTrain   = 20          # number training data
  nTest    = 10          # number test data
  lenTrain =  7          # length of sequences
  lenTest  =  6          # 


def oneStep_NT(x, N, delay):
  """returns next token (NT time series)
     on input: current sequence x"""
  length = len(x)
  pair = x[length-delay-1:length-delay+1]
  return sum(pair)%N  


def oneSequence(length):
  """returns a time series of length 'length',
     starting with a random initial state"""
  sequence = random.sample(range(0, GP.N), GP.tau+1)
  for ll in range(length-GP.tau-1):
    sequence.append(oneStep_NT(sequence, GP.N, GP.tau))
  return sequence


def makeDataset():
  """returns the entire dataset object"""
  myType = "N" + str(GP.N) + "T" + str(GP.tau)

# generate training datasets
  train_data = []
  for _ in range(GP.nTrain):
    newSeries = oneSequence(GP.lenTrain)
    newRow = {"type":myType, "series":newSeries}
    train_data.append(newRow)

# generate test datasets
  test_data = []
  for _ in range(GP.nTest):
    newSeries = oneSequence(GP.lenTest)
    newRow = {"type":myType, "series":newSeries}
    test_data.append(newRow)

# make Dataset 'DatasetDict objects
  train_dataset = Dataset.from_list(train_data)
  test_dataset  = Dataset.from_list(test_data)
  dataset_dict = DatasetDict(
    {'train': train_dataset, 'test': test_dataset}
                            )
  return dataset_dict

def testPrintingDataset(DS):
  print()
  print("# =============")
  print("# test printing")
  print("# =============")
  for split_name, split_dataset in DS.items():
    print(f"Split: {split_name}, Size: {len(split_dataset)}")
    for example in split_dataset.select(range(2)): 
       print(example["series"])
    print()

#
# main
#
print(dataset_dict:=makeDataset())

# saving entire DatasetDict
dataset_dict.save_to_disk('./my_dataset')

# loading entire dataset
loaded_dataset = load_from_disk('./my_dataset')
# print(loaded_dataset)

# test printing
testPrintingDataset(loaded_dataset)

#!/usr/bin/env python3

# generate and store delayed XOR time series 
# x(t) = ( x(t-tau) + x(t-tau-1) ) % N

import random
from datasets import Dataset, DatasetDict, load_from_disk

class GP():
  "global parameters"
  tau      = 1           # delay
  N        = 10          # number symbols
  nTrain   = 20          # number training data
  nTest    = 10          # number test data
  lenTrain =  7          # length of sequences
  lenTest  =  6          #

def oneStep_NT(x, N, delay):
  """returns next token (NT time series)
     on input: current sequence x"""
  length = len(x)
  pair = x[length-delay-1:length-delay+1]
  return sum(pair)%N

def oneSequence(length):
  """returns a time series of length 'length',
     starting with a random initial state"""
  sequence = random.sample(range(0, GP.N), GP.tau+1)
  for ll in range(length-GP.tau-1):
    sequence.append(oneStep_NT(sequence, GP.N, GP.tau))
  return sequence

def makeDataset():
  """returns the entire dataset object"""
  myType = "N" + str(GP.N) + "T" + str(GP.tau)

# generate training datasets
  train_data = []
  for _ in range(GP.nTrain):
    newSeries = oneSequence(GP.lenTrain)
    newRow = {"type":myType, "series":newSeries}
    train_data.append(newRow)

# generate test datasets
  test_data = []
  for _ in range(GP.nTest):
    newSeries = oneSequence(GP.lenTest)
    newRow = {"type":myType, "series":newSeries}
    test_data.append(newRow)

# make Dataset 'DatasetDict objects
  train_dataset = Dataset.from_list(train_data)
  test_dataset  = Dataset.from_list(test_data)
  dataset_dict = DatasetDict(
    {'train': train_dataset, 'test': test_dataset}
                            )
  return dataset_dict

def testPrintingDataset(DS):
  print()
  print("# =============")
  print("# test printing")
  print("# =============")
  for split_name, split_dataset in DS.items():
    print(f"Split: {split_name}, Size: {len(split_dataset)}")
    for example in split_dataset.select(range(2)): 
       print(example["series"])
    print()

#
# main
#
print(dataset_dict:=makeDataset())

# saving entire DatasetDict
dataset_dict.save_to_disk('./my_dataset')

# loading entire dataset
loaded_dataset = load_from_disk('./my_dataset')
# print(loaded_dataset)

# test printing
testPrintingDataset(loaded_dataset)