Machine Learning Primer -- Part III: Advanced Topics




Claudius Gros, WS 2024/25

Institut für theoretische Physik
Goethe-University Frankfurt a.M.

Text Processing / Datasets

general text pre-processing

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

# pip install -U pip setuptools wheel
# pip install -U spacy
# python -m spacy download en_core_web_sm
#                          small english model

import spacy
from spacy import displacy

def tokenSpacy(inputText):
  """Named Entity Recognition (NER) with spacy"""
  NER = spacy.load("en_core_web_sm")
  textSpacy = NER(inputText)

  print("# ===============================")
  cleanText = ""
  for token in textSpacy:
     print(f'{token.text:10s} | {token.pos_:10s} | {token.tag_:10s} | {token.dep_:10s} | ',   f'{token.is_alpha:b}  {token.is_stop:b}')
     cleanText += " " + token.text
  print("# ===============================")
  for word in textSpacy.ents:
     print(f'{word.text:15s} {word.label_:12s}')
     if (word.label_=="PERSON"):                              # no real names
       cleanText =  cleanText.replace(word.text, word.label_)
  print("# ===============================")
  print(cleanText)
  print("# ===============================")
  svgIm = displacy.render(textSpacy, style="dep")             # visualization
  open("foo.svg", "w").write(svgIm)                           # export svg file


def tokenSimple(inputText):
  """cleans and tokenize a given input text; hands-on"""
  addSpace  = [".", ",", ";", ":","!", "?", "'", '"', "\\", "/"]
  addSpace += ["(", ")", "[", "]","{", "}", "<", ">", "|"]
  addSpace += ["@", "#", "$", "%","^", "&", "*", "=", "_"]
  addSpace += ["0","1","2","3","4","5","6","7","8","9"]
  toSpace   = ["\n"]
  for aa in addSpace:
    inputText = inputText.replace(aa," "+aa+" ")
  for tt in toSpace:
    inputText = inputText.replace(tt," ")
#
  inputList  = inputText.split()
  cleanText  = " ".join(inputList)        
  vocabulary = list(set(inputList))
#
  print("# -------------------------------")
  print(cleanText)
  print("# -------------------------------")
  print(vocabulary)
  print("# -------------------------------")

#
# main
#

rawText = """In 2024, the physics Nobel prize was 
             awarded to \nGeoff Hinton and John Hopfield 
             (did they really receive $1M each?)."""

tokenSpacy(rawText)
tokenSimple(rawText)

text cleaning/parsing

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

# text preprocessing/cleaning/parsing
# pip install beautifulsoup4 lxml

import spacy
from spacy import displacy
from urlextract import URLExtract
import re                                 # regular expressions
from bs4 import BeautifulSoup

def html_handling(inputText):
  "straightfoward html tag removeal"
  soup = BeautifulSoup(inputText, 'lxml')
  return soup.get_text()


def hashtag_handling(inputText, hashToken="HASH"):
  "substitutes hashtags"
  inputTags = re.findall(r"#\w+", inputText)
  for hh in inputTags:
    inputText = inputText.replace(hh, hashToken)
  return inputText, inputTags


def email_handling(inputText, mailToken="EMAIL"):
  "substitutes email addresses"
  inputMails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", inputText)
  for ee in inputMails:
    inputText = inputText.replace(ee, mailToken)
  return inputText, inputMails


def URL_handling(inputText, URLtoken="URL"):
  "substitutes all links in input text by a given token"
  extractor = URLExtract()
  inputLinks = extractor.find_urls(inputText)
  for ll in inputLinks:
    inputText = inputText.replace(ll, URLtoken)
  return inputText, inputLinks


def tokenSpacy(inputText):
  """Named Entity Recognition (NER) with spacy.
     Returns a single-line string."""
  NER = spacy.load("en_core_web_sm")
  textSpacy = NER(inputText)

  cleanText = ""
  for token in textSpacy:
     cleanText += token.text + " "
  for word in textSpacy.ents:
     if (word.label_=="PERSON"):                   # no real names
       cleanText =  cleanText.replace(word.text, word.label_)
  cleanText = " ".join(cleanText.splitlines())     # only a single line
  cleanText = " ".join(cleanText.split())          # only single white spaces
  return cleanText

#
# main
#

rawText = """In 2024, the physics Nobel prize #nobelPrize
             was awarded to Geoff Hinton #geoffHinton and 
             John Hopfield #johnHopfield. <br>For more 
             information see https://www.nobelprize.org/, 
             or journalist@newspaper.universe."""
if (1==2):
  rawText = """It follows from x<y that y>x."""
print(rawText, "\n")

rawText = html_handling(rawText)
print(rawText, "\n")

rawText, _ = email_handling(rawText)
print(rawText, "\n")

rawText, _ = hashtag_handling(rawText)
print(rawText, "\n")

rawText, _ = URL_handling(rawText)
print(rawText, "\n")

cleanText = tokenSpacy(rawText)
print(cleanText)

word embedding

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

import spacy

# ===  
# === model generated word vectors
# ===

#spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")        # large model, dim = 300

vocabulary = ["apple", "banana", "cherry", "date", "elderberry"]

word_vectors = {}                         # dictionary for word vectors

for word in vocabulary:                   # word vectors
    token = nlp.vocab[word]
    word_vectors[word] = token.vector

print(word_vectors["apple"])
print()
print(len(word_vectors["apple"]))
print("# ==============================")

# === 
# === dynamic word vectors
# === 

nlp = spacy.load("en_core_web_sm")        # small, dynamic word vectors

sentence = "I like to eat an apple every morning."
doc = nlp(sentence)

for token in doc:
  print(f"{token.text:12s} | {len(token.vector):5d}")
# print(f"{token.text:12s} | {len(token.vector):5d} | {token.vector}")

training datasets

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

# IMDB movie rating dataset
# 25 000 train/test,  31/32MByte
# https://huggingface.co/datasets/stanfordnlp/imdb

from datasets import load_dataset, load_from_disk
import os

# Load the IMDB dataset

def download_data(dataSetName = "imdb"):
  """download from Huggingface if not available locally"""
  localDatasetPath = dataSetName + "_all"
  yesLocal = True if os.path.exists(localDatasetPath)\
                  else False
#
  MyDataset = None
  if (yesLocal):
    print(f'# loading {dataSetName} from file')
    MyDataset = load_from_disk(localDatasetPath)
  else:
    print(f'# downloading {dataSetName} from Huggingface')
    MyDataset = load_dataset(dataSetName)
    MyDataset.save_to_disk(localDatasetPath)
#
  print(MyDataset)
  for split in MyDataset.keys():          # slits and features
    print("# split   ", split)
    print("# features ", MyDataset[split].features)
    print()
#
  if not yesLocal:                        # specific to IMDB
    for split in MyDataset.keys():
     fileName = dataSetName + "." + split
     with open(fileName, 'w', encoding='utf-8') as f:
       for example in MyDataset[split]:
         f.write(f"{example['label']:3d} {example['text']}\n")
#
  return MyDataset

#
# main
#

main_dataset = download_data()

# available datasets
if (1==2):
  from huggingface_hub import list_datasets
  with open("huggingface.sets", 'w') as f:
    [f.write(f"{dataset.id}\n") for dataset in list_datasets()]

private datasets

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

# creating a privat dataset using
# the standard format

from datasets import Dataset, DatasetDict, load_from_disk

# defining data for each split
train_data = [
    {'text': 'First training example', 'label': 0},
    {'text': 'Second training example', 'label': 1}
             ]

test_data = [
    {'text': 'First test example', 'label': 1},
    {'text': 'Second test example', 'label': 0}
            ]

# creating datasets objects for each split
# :: could be stored separatly
train_dataset = Dataset.from_list(train_data)
test_dataset  = Dataset.from_list(test_data)

# DatasetDict combining splits
dataset_dict = DatasetDict(
   {'train': train_dataset, 'test': test_dataset}
                          )
# saving entire DatasetDict
dataset_dict.save_to_disk('./my_dataset')

# loading entire dataset
loaded_dataset = load_from_disk('./my_dataset')
print(loaded_dataset)

time series dataset example

Copy Copy to clipboad
Downlaod Download
#!/usr/bin/env python3

# generate and store delayed XOR time series 
# x(t) = ( x(t-tau) + x(t-tau-1) ) % N

import random
from datasets import Dataset, DatasetDict, load_from_disk

class GP():
  "global parameters"
  tau      = 1           # delay
  N        = 10          # number symbols
  nTrain   = 20          # number training data
  nTest    = 10          # number test data
  lenTrain =  7          # length of sequences
  lenTest  =  6          # 


def oneStep_NT(x, N, delay):
  """returns next token (NT time series)
     on input: current sequence x"""
  length = len(x)
  pair = x[length-delay-1:length-delay+1]
  return sum(pair)%N  


def oneSequence(length):
  """returns a time series of length 'length',
     starting with a random initial state"""
  sequence = random.sample(range(0, GP.N), GP.tau+1)
  for ll in range(length-GP.tau-1):
    sequence.append(oneStep_NT(sequence, GP.N, GP.tau))
  return sequence


def makeDataset():
  """returns the entire dataset object"""
  myType = "N" + str(GP.N) + "T" + str(GP.tau)

# generate training datasets
  train_data = []
  for _ in range(GP.nTrain):
    newSeries = oneSequence(GP.lenTrain)
    newRow = {"type":myType, "series":newSeries}
    train_data.append(newRow)

# generate test datasets
  test_data = []
  for _ in range(GP.nTest):
    newSeries = oneSequence(GP.lenTest)
    newRow = {"type":myType, "series":newSeries}
    test_data.append(newRow)

# make Dataset 'DatasetDict objects
  train_dataset = Dataset.from_list(train_data)
  test_dataset  = Dataset.from_list(test_data)
  dataset_dict = DatasetDict(
    {'train': train_dataset, 'test': test_dataset}
                            )
  return dataset_dict

def testPrintingDataset(DS):
  print()
  print("# =============")
  print("# test printing")
  print("# =============")
  for split_name, split_dataset in DS.items():
    print(f"Split: {split_name}, Size: {len(split_dataset)}")
    for example in split_dataset.select(range(2)): 
       print(example["series"])
    print()

#
# main
#
print(dataset_dict:=makeDataset())

# saving entire DatasetDict
dataset_dict.save_to_disk('./my_dataset')

# loading entire dataset
loaded_dataset = load_from_disk('./my_dataset')
# print(loaded_dataset)

# test printing
testPrintingDataset(loaded_dataset)