#!/usr/bin/env python3
# pip install -U pip setuptools wheel
# pip install -U spacy
# python -m spacy download en_core_web_sm
# small english model
import spacy
from spacy import displacy
def tokenSpacy(inputText):
"""Named Entity Recognition (NER) with spacy"""
NER = spacy.load("en_core_web_sm")
textSpacy = NER(inputText)
print("# ===============================")
cleanText = ""
for token in textSpacy:
print(f'{token.text:10s} | {token.pos_:10s} | {token.tag_:10s} | {token.dep_:10s} | ', f'{token.is_alpha:b} {token.is_stop:b}')
cleanText += " " + token.text
print("# ===============================")
for word in textSpacy.ents:
print(f'{word.text:15s} {word.label_:12s}')
if (word.label_=="PERSON"): # no real names
cleanText = cleanText.replace(word.text, word.label_)
print("# ===============================")
print(cleanText)
print("# ===============================")
svgIm = displacy.render(textSpacy, style="dep") # visualization
open("foo.svg", "w").write(svgIm) # export svg file
def tokenSimple(inputText):
"""cleans and tokenize a given input text; hands-on"""
addSpace = [".", ",", ";", ":","!", "?", "'", '"', "\\", "/"]
addSpace += ["(", ")", "[", "]","{", "}", "<", ">", "|"]
addSpace += ["@", "#", "$", "%","^", "&", "*", "=", "_"]
addSpace += ["0","1","2","3","4","5","6","7","8","9"]
toSpace = ["\n"]
for aa in addSpace:
inputText = inputText.replace(aa," "+aa+" ")
for tt in toSpace:
inputText = inputText.replace(tt," ")
#
inputList = inputText.split()
cleanText = " ".join(inputList)
vocabulary = list(set(inputList))
#
print("# -------------------------------")
print(cleanText)
print("# -------------------------------")
print(vocabulary)
print("# -------------------------------")
#
# main
#
rawText = """In 2024, the physics Nobel prize was
awarded to \nGeoff Hinton and John Hopfield
(did they really receive $1M each?)."""
tokenSpacy(rawText)
tokenSimple(rawText)
#!/usr/bin/env python3
# text preprocessing/cleaning/parsing
# pip install beautifulsoup4 lxml
import spacy
from spacy import displacy
from urlextract import URLExtract
import re # regular expressions
from bs4 import BeautifulSoup
def html_handling(inputText):
"straightfoward html tag removeal"
soup = BeautifulSoup(inputText, 'lxml')
return soup.get_text()
def hashtag_handling(inputText, hashToken="HASH"):
"substitutes hashtags"
inputTags = re.findall(r"#\w+", inputText)
for hh in inputTags:
inputText = inputText.replace(hh, hashToken)
return inputText, inputTags
def email_handling(inputText, mailToken="EMAIL"):
"substitutes email addresses"
inputMails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", inputText)
for ee in inputMails:
inputText = inputText.replace(ee, mailToken)
return inputText, inputMails
def URL_handling(inputText, URLtoken="URL"):
"substitutes all links in input text by a given token"
extractor = URLExtract()
inputLinks = extractor.find_urls(inputText)
for ll in inputLinks:
inputText = inputText.replace(ll, URLtoken)
return inputText, inputLinks
def tokenSpacy(inputText):
"""Named Entity Recognition (NER) with spacy.
Returns a single-line string."""
NER = spacy.load("en_core_web_sm")
textSpacy = NER(inputText)
cleanText = ""
for token in textSpacy:
cleanText += token.text + " "
for word in textSpacy.ents:
if (word.label_=="PERSON"): # no real names
cleanText = cleanText.replace(word.text, word.label_)
cleanText = " ".join(cleanText.splitlines()) # only a single line
cleanText = " ".join(cleanText.split()) # only single white spaces
return cleanText
#
# main
#
rawText = """In 2024, the physics Nobel prize #nobelPrize
was awarded to Geoff Hinton #geoffHinton and
John Hopfield #johnHopfield. <br>For more
information see https://www.nobelprize.org/,
or journalist@newspaper.universe."""
if (1==2):
rawText = """It follows from x<y that y>x."""
print(rawText, "\n")
rawText = html_handling(rawText)
print(rawText, "\n")
rawText, _ = email_handling(rawText)
print(rawText, "\n")
rawText, _ = hashtag_handling(rawText)
print(rawText, "\n")
rawText, _ = URL_handling(rawText)
print(rawText, "\n")
cleanText = tokenSpacy(rawText)
print(cleanText)
spacy.cli.download("en_core_web_lg")from transformers import AutoTokenizer
#!/usr/bin/env python3
import spacy
# ===
# === model generated word vectors
# ===
#spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg") # large model, dim = 300
vocabulary = ["apple", "banana", "cherry", "date", "elderberry"]
word_vectors = {} # dictionary for word vectors
for word in vocabulary: # word vectors
token = nlp.vocab[word]
word_vectors[word] = token.vector
print(word_vectors["apple"])
print()
print(len(word_vectors["apple"]))
print("# ==============================")
# ===
# === dynamic word vectors
# ===
nlp = spacy.load("en_core_web_sm") # small, dynamic word vectors
sentence = "I like to eat an apple every morning."
doc = nlp(sentence)
for token in doc:
print(f"{token.text:12s} | {len(token.vector):5d}")
# print(f"{token.text:12s} | {len(token.vector):5d} | {token.vector}")
myDict[split][row][feature] string / int / string
#!/usr/bin/env python3
# IMDB movie rating dataset
# 25 000 train/test, 31/32MByte
# https://huggingface.co/datasets/stanfordnlp/imdb
from datasets import load_dataset, load_from_disk
import os
# Load the IMDB dataset
def download_data(dataSetName = "imdb"):
"""download from Huggingface if not available locally"""
localDatasetPath = dataSetName + "_all"
yesLocal = True if os.path.exists(localDatasetPath)\
else False
#
MyDataset = None
if (yesLocal):
print(f'# loading {dataSetName} from file')
MyDataset = load_from_disk(localDatasetPath)
else:
print(f'# downloading {dataSetName} from Huggingface')
MyDataset = load_dataset(dataSetName)
MyDataset.save_to_disk(localDatasetPath)
#
print(MyDataset)
for split in MyDataset.keys(): # slits and features
print("# split ", split)
print("# features ", MyDataset[split].features)
print()
#
if not yesLocal: # specific to IMDB
for split in MyDataset.keys():
fileName = dataSetName + "." + split
with open(fileName, 'w', encoding='utf-8') as f:
for example in MyDataset[split]:
f.write(f"{example['label']:3d} {example['text']}\n")
#
return MyDataset
#
# main
#
main_dataset = download_data()
# available datasets
if (1==2):
from huggingface_hub import list_datasets
with open("huggingface.sets", 'w') as f:
[f.write(f"{dataset.id}\n") for dataset in list_datasets()]
#!/usr/bin/env python3
# creating a privat dataset using
# the standard format
from datasets import Dataset, DatasetDict, load_from_disk
# defining data for each split
train_data = [
{'text': 'First training example', 'label': 0},
{'text': 'Second training example', 'label': 1}
]
test_data = [
{'text': 'First test example', 'label': 1},
{'text': 'Second test example', 'label': 0}
]
# creating datasets objects for each split
# :: could be stored separatly
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)
# DatasetDict combining splits
dataset_dict = DatasetDict(
{'train': train_dataset, 'test': test_dataset}
)
# saving entire DatasetDict
dataset_dict.save_to_disk('./my_dataset')
# loading entire dataset
loaded_dataset = load_from_disk('./my_dataset')
print(loaded_dataset)
#!/usr/bin/env python3
# generate and store delayed XOR time series
# x(t) = ( x(t-tau) + x(t-tau-1) ) % N
import random
from datasets import Dataset, DatasetDict, load_from_disk
class GP():
"global parameters"
tau = 1 # delay
N = 10 # number symbols
nTrain = 20 # number training data
nTest = 10 # number test data
lenTrain = 7 # length of sequences
lenTest = 6 #
def oneStep_NT(x, N, delay):
"""returns next token (NT time series)
on input: current sequence x"""
length = len(x)
pair = x[length-delay-1:length-delay+1]
return sum(pair)%N
def oneSequence(length):
"""returns a time series of length 'length',
starting with a random initial state"""
sequence = random.sample(range(0, GP.N), GP.tau+1)
for ll in range(length-GP.tau-1):
sequence.append(oneStep_NT(sequence, GP.N, GP.tau))
return sequence
def makeDataset():
"""returns the entire dataset object"""
myType = "N" + str(GP.N) + "T" + str(GP.tau)
# generate training datasets
train_data = []
for _ in range(GP.nTrain):
newSeries = oneSequence(GP.lenTrain)
newRow = {"type":myType, "series":newSeries}
train_data.append(newRow)
# generate test datasets
test_data = []
for _ in range(GP.nTest):
newSeries = oneSequence(GP.lenTest)
newRow = {"type":myType, "series":newSeries}
test_data.append(newRow)
# make Dataset 'DatasetDict objects
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)
dataset_dict = DatasetDict(
{'train': train_dataset, 'test': test_dataset}
)
return dataset_dict
def testPrintingDataset(DS):
print()
print("# =============")
print("# test printing")
print("# =============")
for split_name, split_dataset in DS.items():
print(f"Split: {split_name}, Size: {len(split_dataset)}")
for example in split_dataset.select(range(2)):
print(example["series"])
print()
#
# main
#
print(dataset_dict:=makeDataset())
# saving entire DatasetDict
dataset_dict.save_to_disk('./my_dataset')
# loading entire dataset
loaded_dataset = load_from_disk('./my_dataset')
# print(loaded_dataset)
# test printing
testPrintingDataset(loaded_dataset)