#!/usr/bin/env python3 # pip install -U pip setuptools wheel # pip install -U spacy # python -m spacy download en_core_web_sm # small english model import spacy from spacy import displacy def tokenSpacy(inputText): """Named Entity Recognition (NER) with spacy""" NER = spacy.load("en_core_web_sm") textSpacy = NER(inputText) print("# ===============================") cleanText = "" for token in textSpacy: print(f'{token.text:10s} | {token.pos_:10s} | {token.tag_:10s} | {token.dep_:10s} | ', f'{token.is_alpha:b} {token.is_stop:b}') cleanText += " " + token.text print("# ===============================") for word in textSpacy.ents: print(f'{word.text:15s} {word.label_:12s}') if (word.label_=="PERSON"): # no real names cleanText = cleanText.replace(word.text, word.label_) print("# ===============================") print(cleanText) print("# ===============================") svgIm = displacy.render(textSpacy, style="dep") # visualization open("foo.svg", "w").write(svgIm) # export svg file def tokenSimple(inputText): """cleans and tokenize a given input text; hands-on""" addSpace = [".", ",", ";", ":","!", "?", "'", '"', "\\", "/"] addSpace += ["(", ")", "[", "]","{", "}", "<", ">", "|"] addSpace += ["@", "#", "$", "%","^", "&", "*", "=", "_"] addSpace += ["0","1","2","3","4","5","6","7","8","9"] toSpace = ["\n"] for aa in addSpace: inputText = inputText.replace(aa," "+aa+" ") for tt in toSpace: inputText = inputText.replace(tt," ") # inputList = inputText.split() cleanText = " ".join(inputList) vocabulary = list(set(inputList)) # print("# -------------------------------") print(cleanText) print("# -------------------------------") print(vocabulary) print("# -------------------------------") # # main # rawText = """In 2024, the physics Nobel prize was awarded to \nGeoff Hinton and John Hopfield (did they really receive $1M each?).""" tokenSpacy(rawText) tokenSimple(rawText)
#!/usr/bin/env python3 # text preprocessing/cleaning/parsing # pip install beautifulsoup4 lxml import spacy from spacy import displacy from urlextract import URLExtract import re # regular expressions from bs4 import BeautifulSoup def html_handling(inputText): "straightfoward html tag removeal" soup = BeautifulSoup(inputText, 'lxml') return soup.get_text() def hashtag_handling(inputText, hashToken="HASH"): "substitutes hashtags" inputTags = re.findall(r"#\w+", inputText) for hh in inputTags: inputText = inputText.replace(hh, hashToken) return inputText, inputTags def email_handling(inputText, mailToken="EMAIL"): "substitutes email addresses" inputMails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", inputText) for ee in inputMails: inputText = inputText.replace(ee, mailToken) return inputText, inputMails def URL_handling(inputText, URLtoken="URL"): "substitutes all links in input text by a given token" extractor = URLExtract() inputLinks = extractor.find_urls(inputText) for ll in inputLinks: inputText = inputText.replace(ll, URLtoken) return inputText, inputLinks def tokenSpacy(inputText): """Named Entity Recognition (NER) with spacy. Returns a single-line string.""" NER = spacy.load("en_core_web_sm") textSpacy = NER(inputText) cleanText = "" for token in textSpacy: cleanText += token.text + " " for word in textSpacy.ents: if (word.label_=="PERSON"): # no real names cleanText = cleanText.replace(word.text, word.label_) cleanText = " ".join(cleanText.splitlines()) # only a single line cleanText = " ".join(cleanText.split()) # only single white spaces return cleanText # # main # rawText = """In 2024, the physics Nobel prize #nobelPrize was awarded to Geoff Hinton #geoffHinton and John Hopfield #johnHopfield. <br>For more information see https://www.nobelprize.org/, or journalist@newspaper.universe.""" if (1==2): rawText = """It follows from x<y that y>x.""" print(rawText, "\n") rawText = html_handling(rawText) print(rawText, "\n") rawText, _ = email_handling(rawText) print(rawText, "\n") rawText, _ = hashtag_handling(rawText) print(rawText, "\n") rawText, _ = URL_handling(rawText) print(rawText, "\n") cleanText = tokenSpacy(rawText) print(cleanText)
spacy.cli.download("en_core_web_lg")
from transformers import AutoTokenizer
#!/usr/bin/env python3 import spacy # === # === model generated word vectors # === #spacy.cli.download("en_core_web_lg") nlp = spacy.load("en_core_web_lg") # large model, dim = 300 vocabulary = ["apple", "banana", "cherry", "date", "elderberry"] word_vectors = {} # dictionary for word vectors for word in vocabulary: # word vectors token = nlp.vocab[word] word_vectors[word] = token.vector print(word_vectors["apple"]) print() print(len(word_vectors["apple"])) print("# ==============================") # === # === dynamic word vectors # === nlp = spacy.load("en_core_web_sm") # small, dynamic word vectors sentence = "I like to eat an apple every morning." doc = nlp(sentence) for token in doc: print(f"{token.text:12s} | {len(token.vector):5d}") # print(f"{token.text:12s} | {len(token.vector):5d} | {token.vector}")
myDict[split][row][feature]
string / int / string #!/usr/bin/env python3 # IMDB movie rating dataset # 25 000 train/test, 31/32MByte # https://huggingface.co/datasets/stanfordnlp/imdb from datasets import load_dataset, load_from_disk import os # Load the IMDB dataset def download_data(dataSetName = "imdb"): """download from Huggingface if not available locally""" localDatasetPath = dataSetName + "_all" yesLocal = True if os.path.exists(localDatasetPath)\ else False # MyDataset = None if (yesLocal): print(f'# loading {dataSetName} from file') MyDataset = load_from_disk(localDatasetPath) else: print(f'# downloading {dataSetName} from Huggingface') MyDataset = load_dataset(dataSetName) MyDataset.save_to_disk(localDatasetPath) # print(MyDataset) for split in MyDataset.keys(): # slits and features print("# split ", split) print("# features ", MyDataset[split].features) print() # if not yesLocal: # specific to IMDB for split in MyDataset.keys(): fileName = dataSetName + "." + split with open(fileName, 'w', encoding='utf-8') as f: for example in MyDataset[split]: f.write(f"{example['label']:3d} {example['text']}\n") # return MyDataset # # main # main_dataset = download_data() # available datasets if (1==2): from huggingface_hub import list_datasets with open("huggingface.sets", 'w') as f: [f.write(f"{dataset.id}\n") for dataset in list_datasets()]
#!/usr/bin/env python3 # creating a privat dataset using # the standard format from datasets import Dataset, DatasetDict, load_from_disk # defining data for each split train_data = [ {'text': 'First training example', 'label': 0}, {'text': 'Second training example', 'label': 1} ] test_data = [ {'text': 'First test example', 'label': 1}, {'text': 'Second test example', 'label': 0} ] # creating datasets objects for each split # :: could be stored separatly train_dataset = Dataset.from_list(train_data) test_dataset = Dataset.from_list(test_data) # DatasetDict combining splits dataset_dict = DatasetDict( {'train': train_dataset, 'test': test_dataset} ) # saving entire DatasetDict dataset_dict.save_to_disk('./my_dataset') # loading entire dataset loaded_dataset = load_from_disk('./my_dataset') print(loaded_dataset)
#!/usr/bin/env python3 # generate and store delayed XOR time series # x(t) = ( x(t-tau) + x(t-tau-1) ) % N import random from datasets import Dataset, DatasetDict, load_from_disk class GP(): "global parameters" tau = 1 # delay N = 10 # number symbols nTrain = 20 # number training data nTest = 10 # number test data lenTrain = 7 # length of sequences lenTest = 6 # def oneStep_NT(x, N, delay): """returns next token (NT time series) on input: current sequence x""" length = len(x) pair = x[length-delay-1:length-delay+1] return sum(pair)%N def oneSequence(length): """returns a time series of length 'length', starting with a random initial state""" sequence = random.sample(range(0, GP.N), GP.tau+1) for ll in range(length-GP.tau-1): sequence.append(oneStep_NT(sequence, GP.N, GP.tau)) return sequence def makeDataset(): """returns the entire dataset object""" myType = "N" + str(GP.N) + "T" + str(GP.tau) # generate training datasets train_data = [] for _ in range(GP.nTrain): newSeries = oneSequence(GP.lenTrain) newRow = {"type":myType, "series":newSeries} train_data.append(newRow) # generate test datasets test_data = [] for _ in range(GP.nTest): newSeries = oneSequence(GP.lenTest) newRow = {"type":myType, "series":newSeries} test_data.append(newRow) # make Dataset 'DatasetDict objects train_dataset = Dataset.from_list(train_data) test_dataset = Dataset.from_list(test_data) dataset_dict = DatasetDict( {'train': train_dataset, 'test': test_dataset} ) return dataset_dict def testPrintingDataset(DS): print() print("# =============") print("# test printing") print("# =============") for split_name, split_dataset in DS.items(): print(f"Split: {split_name}, Size: {len(split_dataset)}") for example in split_dataset.select(range(2)): print(example["series"]) print() # # main # print(dataset_dict:=makeDataset()) # saving entire DatasetDict dataset_dict.save_to_disk('./my_dataset') # loading entire dataset loaded_dataset = load_from_disk('./my_dataset') # print(loaded_dataset) # test printing testPrintingDataset(loaded_dataset)