#!/usr/bin/env python3

# pip install -U pip setuptools wheel
# pip install -U spacy
# python -m spacy download en_core_web_sm
#                          small english model

import spacy
from spacy import displacy

def tokenSpacy(inputText):
  """Named Entity Recognition (NER) with spacy"""
  NER = spacy.load("en_core_web_sm")
  textSpacy = NER(inputText)

  print("# ===============================")
  cleanText = ""
  for token in textSpacy:
     print(f'{token.text:10s} | {token.pos_:10s} | {token.tag_:10s} | {token.dep_:10s} | ',   f'{token.is_alpha:b}  {token.is_stop:b}')
     cleanText += " " + token.text
  print("# ===============================")
  for word in textSpacy.ents:
     print(f'{word.text:15s} {word.label_:12s}')
     if (word.label_=="PERSON"):                              # no real names
       cleanText =  cleanText.replace(word.text, word.label_)
  print("# ===============================")
  print(cleanText)
  print("# ===============================")
  svgIm = displacy.render(textSpacy, style="dep")             # visualization
  open("foo.svg", "w").write(svgIm)                           # export svg file


def tokenSimple(inputText):
  """cleans and tokenize a given input text; hands-on"""
  addSpace  = [".", ",", ";", ":","!", "?", "'", '"', "\\", "/"]
  addSpace += ["(", ")", "[", "]","{", "}", "<", ">", "|"]
  addSpace += ["@", "#", "$", "%","^", "&", "*", "=", "_"]
  addSpace += ["0","1","2","3","4","5","6","7","8","9"]
  toSpace   = ["\n"]
  for aa in addSpace:
    inputText = inputText.replace(aa," "+aa+" ")
  for tt in toSpace:
    inputText = inputText.replace(tt," ")
#
  inputList  = inputText.split()
  cleanText  = " ".join(inputList)        
  vocabulary = list(set(inputList))
#
  print("# -------------------------------")
  print(cleanText)
  print("# -------------------------------")
  print(vocabulary)
  print("# -------------------------------")

#
# main
#

rawText = """In 2024, the physics Nobel prize was 
             awarded to \nGeoff Hinton and John Hopfield 
             (did they really receive $1M each?)."""

tokenSpacy(rawText)
tokenSimple(rawText)
