#!/usr/bin/env python3

# text preprocessing/cleaning/parsing
# pip install beautifulsoup4 lxml

import spacy
from spacy import displacy
from urlextract import URLExtract
import re                                 # regular expressions
from bs4 import BeautifulSoup

def html_handling(inputText):
  "straightfoward html tag removeal"
  soup = BeautifulSoup(inputText, 'lxml')
  return soup.get_text()


def hashtag_handling(inputText, hashToken="HASH"):
  "substitutes hashtags"
  inputTags = re.findall(r"#\w+", inputText)
  for hh in inputTags:
    inputText = inputText.replace(hh, hashToken)
  return inputText, inputTags


def email_handling(inputText, mailToken="EMAIL"):
  "substitutes email addresses"
  inputMails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", inputText)
  for ee in inputMails:
    inputText = inputText.replace(ee, mailToken)
  return inputText, inputMails


def URL_handling(inputText, URLtoken="URL"):
  "substitutes all links in input text by a given token"
  extractor = URLExtract()
  inputLinks = extractor.find_urls(inputText)
  for ll in inputLinks:
    inputText = inputText.replace(ll, URLtoken)
  return inputText, inputLinks


def tokenSpacy(inputText):
  """Named Entity Recognition (NER) with spacy.
     Returns a single-line string."""
  NER = spacy.load("en_core_web_sm")
  textSpacy = NER(inputText)

  cleanText = ""
  for token in textSpacy:
     cleanText += token.text + " "
  for word in textSpacy.ents:
     if (word.label_=="PERSON"):                   # no real names
       cleanText =  cleanText.replace(word.text, word.label_)
  cleanText = " ".join(cleanText.splitlines())     # only a single line
  cleanText = " ".join(cleanText.split())          # only single white spaces
  return cleanText

#
# main
#

rawText = """In 2024, the physics Nobel prize #nobelPrize
             was awarded to Geoff Hinton #geoffHinton and 
             John Hopfield #johnHopfield. <br>For more 
             information see https://www.nobelprize.org/, 
             or journalist@newspaper.universe."""
if (1==2):
  rawText = """It follows from x<y that y>x."""
print(rawText, "\n")

rawText = html_handling(rawText)
print(rawText, "\n")

rawText, _ = email_handling(rawText)
print(rawText, "\n")

rawText, _ = hashtag_handling(rawText)
print(rawText, "\n")

rawText, _ = URL_handling(rawText)
print(rawText, "\n")

cleanText = tokenSpacy(rawText)
print(cleanText)
