#!/usr/bin/python3
# written by andrewt@unsw.edu.au as a COMP(2041|9044) example

# fetch and print the text of a web page
# using HTML parser BeautifulSoup

import re
import sys
import urllib.request
import bs4 as BeautifulSoup

IGNORE_WEBPAGE_ELEMENTS = set("[document] head meta style script title".split())

for url in sys.argv[1:]:
    response = urllib.request.urlopen(url)
    webpage = response.read().decode()
    soup = BeautifulSoup.BeautifulSoup(webpage, "html5lib")
    for element in soup.findAll(text=True):
        parent = element.parent.name.lower()
        if parent in IGNORE_WEBPAGE_ELEMENTS:
            continue
        text = element.getText()
        # remove empty lines and leading whitespace
        text = re.sub(r"\n\s+", "\n", element)
        text = text.strip()
        if text:
            print(text)