#!/usr/bin/python3 # written by andrewt@unsw.edu.au as a COMP(2041|9044) example # fetch and print the text of a web page # using HTML parser BeautifulSoup import re import sys import urllib.request import bs4 as BeautifulSoup IGNORE_WEBPAGE_ELEMENTS = set("[document] head meta style script title".split()) for url in sys.argv[1:]: response = urllib.request.urlopen(url) webpage = response.read().decode() soup = BeautifulSoup.BeautifulSoup(webpage, "html5lib") for element in soup.findAll(text=True): parent = element.parent.name.lower() if parent in IGNORE_WEBPAGE_ELEMENTS: continue text = element.getText() # remove empty lines and leading whitespace text = re.sub(r"\n\s+", "\n", element) text = text.strip() if text: print(text)