diff --git a/README.md b/README.md index 7b36697..aa35f68 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ You do need: - [python](http://www.python.org/) >= 2.6 (python 3 is supported) - [lxml](http://lxml.de/) for xml parsing +- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages - [dateutil](http://labix.org/python-dateutil) to parse feed dates - [chardet](https://pypi.python.org/pypi/chardet) - [six](https://pypi.python.org/pypi/six), a dependency of chardet diff --git a/morss/morss.py b/morss/morss.py index 37a7185..8526320 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -10,6 +10,7 @@ import re import lxml.etree import lxml.html +from bs4 import BeautifulSoup from . import feeds from . import feedify @@ -651,7 +652,43 @@ def cgi_file_handler(environ, start_response, app): return app(environ, start_response) +def cgi_page(environ, start_response): + url, options = cgi_parse_environ(environ) + + # get page + PROTOCOL = ['http', 'https'] + + if urlparse(url).scheme not in ['http', 'https']: + url = 'http://' + url + + con = crawler.custom_handler().open(url) + data = con.read() + + contenttype = con.info().get('Content-Type', '').split(';')[0] + + if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']: + html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify()) + html.make_links_absolute(url) + + kill_tags = ['script', 'iframe', 'noscript'] + + for tag in kill_tags: + for elem in html.xpath('//'+tag): + elem.getparent().remove(elem) + + output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') + + else: + output = None + + # return html page + headers = {'status': '200 OK', 'content-type': 'text/html'} + start_response(headers['status'], list(headers.items())) + return [output] + + dispatch_table = { + 'getpage': cgi_page } diff --git a/requirements.txt b/requirements.txt index f569843..24a03c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ lxml +bs4 python-dateutil <= 1.5 chardet pymysql