morss: get_page to act as a basic proxy (for iframes)

2020-04-04 16:37:15 +02:00
parent 1653394cf7
commit 4a88886767
3 changed files with 39 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ You do need:

 - [python](http://www.python.org/) >= 2.6 (python 3 is supported)
 - [lxml](http://lxml.de/) for xml parsing
+- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
 - [dateutil](http://labix.org/python-dateutil) to parse feed dates
 - [chardet](https://pypi.python.org/pypi/chardet)
 - [six](https://pypi.python.org/pypi/six), a dependency of chardet
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -10,6 +10,7 @@ import re

 import lxml.etree
 import lxml.html
+from bs4 import BeautifulSoup

 from . import feeds
 from . import feedify
@@ -651,7 +652,43 @@ def cgi_file_handler(environ, start_response, app):
        return app(environ, start_response)


+def cgi_page(environ, start_response):
+    url, options = cgi_parse_environ(environ)
+
+    # get page
+    PROTOCOL = ['http', 'https']
+
+    if urlparse(url).scheme not in ['http', 'https']:
+        url = 'http://' + url
+
+    con = crawler.custom_handler().open(url)
+    data = con.read()
+
+    contenttype = con.info().get('Content-Type', '').split(';')[0]
+
+    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
+        html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
+        html.make_links_absolute(url)
+
+        kill_tags = ['script', 'iframe', 'noscript']
+
+        for tag in kill_tags:
+            for elem in html.xpath('//'+tag):
+                elem.getparent().remove(elem)
+
+        output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
+
+    else:
+        output = None
+
+    # return html page
+    headers = {'status': '200 OK', 'content-type': 'text/html'}
+    start_response(headers['status'], list(headers.items()))
+    return [output]
+
+
 dispatch_table = {
+    'getpage': cgi_page
    }


--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 lxml
+bs4
 python-dateutil <= 1.5
 chardet
 pymysql