morss: get_page to act as a basic proxy (for iframes)

morss: cgi_dispatcher to be able to create extra functions
morss: move url/options parsing to own function
2020-04-04 16:37:15 +02:00 · 2020-04-04 16:35:16 +02:00 · 2020-04-04 16:33:52 +02:00 · 2020-04-04 16:21:37 +02:00 · 2020-04-04 16:20:39 +02:00 · 2020-04-03 17:47:19 +02:00
4 changed files with 75 additions and 5 deletions
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ You do need:

 - [python](http://www.python.org/) >= 2.6 (python 3 is supported)
 - [lxml](http://lxml.de/) for xml parsing
+- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
 - [dateutil](http://labix.org/python-dateutil) to parse feed dates
 - [chardet](https://pypi.python.org/pypi/chardet)
 - [six](https://pypi.python.org/pypi/six), a dependency of chardet
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -10,6 +10,7 @@ import re

 import lxml.etree
 import lxml.html
+from bs4 import BeautifulSoup

 from . import feeds
 from . import feedify
@@ -25,13 +26,13 @@ try:
    # python 2
    from Queue import Queue
    from httplib import HTTPException
-    from urllib import quote_plus
+    from urllib import unquote
    from urlparse import urlparse, urljoin, parse_qs
 except ImportError:
    # python 3
    from queue import Queue
    from http.client import HTTPException
-    from urllib.parse import quote_plus
+    from urllib.parse import unquote
    from urllib.parse import urlparse, urljoin, parse_qs

 LIM_ITEM = 100  # deletes what's beyond
@@ -170,6 +171,11 @@ def ItemFix(item, feedurl='/'):
        item.link = parse_qs(urlparse(item.link).query)['url'][0]
        log(item.link)

+    # pocket
+    if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
+        item.link = parse_qs(urlparse(item.link).query)['url'][0]
+        log(item.link)
+
    # facebook
    if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
        item.link = parse_qs(urlparse(item.link).query)['u'][0]
@@ -503,8 +509,9 @@ def process(url, cache=None, options=None):
    return FeedFormat(rss, options)


-def cgi_app(environ, start_response):
+def cgi_parse_environ(environ):
    # get options
+
    if 'REQUEST_URI' in environ:
        url = environ['REQUEST_URI'][1:]
    else:
@@ -518,7 +525,7 @@ def cgi_app(environ, start_response):
    if url.startswith(':'):
        split = url.split('/', 1)

-        raw_options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
+        raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]

        if len(split) > 1:
            url = split[1]
@@ -530,11 +537,18 @@ def cgi_app(environ, start_response):

    # init
    options = Options(filterOptions(parseOptions(raw_options)))
-    headers = {}

    global DEBUG
    DEBUG = options.debug

+    return (url, options)
+
+
+def cgi_app(environ, start_response):
+    url, options = cgi_parse_environ(environ)
+
+    headers = {}
+
    # headers
    headers['status'] = '200 OK'
    headers['cache-control'] = 'max-age=%s' % DELAY
@@ -638,6 +652,57 @@ def cgi_file_handler(environ, start_response, app):
        return app(environ, start_response)


+def cgi_page(environ, start_response):
+    url, options = cgi_parse_environ(environ)
+
+    # get page
+    PROTOCOL = ['http', 'https']
+
+    if urlparse(url).scheme not in ['http', 'https']:
+        url = 'http://' + url
+
+    con = crawler.custom_handler().open(url)
+    data = con.read()
+
+    contenttype = con.info().get('Content-Type', '').split(';')[0]
+
+    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
+        html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
+        html.make_links_absolute(url)
+
+        kill_tags = ['script', 'iframe', 'noscript']
+
+        for tag in kill_tags:
+            for elem in html.xpath('//'+tag):
+                elem.getparent().remove(elem)
+
+        output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
+
+    else:
+        output = None
+
+    # return html page
+    headers = {'status': '200 OK', 'content-type': 'text/html'}
+    start_response(headers['status'], list(headers.items()))
+    return [output]
+
+
+dispatch_table = {
+    'getpage': cgi_page
+    }
+
+
+@middleware
+def cgi_dispatcher(environ, start_response, app):
+    url, options = cgi_parse_environ(environ)
+
+    for key in dispatch_table.keys():
+        if key in options:
+            return dispatch_table[key](environ, start_response)
+
+    return app(environ, start_response)
+
+
@middleware
 def cgi_error_handler(environ, start_response, app):
    try:
@@ -693,6 +758,7 @@ def main():
        # mod_cgi

        app = cgi_app
+        app = cgi_dispatcher(app)
        app = cgi_error_handler(app)
        app = cgi_encode(app)

@@ -714,6 +780,7 @@ def main():

        app = cgi_app
        app = cgi_file_handler(app)
+        app = cgi_dispatcher(app)
        app = cgi_error_handler(app)
        app = cgi_encode(app)

--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 lxml
+bs4
 python-dateutil <= 1.5
 chardet
 pymysql
--- a/www/sheet.xsl
+++ b/www/sheet.xsl
@@ -13,6 +13,7 @@
 				body {
 					overflow-wrap: anywhere;
 					word-wrap: anywhere;
+					font-family: sans;
 				}

 				#url {
Author	SHA1	Message	Date
pictuga	4a88886767	morss: get_page to act as a basic proxy (for iframes)	2020-04-04 16:37:15 +02:00
pictuga	1653394cf7	morss: cgi_dispatcher to be able to create extra functions	2020-04-04 16:35:16 +02:00
pictuga	a8a90cf414	morss: move url/options parsing to own function For future re-use	2020-04-04 16:33:52 +02:00
pictuga	bdbaf0f8a7	morss/cgi: fix handling of special chars in url	2020-04-04 16:21:37 +02:00
pictuga	d0e447a2a6	ItemFix: clean up Pocket links	2020-04-04 16:20:39 +02:00
pictuga	e6817e01b4	sheet.xsl: set font to "sans" Browsers don't all have the same default font. Overriding for consistency	2020-04-03 17:47:19 +02:00