Compare commits

..

6 Commits

Author SHA1 Message Date
pictuga 4a88886767 morss: get_page to act as a basic proxy (for iframes) 2020-04-04 16:37:15 +02:00
pictuga 1653394cf7 morss: cgi_dispatcher to be able to create extra functions 2020-04-04 16:35:16 +02:00
pictuga a8a90cf414 morss: move url/options parsing to own function
For future re-use
2020-04-04 16:33:52 +02:00
pictuga bdbaf0f8a7 morss/cgi: fix handling of special chars in url 2020-04-04 16:21:37 +02:00
pictuga d0e447a2a6 ItemFix: clean up Pocket links 2020-04-04 16:20:39 +02:00
pictuga e6817e01b4 sheet.xsl: set font to "sans"
Browsers don't all have the same default font. Overriding for consistency
2020-04-03 17:47:19 +02:00
4 changed files with 75 additions and 5 deletions

View File

@ -48,6 +48,7 @@ You do need:
- [python](http://www.python.org/) >= 2.6 (python 3 is supported) - [python](http://www.python.org/) >= 2.6 (python 3 is supported)
- [lxml](http://lxml.de/) for xml parsing - [lxml](http://lxml.de/) for xml parsing
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
- [dateutil](http://labix.org/python-dateutil) to parse feed dates - [dateutil](http://labix.org/python-dateutil) to parse feed dates
- [chardet](https://pypi.python.org/pypi/chardet) - [chardet](https://pypi.python.org/pypi/chardet)
- [six](https://pypi.python.org/pypi/six), a dependency of chardet - [six](https://pypi.python.org/pypi/six), a dependency of chardet

View File

@ -10,6 +10,7 @@ import re
import lxml.etree import lxml.etree
import lxml.html import lxml.html
from bs4 import BeautifulSoup
from . import feeds from . import feeds
from . import feedify from . import feedify
@ -25,13 +26,13 @@ try:
# python 2 # python 2
from Queue import Queue from Queue import Queue
from httplib import HTTPException from httplib import HTTPException
from urllib import quote_plus from urllib import unquote
from urlparse import urlparse, urljoin, parse_qs from urlparse import urlparse, urljoin, parse_qs
except ImportError: except ImportError:
# python 3 # python 3
from queue import Queue from queue import Queue
from http.client import HTTPException from http.client import HTTPException
from urllib.parse import quote_plus from urllib.parse import unquote
from urllib.parse import urlparse, urljoin, parse_qs from urllib.parse import urlparse, urljoin, parse_qs
LIM_ITEM = 100 # deletes what's beyond LIM_ITEM = 100 # deletes what's beyond
@ -170,6 +171,11 @@ def ItemFix(item, feedurl='/'):
item.link = parse_qs(urlparse(item.link).query)['url'][0] item.link = parse_qs(urlparse(item.link).query)['url'][0]
log(item.link) log(item.link)
# pocket
if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
item.link = parse_qs(urlparse(item.link).query)['url'][0]
log(item.link)
# facebook # facebook
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'): if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
item.link = parse_qs(urlparse(item.link).query)['u'][0] item.link = parse_qs(urlparse(item.link).query)['u'][0]
@ -503,8 +509,9 @@ def process(url, cache=None, options=None):
return FeedFormat(rss, options) return FeedFormat(rss, options)
def cgi_app(environ, start_response): def cgi_parse_environ(environ):
# get options # get options
if 'REQUEST_URI' in environ: if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:] url = environ['REQUEST_URI'][1:]
else: else:
@ -518,7 +525,7 @@ def cgi_app(environ, start_response):
if url.startswith(':'): if url.startswith(':'):
split = url.split('/', 1) split = url.split('/', 1)
raw_options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:] raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
if len(split) > 1: if len(split) > 1:
url = split[1] url = split[1]
@ -530,11 +537,18 @@ def cgi_app(environ, start_response):
# init # init
options = Options(filterOptions(parseOptions(raw_options))) options = Options(filterOptions(parseOptions(raw_options)))
headers = {}
global DEBUG global DEBUG
DEBUG = options.debug DEBUG = options.debug
return (url, options)
def cgi_app(environ, start_response):
url, options = cgi_parse_environ(environ)
headers = {}
# headers # headers
headers['status'] = '200 OK' headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY headers['cache-control'] = 'max-age=%s' % DELAY
@ -638,6 +652,57 @@ def cgi_file_handler(environ, start_response, app):
return app(environ, start_response) return app(environ, start_response)
def cgi_page(environ, start_response):
url, options = cgi_parse_environ(environ)
# get page
PROTOCOL = ['http', 'https']
if urlparse(url).scheme not in ['http', 'https']:
url = 'http://' + url
con = crawler.custom_handler().open(url)
data = con.read()
contenttype = con.info().get('Content-Type', '').split(';')[0]
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
html.make_links_absolute(url)
kill_tags = ['script', 'iframe', 'noscript']
for tag in kill_tags:
for elem in html.xpath('//'+tag):
elem.getparent().remove(elem)
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
else:
output = None
# return html page
headers = {'status': '200 OK', 'content-type': 'text/html'}
start_response(headers['status'], list(headers.items()))
return [output]
dispatch_table = {
'getpage': cgi_page
}
@middleware
def cgi_dispatcher(environ, start_response, app):
url, options = cgi_parse_environ(environ)
for key in dispatch_table.keys():
if key in options:
return dispatch_table[key](environ, start_response)
return app(environ, start_response)
@middleware @middleware
def cgi_error_handler(environ, start_response, app): def cgi_error_handler(environ, start_response, app):
try: try:
@ -693,6 +758,7 @@ def main():
# mod_cgi # mod_cgi
app = cgi_app app = cgi_app
app = cgi_dispatcher(app)
app = cgi_error_handler(app) app = cgi_error_handler(app)
app = cgi_encode(app) app = cgi_encode(app)
@ -714,6 +780,7 @@ def main():
app = cgi_app app = cgi_app
app = cgi_file_handler(app) app = cgi_file_handler(app)
app = cgi_dispatcher(app)
app = cgi_error_handler(app) app = cgi_error_handler(app)
app = cgi_encode(app) app = cgi_encode(app)

View File

@ -1,4 +1,5 @@
lxml lxml
bs4
python-dateutil <= 1.5 python-dateutil <= 1.5
chardet chardet
pymysql pymysql

View File

@ -13,6 +13,7 @@
body { body {
overflow-wrap: anywhere; overflow-wrap: anywhere;
word-wrap: anywhere; word-wrap: anywhere;
font-family: sans;
} }
#url { #url {