Compare commits

..

8 Commits

Author SHA1 Message Date
pictuga 9e7b9d95ee feeds: properly use html template 2020-04-09 20:00:51 +02:00
pictuga 987a719c4e feeds: try all parsers regardless of contenttype
Turns out some websites send the wrong contenttype (json for html, html for xml, etc.)
2020-04-09 19:17:51 +02:00
pictuga 47b33f4baa morss: specify server output encoding 2020-04-09 19:10:45 +02:00
pictuga 3c7f512583 feeds: handle several errors 2020-04-09 19:09:10 +02:00
pictuga a32f5a8536 readabilite: add debug option (also used by :get) 2020-04-09 19:08:13 +02:00
pictuga 63a06524b7 morss: various encoding fixes 2020-04-09 19:06:51 +02:00
pictuga b0f80c6d3c morss: fix csv output encoding 2020-04-09 19:05:50 +02:00
pictuga 78cea10ead morss: replace :getpage with :get
Also provides readabilite debugging
2020-04-09 18:43:20 +02:00
5 changed files with 66 additions and 62 deletions

View File

@ -99,7 +99,7 @@ item_link = ./a/@href
item_desc = ./div[class=desc] item_desc = ./div[class=desc]
item_content = ./div[class=content] item_content = ./div[class=content]
base = <!DOCTYPE html> <html> <head> <title>Feed reader by morss</title> <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" /> </head> <body> <div id="header"> <h1>@feed.title</h1> <h2>@feed.desc</h2> <p>- via morss</p> </div> <div id="content"> <div class="item"> <a class="title link" href="@item.link" target="_blank">@item.title</a> <div class="desc">@item.desc</div> <div class="content">@item.content</div> </div> </div> <script> var items = document.getElementsByClassName('item') for (var i in items) items[i].onclick = function() { this.classList.toggle('active') document.body.classList.toggle('noscroll') } </script> </body> </html> base = file:reader.html.template
[twitter] [twitter]
mode = html mode = html

View File

@ -47,7 +47,11 @@ def parse_rules(filename=None):
for section in rules.keys(): for section in rules.keys():
for arg in rules[section].keys(): for arg in rules[section].keys():
if '\n' in rules[section][arg]: if rules[section][arg].startswith('file:'):
import_file = os.path.join(os.path.dirname(__file__), rules[section][arg][5:])
rules[section][arg] = open(import_file).read()
elif '\n' in rules[section][arg]:
rules[section][arg] = rules[section][arg].split('\n')[1:] rules[section][arg] = rules[section][arg].split('\n')[1:]
return rules return rules
@ -69,19 +73,13 @@ def parse(data, url=None, mimetype=None, encoding=None):
parser = [x for x in parsers if x.mode == ruleset['mode']][0] parser = [x for x in parsers if x.mode == ruleset['mode']][0]
return parser(data, ruleset, encoding=encoding) return parser(data, ruleset, encoding=encoding)
# 2) Look for a parser based on mimetype # 2) Try each and every parser
if mimetype is not None:
parser_candidates = [x for x in parsers if mimetype in x.mimetype]
if mimetype is None or len(parser_candidates) == 0:
parser_candidates = parsers
# 3) Look for working ruleset for given parser # 3) Look for working ruleset for given parser
# 3a) See if parsing works # 3a) See if parsing works
# 3b) See if .items matches anything # 3b) See if .items matches anything
for parser in parser_candidates: for parser in parsers:
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x] ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
# 'path' as they should have been caught beforehands # 'path' as they should have been caught beforehands
@ -150,15 +148,15 @@ class ParserBase(object):
c = csv.writer(out, dialect=csv.excel) c = csv.writer(out, dialect=csv.excel)
for item in self.items: for item in self.items:
row = [getattr(item, x) for x in item.dic] c.writerow([getattr(item, x) for x in item.dic])
if encoding != 'unicode':
row = [x.encode(encoding) if isinstance(x, unicode) else x for x in row]
c.writerow(row)
out.seek(0) out.seek(0)
return out.read() out = out.read()
if encoding != 'unicode':
out = out.encode(encoding)
return out
def tohtml(self, **k): def tohtml(self, **k):
return self.convert(FeedHTML).tostring(**k) return self.convert(FeedHTML).tostring(**k)
@ -269,8 +267,15 @@ class ParserBase(object):
except AttributeError: except AttributeError:
# does not exist, have to create it # does not exist, have to create it
self.rule_create(self.rules[rule_name]) try:
self.rule_set(self.rules[rule_name], value) self.rule_create(self.rules[rule_name])
except AttributeError:
# no way to create it, give up
pass
else:
self.rule_set(self.rules[rule_name], value)
def rmv(self, rule_name): def rmv(self, rule_name):
# easy deleter # easy deleter
@ -469,6 +474,9 @@ class ParserHTML(ParserXML):
element = deepcopy(match) element = deepcopy(match)
match.getparent().append(element) match.getparent().append(element)
else:
raise AttributeError('no way to create item')
def parse_time(value): def parse_time(value):
if value is None or value == 0: if value is None or value == 0:

View File

@ -471,10 +471,10 @@ def FeedFormat(rss, options, encoding='utf-8'):
else: else:
if options.indent: if options.indent:
return rss.torss(xml_declaration=True, encoding=encoding, pretty_print=True) return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding, pretty_print=True)
else: else:
return rss.torss(xml_declaration=True, encoding=encoding) return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding)
def process(url, cache=None, options=None): def process(url, cache=None, options=None):
@ -554,6 +554,8 @@ def cgi_app(environ, start_response):
else: else:
headers['content-type'] = 'text/xml' headers['content-type'] = 'text/xml'
headers['content-type'] += '; charset=utf-8'
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db')) crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
# get the work done # get the work done
@ -636,7 +638,7 @@ def cgi_file_handler(environ, start_response, app):
return app(environ, start_response) return app(environ, start_response)
def cgi_page(environ, start_response): def cgi_get(environ, start_response):
url, options = cgi_parse_environ(environ) url, options = cgi_parse_environ(environ)
# get page # get page
@ -648,28 +650,35 @@ def cgi_page(environ, start_response):
data, con, contenttype, encoding = crawler.adv_get(url=url) data, con, contenttype, encoding = crawler.adv_get(url=url)
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']: if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
html = readabilite.parse(data, encoding=encoding) if options.get == 'page':
html.make_links_absolute(con.geturl()) html = readabilite.parse(data, encoding=encoding)
html.make_links_absolute(con.geturl())
kill_tags = ['script', 'iframe', 'noscript'] kill_tags = ['script', 'iframe', 'noscript']
for tag in kill_tags: for tag in kill_tags:
for elem in html.xpath('//'+tag): for elem in html.xpath('//'+tag):
elem.getparent().remove(elem) elem.getparent().remove(elem)
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
elif options.get == 'article':
output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug)
else:
raise MorssException('no :get option passed')
else: else:
output = None output = data
# return html page # return html page
headers = {'status': '200 OK', 'content-type': 'text/html'} headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'}
start_response(headers['status'], list(headers.items())) start_response(headers['status'], list(headers.items()))
return [output] return [output]
dispatch_table = { dispatch_table = {
'getpage': cgi_page 'get': cgi_get,
} }
@ -717,10 +726,10 @@ def cli_app():
url = UrlFix(url) url = UrlFix(url)
rss = FeedFetch(url, options) rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options) rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options) out = FeedFormat(rss, options, 'unicode')
if not options.silent: if not options.silent:
print(out.decode('utf-8', 'replace') if isinstance(out, bytes) else out) print(out)
log('done') log('done')

View File

@ -307,7 +307,7 @@ def get_best_node(ranked_grades):
return lowest return lowest
def get_article(data, url=None, encoding=None): def get_article(data, url=None, encoding=None, debug=False):
" Input a raw html string, returns a raw html string of the article " " Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding) html = parse(data, encoding)
@ -319,16 +319,17 @@ def get_article(data, url=None, encoding=None):
best = get_best_node(scores) best = get_best_node(scores)
keep_threshold = percentile([x[1] for x in scores], 0.1) if not debug:
clean_root(best, keep_threshold) keep_threshold = percentile([x[1] for x in scores], 0.1)
clean_root(best, keep_threshold)
wc = count_words(best.text_content()) wc = count_words(best.text_content())
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')])) wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
if wc - wca < 50 or float(wca) / wc > 0.3: if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
return None return None
if url: if url:
best.make_links_absolute(url) best.make_links_absolute(url)
return lxml.etree.tostring(best, pretty_print=True) return lxml.etree.tostring(best if not debug else html, pretty_print=True)

View File

@ -1,11 +1,9 @@
@require(feed)
<!DOCTYPE html> <!DOCTYPE html>
<html> <html>
<head> <head>
<title>@feed.title &#8211; via morss</title> <title>Feed reader by morss</title>
<meta charset="UTF-8" />
<meta name="description" content="@feed.desc (via morss)" />
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" /> <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
<meta name="robots" content="noindex" />
<style type="text/css"> <style type="text/css">
/* columns - from https://thisisdallas.github.io/Simple-Grid/simpleGrid.css */ /* columns - from https://thisisdallas.github.io/Simple-Grid/simpleGrid.css */
@ -32,7 +30,7 @@
padding-right: 20px; /* column-space */ padding-right: 20px; /* column-space */
} }
@@media handheld, only screen and (max-width: 767px) { /* @@ to escape from the template engine */ @media handheld, only screen and (max-width: 767px) {
#content { #content {
width: 100%; width: 100%;
min-width: 0; min-width: 0;
@ -82,6 +80,7 @@
#content { #content {
text-align: justify; text-align: justify;
line-height: 1.5em;
} }
.item .title { .item .title {
@ -171,30 +170,17 @@
<body> <body>
<div id="header"> <div id="header">
<h1>@feed.title</h1> <h1>RSS feed</h1>
@if feed.desc: <h2>with full text articles</h2>
<h2>@feed.desc</h2>
@end
<p>- via morss</p> <p>- via morss</p>
</div> </div>
<div id="content"> <div id="content">
@for item in feed.items:
<div class="item"> <div class="item">
@if item.link: <a class="title link" href="@item.link" target="_blank"></a>
<a class="title link" href="@item.link" target="_blank">@item.title</a> <div class="desc"></div>
@else: <div class="content"></div>
<span class="title">@item.title</span>
@end
<div class="article">
@if item.content:
@item.content
@else:
@item.desc
@end
</div>
</div> </div>
@end
</div> </div>
<script> <script>