Compare commits

..

8 Commits

Author SHA1 Message Date
pictuga d24734110a morss: convert all feeds to RSS
As html feeds might not contain some feeds, leading to data loss
2020-03-20 12:26:34 +01:00
pictuga a41c2a3a62 morss: fix twitter link detection 2020-03-20 12:26:19 +01:00
pictuga dd2651061f feeds & morss: clean up comments/empty lines 2020-03-20 12:25:48 +01:00
pictuga 912c323c40 feeds: make function output more consistent
e.g. setters return nothing, getters return sth relevant or None (i.e. no empty strings)
2020-03-20 12:23:15 +01:00
pictuga 5705a0be17 feeds: fix delete/rmv code 2020-03-20 12:22:07 +01:00
pictuga 4735ffba45 feeds: fix .convert auto-convert
To fix inheritance loophole
2020-03-20 12:20:41 +01:00
pictuga 08e39f5631 feeds: give simpler name to helper functions 2020-03-20 12:20:15 +01:00
pictuga 765a43511e feeds: remove unused import 2020-03-20 12:19:08 +01:00
2 changed files with 49 additions and 41 deletions

View File

@ -21,12 +21,10 @@ json.encoder.c_make_encoder = None
try: try:
# python 2 # python 2
from StringIO import StringIO from StringIO import StringIO
from urllib2 import urlopen
from ConfigParser import RawConfigParser from ConfigParser import RawConfigParser
except ImportError: except ImportError:
# python 3 # python 3
from io import StringIO from io import StringIO
from urllib.request import urlopen
from configparser import RawConfigParser from configparser import RawConfigParser
try: try:
@ -164,7 +162,7 @@ class ParserBase(object):
return self.convert(FeedHTML).tostring(**k) return self.convert(FeedHTML).tostring(**k)
def convert(self, TargetParser): def convert(self, TargetParser):
if isinstance(self, TargetParser): if type(self) == TargetParser:
return self return self
target = TargetParser() target = TargetParser()
@ -208,11 +206,11 @@ class ParserBase(object):
pass pass
def rule_remove(self, rule): def rule_remove(self, rule):
# remove node from its parent # remove node from its parent. Returns nothing
pass pass
def rule_set(self, rule, value): def rule_set(self, rule, value):
# value is always a str? # set the value. Returns nothing
pass pass
def rule_str(self, rule): def rule_str(self, rule):
@ -247,25 +245,30 @@ class ParserBase(object):
return self.rule_search_all(self.rules[rule_name]) return self.rule_search_all(self.rules[rule_name])
def get_str(self, rule_name): def get(self, rule_name):
# simple function to get nice text from the rule name # simple function to get nice text from the rule name
# for use in @property, ie. self.get_str('title') # for use in @property, ie. self.get('title')
if rule_name not in self.rules: if rule_name not in self.rules:
return None return None
return self.rule_str(self.rules[rule_name]) return self.rule_str(self.rules[rule_name]) or None
def set_str(self, rule_name, value): def set(self, rule_name, value):
# simple function to set nice text from the rule name. Returns nothing
if rule_name not in self.rules: if rule_name not in self.rules:
return None return
if value is None:
self.rmv(rule_name)
return
try: try:
return self.rule_set(self.rules[rule_name], value) self.rule_set(self.rules[rule_name], value)
except AttributeError: except AttributeError:
# does not exist, have to create it # does not exist, have to create it
self.rule_create(self.rules[rule_name]) self.rule_create(self.rules[rule_name])
return self.rule_set(self.rules[rule_name], value) self.rule_set(self.rules[rule_name], value)
def rmv(self, rule_name): def rmv(self, rule_name):
# easy deleter # easy deleter
@ -369,10 +372,6 @@ class ParserXML(ParserBase):
match.getparent().append(element) match.getparent().append(element)
return element return element
# try duplicating from template
# FIXME
# >>> self.xml.getroottree().getpath(ff.find('a'))
return None return None
def rule_remove(self, rule): def rule_remove(self, rule):
@ -432,7 +431,7 @@ class ParserXML(ParserBase):
return etree.tostring(match, method='text', encoding='unicode').strip() return etree.tostring(match, method='text', encoding='unicode').strip()
else: else:
return match or "" return match # might be None is no match
class ParserHTML(ParserXML): class ParserHTML(ParserXML):
@ -468,8 +467,6 @@ class ParserHTML(ParserXML):
element = deepcopy(match) element = deepcopy(match)
match.getparent().append(element) match.getparent().append(element)
# TODO def rule_set for the html part
def parse_time(value): def parse_time(value):
if isinstance(value, basestring): if isinstance(value, basestring):
@ -484,6 +481,7 @@ def parse_time(value):
elif isinstance(value, datetime): elif isinstance(value, datetime):
return value return value
else: else:
return False return False
@ -497,8 +495,9 @@ class ParserJSON(ParserBase):
return json.loads(raw) return json.loads(raw)
def remove(self): def remove(self):
# delete oneself FIXME # impossible to "delete" oneself per se but can clear all its items
pass for attr in self.root:
del self.root[attr]
def tostring(self, encoding='unicode', **k): def tostring(self, encoding='unicode', **k):
dump = json.dumps(self.root, ensure_ascii=False, **k) # ensure_ascii = False to have proper (unicode) string and not \u00 dump = json.dumps(self.root, ensure_ascii=False, **k) # ensure_ascii = False to have proper (unicode) string and not \u00
@ -558,11 +557,16 @@ class ParserJSON(ParserBase):
rrule = self._rule_parse(rule) rrule = self._rule_parse(rule)
cur = self.root cur = self.root
try:
for node in rrule[:-1]: for node in rrule[:-1]:
cur = cur[node] cur = cur[node]
del cur[rrule[-1]] del cur[rrule[-1]]
except KeyError:
# nothing to delete
pass
def rule_set(self, rule, value): def rule_set(self, rule, value):
if '[]' in rule: if '[]' in rule:
raise ValueError('not supported') # FIXME raise ValueError('not supported') # FIXME
@ -609,12 +613,12 @@ class Feed(object):
return [itemsClass(x, self.rules, self) for x in items] return [itemsClass(x, self.rules, self) for x in items]
title = property( title = property(
lambda f: f.get_str('title'), lambda f: f.get('title'),
lambda f,x: f.set_str('title', x), lambda f,x: f.set('title', x),
lambda f: f.rmv('title') ) lambda f: f.rmv('title') )
description = desc = property( description = desc = property(
lambda f: f.get_str('desc'), lambda f: f.get('desc'),
lambda f,x: f.set_str('desc', x), lambda f,x: f.set('desc', x),
lambda f: f.rmv('desc') ) lambda f: f.rmv('desc') )
items = property( items = property(
lambda f: f ) lambda f: f )
@ -661,28 +665,28 @@ class Item(Uniq):
return id(xml) return id(xml)
title = property( title = property(
lambda f: f.get_str('item_title'), lambda f: f.get('item_title'),
lambda f,x: f.set_str('item_title', x), lambda f,x: f.set('item_title', x),
lambda f: f.rmv('item_title') ) lambda f: f.rmv('item_title') )
link = property( link = property(
lambda f: f.get_str('item_link'), lambda f: f.get('item_link'),
lambda f,x: f.set_str('item_link', x), lambda f,x: f.set('item_link', x),
lambda f: f.rmv('item_link') ) lambda f: f.rmv('item_link') )
description = desc = property( description = desc = property(
lambda f: f.get_str('item_desc'), lambda f: f.get('item_desc'),
lambda f,x: f.set_str('item_desc', x), lambda f,x: f.set('item_desc', x),
lambda f: f.rmv('item_desc') ) lambda f: f.rmv('item_desc') )
content = property( content = property(
lambda f: f.get_str('item_content'), lambda f: f.get('item_content'),
lambda f,x: f.set_str('item_content', x), lambda f,x: f.set('item_content', x),
lambda f: f.rmv('item_content') ) lambda f: f.rmv('item_content') )
time = property( time = property(
lambda f: f.time_prs(f.get_str('item_time')), lambda f: f.time_prs(f.get('item_time')),
lambda f,x: f.set_str('item_time', f.time_fmt(x)), lambda f,x: f.set('item_time', f.time_fmt(x)),
lambda f: f.rmv('item_time') ) lambda f: f.rmv('item_time') )
updated = property( updated = property(
lambda f: f.time_prs(f.get_str('item_updated')), lambda f: f.time_prs(f.get('item_updated')),
lambda f,x: f.set_str('item_updated', f.time_fmt(x)), lambda f,x: f.set('item_updated', f.time_fmt(x)),
lambda f: f.rmv('item_updated') ) lambda f: f.rmv('item_updated') )

View File

@ -204,7 +204,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
# twitter # twitter
if urlparse(feedurl).netloc == 'twitter.com': if urlparse(feedurl).netloc == 'twitter.com':
match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url') match = lxml.html.fromstring(item.desc).xpath('//a/@data-expanded-url')
if len(match): if len(match):
link = match[0] link = match[0]
log(link) log(link)
@ -341,6 +341,8 @@ def FeedFetch(url, options):
else: else:
try: try:
rss = feeds.parse(xml, url, contenttype) rss = feeds.parse(xml, url, contenttype)
rss = rss.convert(feeds.FeedXML)
# contains all fields, otherwise much-needed data can be lost
except TypeError: except TypeError:
log('random page') log('random page')
@ -435,8 +437,10 @@ def FeedFormat(rss, options):
if options.callback: if options.callback:
if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None: if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
return '%s(%s)' % (options.callback, rss.tojson()) return '%s(%s)' % (options.callback, rss.tojson())
else: else:
raise MorssException('Invalid callback var name') raise MorssException('Invalid callback var name')
elif options.json: elif options.json:
if options.indent: if options.indent:
return rss.tojson(encoding='UTF-8', indent=4) return rss.tojson(encoding='UTF-8', indent=4)