From 6d28323e3aeb3088e9d85953d16e731e50470991 Mon Sep 17 00:00:00 2001 From: pictuga Date: Fri, 9 Nov 2018 22:04:08 +0100 Subject: [PATCH] feeds: add XML support for merger --- morss/feeds.py | 148 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/morss/feeds.py b/morss/feeds.py index b2a2c2f..a2b6679 100644 --- a/morss/feeds.py +++ b/morss/feeds.py @@ -239,6 +239,143 @@ class ParserBase(object): pass +class ParserXML(ParserBase): + def parse(self, raw): + parser = etree.XMLParser(recover=True) + return etree.fromstring(raw, parser) + + def tostring(self, **k): + return etree.tostring(self.root, **k) + + def _rule_parse(self, rule): + test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href + return test.groups() if test else (rule, None) + + def _resolve_ns(self, rule): + match = re.search(r'^([^:]+):([^:]+)$', rule) + if match: + match = match.groups() + if match[0] in NSMAP: + return "{%s}%s" % (NSMAP[match[0]], match[1].lower()) + + return rule + + def rule_search_all(self, rule): + try: + return self.root.xpath(rule, namespaces=NSMAP) + + except etree.XPathEvalError: + return [] + + def rule_create(self, rule): + # duplicate, copy from template or create from scratch + rule, key = self._rule_parse(rule) + + # try recreating based on the rule (for really basic rules, ie. plain RSS) + if re.search(r'^[a-zA-Z0-9/:]+$', rule): + chain = rule.strip('/').split('/') + current = self.root + + if rule[0] == '/': + chain = chain[1:] + + for (i, node) in enumerate(chain): + test = current.find(self._resolve_ns(node)) + + if test and i < len(chain) - 1: + # yay, go on + current = test + + else: + # opps need to create + element = etree.Element(self._resolve_ns(node)) + current.append(element) + current = element + + return current + + # try duplicating from existing (works well with fucked up structures) + match = self.rule_search_last(rule) + if match: + element = deepcopy(match) + match.getparen().append(element) + return element + + # try duplicating from template + # FIXME + # >>> self.xml.getroottree().getpath(ff.find('a')) + + return None + + def rule_remove(self, rule): + rule, key = self._rule_parse(rule) + + match = self.rule_search(rule) + + if key is not None: + del x.attrib[key] + + else: + match.getparent().remove(match) + + def rule_set(self, rule, value): + rule, key = self._rule_parse(rule) + + match = self.rule_search(rule) + + if key is not None: + match.attrib[key] = value + + else: + match.text = value + + def rule_str(self, rule): + match = self.rule_search(rule) + + if isinstance(match, etree._Element): + return match.text or "" + + else: + return match or "" + + def bool_prs(self, x): + return (x or '').lower() != 'false' + + def bool_fmt(self, x): + return 'true' if x else 'false' + + def time_prs(self, x): + try: + return parse_time(x) + except ValueError: + return None + + def time_fmt(self, x): + try: + time = parse_time(x) + return time.strftime(self.rules['timeformat']) + except ValueError: + pass + + def get_raw(self, rule_name): + return self.rule_search_all(self.rules[rule_name]) + + def get_str(self, rule_name): + return self.rule_str(self.rules[rule_name]) + + def set_str(self, rule_name, value): + try: + return self.rule_set(self.rules[rule_name], value) + + except AttributeError: + # does not exist, have to create it + self.rule_create(self.rules[rule_name]) + return self.rule_set(self.rules[rule_name], value) + + def remove(self, rule_name): + self.rule_remove(self.rules[rule_name]) + + class FeedBase(object): """ Base for xml-related classes, which provides simple wrappers around xpath @@ -396,6 +533,13 @@ class Feed(object): return len(self.get_raw('items')) +class FeedXML(Feed, ParserXML): + itemsClass = 'ItemXML' + + def tostring(self, **k): + return etree.tostring(self.root.getroottree(), **k) + + class FeedParser(FeedBase): itemsClass = 'FeedItem' mimetype = 'application/xml' @@ -621,6 +765,10 @@ class Item(Uniq): lambda f: f.remove('item_updated') ) +class ItemXML(Item, ParserXML): + pass + + class FeedItem(FeedBase, Uniq): timeFormat = '' dic = ('title', 'link', 'desc', 'content', 'id', 'is_permalink', 'time', 'updated')