feeds: fix atom xhtml handling

2018-11-11 15:21:06 +01:00 · 2018-11-11 15:21:06 +01:00 · 401dfbc1ff
commit 401dfbc1ff
parent 8aceda4957
1 changed files with 21 additions and 9 deletions
--- a/morss/feeds.py
+++ b/morss/feeds.py
@ -77,14 +77,6 @@ def tag_NS(tag, nsmap=NSMAP):
    return tag


-def inner_html(xml):
-    return (xml.text or '') + ''.join([etree.tostring(child) for child in xml])
-
-
-def clean_node(xml):
-    [xml.remove(child) for child in xml]
-
-
 def parse_rules(filename=None):
    if not filename:
        filename = os.path.join(os.path.dirname(__file__), 'feedify.ini')
@ -266,6 +258,14 @@ class ParserXML(ParserBase):

        return rule

+    @staticmethod
+    def _inner_html(xml):
+        return (xml.text or '') + ''.join([etree.tostring(child) for child in xml])
+
+    @staticmethod
+    def _clean_node(xml):
+        [xml.remove(child) for child in xml]
+
    def rule_search_all(self, rule):
        try:
            return self.root.xpath(rule, namespaces=NSMAP)
@ -333,13 +333,25 @@ class ParserXML(ParserBase):
            match.attrib[key] = value

        else:
+            if len(match):
+                # atom stuff
+                self._clean_node(match)
+
+                if match.attrib.get('type', '') == 'xhtml':
+                    match.attrib['type'] = 'html'
+
            match.text = value

    def rule_str(self, rule):
        match = self.rule_search(rule)

        if isinstance(match, etree._Element):
-            return match.text or ""
+            if len(match):
+                # atom stuff
+                return self._inner_html(match)
+
+            else:
+                return match.text or ""

        else:
            return match or ""