feeds: add html support, adapt .tohtml()

master
pictuga 2020-03-18 16:33:10 +01:00
parent e3528a8f36
commit c2f85da94a
1 changed files with 47 additions and 9 deletions

View File

@ -14,9 +14,7 @@ from dateutil import tz
import dateutil.parser import dateutil.parser
from copy import deepcopy from copy import deepcopy
from wheezy.template.engine import Engine import lxml.html
from wheezy.template.loader import DictLoader
from wheezy.template.ext.core import CoreExtension
json.encoder.c_make_encoder = None json.encoder.c_make_encoder = None
@ -99,12 +97,7 @@ class ParserBase(object):
return out.read() return out.read()
def tohtml(self): def tohtml(self):
# TODO temporary return self.convert(FeedHTML).tostring()
path = os.path.join(os.path.dirname(__file__), 'reader.html.template')
loader = DictLoader({'reader': open(path).read()})
engine = Engine(loader=loader, extensions=[CoreExtension()])
template = engine.get_template('reader')
return template.render({'feed': self}).encode('utf-8')
def convert(self, TargetParser): def convert(self, TargetParser):
target = TargetParser() target = TargetParser()
@ -344,6 +337,43 @@ class ParserXML(ParserBase):
return match or "" return match or ""
class ParserHTML(ParserXML):
default_ruleset = 'html'
mode = 'html'
mimetype = ['text/html', 'application/xhtml+xml']
def parse(self, raw):
return lxml.html.fromstring(raw)
def tostring(self, **k):
return lxml.html.tostring(self.root, **k)
@staticmethod
def _inner_html(xml):
return (xml.text or b'') + b''.join([lxml.html.tostring(child) for child in xml])
def rule_search_all(self, rule):
try:
# do proper "class" matching (too "heavy" to type as-it in rules)
pattern = r'\[class=([^\]]+)\]'
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
rule = re.sub(pattern, repl, rule)
return self.root.xpath(rule)
except etree.XPathEvalError:
return []
def rule_create(self, rule):
# try duplicating from existing (works well with fucked up structures)
rrule, key = self._rule_parse(rule)
match = self.rule_search_last(rule)
if match is not None:
element = deepcopy(match)
match.getparent().append(element)
# TODO def rule_set for the html part
def parse_time(value): def parse_time(value):
@ -565,6 +595,14 @@ class ItemXML(Item, ParserXML):
pass pass
class FeedHTML(Feed, ParserHTML):
itemsClass = 'ItemHTML'
class ItemHTML(Item, ParserHTML):
pass
class FeedJSON(Feed, ParserJSON): class FeedJSON(Feed, ParserJSON):
itemsClass = 'ItemJSON' itemsClass = 'ItemJSON'