feeds: auto-parse()

master
pictuga 2020-03-18 16:34:40 +01:00
parent c2f85da94a
commit 4a70aa9dfa
1 changed files with 61 additions and 0 deletions

View File

@ -9,6 +9,8 @@ import re
import json import json
import csv import csv
from fnmatch import fnmatch
from lxml import etree from lxml import etree
from dateutil import tz from dateutil import tz
import dateutil.parser import dateutil.parser
@ -51,6 +53,65 @@ def parse_rules(filename=None):
return rules return rules
def parse(data, url=None, mimetype=None):
" Determine which ruleset to use "
rulesets = parse_rules()
parsers = [FeedXML, FeedHTML, FeedJSON]
# 1) Look for a ruleset based on path
if url is not None:
for ruleset in rulesets.values():
if 'path' in ruleset:
for path in ruleset['path']:
if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
return parser(data, ruleset)
# 2) Look for a parser based on mimetype
if mimetype is not None:
parser_candidates = [x for x in parsers if mimetype in x.mimetype]
if mimetype is None or parser_candidates is None:
parser_candidates = parsers
# 3) Look for working ruleset for given parser
# 3a) See if parsing works
# 3b) See if .items matches anything
for parser in parser_candidates:
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
# 'path' as they should have been caught beforehands
try:
feed = parser(data)
except (ValueError):
# parsing did not work
pass
else:
# parsing worked, now we try the rulesets
for ruleset in ruleset_candidates:
feed.rules = ruleset
try:
feed.items[0]
except (AttributeError, IndexError):
# parsing and or item picking did not work out
pass
else:
# it worked!
return feed
raise Exception('no way to handle this feed')
class ParserBase(object): class ParserBase(object):
def __init__(self, data=None, rules=None, parent=None): def __init__(self, data=None, rules=None, parent=None):
if rules is None: if rules is None: