Compare commits

..

2 Commits

Author SHA1 Message Date
pictuga e88a823ada feeds: better handle rulesets without a 'mode' specified
continuous-integration/drone/push Build is failing Details
2022-01-19 13:08:33 +01:00
pictuga 750850c162 crawler: avoid too many .append() 2022-01-19 13:04:33 +01:00
2 changed files with 16 additions and 16 deletions

View File

@ -111,8 +111,6 @@ def adv_get(url, post=None, timeout=None, *args, **kwargs):
def custom_opener(follow=None, policy=None, force_min=None, force_max=None): def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
handlers = []
# as per urllib2 source code, these Handelers are added first # as per urllib2 source code, these Handelers are added first
# *unless* one of the custom handlers inherits from one of them # *unless* one of the custom handlers inherits from one of them
# #
@ -130,16 +128,18 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
# http_error_* are run until sth is returned (other than None). If they all # http_error_* are run until sth is returned (other than None). If they all
# return nothing, a python error is raised # return nothing, a python error is raised
#handlers.append(DebugHandler()) handlers = [
handlers.append(SizeLimitHandler(500*1024)) # 500KiB #DebugHandler(),
handlers.append(HTTPCookieProcessor()) SizeLimitHandler(500*1024)) # 500KiB
handlers.append(GZIPHandler()) HTTPCookieProcessor(),
handlers.append(HTTPAllRedirectHandler()) GZIPHandler(),
handlers.append(HTTPEquivHandler()) HTTPAllRedirectHandler(),
handlers.append(HTTPRefreshHandler()) HTTPEquivHandler(),
handlers.append(UAHandler(random.choice(DEFAULT_UAS))) HTTPRefreshHandler(),
handlers.append(BrowserlyHeaderHandler()) UAHandler(random.choice(DEFAULT_UAS)),
handlers.append(EncodingFixHandler()) BrowserlyHeaderHandler(),
EncodingFixHandler(),
]
if follow: if follow:
handlers.append(AlternateHandler(MIMETYPE[follow])) handlers.append(AlternateHandler(MIMETYPE[follow]))

View File

@ -93,7 +93,7 @@ def parse(data, url=None, encoding=None, ruleset=None):
if 'path' in ruleset: if 'path' in ruleset:
for path in ruleset['path']: for path in ruleset['path']:
if fnmatch(url, path): if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset['mode']][0] parser = [x for x in parsers if x.mode == ruleset.get('mode')][0] # FIXME what if no mode specified?
return parser(data, ruleset, encoding=encoding) return parser(data, ruleset, encoding=encoding)
# 2) Try each and every parser # 2) Try each and every parser
@ -113,7 +113,7 @@ def parse(data, url=None, encoding=None, ruleset=None):
else: else:
# parsing worked, now we try the rulesets # parsing worked, now we try the rulesets
ruleset_candidates = [x for x in rulesets if x.get('mode', None) in (parser.mode, None) and 'path' not in x] ruleset_candidates = [x for x in rulesets if x.get('mode') in (parser.mode, None) and 'path' not in x]
# 'path' as they should have been caught beforehands # 'path' as they should have been caught beforehands
# try anyway if no 'mode' specified # try anyway if no 'mode' specified
@ -428,7 +428,7 @@ class ParserXML(ParserBase):
match = self.rule_search(rrule) match = self.rule_search(rrule)
html_rich = ('atom' in rule or self.rules['mode'] == 'html') \ html_rich = ('atom' in rule or self.rules.get('mode') == 'html') \
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')] and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
if key is not None: if key is not None:
@ -439,7 +439,7 @@ class ParserXML(ParserBase):
self._clean_node(match) self._clean_node(match)
match.append(lxml.html.fragment_fromstring(value, create_parent='div')) match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
if self.rules['mode'] == 'html': if self.rules.get('mode') == 'html':
match.find('div').drop_tag() # not supported by lxml.etree match.find('div').drop_tag() # not supported by lxml.etree
else: # i.e. if atom else: # i.e. if atom