Compare commits
No commits in common. "e88a823ada3e16df34aa18421cb1f00b7b2ee296" and "c8669002e49671a3fa18573c0ce3fa32539ed1c0" have entirely different histories.
e88a823ada
...
c8669002e4
|
@ -111,6 +111,8 @@ def adv_get(url, post=None, timeout=None, *args, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
||||||
|
handlers = []
|
||||||
|
|
||||||
# as per urllib2 source code, these Handelers are added first
|
# as per urllib2 source code, these Handelers are added first
|
||||||
# *unless* one of the custom handlers inherits from one of them
|
# *unless* one of the custom handlers inherits from one of them
|
||||||
#
|
#
|
||||||
|
@ -128,18 +130,16 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
||||||
# http_error_* are run until sth is returned (other than None). If they all
|
# http_error_* are run until sth is returned (other than None). If they all
|
||||||
# return nothing, a python error is raised
|
# return nothing, a python error is raised
|
||||||
|
|
||||||
handlers = [
|
#handlers.append(DebugHandler())
|
||||||
#DebugHandler(),
|
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
|
||||||
SizeLimitHandler(500*1024)) # 500KiB
|
handlers.append(HTTPCookieProcessor())
|
||||||
HTTPCookieProcessor(),
|
handlers.append(GZIPHandler())
|
||||||
GZIPHandler(),
|
handlers.append(HTTPAllRedirectHandler())
|
||||||
HTTPAllRedirectHandler(),
|
handlers.append(HTTPEquivHandler())
|
||||||
HTTPEquivHandler(),
|
handlers.append(HTTPRefreshHandler())
|
||||||
HTTPRefreshHandler(),
|
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
||||||
UAHandler(random.choice(DEFAULT_UAS)),
|
handlers.append(BrowserlyHeaderHandler())
|
||||||
BrowserlyHeaderHandler(),
|
handlers.append(EncodingFixHandler())
|
||||||
EncodingFixHandler(),
|
|
||||||
]
|
|
||||||
|
|
||||||
if follow:
|
if follow:
|
||||||
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
||||||
|
|
|
@ -93,7 +93,7 @@ def parse(data, url=None, encoding=None, ruleset=None):
|
||||||
if 'path' in ruleset:
|
if 'path' in ruleset:
|
||||||
for path in ruleset['path']:
|
for path in ruleset['path']:
|
||||||
if fnmatch(url, path):
|
if fnmatch(url, path):
|
||||||
parser = [x for x in parsers if x.mode == ruleset.get('mode')][0] # FIXME what if no mode specified?
|
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
|
||||||
return parser(data, ruleset, encoding=encoding)
|
return parser(data, ruleset, encoding=encoding)
|
||||||
|
|
||||||
# 2) Try each and every parser
|
# 2) Try each and every parser
|
||||||
|
@ -113,7 +113,7 @@ def parse(data, url=None, encoding=None, ruleset=None):
|
||||||
else:
|
else:
|
||||||
# parsing worked, now we try the rulesets
|
# parsing worked, now we try the rulesets
|
||||||
|
|
||||||
ruleset_candidates = [x for x in rulesets if x.get('mode') in (parser.mode, None) and 'path' not in x]
|
ruleset_candidates = [x for x in rulesets if x.get('mode', None) in (parser.mode, None) and 'path' not in x]
|
||||||
# 'path' as they should have been caught beforehands
|
# 'path' as they should have been caught beforehands
|
||||||
# try anyway if no 'mode' specified
|
# try anyway if no 'mode' specified
|
||||||
|
|
||||||
|
@ -428,7 +428,7 @@ class ParserXML(ParserBase):
|
||||||
|
|
||||||
match = self.rule_search(rrule)
|
match = self.rule_search(rrule)
|
||||||
|
|
||||||
html_rich = ('atom' in rule or self.rules.get('mode') == 'html') \
|
html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
|
||||||
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
|
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
|
||||||
|
|
||||||
if key is not None:
|
if key is not None:
|
||||||
|
@ -439,7 +439,7 @@ class ParserXML(ParserBase):
|
||||||
self._clean_node(match)
|
self._clean_node(match)
|
||||||
match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
|
match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
|
||||||
|
|
||||||
if self.rules.get('mode') == 'html':
|
if self.rules['mode'] == 'html':
|
||||||
match.find('div').drop_tag() # not supported by lxml.etree
|
match.find('div').drop_tag() # not supported by lxml.etree
|
||||||
|
|
||||||
else: # i.e. if atom
|
else: # i.e. if atom
|
||||||
|
|
Loading…
Reference in New Issue