Compare commits
No commits in common. "37f5a92b05764356baff305847a702c70afab2cd" and "f9d7794bcca38ebfa9b04042ab9259efb694a377" have entirely different histories.
37f5a92b05
...
f9d7794bcc
|
@ -85,10 +85,10 @@ def adv_get(url, timeout=None, *args, **kwargs):
|
||||||
url = sanitize_url(url)
|
url = sanitize_url(url)
|
||||||
|
|
||||||
if timeout is None:
|
if timeout is None:
|
||||||
con = custom_opener(*args, **kwargs).open(url)
|
con = custom_handler(*args, **kwargs).open(url)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
con = custom_opener(*args, **kwargs).open(url, timeout=timeout)
|
con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
|
||||||
|
|
||||||
data = con.read()
|
data = con.read()
|
||||||
|
|
||||||
|
@ -104,7 +104,7 @@ def adv_get(url, timeout=None, *args, **kwargs):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def custom_opener(follow=None, delay=None):
|
def custom_handler(follow=None, delay=None):
|
||||||
handlers = []
|
handlers = []
|
||||||
|
|
||||||
# as per urllib2 source code, these Handelers are added first
|
# as per urllib2 source code, these Handelers are added first
|
||||||
|
@ -346,8 +346,6 @@ class BrowserlyHeaderHandler(BaseHandler):
|
||||||
|
|
||||||
|
|
||||||
def iter_html_tag(html_str, tag_name):
|
def iter_html_tag(html_str, tag_name):
|
||||||
" To avoid parsing whole pages when looking for a simple tag "
|
|
||||||
|
|
||||||
re_tag = r'<%s(\s*[^>])*>' % tag_name
|
re_tag = r'<%s(\s*[^>])*>' % tag_name
|
||||||
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
||||||
|
|
||||||
|
|
|
@ -88,21 +88,16 @@ def parse_rules(filename=None):
|
||||||
return rules
|
return rules
|
||||||
|
|
||||||
|
|
||||||
def parse(data, url=None, encoding=None, ruleset=None):
|
def parse(data, url=None, encoding=None):
|
||||||
" Determine which ruleset to use "
|
" Determine which ruleset to use "
|
||||||
|
|
||||||
if ruleset is not None:
|
rulesets = parse_rules()
|
||||||
rulesets = [ruleset]
|
|
||||||
|
|
||||||
else:
|
|
||||||
rulesets = parse_rules().values()
|
|
||||||
|
|
||||||
parsers = [FeedXML, FeedHTML, FeedJSON]
|
parsers = [FeedXML, FeedHTML, FeedJSON]
|
||||||
|
|
||||||
# 1) Look for a ruleset based on path
|
# 1) Look for a ruleset based on path
|
||||||
|
|
||||||
if url is not None:
|
if url is not None:
|
||||||
for ruleset in rulesets:
|
for ruleset in rulesets.values():
|
||||||
if 'path' in ruleset:
|
if 'path' in ruleset:
|
||||||
for path in ruleset['path']:
|
for path in ruleset['path']:
|
||||||
if fnmatch(url, path):
|
if fnmatch(url, path):
|
||||||
|
@ -116,6 +111,9 @@ def parse(data, url=None, encoding=None, ruleset=None):
|
||||||
# 3b) See if .items matches anything
|
# 3b) See if .items matches anything
|
||||||
|
|
||||||
for parser in parsers:
|
for parser in parsers:
|
||||||
|
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
|
||||||
|
# 'path' as they should have been caught beforehands
|
||||||
|
|
||||||
try:
|
try:
|
||||||
feed = parser(data, encoding=encoding)
|
feed = parser(data, encoding=encoding)
|
||||||
|
|
||||||
|
@ -126,17 +124,13 @@ def parse(data, url=None, encoding=None, ruleset=None):
|
||||||
else:
|
else:
|
||||||
# parsing worked, now we try the rulesets
|
# parsing worked, now we try the rulesets
|
||||||
|
|
||||||
ruleset_candidates = [x for x in rulesets if x.get('mode', None) in (parser.mode, None) and 'path' not in x]
|
|
||||||
# 'path' as they should have been caught beforehands
|
|
||||||
# try anyway if no 'mode' specified
|
|
||||||
|
|
||||||
for ruleset in ruleset_candidates:
|
for ruleset in ruleset_candidates:
|
||||||
feed.rules = ruleset
|
feed.rules = ruleset
|
||||||
|
|
||||||
try:
|
try:
|
||||||
feed.items[0]
|
feed.items[0]
|
||||||
|
|
||||||
except (AttributeError, IndexError, TypeError):
|
except (AttributeError, IndexError):
|
||||||
# parsing and or item picking did not work out
|
# parsing and or item picking did not work out
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -462,7 +456,7 @@ class ParserXML(ParserBase):
|
||||||
def rule_str(self, rule):
|
def rule_str(self, rule):
|
||||||
match = self.rule_search(rule)
|
match = self.rule_search(rule)
|
||||||
|
|
||||||
html_rich = ('atom' in rule or self.mode == 'html') \
|
html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
|
||||||
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
|
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
|
||||||
|
|
||||||
if isinstance(match, etree._Element):
|
if isinstance(match, etree._Element):
|
||||||
|
|
|
@ -96,7 +96,7 @@ class Options:
|
||||||
return self.options[key]
|
return self.options[key]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return None
|
return False
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
def __setitem__(self, key, value):
|
||||||
self.options[key] = value
|
self.options[key] = value
|
||||||
|
@ -104,13 +104,6 @@ class Options:
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
return key in self.options
|
return key in self.options
|
||||||
|
|
||||||
def get(self, key, default=None):
|
|
||||||
if key in self.options:
|
|
||||||
return self.options[key]
|
|
||||||
|
|
||||||
else:
|
|
||||||
return default
|
|
||||||
|
|
||||||
|
|
||||||
def ItemFix(item, options, feedurl='/'):
|
def ItemFix(item, options, feedurl='/'):
|
||||||
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
||||||
|
@ -283,23 +276,22 @@ def FeedFetch(url, options):
|
||||||
|
|
||||||
if options.items:
|
if options.items:
|
||||||
# using custom rules
|
# using custom rules
|
||||||
ruleset = {}
|
rss = feeds.FeedHTML(req['data'], encoding=req['encoding'])
|
||||||
|
|
||||||
ruleset['items'] = options.items
|
rss.rules['title'] = options.title if options.title else '//head/title'
|
||||||
|
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
|
||||||
|
|
||||||
ruleset['title'] = options.get('title', '//head/title')
|
rss.rules['items'] = options.items
|
||||||
ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')
|
|
||||||
|
|
||||||
ruleset['item_title'] = options.get('item_title', '.')
|
rss.rules['item_title'] = options.item_title if options.item_title else '.'
|
||||||
ruleset['item_link'] = options.get('item_link', './@href|.//a/@href|ancestor::a/@href')
|
rss.rules['item_link'] = options.item_link if options.item_link else './@href|.//a/@href|ancestor::a/@href'
|
||||||
|
|
||||||
if options.item_content:
|
if options.item_content:
|
||||||
ruleset['item_content'] = options.item_content
|
rss.rules['item_content'] = options.item_content
|
||||||
|
|
||||||
if options.item_time:
|
if options.item_time:
|
||||||
ruleset['item_time'] = options.item_time
|
rss.rules['item_time'] = options.item_time
|
||||||
|
|
||||||
rss = feeds.parse(req['data'], encoding=req['encoding'], ruleset=ruleset)
|
|
||||||
rss = rss.convert(feeds.FeedXML)
|
rss = rss.convert(feeds.FeedXML)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -50,7 +50,7 @@ def parse_options(options):
|
||||||
split = option.split('=', 1)
|
split = option.split('=', 1)
|
||||||
|
|
||||||
if len(split) > 1:
|
if len(split) > 1:
|
||||||
out[split[0]] = unquote(split[1]).replace('|', '/') # | -> / for backward compatibility (and Apache)
|
out[split[0]] = split[1]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
out[split[0]] = True
|
out[split[0]] = True
|
||||||
|
@ -58,18 +58,14 @@ def parse_options(options):
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def request_uri(environ):
|
def get_path(environ):
|
||||||
if 'REQUEST_URI' in environ:
|
if 'REQUEST_URI' in environ:
|
||||||
# when running on Apache/uwsgi
|
# when running on Apache
|
||||||
url = environ['REQUEST_URI']
|
url = unquote(environ['REQUEST_URI'][1:])
|
||||||
|
|
||||||
elif 'RAW_URI' in environ:
|
|
||||||
# gunicorn
|
|
||||||
url = environ['RAW_URI']
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# when using other servers
|
# when using internal server
|
||||||
url = environ['PATH_INFO']
|
url = environ['PATH_INFO'][1:]
|
||||||
|
|
||||||
if environ['QUERY_STRING']:
|
if environ['QUERY_STRING']:
|
||||||
url += '?' + environ['QUERY_STRING']
|
url += '?' + environ['QUERY_STRING']
|
||||||
|
@ -80,13 +76,19 @@ def request_uri(environ):
|
||||||
def cgi_parse_environ(environ):
|
def cgi_parse_environ(environ):
|
||||||
# get options
|
# get options
|
||||||
|
|
||||||
url = request_uri(environ)[1:]
|
url = get_path(environ)
|
||||||
url = re.sub(r'^(cgi/)?(morss.py|main.py)/', '', url)
|
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
|
||||||
|
|
||||||
if url.startswith(':'):
|
if url.startswith(':'):
|
||||||
parts = url.split('/', 1)
|
split = url.split('/', 1)
|
||||||
raw_options = parts[0].split(':')[1:]
|
|
||||||
url = parts[1] if len(parts) > 1 else ''
|
raw_options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
||||||
|
|
||||||
|
if len(split) > 1:
|
||||||
|
url = split[1]
|
||||||
|
|
||||||
|
else:
|
||||||
|
url = ''
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raw_options = []
|
raw_options = []
|
||||||
|
@ -162,7 +164,7 @@ def middleware(func):
|
||||||
def cgi_file_handler(environ, start_response, app):
|
def cgi_file_handler(environ, start_response, app):
|
||||||
" Simple HTTP server to serve static files (.html, .css, etc.) "
|
" Simple HTTP server to serve static files (.html, .css, etc.) "
|
||||||
|
|
||||||
url = request_uri(environ)[1:]
|
url = get_path(environ)
|
||||||
|
|
||||||
if url == '':
|
if url == '':
|
||||||
url = 'index.html'
|
url = 'index.html'
|
||||||
|
@ -281,18 +283,11 @@ def cgi_handle_request():
|
||||||
wsgiref.handlers.CGIHandler().run(app)
|
wsgiref.handlers.CGIHandler().run(app)
|
||||||
|
|
||||||
|
|
||||||
class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
|
|
||||||
def get_environ(self):
|
|
||||||
env = super().get_environ()
|
|
||||||
env['REQUEST_URI'] = self.path
|
|
||||||
return env
|
|
||||||
|
|
||||||
|
|
||||||
def cgi_start_server():
|
def cgi_start_server():
|
||||||
crawler.default_cache.autotrim()
|
crawler.default_cache.autotrim()
|
||||||
|
|
||||||
print('Serving http://localhost:%s/' % PORT)
|
print('Serving http://localhost:%s/' % PORT)
|
||||||
httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
|
httpd = wsgiref.simple_server.make_server('', PORT, application)
|
||||||
httpd.serve_forever()
|
httpd.serve_forever()
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue