Compare commits
3 Commits
fdf9acd32b
...
4d6d3c9239
Author | SHA1 | Date |
---|---|---|
pictuga | 4d6d3c9239 | |
pictuga | e81f6b173f | |
pictuga | fe5dbf1ce0 |
|
@ -59,7 +59,9 @@ except NameError:
|
||||||
MIMETYPE = {
|
MIMETYPE = {
|
||||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
||||||
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
'html': ['text/html', 'application/xhtml+xml', 'application/xml'],
|
||||||
|
'json': ['application/json'],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_UAS = [
|
DEFAULT_UAS = [
|
||||||
|
|
|
@ -207,8 +207,10 @@ def clean_root(root, keep_threshold=None):
|
||||||
def clean_node(node, keep_threshold=None):
|
def clean_node(node, keep_threshold=None):
|
||||||
parent = node.getparent()
|
parent = node.getparent()
|
||||||
|
|
||||||
|
# remove comments
|
||||||
if (isinstance(node, lxml.html.HtmlComment)
|
if (isinstance(node, lxml.html.HtmlComment)
|
||||||
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
||||||
|
parent.remove(node)
|
||||||
return
|
return
|
||||||
|
|
||||||
if parent is None:
|
if parent is None:
|
||||||
|
@ -242,11 +244,6 @@ def clean_node(node, keep_threshold=None):
|
||||||
parent.remove(node)
|
parent.remove(node)
|
||||||
return
|
return
|
||||||
|
|
||||||
# remove comments
|
|
||||||
if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
|
|
||||||
parent.remove(node)
|
|
||||||
return
|
|
||||||
|
|
||||||
# remove if too many kids & too high link density
|
# remove if too many kids & too high link density
|
||||||
wc = count_words(node.text_content())
|
wc = count_words(node.text_content())
|
||||||
if wc != 0 and len(list(node.iter())) > 3:
|
if wc != 0 and len(list(node.iter())) > 3:
|
||||||
|
|
|
@ -192,9 +192,10 @@ def cgi_get(environ, start_response):
|
||||||
url, options = cgi_parse_environ(environ)
|
url, options = cgi_parse_environ(environ)
|
||||||
|
|
||||||
# get page
|
# get page
|
||||||
|
if options['get'] in ('page', 'article'):
|
||||||
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||||
|
|
||||||
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
if req['contenttype'] in crawler.MIMETYPE['html']:
|
||||||
if options['get'] == 'page':
|
if options['get'] == 'page':
|
||||||
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
||||||
html.make_links_absolute(req['url'])
|
html.make_links_absolute(req['url'])
|
||||||
|
@ -207,17 +208,20 @@ def cgi_get(environ, start_response):
|
||||||
|
|
||||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
||||||
|
|
||||||
elif options['get'] == 'article':
|
else: # i.e. options['get'] == 'article'
|
||||||
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
||||||
|
|
||||||
|
elif req['contenttype'] in crawler.MIMETYPE['xml'] + crawler.MIMETYPE['rss'] + crawler.MIMETYPE['json']:
|
||||||
|
output = req['data']
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise MorssException('unsupported mimetype')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise MorssException('no :get option passed')
|
raise MorssException('no :get option passed')
|
||||||
|
|
||||||
else:
|
|
||||||
output = req['data']
|
|
||||||
|
|
||||||
# return html page
|
# return html page
|
||||||
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
|
headers = {'status': '200 OK', 'content-type': req['contenttype'], 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
|
||||||
start_response(headers['status'], list(headers.items()))
|
start_response(headers['status'], list(headers.items()))
|
||||||
return [output]
|
return [output]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue