Compare commits

..

9 Commits

Author SHA1 Message Date
pictuga 038f267ea2 Rename :theforce into :force 2020-05-13 11:49:15 +02:00
pictuga 22005065e8 Use etree.tostring 'method' arg
Gives appropriately formatted html code.
Some pages might otherwise be rendered as blank.
2020-05-13 11:44:34 +02:00
pictuga 7d0d416610 morss: cache articles for 24hrs
Also make it possible to refetch articles, regardless of cache
2020-05-12 21:10:31 +02:00
pictuga 5dac4c69a1 crawler: more code comments 2020-05-12 20:44:25 +02:00
pictuga 36e2a1c3fd crawler: increase size limit from 100KiB to 500
I'm looking at you, worldbankgroup.csod.com/ats/careersite/search.aspx
2020-05-12 19:34:16 +02:00
pictuga 83dd2925d3 readabilite: better parsing
Keeping blank_text keeps the tree more as-it, making the final output closer to expectations
2020-05-12 14:15:53 +02:00
pictuga e09d0abf54 morss: remove deprecated peace of code 2020-05-07 16:05:30 +02:00
pictuga ff26a560cb Shift safari work around to morss.py 2020-05-07 16:04:54 +02:00
pictuga 74d7a1eca2 sheet.xsl: fix word wrap 2020-05-06 16:58:28 +02:00
7 changed files with 60 additions and 25 deletions

View File

@ -89,7 +89,7 @@ The arguments are:
- `noref`: drop items' link - `noref`: drop items' link
- `cache`: only take articles from the cache (ie. don't grab new articles' content), so as to save time - `cache`: only take articles from the cache (ie. don't grab new articles' content), so as to save time
- `debug`: to have some feedback from the script execution. Useful for debugging - `debug`: to have some feedback from the script execution. Useful for debugging
- `theforce`: force download the rss feed and ignore cached http errros - `force`: force refetch the rss feed and articles
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging) - `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
- http server only - http server only
- `callback=NAME`: for JSONP calls - `callback=NAME`: for JSONP calls

View File

@ -93,7 +93,7 @@ def custom_handler(follow=None, delay=None, encoding=None):
# & HTTPSHandler # & HTTPSHandler
#handlers.append(DebugHandler()) #handlers.append(DebugHandler())
handlers.append(SizeLimitHandler(100*1024)) # 100KiB handlers.append(SizeLimitHandler(500*1024)) # 500KiB
handlers.append(HTTPCookieProcessor()) handlers.append(HTTPCookieProcessor())
handlers.append(GZIPHandler()) handlers.append(GZIPHandler())
handlers.append(HTTPEquivHandler()) handlers.append(HTTPEquivHandler())
@ -387,12 +387,28 @@ default_cache = {}
class CacheHandler(BaseHandler): class CacheHandler(BaseHandler):
" Cache based on etags/last-modified " " Cache based on etags/last-modified "
private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC private_cache = False # Websites can indicate whether the page should be
# cached by CDNs (e.g. shouldn't be the case for
# private/confidential/user-specific pages.
# With this setting, decide whether (False) you want
# the cache to behave like a CDN (i.e. don't cache
# private pages), or (True) to behave like a end-cache
# private pages. If unsure, False is the safest bet.
handler_order = 499 handler_order = 499
def __init__(self, cache=None, force_min=None): def __init__(self, cache=None, force_min=None):
self.cache = cache or default_cache self.cache = cache or default_cache
self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache self.force_min = force_min
# Servers indicate how long they think their content is "valid".
# With this parameter (force_min, expressed in seconds), we can
# override the validity period (i.e. bypassing http headers)
# Special values:
# -1: valid forever, i.e. use the cache no matter what (and fetch
# the page online if not present in cache)
# 0: valid zero second, i.e. force refresh
# -2: same as -1, i.e. use the cache no matter what, but do NOT
# fetch the page online if not present in cache, throw an
# error instead
def load(self, url): def load(self, url):
try: try:
@ -422,6 +438,10 @@ class CacheHandler(BaseHandler):
return req return req
def http_open(self, req): def http_open(self, req):
# Reminder of how/when this function is called by urllib2:
# If 'None' is returned, try your chance with the next-available handler
# If a 'resp' is returned, stop there, and proceed with 'http_response'
(code, msg, headers, data, timestamp) = self.load(req.get_full_url()) (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
# some info needed to process everything # some info needed to process everything
@ -444,6 +464,7 @@ class CacheHandler(BaseHandler):
pass pass
else: else:
# raise an error, via urllib handlers
headers['Morss'] = 'from_cache' headers['Morss'] = 'from_cache'
resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409) resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
resp.msg = 'Conflict' resp.msg = 'Conflict'
@ -462,14 +483,18 @@ class CacheHandler(BaseHandler):
return None return None
elif code == 301 and cache_age < 7*24*3600: elif code == 301 and cache_age < 7*24*3600:
# "301 Moved Permanently" has to be cached...as long as we want (awesome HTTP specs), let's say a week (why not?) # "301 Moved Permanently" has to be cached...as long as we want
# use force_min=0 if you want to bypass this (needed for a proper refresh) # (awesome HTTP specs), let's say a week (why not?). Use force_min=0
# if you want to bypass this (needed for a proper refresh)
pass pass
elif self.force_min is None and ('no-cache' in cc_list elif self.force_min is None and ('no-cache' in cc_list
or 'no-store' in cc_list or 'no-store' in cc_list
or ('private' in cc_list and not self.private_cache)): or ('private' in cc_list and not self.private_cache)):
# kindly follow web servers indications, refresh # kindly follow web servers indications, refresh
# if the same settings are used all along, this section shouldn't be
# of any use, since the page woudln't be cached in the first place
# the check is only performed "just in case"
return None return None
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age: elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
@ -484,7 +509,7 @@ class CacheHandler(BaseHandler):
# according to the www, we have to refresh when nothing is said # according to the www, we have to refresh when nothing is said
return None return None
# return the cache as a response # return the cache as a response. This code is reached with 'pass' above
headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code) resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
resp.msg = msg resp.msg = msg
@ -515,6 +540,8 @@ class CacheHandler(BaseHandler):
data = resp.read() data = resp.read()
self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time()) self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
# the below is only needed because of 'resp.read()' above, as we can't
# seek(0) on arbitraty file-like objects (e.g. sockets)
fp = BytesIO(data) fp = BytesIO(data)
old_resp = resp old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
@ -534,10 +561,14 @@ class CacheHandler(BaseHandler):
unverifiable=True) unverifiable=True)
new.add_unredirected_header('Morss', 'from_304') new.add_unredirected_header('Morss', 'from_304')
# create a "fake" new request to just re-run through the various
# handlers
return self.parent.open(new, timeout=req.timeout) return self.parent.open(new, timeout=req.timeout)
return None return None # when returning 'None', the next-available handler is used
# the 'HTTPRedirectHandler' has no 'handler_order', i.e.
# uses the default of 500, therefore executed after this
https_request = http_request https_request = http_request
https_open = http_open https_open = http_open

View File

@ -319,7 +319,7 @@ class ParserXML(ParserBase):
return self.root.getparent().remove(self.root) return self.root.getparent().remove(self.root)
def tostring(self, encoding='unicode', **k): def tostring(self, encoding='unicode', **k):
return etree.tostring(self.root, encoding=encoding, **k) return etree.tostring(self.root, encoding=encoding, method='xml', **k)
def _rule_parse(self, rule): def _rule_parse(self, rule):
test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
@ -463,7 +463,7 @@ class ParserHTML(ParserXML):
return html_parse(raw, encoding=self.encoding) return html_parse(raw, encoding=self.encoding)
def tostring(self, encoding='unicode', **k): def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, encoding=encoding, **k) return lxml.html.tostring(self.root, encoding=encoding, method='html', **k)
def rule_search_all(self, rule): def rule_search_all(self, rule):
try: try:
@ -724,7 +724,7 @@ class FeedXML(Feed, ParserXML):
if self.root.getprevious() is None: if self.root.getprevious() is None:
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"')) self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
return etree.tostring(self.root.getroottree(), encoding=encoding, **k) return etree.tostring(self.root.getroottree(), encoding=encoding, method='xml', **k)
class ItemXML(Item, ParserXML): class ItemXML(Item, ParserXML):

View File

@ -243,10 +243,17 @@ def ItemFill(item, options, feedurl='/', fast=False):
# download # download
delay = -1 delay = -1
if fast: if fast or options.fast:
# super-fast mode # force cache, don't fetch
delay = -2 delay = -2
elif options.force:
# force refresh
delay = 0
else:
delay = 24*60*60 # 24h
try: try:
req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT) req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
@ -287,7 +294,7 @@ def ItemAfter(item, options):
for link in content.xpath('//a'): for link in content.xpath('//a'):
log(link.text_content()) log(link.text_content())
link.drop_tag() link.drop_tag()
item.content = lxml.etree.tostring(content) item.content = lxml.etree.tostring(content, method='html')
if options.noref: if options.noref:
item.link = '' item.link = ''
@ -299,7 +306,7 @@ def FeedFetch(url, options):
# fetch feed # fetch feed
delay = DELAY delay = DELAY
if options.theforce: if options.force:
delay = 0 delay = 0
try: try:
@ -487,6 +494,7 @@ def cgi_app(environ, start_response):
# headers # headers
headers['status'] = '200 OK' headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY headers['cache-control'] = 'max-age=%s' % DELAY
headers['x-content-type-options'] = 'nosniff' # safari work around
if options.cors: if options.cors:
headers['access-control-allow-origin'] = '*' headers['access-control-allow-origin'] = '*'
@ -512,9 +520,6 @@ def cgi_app(environ, start_response):
# get the work done # get the work done
url, rss = FeedFetch(url, options) url, rss = FeedFetch(url, options)
if headers['content-type'] == 'text/xml':
headers['content-type'] = rss.mimetype[0]
start_response(headers['status'], list(headers.items())) start_response(headers['status'], list(headers.items()))
rss = FeedGather(rss, url, options) rss = FeedGather(rss, url, options)
@ -607,7 +612,7 @@ def cgi_get(environ, start_response):
for elem in html.xpath('//'+tag): for elem in html.xpath('//'+tag):
elem.getparent().remove(elem) elem.getparent().remove(elem)
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
elif options.get == 'article': elif options.get == 'article':
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug) output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)

View File

@ -11,7 +11,7 @@ def parse(data, encoding=None):
else: else:
data = BeautifulSoup(data, 'lxml').prettify('utf-8') data = BeautifulSoup(data, 'lxml').prettify('utf-8')
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8') parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
return lxml.html.fromstring(data, parser=parser) return lxml.html.fromstring(data, parser=parser)
@ -101,7 +101,7 @@ def score_node(node):
" Score individual node " " Score individual node "
score = 0 score = 0
class_id = node.get('class', '') + node.get('id', '') class_id = (node.get('class') or '') + (node.get('id') or '')
if (isinstance(node, lxml.html.HtmlComment) if (isinstance(node, lxml.html.HtmlComment)
or isinstance(node, lxml.html.HtmlProcessingInstruction)): or isinstance(node, lxml.html.HtmlProcessingInstruction)):
@ -341,7 +341,7 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
if url: if url:
best.make_links_absolute(url) best.make_links_absolute(url)
return lxml.etree.tostring(best if not debug else html, pretty_print=True, encoding=encoding_out) return lxml.etree.tostring(best if not debug else html, method='html', encoding=encoding_out)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -4,9 +4,6 @@ ErrorDocument 403 "Access forbidden"
ErrorDocument 404 /cgi/main.py ErrorDocument 404 /cgi/main.py
ErrorDocument 500 "A very nasty bug found his way onto this very server" ErrorDocument 500 "A very nasty bug found his way onto this very server"
# Work around for Safari
Header set X-Content-Type-Options "nosniff"
<Files ~ "\.(py|pyc|db|log)$"> <Files ~ "\.(py|pyc|db|log)$">
deny from all deny from all
</Files> </Files>

View File

@ -21,6 +21,8 @@
body { body {
overflow-wrap: anywhere; overflow-wrap: anywhere;
word-wrap: anywhere; word-wrap: anywhere;
word-break: break-word;
font-family: sans-serif; font-family: sans-serif;
} }