Rename :theforce into :force

Use etree.tostring 'method' arg
Gives appropriately formatted html code. Some pages might otherwise be rendered as blank.
2020-05-13 11:49:15 +02:00 · 2020-05-13 11:44:34 +02:00 · 2020-05-12 21:10:31 +02:00 · 2020-05-12 20:44:25 +02:00 · 2020-05-12 19:34:16 +02:00 · 2020-05-12 14:15:53 +02:00
7 changed files with 60 additions and 25 deletions
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ The arguments are:
 	- `noref`: drop items' link
 	- `cache`: only take articles from the cache (ie. don't grab new articles' content), so as to save time
 	- `debug`: to have some feedback from the script execution. Useful for debugging
-	- `theforce`: force download the rss feed and ignore cached http errros
+	- `force`: force refetch the rss feed and articles
 	- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
 - http server only
 	- `callback=NAME`: for JSONP calls
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -93,7 +93,7 @@ def custom_handler(follow=None, delay=None, encoding=None):
    # & HTTPSHandler

    #handlers.append(DebugHandler())
-    handlers.append(SizeLimitHandler(100*1024)) # 100KiB
+    handlers.append(SizeLimitHandler(500*1024)) # 500KiB
    handlers.append(HTTPCookieProcessor())
    handlers.append(GZIPHandler())
    handlers.append(HTTPEquivHandler())
@@ -387,12 +387,28 @@ default_cache = {}
 class CacheHandler(BaseHandler):
    " Cache based on etags/last-modified "

-    private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
+    private_cache = False # Websites can indicate whether the page should be
+                          # cached by CDNs (e.g. shouldn't be the case for
+                          # private/confidential/user-specific pages.
+                          # With this setting, decide whether (False) you want
+                          # the cache to behave like a CDN (i.e. don't cache
+                          # private pages), or (True) to behave like a end-cache
+                          # private pages. If unsure, False is the safest bet.
    handler_order = 499

    def __init__(self, cache=None, force_min=None):
        self.cache = cache or default_cache
-        self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
+        self.force_min = force_min
+            # Servers indicate how long they think their content is "valid".
+            # With this parameter (force_min, expressed in seconds), we can
+            # override the validity period (i.e. bypassing http headers)
+            # Special values:
+            #   -1: valid forever, i.e. use the cache no matter what (and fetch
+            #       the page online if not present in cache)
+            #    0: valid zero second, i.e. force refresh
+            #   -2: same as -1, i.e. use the cache no matter what, but do NOT
+            #       fetch the page online if not present in cache, throw an
+            #       error instead

    def load(self, url):
        try:
@@ -422,6 +438,10 @@ class CacheHandler(BaseHandler):
        return req

    def http_open(self, req):
+        # Reminder of how/when this function is called by urllib2:
+        # If 'None' is returned, try your chance with the next-available handler
+        # If a 'resp' is returned, stop there, and proceed with 'http_response'
+
        (code, msg, headers, data, timestamp) = self.load(req.get_full_url())

        # some info needed to process everything
@@ -444,6 +464,7 @@ class CacheHandler(BaseHandler):
                pass

            else:
+                # raise an error, via urllib handlers
                headers['Morss'] = 'from_cache'
                resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
                resp.msg = 'Conflict'
@@ -462,14 +483,18 @@ class CacheHandler(BaseHandler):
            return None

        elif code == 301 and cache_age < 7*24*3600:
-            # "301 Moved Permanently" has to be cached...as long as we want (awesome HTTP specs), let's say a week (why not?)
-            # use force_min=0 if you want to bypass this (needed for a proper refresh)
+            # "301 Moved Permanently" has to be cached...as long as we want
+            # (awesome HTTP specs), let's say a week (why not?). Use force_min=0
+            # if you want to bypass this (needed for a proper refresh)
            pass

        elif  self.force_min is None and ('no-cache' in cc_list
                                        or 'no-store' in cc_list
                                        or ('private' in cc_list and not self.private_cache)):
            # kindly follow web servers indications, refresh
+            # if the same settings are used all along, this section shouldn't be
+            # of any use, since the page woudln't be cached in the first place
+            # the check is only performed "just in case"
            return None

        elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
@@ -484,7 +509,7 @@ class CacheHandler(BaseHandler):
            # according to the www, we have to refresh when nothing is said
            return None

-        # return the cache as a response
+        # return the cache as a response. This code is reached with 'pass' above
        headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
        resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
        resp.msg = msg
@@ -515,6 +540,8 @@ class CacheHandler(BaseHandler):
        data = resp.read()
        self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())

+        # the below is only needed because of 'resp.read()' above, as we can't
+        # seek(0) on arbitraty file-like objects (e.g. sockets)
        fp = BytesIO(data)
        old_resp = resp
        resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
@@ -534,10 +561,14 @@ class CacheHandler(BaseHandler):
                           unverifiable=True)

            new.add_unredirected_header('Morss', 'from_304')
+                # create a "fake" new request to just re-run through the various
+                # handlers

            return self.parent.open(new, timeout=req.timeout)

-        return None
+        return None # when returning 'None', the next-available handler is used
+                    # the 'HTTPRedirectHandler' has no 'handler_order', i.e.
+                    # uses the default of 500, therefore executed after this

    https_request = http_request
    https_open = http_open
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -319,7 +319,7 @@ class ParserXML(ParserBase):
        return self.root.getparent().remove(self.root)

    def tostring(self, encoding='unicode', **k):
-        return etree.tostring(self.root, encoding=encoding, **k)
+        return etree.tostring(self.root, encoding=encoding, method='xml', **k)

    def _rule_parse(self, rule):
        test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
@@ -463,7 +463,7 @@ class ParserHTML(ParserXML):
        return html_parse(raw, encoding=self.encoding)

    def tostring(self, encoding='unicode', **k):
-        return lxml.html.tostring(self.root, encoding=encoding, **k)
+        return lxml.html.tostring(self.root, encoding=encoding, method='html', **k)

    def rule_search_all(self, rule):
        try:
@@ -724,7 +724,7 @@ class FeedXML(Feed, ParserXML):
        if self.root.getprevious() is None:
            self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))

-        return etree.tostring(self.root.getroottree(), encoding=encoding, **k)
+        return etree.tostring(self.root.getroottree(), encoding=encoding, method='xml', **k)


 class ItemXML(Item, ParserXML):
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -243,10 +243,17 @@ def ItemFill(item, options, feedurl='/', fast=False):
    # download
    delay = -1

-    if fast:
-        # super-fast mode
+    if fast or options.fast:
+        # force cache, don't fetch
        delay = -2

+    elif options.force:
+        # force refresh
+        delay = 0
+
+    else:
+        delay = 24*60*60 # 24h
+
    try:
        req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)

@@ -287,7 +294,7 @@ def ItemAfter(item, options):
        for link in content.xpath('//a'):
            log(link.text_content())
            link.drop_tag()
-        item.content = lxml.etree.tostring(content)
+        item.content = lxml.etree.tostring(content, method='html')

    if options.noref:
        item.link = ''
@@ -299,7 +306,7 @@ def FeedFetch(url, options):
    # fetch feed
    delay = DELAY

-    if options.theforce:
+    if options.force:
        delay = 0

    try:
@@ -487,6 +494,7 @@ def cgi_app(environ, start_response):
    # headers
    headers['status'] = '200 OK'
    headers['cache-control'] = 'max-age=%s' % DELAY
+    headers['x-content-type-options'] = 'nosniff' # safari work around

    if options.cors:
        headers['access-control-allow-origin'] = '*'
@@ -512,9 +520,6 @@ def cgi_app(environ, start_response):
    # get the work done
    url, rss = FeedFetch(url, options)

-    if headers['content-type'] == 'text/xml':
-        headers['content-type'] = rss.mimetype[0]
-
    start_response(headers['status'], list(headers.items()))

    rss = FeedGather(rss, url, options)
@@ -607,7 +612,7 @@ def cgi_get(environ, start_response):
                for elem in html.xpath('//'+tag):
                    elem.getparent().remove(elem)

-            output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
+            output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')

        elif options.get == 'article':
            output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -11,7 +11,7 @@ def parse(data, encoding=None):
    else:
        data = BeautifulSoup(data, 'lxml').prettify('utf-8')

-    parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
+    parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')

    return lxml.html.fromstring(data, parser=parser)

@@ -101,7 +101,7 @@ def score_node(node):
    " Score individual node "

    score = 0
-    class_id = node.get('class', '') + node.get('id', '')
+    class_id = (node.get('class') or '') + (node.get('id') or '')

    if (isinstance(node, lxml.html.HtmlComment)
            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
@@ -341,7 +341,7 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
    if url:
        best.make_links_absolute(url)

-    return lxml.etree.tostring(best if not debug else html, pretty_print=True, encoding=encoding_out)
+    return lxml.etree.tostring(best if not debug else html, method='html', encoding=encoding_out)


 if __name__ == '__main__':
--- a/www/.htaccess
+++ b/www/.htaccess
@@ -4,9 +4,6 @@ ErrorDocument 403 "Access forbidden"
 ErrorDocument 404 /cgi/main.py
 ErrorDocument 500 "A very nasty bug found his way onto this very server"

-# Work around for Safari
-Header set X-Content-Type-Options "nosniff"
-
 <Files ~ "\.(py|pyc|db|log)$">
 	deny from all
 </Files>
--- a/www/sheet.xsl
+++ b/www/sheet.xsl
@@ -21,6 +21,8 @@
 				body {
 					overflow-wrap: anywhere;
 					word-wrap: anywhere;
+					word-break: break-word;
+
 					font-family: sans-serif;
 				}
Author	SHA1	Message	Date
pictuga	038f267ea2	Rename :theforce into :force	2020-05-13 11:49:15 +02:00
pictuga	22005065e8	Use etree.tostring 'method' arg Gives appropriately formatted html code. Some pages might otherwise be rendered as blank.	2020-05-13 11:44:34 +02:00
pictuga	7d0d416610	morss: cache articles for 24hrs Also make it possible to refetch articles, regardless of cache	2020-05-12 21:10:31 +02:00
pictuga	5dac4c69a1	crawler: more code comments	2020-05-12 20:44:25 +02:00
pictuga	36e2a1c3fd	crawler: increase size limit from 100KiB to 500 I'm looking at you, worldbankgroup.csod.com/ats/careersite/search.aspx	2020-05-12 19:34:16 +02:00
pictuga	83dd2925d3	readabilite: better parsing Keeping blank_text keeps the tree more as-it, making the final output closer to expectations	2020-05-12 14:15:53 +02:00
pictuga	e09d0abf54	morss: remove deprecated peace of code	2020-05-07 16:05:30 +02:00
pictuga	ff26a560cb	Shift safari work around to morss.py	2020-05-07 16:04:54 +02:00
pictuga	74d7a1eca2	sheet.xsl: fix word wrap	2020-05-06 16:58:28 +02:00