crawler: make AutoUA match http(s) type

crawler: replace ContentNegoHandler with AlternateHandler
More basic. Sends the same headers no matter what. Make requests more "replicable". Also, drop "text/xml" from RSS contenttype, too broad, matches garbage
2020-04-05 16:07:51 +02:00 · 2020-04-05 16:05:59 +02:00 · 2020-04-05 16:03:06 +02:00
2 changed files with 14 additions and 32 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -27,13 +27,14 @@ except NameError:

 MIMETYPE = {
    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
+    'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}


 DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'


-def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
+def custom_handler(follow=None, delay=None, encoding=None):
    handlers = []

    # as per urllib2 source code, these Handelers are added first
@@ -51,14 +52,12 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
    handlers.append(HTTPEquivHandler())
    handlers.append(HTTPRefreshHandler())
    handlers.append(UAHandler(DEFAULT_UA))
-
-    if not basic:
    handlers.append(AutoRefererHandler())

    handlers.append(EncodingFixHandler(encoding))

-    if accept:
-        handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
+    if follow:
+        handlers.append(AlternateHandler(MIMETYPE[follow]))

    handlers.append(CacheHandler(force_min=delay))

@@ -198,43 +197,28 @@ class UAHandler(BaseHandler):

 class AutoRefererHandler(BaseHandler):
    def http_request(self, req):
-        req.add_unredirected_header('Referer', 'http://%s' % req.host)
+        req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host))
        return req

    https_request = http_request


-class ContentNegociationHandler(BaseHandler):
-    " Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
+class AlternateHandler(BaseHandler):
+    " Follow <link rel='alternate' type='application/rss+xml' href='...' /> "

-    def __init__(self, accept=None, strict=False):
-        self.accept = accept
-        self.strict = strict
-
-    def http_request(self, req):
-        if self.accept is not None:
-            if isinstance(self.accept, basestring):
-                self.accept = (self.accept,)
-
-            string = ','.join(self.accept)
-
-            if self.strict:
-                string += ',*/*;q=0.9'
-
-            req.add_unredirected_header('Accept', string)
-
-        return req
+    def __init__(self, follow=None):
+        self.follow = follow or []

    def http_response(self, req, resp):
        contenttype = resp.info().get('Content-Type', '').split(';')[0]
-        if 200 <= resp.code < 300 and self.accept is not None and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
+        if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
            # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types

            data = resp.read()
            links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')

            for link in links:
-                if link.get('type', '') in self.accept:
+                if link.get('type', '') in self.follow:
                    resp.code = 302
                    resp.msg = 'Moved Temporarily'
                    resp.headers['location'] = link.get('href')
@@ -246,7 +230,6 @@ class ContentNegociationHandler(BaseHandler):

        return resp

-    https_request = http_request
    https_response = http_response


--- a/morss/morss.py
+++ b/morss/morss.py
@@ -252,7 +252,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
        delay = -2

    try:
-        con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
+        con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
        data = con.read()

    except (IOError, HTTPException) as e:
@@ -335,8 +335,7 @@ def FeedFetch(url, options):
        delay = 0

    try:
-        con = crawler.custom_handler(accept='xml', strict=True, delay=delay,
-            encoding=options.encoding, basic=not options.items) \
+        con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
            .open(url, timeout=TIMEOUT * 2)
        xml = con.read()
Author	SHA1	Message	Date
pictuga	bf86c1e962	crawler: make AutoUA match http(s) type	2020-04-05 16:07:51 +02:00
pictuga	d20f6237bd	crawler: replace ContentNegoHandler with AlternateHandler More basic. Sends the same headers no matter what. Make requests more "replicable". Also, drop "text/xml" from RSS contenttype, too broad, matches garbage	2020-04-05 16:05:59 +02:00
pictuga	8a4d68d72c	crawler: drop 'basic' toggle Can't even remember the use case	2020-04-05 16:03:06 +02:00