crawler: drop auto-referer

Was solving some issues. But creating even more issues.
Detect encoding everytime
2020-04-07 10:39:21 +02:00 · 2020-04-07 10:38:36 +02:00 · 2020-04-07 10:30:17 +02:00 · 2020-04-07 09:24:32 +02:00 · 2020-04-07 09:23:29 +02:00 · 2020-04-06 18:53:07 +02:00
5 changed files with 97 additions and 72 deletions
--- a/README.md
+++ b/README.md
@ -24,15 +24,13 @@ hand-written rules (ie. there's no automatic detection of links to build feeds).
 Please mind that feeds based on html files may stop working unexpectedly, due to
 html structure changes on the target website.

-Additionally morss can grab the source xml feed of iTunes podcast, and detect
-rss feeds in html pages' `<meta>`.
+Additionally morss can detect rss feeds in html pages' `<meta>`.

 You can use this program online for free at **[morss.it](https://morss.it/)**.

 Some features of morss:
 - Read RSS/Atom feeds
 - Create RSS feeds from json/html pages
- Convert iTunes podcast links into xml links
 - Export feeds as RSS/JSON/CSV/HTML
 - Fetch full-text content of feed items
 - Follow 301/meta redirects
@ -75,6 +73,8 @@ The arguments are:

 - Change what morss does
 	- `json`: output as JSON
+	- `html`: outpout as HTML
+	- `csv`: outpout as CSV
 	- `proxy`: doesn't fill the articles
 	- `clip`: stick the full article content under the original feed content (useful for twitter)
 	- `search=STRING`: does a basic case-sensitive search in the feed
@ -88,11 +88,9 @@ The arguments are:
 	- `mono`: disable multithreading while fetching, makes debugging easier
 	- `theforce`: force download the rss feed and ignore cached http errros
 	- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
-	- `encoding=ENCODING`: overrides the encoding auto-detection of the crawler. Some web developers did not quite understand the importance of setting charset/encoding tags correctly...
 - http server only
 	- `callback=NAME`: for JSONP calls
 	- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
-	- `html`: changes the http content-type to html, so that python cgi erros (written in html) are readable in a web browser
 	- `txt`: changes the http content-type to txt (for faster "`view-source:`")
 - Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
 	- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries
--- a/morss/crawler.py
+++ b/morss/crawler.py
@ -34,6 +34,25 @@ MIMETYPE = {
 DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'


+def get(*args, **kwargs):
+    return adv_get(*args, **kwargs)[0]
+
+
+def adv_get(url, timeout=None, *args, **kwargs):
+    if timeout is None:
+        con = custom_handler(*args, **kwargs).open(url)
+
+    else:
+        con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
+
+    data = con.read()
+
+    contenttype = con.info().get('Content-Type', '').split(';')[0]
+    encoding= detect_encoding(data, con)
+
+    return data, con, contenttype, encoding
+
+
 def custom_handler(follow=None, delay=None, encoding=None):
    handlers = []

@ -199,7 +218,6 @@ class BrowserlyHeaderHandler(BaseHandler):
    """ Add more headers to look less suspicious """

    def http_request(self, req):
-        req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host))
        req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
        req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
        return req
--- a/morss/feeds.py
+++ b/morss/feeds.py
@ -15,7 +15,7 @@ import dateutil.parser
 from copy import deepcopy

 import lxml.html
-from bs4 import BeautifulSoup
+from .readabilite import parse as html_parse

 json.encoder.c_make_encoder = None

@ -53,7 +53,7 @@ def parse_rules(filename=None):
    return rules


-def parse(data, url=None, mimetype=None):
+def parse(data, url=None, mimetype=None, encoding=None):
    " Determine which ruleset to use "

    rulesets = parse_rules()
@ -67,14 +67,14 @@ def parse(data, url=None, mimetype=None):
                for path in ruleset['path']:
                    if fnmatch(url, path):
                        parser = [x for x in parsers if x.mode == ruleset['mode']][0]
-                        return parser(data, ruleset) 
+                        return parser(data, ruleset, encoding=encoding) 

    # 2) Look for a parser based on mimetype

    if mimetype is not None:
        parser_candidates = [x for x in parsers if mimetype in x.mimetype]

-    if mimetype is None or parser_candidates is None:
+    if mimetype is None or len(parser_candidates) == 0:
        parser_candidates = parsers

    # 3) Look for working ruleset for given parser
@ -86,7 +86,7 @@ def parse(data, url=None, mimetype=None):
            # 'path' as they should have been caught beforehands

        try:
-            feed = parser(data)
+            feed = parser(data, encoding=encoding)

        except (ValueError):
            # parsing did not work
@ -113,7 +113,7 @@ def parse(data, url=None, mimetype=None):


 class ParserBase(object):
-    def __init__(self, data=None, rules=None, parent=None):
+    def __init__(self, data=None, rules=None, parent=None, encoding=None):
        if rules is None:
            rules = parse_rules()[self.default_ruleset]

@ -122,9 +122,10 @@ class ParserBase(object):
        if data is None:
            data = rules['base']

-        self.root = self.parse(data)
        self.parent = parent
+        self.encoding = encoding

+        self.root = self.parse(data)

    def parse(self, raw):
        pass
@ -442,8 +443,7 @@ class ParserHTML(ParserXML):
    mimetype = ['text/html', 'application/xhtml+xml']

    def parse(self, raw):
-        parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
-        return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser)
+        return html_parse(raw, encoding=self.encoding)

    def tostring(self, encoding='unicode', **k):
        return lxml.html.tostring(self.root, encoding=encoding, **k)
--- a/morss/morss.py
+++ b/morss/morss.py
@ -10,7 +10,6 @@ import re

 import lxml.etree
 import lxml.html
-from bs4 import BeautifulSoup

 from . import feeds
 from . import crawler
@ -251,19 +250,17 @@ def ItemFill(item, options, feedurl='/', fast=False):
        delay = -2

    try:
-        con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
-        data = con.read()
+        data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)

    except (IOError, HTTPException) as e:
        log('http error')
        return False # let's just delete errors stuff when in cache mode

-    contenttype = con.info().get('Content-Type', '').split(';')[0]
    if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
        log('non-text page')
        return True

-    out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
+    out = readabilite.get_article(data, url=con.geturl(), encoding=encoding)

    if out is not None:
        item.content = out
@ -324,18 +321,14 @@ def FeedFetch(url, options):
        delay = 0

    try:
-        con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
-            .open(url, timeout=TIMEOUT * 2)
-        xml = con.read()
+        xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)

    except (IOError, HTTPException):
        raise MorssException('Error downloading feed')

-    contenttype = con.info().get('Content-Type', '').split(';')[0]
-
    if options.items:
        # using custom rules
-        rss = feeds.FeedHTML(xml)
+        rss = feeds.FeedHTML(xml, encoding=encoding)

        rss.rules['title'] = options.title              if options.title        else '//head/title'
        rss.rules['desc'] = options.desc                if options.desc         else '//head/meta[@name="description"]/@content'
@ -355,7 +348,7 @@ def FeedFetch(url, options):

    else:
        try:
-            rss = feeds.parse(xml, url, contenttype)
+            rss = feeds.parse(xml, url, contenttype, encoding=encoding)
            rss = rss.convert(feeds.FeedXML)
                # contains all fields, otherwise much-needed data can be lost

@ -469,7 +462,7 @@ def FeedFormat(rss, options, encoding='utf-8'):
    elif options.csv:
        return rss.tocsv(encoding=encoding)

-    elif options.reader:
+    elif options.html:
        if options.indent:
            return rss.tohtml(encoding=encoding, pretty_print=True)

@ -547,7 +540,7 @@ def cgi_app(environ, start_response):
    if options.cors:
        headers['access-control-allow-origin'] = '*'

-    if options.html or options.reader:
+    if options.html:
        headers['content-type'] = 'text/html'
    elif options.txt or options.silent:
        headers['content-type'] = 'text/plain'
@ -652,13 +645,10 @@ def cgi_page(environ, start_response):
    if urlparse(url).scheme not in ['http', 'https']:
        url = 'http://' + url

-    con = crawler.custom_handler().open(url)
-    data = con.read()
-
-    contenttype = con.info().get('Content-Type', '').split(';')[0]
+    data, con, contenttype, encoding = crawler.adv_get(url=url)

    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
-        html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
+        html = readabilite.parse(data, encoding=encoding)
        html.make_links_absolute(con.geturl())

        kill_tags = ['script', 'iframe', 'noscript']
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@ -6,11 +6,14 @@ import re

 def parse(data, encoding=None):
    if encoding:
-        parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
-    else:
-        parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
+        data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')

-    return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser)
+    else:
+        data = BeautifulSoup(data, 'lxml').prettify('utf-8')
+
+    parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
+
+    return lxml.html.fromstring(data, parser=parser)


 def count_words(string):
@ -44,6 +47,12 @@ def count_content(node):
    return count_words(node.text_content()) + len(node.findall('.//img'))


+def percentile(N, P):
+    # https://stackoverflow.com/a/7464107
+    n = max(int(round(P * len(N) + 0.5)), 2)
+    return N[n-2]
+
+
 class_bad = ['comment', 'community', 'extra', 'foot',
    'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
    'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about',
@ -123,33 +132,42 @@ def score_node(node):
    return score


-def score_all(node, grades=None):
+def score_all(node):
    " Fairly dumb loop to score all worthwhile nodes. Tries to be fast "

-    if grades is None:
-        grades = {}
-
    for child in node:
        score = score_node(child)
        child.attrib['seen'] = 'yes, ' + str(int(score))

-        if score > 0 or not len(grades):
-            spread_score(child, score, grades)
-            score_all(child, grades)
-
-    return grades
+        if score > 0 or len(list(child.iterancestors())) <= 2:
+            spread_score(child, score)
+            score_all(child)


-def spread_score(node, score, grades):
+def set_score(node, value):
+    node.attrib['morss_score'] = str(float(value))
+
+
+def get_score(node):
+    return float(node.attrib.get('morss_score', 0))
+
+
+def incr_score(node, delta):
+    set_score(node, get_score(node) + delta)
+
+
+def get_all_scores(node):
+    return {x:get_score(x) for x in list(node.iter()) if get_score(x) != 0}
+
+
+def spread_score(node, score):
    " Spread the node's score to its parents, on a linear way "

    delta = score / 2
+
    for ancestor in [node,] + list(node.iterancestors()):
        if score >= 1 or ancestor is node:
-            try:
-                grades[ancestor] += score
-            except KeyError:
-                grades[ancestor] = score
+            incr_score(ancestor, score)

            score -= delta

@ -157,26 +175,24 @@ def spread_score(node, score, grades):
            break


-def write_score_all(root, grades):
-    " Useful for debugging "
-
-    for node in root.iter():
-        node.attrib['score'] = str(int(grades.get(node, 0)))
-
-
-def clean_root(root):
+def clean_root(root, keep_threshold=None):
    for node in list(root):
-        clean_root(node)
-        clean_node(node)
+        # bottom-up approach, i.e. starting with children before cleaning current node
+        clean_root(node, keep_threshold)
+        clean_node(node, keep_threshold)


-def clean_node(node):
+def clean_node(node, keep_threshold=None):
    parent = node.getparent()

    if parent is None:
        # this is <html/> (or a removed element waiting for GC)
        return

+    if keep_threshold is not None and get_score(node) >= keep_threshold:
+        # high score, so keep
+        return
+
    gdparent = parent.getparent()

    # remove shitty tags
@ -275,18 +291,18 @@ def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
    return nodeA # should always find one tho, at least <html/>, but needed for max_depth


-def rank_nodes(grades):
+def rank_grades(grades):
+    # largest score to smallest
    return sorted(grades.items(), key=lambda x: x[1], reverse=True)


-def get_best_node(grades):
+def get_best_node(ranked_grades):
    " To pick the best (raw) node. Another function will clean it "

-    if len(grades) == 1:
-        return grades[0]
+    if len(ranked_grades) == 1:
+        return ranked_grades[0]

-    top = rank_nodes(grades)
-    lowest = lowest_common_ancestor(top[0][0], top[1][0], 3)
+    lowest = lowest_common_ancestor(ranked_grades[0][0], ranked_grades[1][0], 3)

    return lowest

@ -295,12 +311,17 @@ def get_article(data, url=None, encoding=None):
    " Input a raw html string, returns a raw html string of the article "

    html = parse(data, encoding)
-    scores = score_all(html)
+    score_all(html)
+    scores = rank_grades(get_all_scores(html))

    if not len(scores):
        return None

    best = get_best_node(scores)
+
+    keep_threshold = percentile([x[1] for x in scores], 0.1)
+    clean_root(best, keep_threshold)
+
    wc = count_words(best.text_content())
    wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))

@ -310,6 +331,4 @@ def get_article(data, url=None, encoding=None):
    if url:
        best.make_links_absolute(url)

-    clean_root(best)
-
    return lxml.etree.tostring(best, pretty_print=True)
Author	SHA1	Message	Date
pictuga	e5a82ff1f4	crawler: drop auto-referer Was solving some issues. But creating even more issues.	2020-04-07 10:39:21 +02:00
pictuga	f3d1f92b39	Detect encoding everytime	2020-04-07 10:38:36 +02:00
pictuga	7691df5257	Use wrapper for http calls	2020-04-07 10:30:17 +02:00
pictuga	0ae0dbc175	README: mention csv output	2020-04-07 09:24:32 +02:00
pictuga	f1d0431e68	morss: drop :html, replaced with :reader README updated accordingly	2020-04-07 09:23:29 +02:00
pictuga	a09831415f	feeds: fix bug when mimetype matches nothing	2020-04-06 18:53:07 +02:00
pictuga	bfad6b7a4a	readabilite: clean before counting To remove links which are not kept anyway	2020-04-06 16:55:39 +02:00
pictuga	6b8c3e51e7	readabilite: fix threshold feature Awkward typo...	2020-04-06 16:52:06 +02:00
pictuga	dc9e425247	readabilite: don't clean-out the top 10% nodes Loosen up the code once again to limit over-kill	2020-04-06 14:26:28 +02:00
pictuga	2f48e18bb1	readabilite: put scores directly in html node Probably slower but makes code somewhat cleaner...	2020-04-06 14:21:41 +02:00
pictuga	31cac921c7	README: remove ref to iTunes	2020-04-05 22:20:33 +02:00