readabilite: custom xpath for article detection

cache: avoid name collision
readability: better var names
2021-09-18 16:16:34 +02:00 · 2021-09-18 16:08:01 +02:00 · 2021-09-16 07:40:58 +02:00
7 changed files with 50 additions and 29 deletions
--- a/README.md
+++ b/README.md
@ -262,11 +262,12 @@ arguments to morss is explained in Run above.
 The list of arguments can be obtained by running `morss --help`

 ```
-usage: morss [-h] [--post STRING] [--format {rss,json,html,csv}]
-             [--search STRING] [--clip] [--indent] [--cache] [--force]
-             [--proxy] [--newest] [--firstlink] [--resolve] [--items XPATH]
-             [--item_link XPATH] [--item_title XPATH] [--item_content XPATH]
-             [--item_time XPATH] [--nolink] [--noref] [--silent]
+usage: morss [-h] [--post STRING] [--xpath XPATH]
+             [--format {rss,json,html,csv}] [--search STRING] [--clip]
+             [--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
+             [--resolve] [--items XPATH] [--item_link XPATH]
+             [--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
+             [--nolink] [--noref] [--silent]
             url

 Get full-text RSS feeds
@ -277,6 +278,7 @@ positional arguments:
 optional arguments:
  -h, --help            show this help message and exit
  --post STRING         POST request
+  --xpath XPATH         xpath rule to manually detect the article

 output:
  --format {rss,json,html,csv}
--- a/morss/caching.py
+++ b/morss/caching.py
--- a/morss/cli.py
+++ b/morss/cli.py
@ -32,6 +32,7 @@ def cli_app():
    parser.add_argument('url', help='feed url')

    parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
+    parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')

    group = parser.add_argument_group('output')
    group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
--- a/morss/crawler.py
+++ b/morss/crawler.py
@ -28,7 +28,7 @@ from io import BytesIO, StringIO

 import chardet

-from .cache import default_cache
+from .caching import default_cache

 try:
    # python 2
--- a/morss/morss.py
+++ b/morss/morss.py
@ -25,7 +25,7 @@ import lxml.etree
 import lxml.html
 from dateutil import tz

-from . import crawler, feeds, readabilite
+from . import caching, crawler, feeds, readabilite

 try:
    # python 2
@ -222,7 +222,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
        log('empty page')
        return True

-    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
+    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)

    if out is not None:
        item.content = out
@ -411,7 +411,7 @@ def process(url, cache=None, options=None):
    options = Options(options)

    if cache:
-        crawler.default_cache = crawler.SQLiteCache(cache)
+        caching.default_cache = caching.SQLiteCache(cache)

    url, rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@ -211,7 +211,7 @@ def clean_node(node, keep_threshold=None):
        return

    # high score, so keep
-    if keep_threshold is not None and get_score(node) >= keep_threshold:
+    if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
        return

    gdparent = parent.getparent()
@ -294,28 +294,26 @@ def clean_node(node, keep_threshold=None):
            gdparent.insert(gdparent.index(parent)+1, new_node)


-def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
-    ancestorsA = list(nodeA.iterancestors())
-    ancestorsB = list(nodeB.iterancestors())
+def lowest_common_ancestor(node_a, node_b, max_depth=None):
+    ancestors_a = list(node_a.iterancestors())
+    ancestors_b = list(node_b.iterancestors())

    if max_depth is not None:
-        ancestorsA = ancestorsA[:max_depth]
-        ancestorsB = ancestorsB[:max_depth]
+        ancestors_a = ancestors_a[:max_depth]
+        ancestors_b = ancestors_b[:max_depth]

-    ancestorsA.insert(0, nodeA)
-    ancestorsB.insert(0, nodeB)
+    ancestors_a.insert(0, node_a)
+    ancestors_b.insert(0, node_b)

-    for ancestorA in ancestorsA:
-        if ancestorA in ancestorsB:
-            return ancestorA
+    for ancestor_a in ancestors_a:
+        if ancestor_a in ancestors_b:
+            return ancestor_a

-    return nodeA # should always find one tho, at least <html/>, but needed for max_depth
+    return node_a # should always find one tho, at least <html/>, but needed for max_depth


-def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
-    " Input a raw html string, returns a raw html string of the article "
-
-    html = parse(data, encoding_in)
+def get_best_node(html, threshold=5):
+    # score all nodes
    score_all(html)

    # rank all nodes (largest to smallest)
@ -332,9 +330,29 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
    else:
        best = ranked_nodes[0]

+    return best
+
+
+def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
+    " Input a raw html string, returns a raw html string of the article "
+
+    html = parse(data, encoding_in)
+
+    if xpath is not None:
+        xpath_match = html.xpath(xpath)
+
+        if len(xpath_match):
+            best = xpath_match[0]
+
+        else:
+            best = get_best_node(html, threshold)
+
+    else:
+        best = get_best_node(html, threshold)
+
    # clean up
    if not debug:
-        keep_threshold = get_score(ranked_nodes[0]) * 3/4
+        keep_threshold = get_score(best) * 3/4
        clean_root(best, keep_threshold)

    # check for spammy content (links only)
--- a/morss/wsgi.py
+++ b/morss/wsgi.py
@ -33,7 +33,7 @@ except ImportError:
    # python 3
    from urllib.parse import unquote

-from . import crawler, readabilite
+from . import caching, crawler, readabilite
 from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
                    MorssException, Options, log)

@ -287,7 +287,7 @@ class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):


 def cgi_start_server():
-    crawler.default_cache.autotrim()
+    caching.default_cache.autotrim()

    print('Serving http://localhost:%s/' % PORT)
    httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
@ -295,4 +295,4 @@ def cgi_start_server():


 if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
-    crawler.default_cache.autotrim()
+    caching.default_cache.autotrim()
Author	SHA1	Message	Date
pictuga	ef6efd981c	readabilite: custom xpath for article detection continuous-integration/drone/push Build is passing Details	2021-09-18 16:16:34 +02:00
pictuga	70d6ee02d3	cache: avoid name collision	2021-09-18 16:08:01 +02:00
pictuga	3c551d77bb	readability: better var names	2021-09-16 07:40:58 +02:00