From 0365232a73401766720158a77471e277573958b9 Mon Sep 17 00:00:00 2001
From: pictuga <contact@pictuga.com>
Date: Sat, 18 Sep 2021 16:16:34 +0200
Subject: [PATCH] readabilite: custom xpath for article detection

---
 README.md            | 12 +++++++-----
 morss/cli.py         |  1 +
 morss/morss.py       |  2 +-
 morss/readabilite.py | 30 ++++++++++++++++++++++++------
 4 files changed, 33 insertions(+), 12 deletions(-)
diff --git a/README.md b/README.md
index 4955eb5..6990f57 100644
--- a/README.md
+++ b/README.md
@@ -262,11 +262,12 @@ arguments to morss is explained in Run above.
 The list of arguments can be obtained by running `morss --help`
 
 ```
-usage: morss [-h] [--post STRING] [--format {rss,json,html,csv}]
-             [--search STRING] [--clip] [--indent] [--cache] [--force]
-             [--proxy] [--newest] [--firstlink] [--resolve] [--items XPATH]
-             [--item_link XPATH] [--item_title XPATH] [--item_content XPATH]
-             [--item_time XPATH] [--nolink] [--noref] [--silent]
+usage: morss [-h] [--post STRING] [--xpath XPATH]
+             [--format {rss,json,html,csv}] [--search STRING] [--clip]
+             [--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
+             [--resolve] [--items XPATH] [--item_link XPATH]
+             [--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
+             [--nolink] [--noref] [--silent]
              url
 
 Get full-text RSS feeds
@@ -277,6 +278,7 @@ positional arguments:
 optional arguments:
   -h, --help            show this help message and exit
   --post STRING         POST request
+  --xpath XPATH         xpath rule to manually detect the article
 
 output:
   --format {rss,json,html,csv}
diff --git a/morss/cli.py b/morss/cli.py
index 94c22fd..f43c5dd 100644
--- a/morss/cli.py
+++ b/morss/cli.py
@@ -32,6 +32,7 @@ def cli_app():
     parser.add_argument('url', help='feed url')
 
     parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
+    parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
 
     group = parser.add_argument_group('output')
     group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
diff --git a/morss/morss.py b/morss/morss.py
index b24b600..b34dabf 100644
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -222,7 +222,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
         log('empty page')
         return True
 
-    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
+    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)
 
     if out is not None:
         item.content = out
diff --git a/morss/readabilite.py b/morss/readabilite.py
index 709a033..4ea17cb 100644
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -211,7 +211,7 @@ def clean_node(node, keep_threshold=None):
         return
 
     # high score, so keep
-    if keep_threshold is not None and get_score(node) >= keep_threshold:
+    if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
         return
 
     gdparent = parent.getparent()
@@ -312,10 +312,8 @@ def lowest_common_ancestor(node_a, node_b, max_depth=None):
     return node_a # should always find one tho, at least <html/>, but needed for max_depth
 
 
-def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
-    " Input a raw html string, returns a raw html string of the article "
-
-    html = parse(data, encoding_in)
+def get_best_node(html, threshold=5):
+    # score all nodes
     score_all(html)
 
     # rank all nodes (largest to smallest)
@@ -332,9 +330,29 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
     else:
         best = ranked_nodes[0]
 
+    return best
+
+
+def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
+    " Input a raw html string, returns a raw html string of the article "
+
+    html = parse(data, encoding_in)
+
+    if xpath is not None:
+        xpath_match = html.xpath(xpath)
+
+        if len(xpath_match):
+            best = xpath_match[0]
+
+        else:
+            best = get_best_node(html, threshold)
+
+    else:
+        best = get_best_node(html, threshold)
+
     # clean up
     if not debug:
-        keep_threshold = get_score(ranked_nodes[0]) * 3/4
+        keep_threshold = get_score(best) * 3/4
         clean_root(best, keep_threshold)
 
     # check for spammy content (links only)