readabilite: custom xpath for article detection
continuous-integration/drone/push Build is failing
Details
continuous-integration/drone/push Build is failing
Details
parent
a523518ae8
commit
0365232a73
12
README.md
12
README.md
|
@ -262,11 +262,12 @@ arguments to morss is explained in Run above.
|
|||
The list of arguments can be obtained by running `morss --help`
|
||||
|
||||
```
|
||||
usage: morss [-h] [--post STRING] [--format {rss,json,html,csv}]
|
||||
[--search STRING] [--clip] [--indent] [--cache] [--force]
|
||||
[--proxy] [--newest] [--firstlink] [--resolve] [--items XPATH]
|
||||
[--item_link XPATH] [--item_title XPATH] [--item_content XPATH]
|
||||
[--item_time XPATH] [--nolink] [--noref] [--silent]
|
||||
usage: morss [-h] [--post STRING] [--xpath XPATH]
|
||||
[--format {rss,json,html,csv}] [--search STRING] [--clip]
|
||||
[--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
|
||||
[--resolve] [--items XPATH] [--item_link XPATH]
|
||||
[--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
|
||||
[--nolink] [--noref] [--silent]
|
||||
url
|
||||
|
||||
Get full-text RSS feeds
|
||||
|
@ -277,6 +278,7 @@ positional arguments:
|
|||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--post STRING POST request
|
||||
--xpath XPATH xpath rule to manually detect the article
|
||||
|
||||
output:
|
||||
--format {rss,json,html,csv}
|
||||
|
|
|
@ -32,6 +32,7 @@ def cli_app():
|
|||
parser.add_argument('url', help='feed url')
|
||||
|
||||
parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
|
||||
parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
|
||||
|
||||
group = parser.add_argument_group('output')
|
||||
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
|
||||
|
|
|
@ -222,7 +222,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
|||
log('empty page')
|
||||
return True
|
||||
|
||||
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
|
||||
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)
|
||||
|
||||
if out is not None:
|
||||
item.content = out
|
||||
|
|
|
@ -211,7 +211,7 @@ def clean_node(node, keep_threshold=None):
|
|||
return
|
||||
|
||||
# high score, so keep
|
||||
if keep_threshold is not None and get_score(node) >= keep_threshold:
|
||||
if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
|
||||
return
|
||||
|
||||
gdparent = parent.getparent()
|
||||
|
@ -312,10 +312,8 @@ def lowest_common_ancestor(node_a, node_b, max_depth=None):
|
|||
return node_a # should always find one tho, at least <html/>, but needed for max_depth
|
||||
|
||||
|
||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
|
||||
" Input a raw html string, returns a raw html string of the article "
|
||||
|
||||
html = parse(data, encoding_in)
|
||||
def get_best_node(html, threshold=5):
|
||||
# score all nodes
|
||||
score_all(html)
|
||||
|
||||
# rank all nodes (largest to smallest)
|
||||
|
@ -332,9 +330,29 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
|
|||
else:
|
||||
best = ranked_nodes[0]
|
||||
|
||||
return best
|
||||
|
||||
|
||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
|
||||
" Input a raw html string, returns a raw html string of the article "
|
||||
|
||||
html = parse(data, encoding_in)
|
||||
|
||||
if xpath is not None:
|
||||
xpath_match = html.xpath(xpath)
|
||||
|
||||
if len(xpath_match):
|
||||
best = xpath_match[0]
|
||||
|
||||
else:
|
||||
best = get_best_node(html, threshold)
|
||||
|
||||
else:
|
||||
best = get_best_node(html, threshold)
|
||||
|
||||
# clean up
|
||||
if not debug:
|
||||
keep_threshold = get_score(ranked_nodes[0]) * 3/4
|
||||
keep_threshold = get_score(best) * 3/4
|
||||
clean_root(best, keep_threshold)
|
||||
|
||||
# check for spammy content (links only)
|
||||
|
|
Loading…
Reference in New Issue