Compare commits
3 Commits
c3052a0619
...
ef6efd981c
Author | SHA1 | Date |
---|---|---|
pictuga | ef6efd981c | |
pictuga | 70d6ee02d3 | |
pictuga | 3c551d77bb |
12
README.md
12
README.md
|
@ -262,11 +262,12 @@ arguments to morss is explained in Run above.
|
||||||
The list of arguments can be obtained by running `morss --help`
|
The list of arguments can be obtained by running `morss --help`
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: morss [-h] [--post STRING] [--format {rss,json,html,csv}]
|
usage: morss [-h] [--post STRING] [--xpath XPATH]
|
||||||
[--search STRING] [--clip] [--indent] [--cache] [--force]
|
[--format {rss,json,html,csv}] [--search STRING] [--clip]
|
||||||
[--proxy] [--newest] [--firstlink] [--resolve] [--items XPATH]
|
[--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
|
||||||
[--item_link XPATH] [--item_title XPATH] [--item_content XPATH]
|
[--resolve] [--items XPATH] [--item_link XPATH]
|
||||||
[--item_time XPATH] [--nolink] [--noref] [--silent]
|
[--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
|
||||||
|
[--nolink] [--noref] [--silent]
|
||||||
url
|
url
|
||||||
|
|
||||||
Get full-text RSS feeds
|
Get full-text RSS feeds
|
||||||
|
@ -277,6 +278,7 @@ positional arguments:
|
||||||
optional arguments:
|
optional arguments:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
--post STRING POST request
|
--post STRING POST request
|
||||||
|
--xpath XPATH xpath rule to manually detect the article
|
||||||
|
|
||||||
output:
|
output:
|
||||||
--format {rss,json,html,csv}
|
--format {rss,json,html,csv}
|
||||||
|
|
|
@ -32,6 +32,7 @@ def cli_app():
|
||||||
parser.add_argument('url', help='feed url')
|
parser.add_argument('url', help='feed url')
|
||||||
|
|
||||||
parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
|
parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
|
||||||
|
parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
|
||||||
|
|
||||||
group = parser.add_argument_group('output')
|
group = parser.add_argument_group('output')
|
||||||
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
|
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
|
||||||
|
|
|
@ -28,7 +28,7 @@ from io import BytesIO, StringIO
|
||||||
|
|
||||||
import chardet
|
import chardet
|
||||||
|
|
||||||
from .cache import default_cache
|
from .caching import default_cache
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
|
|
|
@ -25,7 +25,7 @@ import lxml.etree
|
||||||
import lxml.html
|
import lxml.html
|
||||||
from dateutil import tz
|
from dateutil import tz
|
||||||
|
|
||||||
from . import crawler, feeds, readabilite
|
from . import caching, crawler, feeds, readabilite
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
|
@ -222,7 +222,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
log('empty page')
|
log('empty page')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
|
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)
|
||||||
|
|
||||||
if out is not None:
|
if out is not None:
|
||||||
item.content = out
|
item.content = out
|
||||||
|
@ -411,7 +411,7 @@ def process(url, cache=None, options=None):
|
||||||
options = Options(options)
|
options = Options(options)
|
||||||
|
|
||||||
if cache:
|
if cache:
|
||||||
crawler.default_cache = crawler.SQLiteCache(cache)
|
caching.default_cache = caching.SQLiteCache(cache)
|
||||||
|
|
||||||
url, rss = FeedFetch(url, options)
|
url, rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
|
|
|
@ -211,7 +211,7 @@ def clean_node(node, keep_threshold=None):
|
||||||
return
|
return
|
||||||
|
|
||||||
# high score, so keep
|
# high score, so keep
|
||||||
if keep_threshold is not None and get_score(node) >= keep_threshold:
|
if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
|
||||||
return
|
return
|
||||||
|
|
||||||
gdparent = parent.getparent()
|
gdparent = parent.getparent()
|
||||||
|
@ -294,28 +294,26 @@ def clean_node(node, keep_threshold=None):
|
||||||
gdparent.insert(gdparent.index(parent)+1, new_node)
|
gdparent.insert(gdparent.index(parent)+1, new_node)
|
||||||
|
|
||||||
|
|
||||||
def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
|
def lowest_common_ancestor(node_a, node_b, max_depth=None):
|
||||||
ancestorsA = list(nodeA.iterancestors())
|
ancestors_a = list(node_a.iterancestors())
|
||||||
ancestorsB = list(nodeB.iterancestors())
|
ancestors_b = list(node_b.iterancestors())
|
||||||
|
|
||||||
if max_depth is not None:
|
if max_depth is not None:
|
||||||
ancestorsA = ancestorsA[:max_depth]
|
ancestors_a = ancestors_a[:max_depth]
|
||||||
ancestorsB = ancestorsB[:max_depth]
|
ancestors_b = ancestors_b[:max_depth]
|
||||||
|
|
||||||
ancestorsA.insert(0, nodeA)
|
ancestors_a.insert(0, node_a)
|
||||||
ancestorsB.insert(0, nodeB)
|
ancestors_b.insert(0, node_b)
|
||||||
|
|
||||||
for ancestorA in ancestorsA:
|
for ancestor_a in ancestors_a:
|
||||||
if ancestorA in ancestorsB:
|
if ancestor_a in ancestors_b:
|
||||||
return ancestorA
|
return ancestor_a
|
||||||
|
|
||||||
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
|
return node_a # should always find one tho, at least <html/>, but needed for max_depth
|
||||||
|
|
||||||
|
|
||||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
|
def get_best_node(html, threshold=5):
|
||||||
" Input a raw html string, returns a raw html string of the article "
|
# score all nodes
|
||||||
|
|
||||||
html = parse(data, encoding_in)
|
|
||||||
score_all(html)
|
score_all(html)
|
||||||
|
|
||||||
# rank all nodes (largest to smallest)
|
# rank all nodes (largest to smallest)
|
||||||
|
@ -332,9 +330,29 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
|
||||||
else:
|
else:
|
||||||
best = ranked_nodes[0]
|
best = ranked_nodes[0]
|
||||||
|
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
|
||||||
|
" Input a raw html string, returns a raw html string of the article "
|
||||||
|
|
||||||
|
html = parse(data, encoding_in)
|
||||||
|
|
||||||
|
if xpath is not None:
|
||||||
|
xpath_match = html.xpath(xpath)
|
||||||
|
|
||||||
|
if len(xpath_match):
|
||||||
|
best = xpath_match[0]
|
||||||
|
|
||||||
|
else:
|
||||||
|
best = get_best_node(html, threshold)
|
||||||
|
|
||||||
|
else:
|
||||||
|
best = get_best_node(html, threshold)
|
||||||
|
|
||||||
# clean up
|
# clean up
|
||||||
if not debug:
|
if not debug:
|
||||||
keep_threshold = get_score(ranked_nodes[0]) * 3/4
|
keep_threshold = get_score(best) * 3/4
|
||||||
clean_root(best, keep_threshold)
|
clean_root(best, keep_threshold)
|
||||||
|
|
||||||
# check for spammy content (links only)
|
# check for spammy content (links only)
|
||||||
|
|
|
@ -33,7 +33,7 @@ except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
from . import crawler, readabilite
|
from . import caching, crawler, readabilite
|
||||||
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
|
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
|
||||||
MorssException, Options, log)
|
MorssException, Options, log)
|
||||||
|
|
||||||
|
@ -287,7 +287,7 @@ class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
|
||||||
|
|
||||||
|
|
||||||
def cgi_start_server():
|
def cgi_start_server():
|
||||||
crawler.default_cache.autotrim()
|
caching.default_cache.autotrim()
|
||||||
|
|
||||||
print('Serving http://localhost:%s/' % PORT)
|
print('Serving http://localhost:%s/' % PORT)
|
||||||
httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
|
httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
|
||||||
|
@ -295,4 +295,4 @@ def cgi_start_server():
|
||||||
|
|
||||||
|
|
||||||
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
|
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
|
||||||
crawler.default_cache.autotrim()
|
caching.default_cache.autotrim()
|
||||||
|
|
Loading…
Reference in New Issue