Compare commits
4 Commits
cb69e3167f
...
7f4589c578
Author | SHA1 | Date |
---|---|---|
pictuga | 7f4589c578 | |
pictuga | a1dc96cb50 | |
pictuga | 749acc87fc | |
pictuga | c186188557 |
|
@ -58,6 +58,10 @@ Simplest way to get these:
|
||||||
pip install git+https://git.pictuga.com/pictuga/morss.git@master
|
pip install git+https://git.pictuga.com/pictuga/morss.git@master
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
|
||||||
|
C code needs to be compiled). If possible on your distribution, try installing
|
||||||
|
it with the system package manager.
|
||||||
|
|
||||||
You may also need:
|
You may also need:
|
||||||
|
|
||||||
- Apache, with python-cgi support, to run on a server
|
- Apache, with python-cgi support, to run on a server
|
||||||
|
|
|
@ -51,12 +51,15 @@ DEFAULT_UAS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
PROTOCOL = ['http', 'https']
|
||||||
|
|
||||||
|
|
||||||
def get(*args, **kwargs):
|
def get(*args, **kwargs):
|
||||||
return adv_get(*args, **kwargs)[0]
|
return adv_get(*args, **kwargs)['data']
|
||||||
|
|
||||||
|
|
||||||
def adv_get(url, timeout=None, *args, **kwargs):
|
def adv_get(url, timeout=None, *args, **kwargs):
|
||||||
url = encode_url(url)
|
url = sanitize_url(url)
|
||||||
|
|
||||||
if timeout is None:
|
if timeout is None:
|
||||||
con = custom_handler(*args, **kwargs).open(url)
|
con = custom_handler(*args, **kwargs).open(url)
|
||||||
|
@ -69,7 +72,13 @@ def adv_get(url, timeout=None, *args, **kwargs):
|
||||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
||||||
encoding= detect_encoding(data, con)
|
encoding= detect_encoding(data, con)
|
||||||
|
|
||||||
return data, con, contenttype, encoding
|
return {
|
||||||
|
'data':data,
|
||||||
|
'url': con.geturl(),
|
||||||
|
'con': con,
|
||||||
|
'contenttype': contenttype,
|
||||||
|
'encoding': encoding
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def custom_handler(follow=None, delay=None, encoding=None):
|
def custom_handler(follow=None, delay=None, encoding=None):
|
||||||
|
@ -113,8 +122,16 @@ def is_ascii(string):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def encode_url(url):
|
def sanitize_url(url):
|
||||||
" Escape non-ascii unicode characters "
|
if isinstance(url, bytes):
|
||||||
|
url = url.decode()
|
||||||
|
|
||||||
|
if url.split(':', 1)[0] not in PROTOCOL:
|
||||||
|
url = 'http://' + url
|
||||||
|
|
||||||
|
url = url.replace(' ', '%20')
|
||||||
|
|
||||||
|
# Escape non-ascii unicode characters
|
||||||
# https://stackoverflow.com/a/4391299
|
# https://stackoverflow.com/a/4391299
|
||||||
parts = list(urlparse(url))
|
parts = list(urlparse(url))
|
||||||
|
|
||||||
|
@ -613,4 +630,4 @@ if __name__ == '__main__':
|
||||||
data, con, contenttype, encoding = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
data, con, contenttype, encoding = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||||
|
|
||||||
if not sys.flags.interactive:
|
if not sys.flags.interactive:
|
||||||
print(data.decode(encoding))
|
print(req['data'].decode(req['encoding']))
|
||||||
|
|
|
@ -71,7 +71,7 @@ def parse_rules(filename=None):
|
||||||
return rules
|
return rules
|
||||||
|
|
||||||
|
|
||||||
def parse(data, url=None, mimetype=None, encoding=None):
|
def parse(data, url=None, encoding=None):
|
||||||
" Determine which ruleset to use "
|
" Determine which ruleset to use "
|
||||||
|
|
||||||
rulesets = parse_rules()
|
rulesets = parse_rules()
|
||||||
|
@ -759,8 +759,8 @@ class ItemJSON(Item, ParserJSON):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from . import crawler
|
from . import crawler
|
||||||
|
|
||||||
data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
|
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
|
||||||
feed = parse(data, url=con.geturl(), mimetype=contenttype, encoding=encoding)
|
feed = parse(req['data'], url=req['url'], encoding=req['encoding'])
|
||||||
|
|
||||||
if not sys.flags.interactive:
|
if not sys.flags.interactive:
|
||||||
for item in feed.items:
|
for item in feed.items:
|
||||||
|
|
|
@ -44,8 +44,6 @@ TIMEOUT = 4 # http timeout (in sec)
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
PORT = 8080
|
PORT = 8080
|
||||||
|
|
||||||
PROTOCOL = ['http', 'https']
|
|
||||||
|
|
||||||
|
|
||||||
def filterOptions(options):
|
def filterOptions(options):
|
||||||
return options
|
return options
|
||||||
|
@ -250,17 +248,17 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
delay = -2
|
delay = -2
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
|
req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
|
||||||
|
|
||||||
except (IOError, HTTPException) as e:
|
except (IOError, HTTPException) as e:
|
||||||
log('http error')
|
log('http error')
|
||||||
return False # let's just delete errors stuff when in cache mode
|
return False # let's just delete errors stuff when in cache mode
|
||||||
|
|
||||||
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
|
if req['contenttype'] not in crawler.MIMETYPE['html'] and req['contenttype'] != 'text/plain':
|
||||||
log('non-text page')
|
log('non-text page')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
out = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
|
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
|
||||||
|
|
||||||
if out is not None:
|
if out is not None:
|
||||||
item.content = out
|
item.content = out
|
||||||
|
@ -297,22 +295,6 @@ def ItemAfter(item, options):
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
|
||||||
def UrlFix(url):
|
|
||||||
if url is None:
|
|
||||||
raise MorssException('No url provided')
|
|
||||||
|
|
||||||
if isinstance(url, bytes):
|
|
||||||
url = url.decode()
|
|
||||||
|
|
||||||
if urlparse(url).scheme not in PROTOCOL:
|
|
||||||
url = 'http://' + url
|
|
||||||
log(url)
|
|
||||||
|
|
||||||
url = url.replace(' ', '%20')
|
|
||||||
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def FeedFetch(url, options):
|
def FeedFetch(url, options):
|
||||||
# fetch feed
|
# fetch feed
|
||||||
delay = DELAY
|
delay = DELAY
|
||||||
|
@ -321,14 +303,14 @@ def FeedFetch(url, options):
|
||||||
delay = 0
|
delay = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
|
req = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
|
||||||
|
|
||||||
except (IOError, HTTPException):
|
except (IOError, HTTPException):
|
||||||
raise MorssException('Error downloading feed')
|
raise MorssException('Error downloading feed')
|
||||||
|
|
||||||
if options.items:
|
if options.items:
|
||||||
# using custom rules
|
# using custom rules
|
||||||
rss = feeds.FeedHTML(xml, encoding=encoding)
|
rss = feeds.FeedHTML(req['data'], encoding=req['encoding'])
|
||||||
|
|
||||||
rss.rules['title'] = options.title if options.title else '//head/title'
|
rss.rules['title'] = options.title if options.title else '//head/title'
|
||||||
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
|
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
|
||||||
|
@ -348,13 +330,13 @@ def FeedFetch(url, options):
|
||||||
|
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
rss = feeds.parse(xml, url, contenttype, encoding=encoding)
|
rss = feeds.parse(req['data'], url=url, encoding=req['encoding'])
|
||||||
rss = rss.convert(feeds.FeedXML)
|
rss = rss.convert(feeds.FeedXML)
|
||||||
# contains all fields, otherwise much-needed data can be lost
|
# contains all fields, otherwise much-needed data can be lost
|
||||||
|
|
||||||
except TypeError:
|
except TypeError:
|
||||||
log('random page')
|
log('random page')
|
||||||
log(contenttype)
|
log(req['contenttype'])
|
||||||
raise MorssException('Link provided is not a valid feed')
|
raise MorssException('Link provided is not a valid feed')
|
||||||
|
|
||||||
return rss
|
return rss
|
||||||
|
@ -456,7 +438,6 @@ def process(url, cache=None, options=None):
|
||||||
if cache:
|
if cache:
|
||||||
crawler.default_cache = crawler.SQLiteCache(cache)
|
crawler.default_cache = crawler.SQLiteCache(cache)
|
||||||
|
|
||||||
url = UrlFix(url)
|
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
|
|
||||||
|
@ -529,7 +510,6 @@ def cgi_app(environ, start_response):
|
||||||
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
||||||
|
|
||||||
# get the work done
|
# get the work done
|
||||||
url = UrlFix(url)
|
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
|
|
||||||
if headers['content-type'] == 'text/xml':
|
if headers['content-type'] == 'text/xml':
|
||||||
|
@ -614,17 +594,12 @@ def cgi_get(environ, start_response):
|
||||||
url, options = cgi_parse_environ(environ)
|
url, options = cgi_parse_environ(environ)
|
||||||
|
|
||||||
# get page
|
# get page
|
||||||
PROTOCOL = ['http', 'https']
|
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||||
|
|
||||||
if urlparse(url).scheme not in ['http', 'https']:
|
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||||
url = 'http://' + url
|
|
||||||
|
|
||||||
data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT)
|
|
||||||
|
|
||||||
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
|
||||||
if options.get == 'page':
|
if options.get == 'page':
|
||||||
html = readabilite.parse(data, encoding=encoding)
|
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
||||||
html.make_links_absolute(con.geturl())
|
html.make_links_absolute(req['url'])
|
||||||
|
|
||||||
kill_tags = ['script', 'iframe', 'noscript']
|
kill_tags = ['script', 'iframe', 'noscript']
|
||||||
|
|
||||||
|
@ -635,13 +610,13 @@ def cgi_get(environ, start_response):
|
||||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
||||||
|
|
||||||
elif options.get == 'article':
|
elif options.get == 'article':
|
||||||
output = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='utf-8', debug=options.debug)
|
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise MorssException('no :get option passed')
|
raise MorssException('no :get option passed')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
output = data
|
output = req['data']
|
||||||
|
|
||||||
# return html page
|
# return html page
|
||||||
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'}
|
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'}
|
||||||
|
@ -698,7 +673,6 @@ def cli_app():
|
||||||
|
|
||||||
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
|
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
|
||||||
|
|
||||||
url = UrlFix(url)
|
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
out = FeedFormat(rss, options, 'unicode')
|
out = FeedFormat(rss, options, 'unicode')
|
||||||
|
|
|
@ -348,8 +348,8 @@ if __name__ == '__main__':
|
||||||
import sys
|
import sys
|
||||||
from . import crawler
|
from . import crawler
|
||||||
|
|
||||||
data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||||
article = get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
|
article = get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
|
||||||
|
|
||||||
if not sys.flags.interactive:
|
if not sys.flags.interactive:
|
||||||
print(article)
|
print(article)
|
||||||
|
|
Loading…
Reference in New Issue