Compare commits

..

No commits in common. "7f4589c5784313a4e433c44fcb3c20556a24a109" and "cb69e3167f7d5e9a059e4fc3a7b71b4f0025f046" have entirely different histories.

5 changed files with 50 additions and 45 deletions

View File

@ -58,10 +58,6 @@ Simplest way to get these:
pip install git+https://git.pictuga.com/pictuga/morss.git@master pip install git+https://git.pictuga.com/pictuga/morss.git@master
``` ```
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
C code needs to be compiled). If possible on your distribution, try installing
it with the system package manager.
You may also need: You may also need:
- Apache, with python-cgi support, to run on a server - Apache, with python-cgi support, to run on a server

View File

@ -51,15 +51,12 @@ DEFAULT_UAS = [
] ]
PROTOCOL = ['http', 'https']
def get(*args, **kwargs): def get(*args, **kwargs):
return adv_get(*args, **kwargs)['data'] return adv_get(*args, **kwargs)[0]
def adv_get(url, timeout=None, *args, **kwargs): def adv_get(url, timeout=None, *args, **kwargs):
url = sanitize_url(url) url = encode_url(url)
if timeout is None: if timeout is None:
con = custom_handler(*args, **kwargs).open(url) con = custom_handler(*args, **kwargs).open(url)
@ -72,13 +69,7 @@ def adv_get(url, timeout=None, *args, **kwargs):
contenttype = con.info().get('Content-Type', '').split(';')[0] contenttype = con.info().get('Content-Type', '').split(';')[0]
encoding= detect_encoding(data, con) encoding= detect_encoding(data, con)
return { return data, con, contenttype, encoding
'data':data,
'url': con.geturl(),
'con': con,
'contenttype': contenttype,
'encoding': encoding
}
def custom_handler(follow=None, delay=None, encoding=None): def custom_handler(follow=None, delay=None, encoding=None):
@ -122,16 +113,8 @@ def is_ascii(string):
return True return True
def sanitize_url(url): def encode_url(url):
if isinstance(url, bytes): " Escape non-ascii unicode characters "
url = url.decode()
if url.split(':', 1)[0] not in PROTOCOL:
url = 'http://' + url
url = url.replace(' ', '%20')
# Escape non-ascii unicode characters
# https://stackoverflow.com/a/4391299 # https://stackoverflow.com/a/4391299
parts = list(urlparse(url)) parts = list(urlparse(url))
@ -630,4 +613,4 @@ if __name__ == '__main__':
data, con, contenttype, encoding = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') data, con, contenttype, encoding = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
if not sys.flags.interactive: if not sys.flags.interactive:
print(req['data'].decode(req['encoding'])) print(data.decode(encoding))

View File

@ -71,7 +71,7 @@ def parse_rules(filename=None):
return rules return rules
def parse(data, url=None, encoding=None): def parse(data, url=None, mimetype=None, encoding=None):
" Determine which ruleset to use " " Determine which ruleset to use "
rulesets = parse_rules() rulesets = parse_rules()
@ -759,8 +759,8 @@ class ItemJSON(Item, ParserJSON):
if __name__ == '__main__': if __name__ == '__main__':
from . import crawler from . import crawler
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss') data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
feed = parse(req['data'], url=req['url'], encoding=req['encoding']) feed = parse(data, url=con.geturl(), mimetype=contenttype, encoding=encoding)
if not sys.flags.interactive: if not sys.flags.interactive:
for item in feed.items: for item in feed.items:

View File

@ -44,6 +44,8 @@ TIMEOUT = 4 # http timeout (in sec)
DEBUG = False DEBUG = False
PORT = 8080 PORT = 8080
PROTOCOL = ['http', 'https']
def filterOptions(options): def filterOptions(options):
return options return options
@ -248,17 +250,17 @@ def ItemFill(item, options, feedurl='/', fast=False):
delay = -2 delay = -2
try: try:
req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT) data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
except (IOError, HTTPException) as e: except (IOError, HTTPException) as e:
log('http error') log('http error')
return False # let's just delete errors stuff when in cache mode return False # let's just delete errors stuff when in cache mode
if req['contenttype'] not in crawler.MIMETYPE['html'] and req['contenttype'] != 'text/plain': if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
log('non-text page') log('non-text page')
return True return True
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode') out = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
if out is not None: if out is not None:
item.content = out item.content = out
@ -295,6 +297,22 @@ def ItemAfter(item, options):
return item return item
def UrlFix(url):
if url is None:
raise MorssException('No url provided')
if isinstance(url, bytes):
url = url.decode()
if urlparse(url).scheme not in PROTOCOL:
url = 'http://' + url
log(url)
url = url.replace(' ', '%20')
return url
def FeedFetch(url, options): def FeedFetch(url, options):
# fetch feed # fetch feed
delay = DELAY delay = DELAY
@ -303,14 +321,14 @@ def FeedFetch(url, options):
delay = 0 delay = 0
try: try:
req = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2) xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
except (IOError, HTTPException): except (IOError, HTTPException):
raise MorssException('Error downloading feed') raise MorssException('Error downloading feed')
if options.items: if options.items:
# using custom rules # using custom rules
rss = feeds.FeedHTML(req['data'], encoding=req['encoding']) rss = feeds.FeedHTML(xml, encoding=encoding)
rss.rules['title'] = options.title if options.title else '//head/title' rss.rules['title'] = options.title if options.title else '//head/title'
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content' rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
@ -330,13 +348,13 @@ def FeedFetch(url, options):
else: else:
try: try:
rss = feeds.parse(req['data'], url=url, encoding=req['encoding']) rss = feeds.parse(xml, url, contenttype, encoding=encoding)
rss = rss.convert(feeds.FeedXML) rss = rss.convert(feeds.FeedXML)
# contains all fields, otherwise much-needed data can be lost # contains all fields, otherwise much-needed data can be lost
except TypeError: except TypeError:
log('random page') log('random page')
log(req['contenttype']) log(contenttype)
raise MorssException('Link provided is not a valid feed') raise MorssException('Link provided is not a valid feed')
return rss return rss
@ -438,6 +456,7 @@ def process(url, cache=None, options=None):
if cache: if cache:
crawler.default_cache = crawler.SQLiteCache(cache) crawler.default_cache = crawler.SQLiteCache(cache)
url = UrlFix(url)
rss = FeedFetch(url, options) rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options) rss = FeedGather(rss, url, options)
@ -510,6 +529,7 @@ def cgi_app(environ, start_response):
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db')) crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
# get the work done # get the work done
url = UrlFix(url)
rss = FeedFetch(url, options) rss = FeedFetch(url, options)
if headers['content-type'] == 'text/xml': if headers['content-type'] == 'text/xml':
@ -594,12 +614,17 @@ def cgi_get(environ, start_response):
url, options = cgi_parse_environ(environ) url, options = cgi_parse_environ(environ)
# get page # get page
req = crawler.adv_get(url=url, timeout=TIMEOUT) PROTOCOL = ['http', 'https']
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']: if urlparse(url).scheme not in ['http', 'https']:
url = 'http://' + url
data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT)
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
if options.get == 'page': if options.get == 'page':
html = readabilite.parse(req['data'], encoding=req['encoding']) html = readabilite.parse(data, encoding=encoding)
html.make_links_absolute(req['url']) html.make_links_absolute(con.geturl())
kill_tags = ['script', 'iframe', 'noscript'] kill_tags = ['script', 'iframe', 'noscript']
@ -610,13 +635,13 @@ def cgi_get(environ, start_response):
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
elif options.get == 'article': elif options.get == 'article':
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug) output = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='utf-8', debug=options.debug)
else: else:
raise MorssException('no :get option passed') raise MorssException('no :get option passed')
else: else:
output = req['data'] output = data
# return html page # return html page
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'} headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'}
@ -673,6 +698,7 @@ def cli_app():
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db')) crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
url = UrlFix(url)
rss = FeedFetch(url, options) rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options) rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options, 'unicode') out = FeedFormat(rss, options, 'unicode')

View File

@ -348,8 +348,8 @@ if __name__ == '__main__':
import sys import sys
from . import crawler from . import crawler
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
article = get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode') article = get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
if not sys.flags.interactive: if not sys.flags.interactive:
print(article) print(article)