Compare commits
26 Commits
c6d3a0eb53
...
598a2591f1
Author | SHA1 | Date |
---|---|---|
pictuga | 598a2591f1 | |
pictuga | e76ab2b631 | |
pictuga | aa9143302b | |
pictuga | 0d62a7625b | |
pictuga | bd0efb1529 | |
pictuga | 47a17614ef | |
pictuga | 4dfebe78f7 | |
pictuga | dcd3e4a675 | |
pictuga | e968b2ea7f | |
pictuga | 0ac590c798 | |
pictuga | fa1b5aef09 | |
pictuga | 7f6309f618 | |
pictuga | f65fb45030 | |
pictuga | 6dd40e5cc4 | |
pictuga | 0acfce5a22 | |
pictuga | 97ccc15db0 | |
pictuga | 7a560181f7 | |
pictuga | baccd3b22b | |
pictuga | f79938ab11 | |
pictuga | 5b8bd47829 | |
pictuga | b5b355aa6e | |
pictuga | 94097f481a | |
pictuga | 8161baa7ae | |
pictuga | bd182bcb85 | |
pictuga | c7c2c5d749 | |
pictuga | c6b52e625f |
|
@ -5,4 +5,4 @@ RUN apk add python3 py3-lxml py3-gunicorn py3-pip git
|
|||
ADD . /app
|
||||
RUN pip3 install /app
|
||||
|
||||
CMD gunicorn --bind 0.0.0.0:8080 -w 4 morss:cgi_standalone_app
|
||||
CMD gunicorn --bind 0.0.0.0:8080 -w 4 morss
|
||||
|
|
105
README.md
105
README.md
|
@ -73,35 +73,56 @@ morss accepts some arguments, to lightly alter the output of morss. Arguments
|
|||
may need to have a value (usually a string or a number). In the different "Use
|
||||
cases" below is detailed how to pass those arguments to morss.
|
||||
|
||||
The arguments are:
|
||||
The list of arguments can be obtained by running `morss --help`
|
||||
|
||||
```
|
||||
usage: morss [-h] [--format {rss,json,html,csv}] [--search STRING] [--clip] [--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink] [--items XPATH] [--item_link XPATH]
|
||||
[--item_title XPATH] [--item_content XPATH] [--item_time XPATH] [--nolink] [--noref] [--debug]
|
||||
url
|
||||
|
||||
Get full-text RSS feeds
|
||||
|
||||
positional arguments:
|
||||
url feed url
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
|
||||
output:
|
||||
--format {rss,json,html,csv}
|
||||
output format
|
||||
--search STRING does a basic case-sensitive search in the feed
|
||||
--clip stick the full article content under the original feed content (useful for twitter)
|
||||
--indent returns indented XML or JSON, takes more place, but human-readable
|
||||
|
||||
action:
|
||||
--cache only take articles from the cache (ie. don't grab new articles' content), so as to save time
|
||||
--force force refetch the rss feed and articles
|
||||
--proxy doesn't fill the articles
|
||||
--newest return the feed items in chronological order (morss ohterwise shows the items by appearing order)
|
||||
--firstlink pull the first article mentioned in the description instead of the default link
|
||||
|
||||
custom feeds:
|
||||
--items XPATH (mandatory to activate the custom feeds function) xpath rule to match all the RSS entries
|
||||
--item_link XPATH xpath rule relative to items to point to the entry's link
|
||||
--item_title XPATH entry's title
|
||||
--item_content XPATH entry's content
|
||||
--item_time XPATH entry's date & time (accepts a wide range of time formats)
|
||||
|
||||
misc:
|
||||
--nolink drop links, but keeps links' inner text
|
||||
--noref drop items' link
|
||||
--silent don't output the final RSS (useless on its own, but can be nice when debugging)
|
||||
|
||||
GNU AGPLv3 code
|
||||
```
|
||||
|
||||
Further options:
|
||||
- Change what morss does
|
||||
- `json`: output as JSON
|
||||
- `html`: outpout as HTML
|
||||
- `csv`: outpout as CSV
|
||||
- `proxy`: doesn't fill the articles
|
||||
- `clip`: stick the full article content under the original feed content (useful for twitter)
|
||||
- `search=STRING`: does a basic case-sensitive search in the feed
|
||||
- Advanced
|
||||
- `csv`: export to csv
|
||||
- `indent`: returns indented XML or JSON, takes more place, but human-readable
|
||||
- `nolink`: drop links, but keeps links' inner text
|
||||
- `noref`: drop items' link
|
||||
- `cache`: only take articles from the cache (ie. don't grab new articles' content), so as to save time
|
||||
- `debug`: to have some feedback from the script execution. Useful for debugging
|
||||
- `force`: force refetch the rss feed and articles
|
||||
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
|
||||
- `newest`: return the feed items in chronological order (morss ohterwise shows the items by appearing order)
|
||||
- http server only
|
||||
- `callback=NAME`: for JSONP calls
|
||||
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
|
||||
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
|
||||
- Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
|
||||
- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries
|
||||
- `item_link`: xpath rule relative to `items` to point to the entry's link
|
||||
- `item_title`: entry's title
|
||||
- `item_content`: entry's description
|
||||
- `item_time`: entry's date & time (accepts a wide range of time formats)
|
||||
- Environment variable `DEBUG=`: to have some feedback from the script execution. Useful for debugging. On Apache, can be set via the `SetEnv` instruction (see sample `.htaccess` provided).
|
||||
- `callback=NAME`: for JSONP calls
|
||||
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
|
||||
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
|
||||
|
||||
## Use cases
|
||||
|
||||
|
@ -150,7 +171,7 @@ uwsgi --http :8080 --plugin python --wsgi-file main.py
|
|||
#### Using Gunicorn
|
||||
|
||||
```shell
|
||||
gunicorn morss:cgi_standalone_app
|
||||
gunicorn morss
|
||||
```
|
||||
|
||||
#### Using docker
|
||||
|
@ -162,12 +183,6 @@ docker build https://git.pictuga.com/pictuga/morss.git -t morss
|
|||
docker run -p 8080:8080 morss
|
||||
```
|
||||
|
||||
In one line
|
||||
|
||||
```shell
|
||||
docker run -p 8080:8080 $(docker build -q https://git.pictuga.com/pictuga/morss.git)
|
||||
```
|
||||
|
||||
With docker-compose:
|
||||
|
||||
```yml
|
||||
|
@ -193,7 +208,7 @@ without any argument, on port 8080.
|
|||
morss
|
||||
```
|
||||
|
||||
You can change the port like this `morss 9000`.
|
||||
You can change the port using environment variables like this `PORT=9000 morss`.
|
||||
|
||||
#### Passing arguments
|
||||
|
||||
|
@ -213,9 +228,9 @@ Works like a charm with [Tiny Tiny RSS](http://tt-rss.org/redmine/projects/tt-rs
|
|||
|
||||
Run:
|
||||
```
|
||||
morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
|
||||
morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
|
||||
```
|
||||
For example: `morss debug http://feeds.bbci.co.uk/news/rss.xml`
|
||||
For example: `morss --debug http://feeds.bbci.co.uk/news/rss.xml`
|
||||
|
||||
*(Brackets indicate optional text)*
|
||||
|
||||
|
@ -275,13 +290,15 @@ output = morss.FeedFormat(rss, options, 'unicode') # formats final feed
|
|||
|
||||
## Cache information
|
||||
|
||||
morss uses caching to make loading faster. There are 3 possible cache backends
|
||||
(visible in `morss/crawler.py`):
|
||||
morss uses caching to make loading faster. There are 3 possible cache backends,
|
||||
which can be picked via environment variables:
|
||||
|
||||
- `{}`: a simple python in-memory dict() object
|
||||
- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
|
||||
be cleared every time the program is run
|
||||
- `MySQLCacheHandler`
|
||||
- `(nothing/default)`: a simple python in-memory dict() object.
|
||||
- `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
|
||||
will be cleared every time the program is run). Path can be defined with
|
||||
`SQLITE_PATH`.
|
||||
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
|
||||
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
|
||||
|
||||
## Configuration
|
||||
### Length limitation
|
||||
|
@ -289,7 +306,7 @@ be cleared every time the program is run
|
|||
When parsing long feeds, with a lot of items (100+), morss might take a lot of
|
||||
time to parse it, or might even run into a memory overflow on some shared
|
||||
hosting plans (limits around 10Mb), in which case you might want to adjust the
|
||||
different values at the top of the script.
|
||||
below settings via environment variables.
|
||||
|
||||
- `MAX_TIME` sets the maximum amount of time spent *fetching* articles, more time might be spent taking older articles from cache. `-1` for unlimited.
|
||||
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited. More articles will be taken from cache following the nexts settings.
|
||||
|
|
3
main.py
3
main.py
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
from morss import main, cgi_standalone_app as application
|
||||
from morss.__main__ import main
|
||||
from morss.wsgi import application
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
# ran on `import morss`
|
||||
from .morss import *
|
||||
from .wsgi import application
|
||||
|
|
|
@ -1,5 +1,54 @@
|
|||
# ran on `python -m morss`
|
||||
from .morss import main
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from . import wsgi
|
||||
from . import cli
|
||||
|
||||
from .morss import MorssException
|
||||
|
||||
import wsgiref.simple_server
|
||||
import wsgiref.handlers
|
||||
|
||||
|
||||
PORT = int(os.getenv('PORT', 8080))
|
||||
|
||||
|
||||
def main():
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
# mod_cgi (w/o file handler)
|
||||
|
||||
app = wsgi.cgi_app
|
||||
app = wsgi.cgi_dispatcher(app)
|
||||
app = wsgi.cgi_error_handler(app)
|
||||
app = wsgi.cgi_encode(app)
|
||||
|
||||
wsgiref.handlers.CGIHandler().run(app)
|
||||
|
||||
elif len(sys.argv) <= 1:
|
||||
# start internal (basic) http server (w/ file handler)
|
||||
|
||||
app = wsgi.cgi_app
|
||||
app = wsgi.cgi_file_handler(app)
|
||||
app = wsgi.cgi_dispatcher(app)
|
||||
app = wsgi.cgi_error_handler(app)
|
||||
app = wsgi.cgi_encode(app)
|
||||
|
||||
print('Serving http://localhost:%s/' % port)
|
||||
httpd = wsgiref.simple_server.make_server('', PORT, app)
|
||||
httpd.serve_forever()
|
||||
|
||||
else:
|
||||
# as a CLI app
|
||||
try:
|
||||
cli.cli_app()
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
print('ERROR: %s' % e.message)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
import sys
|
||||
import os.path
|
||||
import argparse
|
||||
|
||||
from .morss import FeedFetch, FeedGather, FeedFormat
|
||||
from .morss import Options
|
||||
|
||||
|
||||
def cli_app():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='morss',
|
||||
description='Get full-text RSS feeds',
|
||||
epilog='GNU AGPLv3 code'
|
||||
)
|
||||
|
||||
parser.add_argument('url', help='feed url')
|
||||
|
||||
group = parser.add_argument_group('output')
|
||||
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
|
||||
group.add_argument('--search', action='store', type=str, metavar='STRING', help='does a basic case-sensitive search in the feed')
|
||||
group.add_argument('--clip', action='store_true', help='stick the full article content under the original feed content (useful for twitter)')
|
||||
group.add_argument('--indent', action='store_true', help='returns indented XML or JSON, takes more place, but human-readable')
|
||||
|
||||
group = parser.add_argument_group('action')
|
||||
group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
|
||||
group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
|
||||
group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
|
||||
group.add_argument('--newest', action='store_true', help='return the feed items in chronological order (morss ohterwise shows the items by appearing order)')
|
||||
group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
|
||||
|
||||
group = parser.add_argument_group('custom feeds')
|
||||
group.add_argument('--items', action='store', type=str, metavar='XPATH', help='(mandatory to activate the custom feeds function) xpath rule to match all the RSS entries')
|
||||
group.add_argument('--item_link', action='store', type=str, metavar='XPATH', help='xpath rule relative to items to point to the entry\'s link')
|
||||
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
|
||||
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
|
||||
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
|
||||
|
||||
group = parser.add_argument_group('misc')
|
||||
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
|
||||
group.add_argument('--noref', action='store_true', help='drop items\' link')
|
||||
group.add_argument('--silent', action='store_true', help='don\'t output the final RSS (useless on its own, but can be nice when debugging)')
|
||||
|
||||
options = Options(vars(parser.parse_args()))
|
||||
url = options.url
|
||||
|
||||
url, rss = FeedFetch(url, options)
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options, 'unicode')
|
||||
|
||||
if not options.silent:
|
||||
print(out)
|
|
@ -1,3 +1,4 @@
|
|||
import os
|
||||
import sys
|
||||
|
||||
import zlib
|
||||
|
@ -388,9 +389,6 @@ class HTTPRefreshHandler(BaseHandler):
|
|||
https_response = http_response
|
||||
|
||||
|
||||
default_cache = {}
|
||||
|
||||
|
||||
class CacheHandler(BaseHandler):
|
||||
" Cache based on etags/last-modified "
|
||||
|
||||
|
@ -659,6 +657,22 @@ class MySQLCacheHandler(BaseCache):
|
|||
(url,) + value + value)
|
||||
|
||||
|
||||
if 'CACHE' in os.environ:
|
||||
if os.environ['CACHE'] == 'mysql':
|
||||
default_cache = MySQLCacheHandler(
|
||||
user = os.getenv('MYSQL_USER'),
|
||||
password = os.getenv('MYSQL_PWD'),
|
||||
database = os.getenv('MYSQL_DB'),
|
||||
host = os.getenv('MYSQL_HOST')
|
||||
)
|
||||
|
||||
elif os.environ['CACHE'] == 'sqlite':
|
||||
default_cache = SQLiteCache(os.getenv('SQLITE_PATH', ':memory:'))
|
||||
|
||||
else:
|
||||
default_cache = {}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||
|
||||
|
|
366
morss/morss.py
366
morss/morss.py
|
@ -1,6 +1,4 @@
|
|||
import sys
|
||||
import os
|
||||
import os.path
|
||||
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
@ -16,56 +14,39 @@ from . import feeds
|
|||
from . import crawler
|
||||
from . import readabilite
|
||||
|
||||
import wsgiref.simple_server
|
||||
import wsgiref.handlers
|
||||
import cgitb
|
||||
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from httplib import HTTPException
|
||||
from urllib import unquote
|
||||
from urlparse import urlparse, urljoin, parse_qs
|
||||
except ImportError:
|
||||
# python 3
|
||||
from http.client import HTTPException
|
||||
from urllib.parse import unquote
|
||||
from urllib.parse import urlparse, urljoin, parse_qs
|
||||
|
||||
MAX_ITEM = 5 # cache-only beyond
|
||||
MAX_TIME = 2 # cache-only after (in sec)
|
||||
|
||||
LIM_ITEM = 10 # deletes what's beyond
|
||||
LIM_TIME = 2.5 # deletes what's after
|
||||
MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
|
||||
MAX_TIME = int(os.getenv('MAX_TIME', 2)) # cache-only after (in sec)
|
||||
|
||||
DELAY = 10 * 60 # xml cache & ETag cache (in sec)
|
||||
TIMEOUT = 4 # http timeout (in sec)
|
||||
LIM_ITEM = int(os.getenv('LIM_ITEM', 10)) # deletes what's beyond
|
||||
LIM_TIME = int(os.getenv('LIM_TIME', 2.5)) # deletes what's after
|
||||
|
||||
DEBUG = False
|
||||
PORT = 8080
|
||||
|
||||
|
||||
def filterOptions(options):
|
||||
return options
|
||||
|
||||
# example of filtering code below
|
||||
|
||||
#allowed = ['proxy', 'clip', 'cache', 'force', 'silent', 'pro', 'debug']
|
||||
#filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
|
||||
|
||||
#return filtered
|
||||
DELAY = int(os.getenv('DELAY', 10 * 60)) # xml cache & ETag cache (in sec)
|
||||
TIMEOUT = int(os.getenv('TIMEOUT', 4)) # http timeout (in sec)
|
||||
|
||||
|
||||
class MorssException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def log(txt, force=False):
|
||||
if DEBUG or force:
|
||||
def log(txt):
|
||||
if 'DEBUG' in os.environ:
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
# when running on Apache
|
||||
open('morss.log', 'a').write("%s\n" % repr(txt))
|
||||
|
||||
else:
|
||||
# when using internal server or cli
|
||||
print(repr(txt))
|
||||
|
||||
|
||||
|
@ -107,29 +88,6 @@ class Options:
|
|||
return key in self.options
|
||||
|
||||
|
||||
def parseOptions(options):
|
||||
""" Turns ['md=True'] into {'md':True} """
|
||||
out = {}
|
||||
|
||||
for option in options:
|
||||
split = option.split('=', 1)
|
||||
|
||||
if len(split) > 1:
|
||||
if split[0].lower() == 'true':
|
||||
out[split[0]] = True
|
||||
|
||||
elif split[0].lower() == 'false':
|
||||
out[split[0]] = False
|
||||
|
||||
else:
|
||||
out[split[0]] = split[1]
|
||||
|
||||
else:
|
||||
out[split[0]] = True
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def ItemFix(item, options, feedurl='/'):
|
||||
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
||||
|
||||
|
@ -399,24 +357,24 @@ def FeedFormat(rss, options, encoding='utf-8'):
|
|||
else:
|
||||
raise MorssException('Invalid callback var name')
|
||||
|
||||
elif options.json:
|
||||
elif options.format == 'json':
|
||||
if options.indent:
|
||||
return rss.tojson(encoding=encoding, indent=4)
|
||||
|
||||
else:
|
||||
return rss.tojson(encoding=encoding)
|
||||
|
||||
elif options.csv:
|
||||
elif options.format == 'csv':
|
||||
return rss.tocsv(encoding=encoding)
|
||||
|
||||
elif options.html:
|
||||
elif options.format == 'html':
|
||||
if options.indent:
|
||||
return rss.tohtml(encoding=encoding, pretty_print=True)
|
||||
|
||||
else:
|
||||
return rss.tohtml(encoding=encoding)
|
||||
|
||||
else:
|
||||
else: # i.e. format == 'rss'
|
||||
if options.indent:
|
||||
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding, pretty_print=True)
|
||||
|
||||
|
@ -437,299 +395,3 @@ def process(url, cache=None, options=None):
|
|||
rss = FeedGather(rss, url, options)
|
||||
|
||||
return FeedFormat(rss, options, 'unicode')
|
||||
|
||||
|
||||
def cgi_parse_environ(environ):
|
||||
# get options
|
||||
|
||||
if 'REQUEST_URI' in environ:
|
||||
url = environ['REQUEST_URI'][1:]
|
||||
else:
|
||||
url = environ['PATH_INFO'][1:]
|
||||
|
||||
if environ['QUERY_STRING']:
|
||||
url += '?' + environ['QUERY_STRING']
|
||||
|
||||
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
|
||||
|
||||
if url.startswith(':'):
|
||||
split = url.split('/', 1)
|
||||
|
||||
raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
||||
|
||||
if len(split) > 1:
|
||||
url = split[1]
|
||||
else:
|
||||
url = ''
|
||||
|
||||
else:
|
||||
raw_options = []
|
||||
|
||||
# init
|
||||
options = Options(filterOptions(parseOptions(raw_options)))
|
||||
|
||||
global DEBUG
|
||||
DEBUG = options.debug
|
||||
|
||||
return (url, options)
|
||||
|
||||
|
||||
def cgi_app(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
headers = {}
|
||||
|
||||
# headers
|
||||
headers['status'] = '200 OK'
|
||||
headers['cache-control'] = 'max-age=%s' % DELAY
|
||||
headers['x-content-type-options'] = 'nosniff' # safari work around
|
||||
|
||||
if options.cors:
|
||||
headers['access-control-allow-origin'] = '*'
|
||||
|
||||
if options.html:
|
||||
headers['content-type'] = 'text/html'
|
||||
elif options.txt or options.silent:
|
||||
headers['content-type'] = 'text/plain'
|
||||
elif options.json:
|
||||
headers['content-type'] = 'application/json'
|
||||
elif options.callback:
|
||||
headers['content-type'] = 'application/javascript'
|
||||
elif options.csv:
|
||||
headers['content-type'] = 'text/csv'
|
||||
headers['content-disposition'] = 'attachment; filename="feed.csv"'
|
||||
else:
|
||||
headers['content-type'] = 'text/xml'
|
||||
|
||||
headers['content-type'] += '; charset=utf-8'
|
||||
|
||||
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
||||
|
||||
# get the work done
|
||||
url, rss = FeedFetch(url, options)
|
||||
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options)
|
||||
|
||||
if options.silent:
|
||||
return ['']
|
||||
|
||||
else:
|
||||
return [out]
|
||||
|
||||
|
||||
def middleware(func):
|
||||
" Decorator to turn a function into a wsgi middleware "
|
||||
# This is called when parsing the "@middleware" code
|
||||
|
||||
def app_builder(app):
|
||||
# This is called when doing app = cgi_wrapper(app)
|
||||
|
||||
def app_wrap(environ, start_response):
|
||||
# This is called when a http request is being processed
|
||||
|
||||
return func(environ, start_response, app)
|
||||
|
||||
return app_wrap
|
||||
|
||||
return app_builder
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_file_handler(environ, start_response, app):
|
||||
" Simple HTTP server to serve static files (.html, .css, etc.) "
|
||||
|
||||
files = {
|
||||
'': 'text/html',
|
||||
'index.html': 'text/html',
|
||||
'sheet.xsl': 'text/xsl'}
|
||||
|
||||
if 'REQUEST_URI' in environ:
|
||||
url = environ['REQUEST_URI'][1:]
|
||||
|
||||
else:
|
||||
url = environ['PATH_INFO'][1:]
|
||||
|
||||
if url in files:
|
||||
headers = {}
|
||||
|
||||
if url == '':
|
||||
url = 'index.html'
|
||||
|
||||
paths = [os.path.join(sys.prefix, 'share/morss/www', url),
|
||||
os.path.join(os.path.dirname(__file__), '../www', url)]
|
||||
|
||||
for path in paths:
|
||||
try:
|
||||
body = open(path, 'rb').read()
|
||||
|
||||
headers['status'] = '200 OK'
|
||||
headers['content-type'] = files[url]
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return [body]
|
||||
|
||||
except IOError:
|
||||
continue
|
||||
|
||||
else:
|
||||
# the for loop did not return, so here we are, i.e. no file found
|
||||
headers['status'] = '404 Not found'
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return ['Error %s' % headers['status']]
|
||||
|
||||
else:
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
def cgi_get(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
# get page
|
||||
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||
|
||||
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||
if options.get == 'page':
|
||||
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
||||
html.make_links_absolute(req['url'])
|
||||
|
||||
kill_tags = ['script', 'iframe', 'noscript']
|
||||
|
||||
for tag in kill_tags:
|
||||
for elem in html.xpath('//'+tag):
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
||||
|
||||
elif options.get == 'article':
|
||||
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
||||
|
||||
else:
|
||||
raise MorssException('no :get option passed')
|
||||
|
||||
else:
|
||||
output = req['data']
|
||||
|
||||
# return html page
|
||||
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return [output]
|
||||
|
||||
|
||||
dispatch_table = {
|
||||
'get': cgi_get,
|
||||
}
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_dispatcher(environ, start_response, app):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
for key in dispatch_table.keys():
|
||||
if key in options:
|
||||
return dispatch_table[key](environ, start_response)
|
||||
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_error_handler(environ, start_response, app):
|
||||
try:
|
||||
return app(environ, start_response)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
headers = {'status': '500 Oops', 'content-type': 'text/html'}
|
||||
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
||||
log('ERROR: %s' % repr(e), force=True)
|
||||
return [cgitb.html(sys.exc_info())]
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_encode(environ, start_response, app):
|
||||
out = app(environ, start_response)
|
||||
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
|
||||
|
||||
|
||||
cgi_standalone_app = cgi_encode(cgi_error_handler(cgi_dispatcher(cgi_file_handler(cgi_app))))
|
||||
|
||||
|
||||
def cli_app():
|
||||
options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
|
||||
url = sys.argv[-1]
|
||||
|
||||
global DEBUG
|
||||
DEBUG = options.debug
|
||||
|
||||
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
|
||||
|
||||
url, rss = FeedFetch(url, options)
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options, 'unicode')
|
||||
|
||||
if not options.silent:
|
||||
print(out)
|
||||
|
||||
log('done')
|
||||
|
||||
|
||||
def isInt(string):
|
||||
try:
|
||||
int(string)
|
||||
return True
|
||||
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
# mod_cgi
|
||||
|
||||
app = cgi_app
|
||||
app = cgi_dispatcher(app)
|
||||
app = cgi_error_handler(app)
|
||||
app = cgi_encode(app)
|
||||
|
||||
wsgiref.handlers.CGIHandler().run(app)
|
||||
|
||||
elif len(sys.argv) <= 1 or isInt(sys.argv[1]):
|
||||
# start internal (basic) http server
|
||||
|
||||
if len(sys.argv) > 1 and isInt(sys.argv[1]):
|
||||
argPort = int(sys.argv[1])
|
||||
if argPort > 0:
|
||||
port = argPort
|
||||
|
||||
else:
|
||||
raise MorssException('Port must be positive integer')
|
||||
|
||||
else:
|
||||
port = PORT
|
||||
|
||||
app = cgi_app
|
||||
app = cgi_file_handler(app)
|
||||
app = cgi_dispatcher(app)
|
||||
app = cgi_error_handler(app)
|
||||
app = cgi_encode(app)
|
||||
|
||||
print('Serving http://localhost:%s/' % port)
|
||||
httpd = wsgiref.simple_server.make_server('', port, app)
|
||||
httpd.serve_forever()
|
||||
|
||||
else:
|
||||
# as a CLI app
|
||||
try:
|
||||
cli_app()
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
print('ERROR: %s' % e.message)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -125,7 +125,7 @@ def score_node(node):
|
|||
|
||||
if wc != 0:
|
||||
wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
|
||||
score = score * ( 1 - float(wca)/wc )
|
||||
score = score * ( 1 - 2 * float(wca)/wc )
|
||||
|
||||
return score
|
||||
|
||||
|
|
|
@ -0,0 +1,257 @@
|
|||
import sys
|
||||
import os.path
|
||||
import re
|
||||
import lxml.etree
|
||||
|
||||
import cgitb
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from urllib import unquote
|
||||
except ImportError:
|
||||
# python 3
|
||||
from urllib.parse import unquote
|
||||
|
||||
from . import crawler
|
||||
from . import readabilite
|
||||
from .morss import FeedFetch, FeedGather, FeedFormat
|
||||
from .morss import Options, log, TIMEOUT, DELAY, MorssException
|
||||
|
||||
from . import cred
|
||||
|
||||
|
||||
def parse_options(options):
|
||||
""" Turns ['md=True'] into {'md':True} """
|
||||
out = {}
|
||||
|
||||
for option in options:
|
||||
split = option.split('=', 1)
|
||||
|
||||
if len(split) > 1:
|
||||
out[split[0]] = split[1]
|
||||
|
||||
else:
|
||||
out[split[0]] = True
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def cgi_parse_environ(environ):
|
||||
# get options
|
||||
|
||||
if 'REQUEST_URI' in environ:
|
||||
# when running on Apache
|
||||
url = environ['REQUEST_URI'][1:]
|
||||
|
||||
else:
|
||||
# when using internal server
|
||||
url = environ['PATH_INFO'][1:]
|
||||
|
||||
if environ['QUERY_STRING']:
|
||||
url += '?' + environ['QUERY_STRING']
|
||||
|
||||
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
|
||||
|
||||
if url.startswith(':'):
|
||||
split = url.split('/', 1)
|
||||
|
||||
raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
||||
|
||||
if len(split) > 1:
|
||||
url = split[1]
|
||||
|
||||
else:
|
||||
url = ''
|
||||
|
||||
else:
|
||||
raw_options = []
|
||||
|
||||
# init
|
||||
options = Options(parse_options(raw_options))
|
||||
|
||||
return (url, options)
|
||||
|
||||
|
||||
def cgi_app(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
headers = {}
|
||||
|
||||
# headers
|
||||
headers['status'] = '200 OK'
|
||||
headers['cache-control'] = 'max-age=%s' % DELAY
|
||||
headers['x-content-type-options'] = 'nosniff' # safari work around
|
||||
|
||||
if options.cors:
|
||||
headers['access-control-allow-origin'] = '*'
|
||||
|
||||
if options.format == 'html':
|
||||
headers['content-type'] = 'text/html'
|
||||
elif options.txt or options.silent:
|
||||
headers['content-type'] = 'text/plain'
|
||||
elif options.format == 'json':
|
||||
headers['content-type'] = 'application/json'
|
||||
elif options.callback:
|
||||
headers['content-type'] = 'application/javascript'
|
||||
elif options.format == 'csv':
|
||||
headers['content-type'] = 'text/csv'
|
||||
headers['content-disposition'] = 'attachment; filename="feed.csv"'
|
||||
else:
|
||||
headers['content-type'] = 'text/xml'
|
||||
|
||||
headers['content-type'] += '; charset=utf-8'
|
||||
|
||||
# get the work done
|
||||
url, rss = FeedFetch(url, options)
|
||||
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options)
|
||||
|
||||
if options.silent:
|
||||
return ['']
|
||||
|
||||
else:
|
||||
return [out]
|
||||
|
||||
|
||||
def middleware(func):
|
||||
" Decorator to turn a function into a wsgi middleware "
|
||||
# This is called when parsing the "@middleware" code
|
||||
|
||||
def app_builder(app):
|
||||
# This is called when doing app = cgi_wrapper(app)
|
||||
|
||||
def app_wrap(environ, start_response):
|
||||
# This is called when a http request is being processed
|
||||
|
||||
return func(environ, start_response, app)
|
||||
|
||||
return app_wrap
|
||||
|
||||
return app_builder
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_file_handler(environ, start_response, app):
|
||||
" Simple HTTP server to serve static files (.html, .css, etc.) "
|
||||
|
||||
files = {
|
||||
'': 'text/html',
|
||||
'index.html': 'text/html',
|
||||
'sheet.xsl': 'text/xsl'}
|
||||
|
||||
if 'REQUEST_URI' in environ:
|
||||
url = environ['REQUEST_URI'][1:]
|
||||
|
||||
else:
|
||||
url = environ['PATH_INFO'][1:]
|
||||
|
||||
if url in files:
|
||||
headers = {}
|
||||
|
||||
if url == '':
|
||||
url = 'index.html'
|
||||
|
||||
paths = [os.path.join(sys.prefix, 'share/morss/www', url),
|
||||
os.path.join(os.path.dirname(__file__), '../www', url)]
|
||||
|
||||
for path in paths:
|
||||
try:
|
||||
body = open(path, 'rb').read()
|
||||
|
||||
headers['status'] = '200 OK'
|
||||
headers['content-type'] = files[url]
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return [body]
|
||||
|
||||
except IOError:
|
||||
continue
|
||||
|
||||
else:
|
||||
# the for loop did not return, so here we are, i.e. no file found
|
||||
headers['status'] = '404 Not found'
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return ['Error %s' % headers['status']]
|
||||
|
||||
else:
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
def cgi_get(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
# get page
|
||||
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||
|
||||
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||
if options.get == 'page':
|
||||
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
||||
html.make_links_absolute(req['url'])
|
||||
|
||||
kill_tags = ['script', 'iframe', 'noscript']
|
||||
|
||||
for tag in kill_tags:
|
||||
for elem in html.xpath('//'+tag):
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
||||
|
||||
elif options.get == 'article':
|
||||
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
||||
|
||||
else:
|
||||
raise MorssException('no :get option passed')
|
||||
|
||||
else:
|
||||
output = req['data']
|
||||
|
||||
# return html page
|
||||
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return [output]
|
||||
|
||||
|
||||
dispatch_table = {
|
||||
'get': cgi_get,
|
||||
}
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_dispatcher(environ, start_response, app):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
for key in dispatch_table.keys():
|
||||
if key in options:
|
||||
return dispatch_table[key](environ, start_response)
|
||||
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_error_handler(environ, start_response, app):
|
||||
try:
|
||||
return app(environ, start_response)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
headers = {'status': '500 Oops', 'content-type': 'text/html'}
|
||||
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
||||
log('ERROR: %s' % repr(e), force=True)
|
||||
return [cgitb.html(sys.exc_info())]
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_encode(environ, start_response, app):
|
||||
out = app(environ, start_response)
|
||||
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
|
||||
|
||||
|
||||
application = cgi_app
|
||||
application = cgi_file_handler(application)
|
||||
application = cgi_dispatcher(application)
|
||||
application = cgi_error_handler(application)
|
||||
application = cgi_encode(application)
|
|
@ -4,6 +4,12 @@ ErrorDocument 403 "Access forbidden"
|
|||
ErrorDocument 404 /cgi/main.py
|
||||
ErrorDocument 500 "A very nasty bug found his way onto this very server"
|
||||
|
||||
# Uncomment below line to turn debug on for all requests
|
||||
#SetEnv DEBUG 1
|
||||
|
||||
# Uncomment below line to turn debug on for requests with :debug in the url
|
||||
#SetEnvIf Request_URI :debug DEBUG 1
|
||||
|
||||
<Files ~ "\.(py|pyc|db|log)$">
|
||||
deny from all
|
||||
</Files>
|
||||
|
|
|
@ -18,12 +18,18 @@
|
|||
<meta name="robots" content="noindex" />
|
||||
|
||||
<style type="text/css">
|
||||
body * {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
overflow-wrap: anywhere;
|
||||
word-wrap: anywhere;
|
||||
word-break: break-word;
|
||||
|
||||
font-family: sans-serif;
|
||||
|
||||
-webkit-tap-highlight-color: transparent; /* safari work around */
|
||||
}
|
||||
|
||||
input, select {
|
||||
|
@ -133,6 +139,10 @@
|
|||
padding: 1%;
|
||||
}
|
||||
|
||||
.item > *:empty {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.item > :not(:last-child) {
|
||||
border-bottom: 1px solid silver;
|
||||
}
|
||||
|
@ -221,7 +231,7 @@
|
|||
<div id="content">
|
||||
<xsl:for-each select="rdf:RDF/rssfake:channel/rssfake:item|rss/channel/item|atom:feed/atom:entry|atom03:feed/atom03:entry">
|
||||
<div class="item" dir="auto">
|
||||
<a href="/" target="_blank"><xsl:attribute name="href"><xsl:value-of select="rssfake:link|link|atom:link/@href|atom03:link/@href"/></xsl:attribute>
|
||||
<a target="_blank"><xsl:attribute name="href"><xsl:value-of select="rssfake:link|link|atom:link/@href|atom03:link/@href"/></xsl:attribute>
|
||||
<xsl:value-of select="rssfake:title|title|atom:title|atom03:title"/>
|
||||
</a>
|
||||
|
||||
|
@ -242,7 +252,7 @@
|
|||
|
||||
if (!/:html/.test(window.location.href))
|
||||
for (var content of document.querySelectorAll(".desc,.content"))
|
||||
content.innerHTML = (content.innerText.match(/>/g) || []).length > 10 ? content.innerText : content.innerHTML
|
||||
content.innerHTML = (content.innerText.match(/>/g) || []).length > 3 ? content.innerText : content.innerHTML
|
||||
|
||||
var options = parse_location()[0]
|
||||
|
||||
|
|
Loading…
Reference in New Issue