Compare commits

...

5 Commits

Author SHA1 Message Date
pictuga 41a63900c2 README: improve docker instructions 2020-04-19 13:01:08 +02:00
pictuga ec8edb02f1 Various small bug fixes 2020-04-19 12:54:02 +02:00
pictuga d01b943597 Remove leftover threading var 2020-04-19 12:51:11 +02:00
pictuga b361aa2867 Add timeout to :get 2020-04-19 12:50:26 +02:00
pictuga 4ce3c7cb32 Small code clean ups 2020-04-19 12:50:05 +02:00
6 changed files with 15 additions and 16 deletions

View File

@ -108,7 +108,6 @@ morss will auto-detect what "mode" to use.
For this, you'll want to change a bit the architecture of the files, for example
into something like this.
```
/
├── cgi
@ -151,20 +150,19 @@ gunicorn morss:cgi_standalone_app
#### Using docker
Build
Build & run
```shell
docker build https://git.pictuga.com/pictuga/morss.git
docker build https://git.pictuga.com/pictuga/morss.git -t morss
docker run -p 8080:8080 morss
```
Run & Build in one go
In one line
```shell
docker run -p 8080:8080 $(docker build -q https://git.pictuga.com/pictuga/morss.git)
```
It will run on port 8080 by default
#### Using morss' internal HTTP server
Morss can run its own HTTP server. The later should start when you run morss
@ -256,9 +254,10 @@ output = morss.Format(rss, options) # formats final feed
## Cache information
morss uses caching to make loading faster. There are 2 possible cache backends
morss uses caching to make loading faster. There are 3 possible cache backends
(visible in `morss/crawler.py`):
- `{}`: a simple python in-memory dict() object
- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
be cleared every time the program is run
- `MySQLCacheHandler`

View File

@ -72,7 +72,6 @@ def custom_handler(follow=None, delay=None, encoding=None):
handlers.append(HTTPRefreshHandler())
handlers.append(UAHandler(DEFAULT_UA))
handlers.append(BrowserlyHeaderHandler())
handlers.append(EncodingFixHandler(encoding))
if follow:
@ -466,6 +465,8 @@ class CacheHandler(BaseHandler):
class BaseCache:
""" Subclasses must behave like a dict """
def __contains__(self, url):
try:
self[url]

View File

@ -102,7 +102,7 @@ item_link = ./a/@href
item_desc = ./div[class=desc]
item_content = ./div[class=content]
base = file:www/sheet.xsl
base = file:sheet.xsl
[twitter]
mode = html

View File

@ -85,7 +85,7 @@ def parse(data, url=None, mimetype=None, encoding=None):
for path in ruleset['path']:
if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
return parser(data, ruleset, encoding=encoding)
return parser(data, ruleset, encoding=encoding)
# 2) Try each and every parser

View File

@ -40,7 +40,6 @@ LIM_TIME = 2.5 # deletes what's after
DELAY = 10 * 60 # xml cache & ETag cache (in sec)
TIMEOUT = 4 # http timeout (in sec)
THREADS = MAX_ITEM # number of threads (1 for single-threaded)
DEBUG = False
PORT = 8080
@ -137,7 +136,7 @@ def ItemFix(item, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """
# check unwanted uppercase title
if len(item.title) > 20 and item.title.isupper():
if item.title is not None and len(item.title) > 20 and item.title.isupper():
item.title = item.title.title()
# check if it includes link
@ -200,7 +199,7 @@ def ItemFix(item, feedurl='/'):
# reddit
if urlparse(feedurl).netloc == 'www.reddit.com':
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
match = lxml.html.fromstring(item.content).xpath('//a[text()="[link]"]/@href')
if len(match):
item.link = match[0]
log(item.link)
@ -550,7 +549,7 @@ def cgi_app(environ, start_response):
def middleware(func):
" Decorator to turn a function into a wsgi middleware "
# This is called when parsing the code
# This is called when parsing the "@middleware" code
def app_builder(app):
# This is called when doing app = cgi_wrapper(app)
@ -620,7 +619,7 @@ def cgi_get(environ, start_response):
if urlparse(url).scheme not in ['http', 'https']:
url = 'http://' + url
data, con, contenttype, encoding = crawler.adv_get(url=url)
data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT)
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
if options.get == 'page':

View File

@ -137,7 +137,7 @@ def score_all(node):
for child in node:
score = score_node(child)
child.attrib['seen'] = 'yes, ' + str(int(score))
child.attrib['morss_own_score'] = str(float(score))
if score > 0 or len(list(child.iterancestors())) <= 2:
spread_score(child, score)