Compare commits

..

No commits in common. "41a63900c286140303c9863f7060ed42e21049f3" and "7e45b2611dcfa0d88ec02278346c7290306849c8" have entirely different histories.

6 changed files with 16 additions and 15 deletions

View File

@ -108,6 +108,7 @@ morss will auto-detect what "mode" to use.
For this, you'll want to change a bit the architecture of the files, for example For this, you'll want to change a bit the architecture of the files, for example
into something like this. into something like this.
``` ```
/ /
├── cgi ├── cgi
@ -150,19 +151,20 @@ gunicorn morss:cgi_standalone_app
#### Using docker #### Using docker
Build & run Build
```shell ```shell
docker build https://git.pictuga.com/pictuga/morss.git -t morss docker build https://git.pictuga.com/pictuga/morss.git
docker run -p 8080:8080 morss
``` ```
In one line Run & Build in one go
```shell ```shell
docker run -p 8080:8080 $(docker build -q https://git.pictuga.com/pictuga/morss.git) docker run -p 8080:8080 $(docker build -q https://git.pictuga.com/pictuga/morss.git)
``` ```
It will run on port 8080 by default
#### Using morss' internal HTTP server #### Using morss' internal HTTP server
Morss can run its own HTTP server. The later should start when you run morss Morss can run its own HTTP server. The later should start when you run morss
@ -254,10 +256,9 @@ output = morss.Format(rss, options) # formats final feed
## Cache information ## Cache information
morss uses caching to make loading faster. There are 3 possible cache backends morss uses caching to make loading faster. There are 2 possible cache backends
(visible in `morss/crawler.py`): (visible in `morss/crawler.py`):
- `{}`: a simple python in-memory dict() object
- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will - `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
be cleared every time the program is run be cleared every time the program is run
- `MySQLCacheHandler` - `MySQLCacheHandler`

View File

@ -72,6 +72,7 @@ def custom_handler(follow=None, delay=None, encoding=None):
handlers.append(HTTPRefreshHandler()) handlers.append(HTTPRefreshHandler())
handlers.append(UAHandler(DEFAULT_UA)) handlers.append(UAHandler(DEFAULT_UA))
handlers.append(BrowserlyHeaderHandler()) handlers.append(BrowserlyHeaderHandler())
handlers.append(EncodingFixHandler(encoding)) handlers.append(EncodingFixHandler(encoding))
if follow: if follow:
@ -465,8 +466,6 @@ class CacheHandler(BaseHandler):
class BaseCache: class BaseCache:
""" Subclasses must behave like a dict """
def __contains__(self, url): def __contains__(self, url):
try: try:
self[url] self[url]

View File

@ -102,7 +102,7 @@ item_link = ./a/@href
item_desc = ./div[class=desc] item_desc = ./div[class=desc]
item_content = ./div[class=content] item_content = ./div[class=content]
base = file:sheet.xsl base = file:www/sheet.xsl
[twitter] [twitter]
mode = html mode = html

View File

@ -85,7 +85,7 @@ def parse(data, url=None, mimetype=None, encoding=None):
for path in ruleset['path']: for path in ruleset['path']:
if fnmatch(url, path): if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset['mode']][0] parser = [x for x in parsers if x.mode == ruleset['mode']][0]
return parser(data, ruleset, encoding=encoding) return parser(data, ruleset, encoding=encoding)
# 2) Try each and every parser # 2) Try each and every parser

View File

@ -40,6 +40,7 @@ LIM_TIME = 2.5 # deletes what's after
DELAY = 10 * 60 # xml cache & ETag cache (in sec) DELAY = 10 * 60 # xml cache & ETag cache (in sec)
TIMEOUT = 4 # http timeout (in sec) TIMEOUT = 4 # http timeout (in sec)
THREADS = MAX_ITEM # number of threads (1 for single-threaded)
DEBUG = False DEBUG = False
PORT = 8080 PORT = 8080
@ -136,7 +137,7 @@ def ItemFix(item, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """ """ Improves feed items (absolute links, resolve feedburner links, etc) """
# check unwanted uppercase title # check unwanted uppercase title
if item.title is not None and len(item.title) > 20 and item.title.isupper(): if len(item.title) > 20 and item.title.isupper():
item.title = item.title.title() item.title = item.title.title()
# check if it includes link # check if it includes link
@ -199,7 +200,7 @@ def ItemFix(item, feedurl='/'):
# reddit # reddit
if urlparse(feedurl).netloc == 'www.reddit.com': if urlparse(feedurl).netloc == 'www.reddit.com':
match = lxml.html.fromstring(item.content).xpath('//a[text()="[link]"]/@href') match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
if len(match): if len(match):
item.link = match[0] item.link = match[0]
log(item.link) log(item.link)
@ -549,7 +550,7 @@ def cgi_app(environ, start_response):
def middleware(func): def middleware(func):
" Decorator to turn a function into a wsgi middleware " " Decorator to turn a function into a wsgi middleware "
# This is called when parsing the "@middleware" code # This is called when parsing the code
def app_builder(app): def app_builder(app):
# This is called when doing app = cgi_wrapper(app) # This is called when doing app = cgi_wrapper(app)
@ -619,7 +620,7 @@ def cgi_get(environ, start_response):
if urlparse(url).scheme not in ['http', 'https']: if urlparse(url).scheme not in ['http', 'https']:
url = 'http://' + url url = 'http://' + url
data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT) data, con, contenttype, encoding = crawler.adv_get(url=url)
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']: if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
if options.get == 'page': if options.get == 'page':

View File

@ -137,7 +137,7 @@ def score_all(node):
for child in node: for child in node:
score = score_node(child) score = score_node(child)
child.attrib['morss_own_score'] = str(float(score)) child.attrib['seen'] = 'yes, ' + str(int(score))
if score > 0 or len(list(child.iterancestors())) <= 2: if score > 0 or len(list(child.iterancestors())) <= 2:
spread_score(child, score) spread_score(child, score)