Compare commits
5 Commits
7e45b2611d
...
41a63900c2
Author | SHA1 | Date |
---|---|---|
pictuga | 41a63900c2 | |
pictuga | ec8edb02f1 | |
pictuga | d01b943597 | |
pictuga | b361aa2867 | |
pictuga | 4ce3c7cb32 |
13
README.md
13
README.md
|
@ -108,7 +108,6 @@ morss will auto-detect what "mode" to use.
|
||||||
For this, you'll want to change a bit the architecture of the files, for example
|
For this, you'll want to change a bit the architecture of the files, for example
|
||||||
into something like this.
|
into something like this.
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
/
|
/
|
||||||
├── cgi
|
├── cgi
|
||||||
|
@ -151,20 +150,19 @@ gunicorn morss:cgi_standalone_app
|
||||||
|
|
||||||
#### Using docker
|
#### Using docker
|
||||||
|
|
||||||
Build
|
Build & run
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker build https://git.pictuga.com/pictuga/morss.git
|
docker build https://git.pictuga.com/pictuga/morss.git -t morss
|
||||||
|
docker run -p 8080:8080 morss
|
||||||
```
|
```
|
||||||
|
|
||||||
Run & Build in one go
|
In one line
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker run -p 8080:8080 $(docker build -q https://git.pictuga.com/pictuga/morss.git)
|
docker run -p 8080:8080 $(docker build -q https://git.pictuga.com/pictuga/morss.git)
|
||||||
```
|
```
|
||||||
|
|
||||||
It will run on port 8080 by default
|
|
||||||
|
|
||||||
#### Using morss' internal HTTP server
|
#### Using morss' internal HTTP server
|
||||||
|
|
||||||
Morss can run its own HTTP server. The later should start when you run morss
|
Morss can run its own HTTP server. The later should start when you run morss
|
||||||
|
@ -256,9 +254,10 @@ output = morss.Format(rss, options) # formats final feed
|
||||||
|
|
||||||
## Cache information
|
## Cache information
|
||||||
|
|
||||||
morss uses caching to make loading faster. There are 2 possible cache backends
|
morss uses caching to make loading faster. There are 3 possible cache backends
|
||||||
(visible in `morss/crawler.py`):
|
(visible in `morss/crawler.py`):
|
||||||
|
|
||||||
|
- `{}`: a simple python in-memory dict() object
|
||||||
- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
|
- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
|
||||||
be cleared every time the program is run
|
be cleared every time the program is run
|
||||||
- `MySQLCacheHandler`
|
- `MySQLCacheHandler`
|
||||||
|
|
|
@ -72,7 +72,6 @@ def custom_handler(follow=None, delay=None, encoding=None):
|
||||||
handlers.append(HTTPRefreshHandler())
|
handlers.append(HTTPRefreshHandler())
|
||||||
handlers.append(UAHandler(DEFAULT_UA))
|
handlers.append(UAHandler(DEFAULT_UA))
|
||||||
handlers.append(BrowserlyHeaderHandler())
|
handlers.append(BrowserlyHeaderHandler())
|
||||||
|
|
||||||
handlers.append(EncodingFixHandler(encoding))
|
handlers.append(EncodingFixHandler(encoding))
|
||||||
|
|
||||||
if follow:
|
if follow:
|
||||||
|
@ -466,6 +465,8 @@ class CacheHandler(BaseHandler):
|
||||||
|
|
||||||
|
|
||||||
class BaseCache:
|
class BaseCache:
|
||||||
|
""" Subclasses must behave like a dict """
|
||||||
|
|
||||||
def __contains__(self, url):
|
def __contains__(self, url):
|
||||||
try:
|
try:
|
||||||
self[url]
|
self[url]
|
||||||
|
|
|
@ -102,7 +102,7 @@ item_link = ./a/@href
|
||||||
item_desc = ./div[class=desc]
|
item_desc = ./div[class=desc]
|
||||||
item_content = ./div[class=content]
|
item_content = ./div[class=content]
|
||||||
|
|
||||||
base = file:www/sheet.xsl
|
base = file:sheet.xsl
|
||||||
|
|
||||||
[twitter]
|
[twitter]
|
||||||
mode = html
|
mode = html
|
||||||
|
|
|
@ -40,7 +40,6 @@ LIM_TIME = 2.5 # deletes what's after
|
||||||
|
|
||||||
DELAY = 10 * 60 # xml cache & ETag cache (in sec)
|
DELAY = 10 * 60 # xml cache & ETag cache (in sec)
|
||||||
TIMEOUT = 4 # http timeout (in sec)
|
TIMEOUT = 4 # http timeout (in sec)
|
||||||
THREADS = MAX_ITEM # number of threads (1 for single-threaded)
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
PORT = 8080
|
PORT = 8080
|
||||||
|
@ -137,7 +136,7 @@ def ItemFix(item, feedurl='/'):
|
||||||
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
||||||
|
|
||||||
# check unwanted uppercase title
|
# check unwanted uppercase title
|
||||||
if len(item.title) > 20 and item.title.isupper():
|
if item.title is not None and len(item.title) > 20 and item.title.isupper():
|
||||||
item.title = item.title.title()
|
item.title = item.title.title()
|
||||||
|
|
||||||
# check if it includes link
|
# check if it includes link
|
||||||
|
@ -200,7 +199,7 @@ def ItemFix(item, feedurl='/'):
|
||||||
|
|
||||||
# reddit
|
# reddit
|
||||||
if urlparse(feedurl).netloc == 'www.reddit.com':
|
if urlparse(feedurl).netloc == 'www.reddit.com':
|
||||||
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
|
match = lxml.html.fromstring(item.content).xpath('//a[text()="[link]"]/@href')
|
||||||
if len(match):
|
if len(match):
|
||||||
item.link = match[0]
|
item.link = match[0]
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
@ -550,7 +549,7 @@ def cgi_app(environ, start_response):
|
||||||
|
|
||||||
def middleware(func):
|
def middleware(func):
|
||||||
" Decorator to turn a function into a wsgi middleware "
|
" Decorator to turn a function into a wsgi middleware "
|
||||||
# This is called when parsing the code
|
# This is called when parsing the "@middleware" code
|
||||||
|
|
||||||
def app_builder(app):
|
def app_builder(app):
|
||||||
# This is called when doing app = cgi_wrapper(app)
|
# This is called when doing app = cgi_wrapper(app)
|
||||||
|
@ -620,7 +619,7 @@ def cgi_get(environ, start_response):
|
||||||
if urlparse(url).scheme not in ['http', 'https']:
|
if urlparse(url).scheme not in ['http', 'https']:
|
||||||
url = 'http://' + url
|
url = 'http://' + url
|
||||||
|
|
||||||
data, con, contenttype, encoding = crawler.adv_get(url=url)
|
data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||||
|
|
||||||
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||||
if options.get == 'page':
|
if options.get == 'page':
|
||||||
|
|
|
@ -137,7 +137,7 @@ def score_all(node):
|
||||||
|
|
||||||
for child in node:
|
for child in node:
|
||||||
score = score_node(child)
|
score = score_node(child)
|
||||||
child.attrib['seen'] = 'yes, ' + str(int(score))
|
child.attrib['morss_own_score'] = str(float(score))
|
||||||
|
|
||||||
if score > 0 or len(list(child.iterancestors())) <= 2:
|
if score > 0 or len(list(child.iterancestors())) <= 2:
|
||||||
spread_score(child, score)
|
spread_score(child, score)
|
||||||
|
|
Loading…
Reference in New Issue