Compare commits
1 Commits
master
...
b888e068c0
| Author | SHA1 | Date | |
|---|---|---|---|
| b888e068c0 |
94
.drone.yml
Normal file
94
.drone.yml
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: test
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: lint
|
||||||
|
image: alpine:edge
|
||||||
|
commands:
|
||||||
|
- apk add --no-cache python3 py3-lxml py3-setproctitle py3-pip py3-wheel py3-enchant hunspell-en
|
||||||
|
- pip3 install --no-cache-dir .[full] .[dev]
|
||||||
|
- isort --check-only --diff .
|
||||||
|
- pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
|
||||||
|
- pytest --cov=morss tests
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: python
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: publish
|
||||||
|
image: plugins/pypi
|
||||||
|
settings:
|
||||||
|
username:
|
||||||
|
from_secret: pypi_user
|
||||||
|
password:
|
||||||
|
from_secret: pypi_pwd
|
||||||
|
commands:
|
||||||
|
- /bin/drone-pypi
|
||||||
|
- cp dist/morss-*.tar.gz dist/morss.tar.gz
|
||||||
|
|
||||||
|
- name: push
|
||||||
|
image: appleboy/drone-scp
|
||||||
|
settings:
|
||||||
|
host:
|
||||||
|
from_secret: ssh_host
|
||||||
|
username:
|
||||||
|
from_secret: ssh_user
|
||||||
|
key:
|
||||||
|
from_secret: ssh_key
|
||||||
|
source:
|
||||||
|
- dist/morss.tar.gz
|
||||||
|
target: /home/ubuntu
|
||||||
|
|
||||||
|
- name: deploy
|
||||||
|
image: appleboy/drone-ssh
|
||||||
|
settings:
|
||||||
|
host:
|
||||||
|
from_secret: ssh_host
|
||||||
|
username:
|
||||||
|
from_secret: ssh_user
|
||||||
|
key:
|
||||||
|
from_secret: ssh_key
|
||||||
|
script_stop: true
|
||||||
|
script:
|
||||||
|
- sudo pip install --upgrade dist/morss.tar.gz[full]
|
||||||
|
- sudo rm -r dist
|
||||||
|
- sudo morss-helper reload
|
||||||
|
|
||||||
|
trigger:
|
||||||
|
branch:
|
||||||
|
- master
|
||||||
|
event:
|
||||||
|
- push
|
||||||
|
|
||||||
|
depends_on:
|
||||||
|
- test
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: docker
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: publish
|
||||||
|
image: thegeeklab/drone-docker-buildx
|
||||||
|
# NB. this requires qemu installed on host
|
||||||
|
privileged: true
|
||||||
|
settings:
|
||||||
|
username:
|
||||||
|
from_secret: docker_user
|
||||||
|
password:
|
||||||
|
from_secret: docker_pwd
|
||||||
|
repo:
|
||||||
|
from_secret: docker_repo
|
||||||
|
tags: latest
|
||||||
|
platforms: linux/amd64,linux/arm64,linux/arm/v7
|
||||||
|
|
||||||
|
trigger:
|
||||||
|
branch:
|
||||||
|
- master
|
||||||
|
event:
|
||||||
|
- push
|
||||||
|
|
||||||
|
depends_on:
|
||||||
|
- test
|
||||||
78
.github/workflows/default.yml
vendored
78
.github/workflows/default.yml
vendored
@@ -1,78 +0,0 @@
|
|||||||
name: default
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
test-lint:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Prepare image
|
|
||||||
run: apt-get -y update && apt-get -y install python3-pip libenchant-2-2 aspell-en
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: pip3 install .[full] .[dev]
|
|
||||||
- run: isort --check-only --diff .
|
|
||||||
- run: pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
|
|
||||||
- run: pytest --cov=morss tests
|
|
||||||
|
|
||||||
python-publish:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Prepare image
|
|
||||||
run: apt-get -y update && apt-get -y install python3-pip python3-build
|
|
||||||
|
|
||||||
- name: Build package
|
|
||||||
run: python3 -m build
|
|
||||||
|
|
||||||
- name: Publish package
|
|
||||||
uses: https://github.com/pypa/gh-action-pypi-publish@release/v1
|
|
||||||
with:
|
|
||||||
password: ${{ secrets.pypi_api_token }}
|
|
||||||
|
|
||||||
docker-publish-deploy:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
container:
|
|
||||||
image: catthehacker/ubuntu:act-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- name: Set up QEMU
|
|
||||||
uses: https://github.com/docker/setup-qemu-action@v2
|
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: https://github.com/docker/setup-buildx-action@v2
|
|
||||||
|
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: https://github.com/docker/login-action@v2
|
|
||||||
with:
|
|
||||||
username: ${{ secrets.docker_user }}
|
|
||||||
password: ${{ secrets.docker_pwd }}
|
|
||||||
|
|
||||||
- name: Build and push
|
|
||||||
uses: https://github.com/docker/build-push-action@v4
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
platforms: linux/amd64,linux/arm64,linux/arm/v7
|
|
||||||
push: true
|
|
||||||
tags: ${{ secrets.docker_repo }}
|
|
||||||
|
|
||||||
- name: Deploy on server
|
|
||||||
uses: https://github.com/appleboy/ssh-action@v0.1.10
|
|
||||||
with:
|
|
||||||
host: ${{ secrets.ssh_host }}
|
|
||||||
username: ${{ secrets.ssh_user }}
|
|
||||||
key: ${{ secrets.ssh_key }}
|
|
||||||
script: morss-update
|
|
||||||
28
README.md
28
README.md
@@ -41,7 +41,7 @@ Some features of morss:
|
|||||||
- Follow 301/meta redirects
|
- Follow 301/meta redirects
|
||||||
- Recover xml feeds with corrupt encoding
|
- Recover xml feeds with corrupt encoding
|
||||||
- Supports gzip-compressed http content
|
- Supports gzip-compressed http content
|
||||||
- HTTP caching with different backends (in-memory/redis/diskcache)
|
- HTTP caching with different backends (in-memory/sqlite/mysql/redis/diskcache)
|
||||||
- Works as server/cli tool
|
- Works as server/cli tool
|
||||||
- Deobfuscate various tracking links
|
- Deobfuscate various tracking links
|
||||||
|
|
||||||
@@ -81,9 +81,9 @@ From git
|
|||||||
pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
|
pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
|
||||||
```
|
```
|
||||||
|
|
||||||
The full install includes all the cache backends. Otherwise, only in-memory
|
The full install includes all the cache backends. Otherwise, only in-memory and
|
||||||
cache is available. The full install also includes gunicorn (for more efficient
|
sqlite3 caches are available. The full install also includes gunicorn (for more
|
||||||
HTTP handling).
|
efficient HTTP handling).
|
||||||
|
|
||||||
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
|
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
|
||||||
C code needs to be compiled). If possible on your distribution, try installing
|
C code needs to be compiled). If possible on your distribution, try installing
|
||||||
@@ -103,7 +103,7 @@ With cli
|
|||||||
docker pull pictuga/morss
|
docker pull pictuga/morss
|
||||||
```
|
```
|
||||||
|
|
||||||
With docker-compose **(recommended)**
|
With docker-compose
|
||||||
|
|
||||||
```yml
|
```yml
|
||||||
services:
|
services:
|
||||||
@@ -215,7 +215,7 @@ From source
|
|||||||
docker run -p 8000:8000 morss
|
docker run -p 8000:8000 morss
|
||||||
```
|
```
|
||||||
|
|
||||||
With docker-compose **(recommended)**
|
With docker-compose
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker-compose up
|
docker-compose up
|
||||||
@@ -353,7 +353,7 @@ Using cache and passing arguments:
|
|||||||
```python
|
```python
|
||||||
>>> import morss
|
>>> import morss
|
||||||
>>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
|
>>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
|
||||||
>>> cache = '/tmp/morss-cache' # diskcache cache location
|
>>> cache = '/tmp/morss-cache.db' # sqlite cache location
|
||||||
>>> options = {'csv':True}
|
>>> options = {'csv':True}
|
||||||
>>> xml_string = morss.process(url, cache, options)
|
>>> xml_string = morss.process(url, cache, options)
|
||||||
>>> xml_string[:50]
|
>>> xml_string[:50]
|
||||||
@@ -367,10 +367,11 @@ under the hood.
|
|||||||
Doing it step-by-step:
|
Doing it step-by-step:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import morss
|
import morss, morss.crawler
|
||||||
|
|
||||||
url = 'http://newspaper.example/feed.xml'
|
url = 'http://newspaper.example/feed.xml'
|
||||||
options = morss.Options(csv=True) # arguments
|
options = morss.Options(csv=True) # arguments
|
||||||
|
morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location
|
||||||
|
|
||||||
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
|
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
|
||||||
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
|
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
|
||||||
@@ -394,8 +395,8 @@ usage: morss [-h] [--post STRING] [--xpath XPATH]
|
|||||||
[--indent] [--cache] [--force] [--proxy]
|
[--indent] [--cache] [--force] [--proxy]
|
||||||
[--order {first,last,newest,oldest}] [--firstlink] [--resolve]
|
[--order {first,last,newest,oldest}] [--firstlink] [--resolve]
|
||||||
[--items XPATH] [--item_link XPATH] [--item_title XPATH]
|
[--items XPATH] [--item_link XPATH] [--item_title XPATH]
|
||||||
[--item_content XPATH] [--item_time XPATH]
|
[--item_content XPATH] [--item_time XPATH] [--nolink] [--noref]
|
||||||
[--mode {xml,html,json}] [--nolink] [--noref] [--silent]
|
[--silent]
|
||||||
url
|
url
|
||||||
|
|
||||||
Get full-text RSS feeds
|
Get full-text RSS feeds
|
||||||
@@ -439,8 +440,6 @@ custom feeds:
|
|||||||
--item_content XPATH entry's content
|
--item_content XPATH entry's content
|
||||||
--item_time XPATH entry's date & time (accepts a wide range of time
|
--item_time XPATH entry's date & time (accepts a wide range of time
|
||||||
formats)
|
formats)
|
||||||
--mode {xml,html,json}
|
|
||||||
parser to use for the custom feeds
|
|
||||||
|
|
||||||
misc:
|
misc:
|
||||||
--nolink drop links, but keeps links' inner text
|
--nolink drop links, but keeps links' inner text
|
||||||
@@ -502,6 +501,11 @@ be dropped from the feed, even if they're cached. `-1` for unlimited.
|
|||||||
morss uses caching to make loading faster. There are 3 possible cache backends:
|
morss uses caching to make loading faster. There are 3 possible cache backends:
|
||||||
|
|
||||||
- `(nothing/default)`: a simple python in-memory dict-like object.
|
- `(nothing/default)`: a simple python in-memory dict-like object.
|
||||||
|
- `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
|
||||||
|
will be cleared every time the program is run). Path can be defined with
|
||||||
|
`SQLITE_PATH`.
|
||||||
|
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
|
||||||
|
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
|
||||||
- `CACHE=redis`: Redis cache. Connection can be defined with the following
|
- `CACHE=redis`: Redis cache. Connection can be defined with the following
|
||||||
environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
|
environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
|
||||||
- `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
|
- `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
|
||||||
|
|||||||
@@ -1,13 +0,0 @@
|
|||||||
[Unit]
|
|
||||||
Description=morss server (gunicorn)
|
|
||||||
After=network.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
ExecStart=/usr/local/bin/morss-helper run
|
|
||||||
ExecReload=/usr/local/bin/morss-helper reload
|
|
||||||
KillMode=process
|
|
||||||
Restart=always
|
|
||||||
User=http
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
@@ -19,7 +19,5 @@
|
|||||||
|
|
||||||
# pylint: disable=unused-import,unused-variable
|
# pylint: disable=unused-import,unused-variable
|
||||||
|
|
||||||
__version__ = ""
|
|
||||||
|
|
||||||
from .morss import *
|
from .morss import *
|
||||||
from .wsgi import application
|
from .wsgi import application
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import pickle
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
@@ -50,6 +51,83 @@ class BaseCache:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
import sqlite3 # isort:skip
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SQLiteCache(BaseCache):
|
||||||
|
def __init__(self, path=':memory:'):
|
||||||
|
self.con = sqlite3.connect(path, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
||||||
|
|
||||||
|
with self.con:
|
||||||
|
self.con.execute('CREATE TABLE IF NOT EXISTS data (ky UNICODE PRIMARY KEY, data BLOB, timestamp INT)')
|
||||||
|
self.con.execute('pragma journal_mode=WAL')
|
||||||
|
|
||||||
|
self.trim()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
self.con.close()
|
||||||
|
|
||||||
|
def trim(self):
|
||||||
|
with self.con:
|
||||||
|
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
row = self.con.execute('SELECT * FROM data WHERE ky=?', (key,)).fetchone()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
raise KeyError
|
||||||
|
|
||||||
|
return row[1]
|
||||||
|
|
||||||
|
def __setitem__(self, key, data):
|
||||||
|
with self.con:
|
||||||
|
self.con.execute('INSERT INTO data VALUES (?,?,?) ON CONFLICT(ky) DO UPDATE SET data=?, timestamp=?', (key, data, time.time(), data, time.time()))
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pymysql.cursors # isort:skip
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class MySQLCacheHandler(BaseCache):
|
||||||
|
def __init__(self, user, password, database, host='localhost'):
|
||||||
|
self.user = user
|
||||||
|
self.password = password
|
||||||
|
self.database = database
|
||||||
|
self.host = host
|
||||||
|
|
||||||
|
with self.cursor() as cursor:
|
||||||
|
cursor.execute('CREATE TABLE IF NOT EXISTS data (ky VARCHAR(255) NOT NULL PRIMARY KEY, data MEDIUMBLOB, timestamp INT)')
|
||||||
|
|
||||||
|
self.trim()
|
||||||
|
|
||||||
|
def cursor(self):
|
||||||
|
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
|
||||||
|
|
||||||
|
def trim(self):
|
||||||
|
with self.cursor() as cursor:
|
||||||
|
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
cursor = self.cursor()
|
||||||
|
cursor.execute('SELECT * FROM data WHERE ky=%s', (key,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
raise KeyError
|
||||||
|
|
||||||
|
return row[1]
|
||||||
|
|
||||||
|
def __setitem__(self, key, data):
|
||||||
|
with self.cursor() as cursor:
|
||||||
|
cursor.execute('INSERT INTO data VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE data=%s, timestamp=%s',
|
||||||
|
(key, data, time.time(), data, time.time()))
|
||||||
|
|
||||||
|
|
||||||
class CappedDict(OrderedDict, BaseCache):
|
class CappedDict(OrderedDict, BaseCache):
|
||||||
def trim(self):
|
def trim(self):
|
||||||
if CACHE_SIZE >= 0:
|
if CACHE_SIZE >= 0:
|
||||||
@@ -104,7 +182,20 @@ class DiskCacheHandler(BaseCache):
|
|||||||
|
|
||||||
|
|
||||||
if 'CACHE' in os.environ:
|
if 'CACHE' in os.environ:
|
||||||
if os.environ['CACHE'] == 'redis':
|
if os.environ['CACHE'] == 'mysql':
|
||||||
|
default_cache = MySQLCacheHandler(
|
||||||
|
user = os.getenv('MYSQL_USER'),
|
||||||
|
password = os.getenv('MYSQL_PWD'),
|
||||||
|
database = os.getenv('MYSQL_DB'),
|
||||||
|
host = os.getenv('MYSQL_HOST', 'localhost')
|
||||||
|
)
|
||||||
|
|
||||||
|
elif os.environ['CACHE'] == 'sqlite':
|
||||||
|
default_cache = SQLiteCache(
|
||||||
|
os.getenv('SQLITE_PATH', ':memory:')
|
||||||
|
)
|
||||||
|
|
||||||
|
elif os.environ['CACHE'] == 'redis':
|
||||||
default_cache = RedisCacheHandler(
|
default_cache = RedisCacheHandler(
|
||||||
host = os.getenv('REDIS_HOST', 'localhost'),
|
host = os.getenv('REDIS_HOST', 'localhost'),
|
||||||
port = int(os.getenv('REDIS_PORT', 6379)),
|
port = int(os.getenv('REDIS_PORT', 6379)),
|
||||||
|
|||||||
@@ -54,7 +54,6 @@ def cli_app():
|
|||||||
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
|
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
|
||||||
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
|
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
|
||||||
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
|
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
|
||||||
group.add_argument('--mode', default=None, choices=('xml', 'html', 'json'), help='parser to use for the custom feeds')
|
|
||||||
|
|
||||||
group = parser.add_argument_group('misc')
|
group = parser.add_argument_group('misc')
|
||||||
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
|
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ from .caching import default_cache
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
from urllib import quote
|
from urllib import quote, unquote
|
||||||
|
|
||||||
from httplib import HTTPMessage
|
from httplib import HTTPMessage
|
||||||
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
||||||
@@ -43,7 +43,7 @@ except ImportError:
|
|||||||
# python 3
|
# python 3
|
||||||
from email import message_from_string
|
from email import message_from_string
|
||||||
from http.client import HTTPMessage
|
from http.client import HTTPMessage
|
||||||
from urllib.parse import quote, urlsplit
|
from urllib.parse import quote, unquote, urlsplit
|
||||||
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
||||||
HTTPRedirectHandler, Request, addinfourl,
|
HTTPRedirectHandler, Request, addinfourl,
|
||||||
build_opener, parse_http_list, parse_keqv_list)
|
build_opener, parse_http_list, parse_keqv_list)
|
||||||
@@ -151,28 +151,6 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
|||||||
return build_opener(*handlers)
|
return build_opener(*handlers)
|
||||||
|
|
||||||
|
|
||||||
def is_ascii(string):
|
|
||||||
# there's a native function in py3, but home-made fix for backward compatibility
|
|
||||||
try:
|
|
||||||
string.encode('ascii')
|
|
||||||
|
|
||||||
except UnicodeError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def soft_quote(string):
|
|
||||||
" url-quote only when not a valid ascii string "
|
|
||||||
|
|
||||||
if is_ascii(string):
|
|
||||||
return string
|
|
||||||
|
|
||||||
else:
|
|
||||||
return quote(string.encode('utf-8'))
|
|
||||||
|
|
||||||
|
|
||||||
def sanitize_url(url):
|
def sanitize_url(url):
|
||||||
# make sure the url is unicode, i.e. not bytes
|
# make sure the url is unicode, i.e. not bytes
|
||||||
if isinstance(url, bytes):
|
if isinstance(url, bytes):
|
||||||
@@ -185,10 +163,7 @@ def sanitize_url(url):
|
|||||||
# turns out some websites have really badly fomatted urls (fix http:/badurl)
|
# turns out some websites have really badly fomatted urls (fix http:/badurl)
|
||||||
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
|
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
|
||||||
|
|
||||||
# escape spaces
|
# escape non-ascii unicode characters (also encode spaces as %20)
|
||||||
url = url.replace(' ', '%20')
|
|
||||||
|
|
||||||
# escape non-ascii unicode characters
|
|
||||||
parts = urlsplit(url)
|
parts = urlsplit(url)
|
||||||
|
|
||||||
parts = parts._replace(
|
parts = parts._replace(
|
||||||
@@ -196,9 +171,9 @@ def sanitize_url(url):
|
|||||||
parts.hostname,
|
parts.hostname,
|
||||||
parts.hostname.encode('idna').decode('ascii')
|
parts.hostname.encode('idna').decode('ascii')
|
||||||
),
|
),
|
||||||
path=soft_quote(parts.path),
|
path=quote(unquote(parts.path).encode('utf-8')),
|
||||||
query=soft_quote(parts.query),
|
query=quote(unquote(parts.query).encode('utf-8')),
|
||||||
fragment=soft_quote(parts.fragment),
|
fragment=quote(unquote(parts.fragment).encode('utf-8')),
|
||||||
)
|
)
|
||||||
|
|
||||||
return parts.geturl()
|
return parts.geturl()
|
||||||
@@ -368,7 +343,7 @@ class BrowserlyHeaderHandler(BaseHandler):
|
|||||||
def iter_html_tag(html_str, tag_name):
|
def iter_html_tag(html_str, tag_name):
|
||||||
" To avoid parsing whole pages when looking for a simple tag "
|
" To avoid parsing whole pages when looking for a simple tag "
|
||||||
|
|
||||||
re_tag = r'<%s\s+[^>]+>' % tag_name
|
re_tag = r'<%s(\s*[^>])*>' % tag_name
|
||||||
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
||||||
|
|
||||||
for tag_match in re.finditer(re_tag, html_str):
|
for tag_match in re.finditer(re_tag, html_str):
|
||||||
@@ -425,7 +400,7 @@ class HTTPRefreshHandler(BaseHandler):
|
|||||||
def http_response(self, req, resp):
|
def http_response(self, req, resp):
|
||||||
if 200 <= resp.code < 300:
|
if 200 <= resp.code < 300:
|
||||||
if resp.headers.get('refresh'):
|
if resp.headers.get('refresh'):
|
||||||
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url\s*=\s*(["\']?)(?P<url>.+)\2$'
|
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
|
||||||
match = re.search(regex, resp.headers.get('refresh'))
|
match = re.search(regex, resp.headers.get('refresh'))
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
|
|||||||
@@ -90,6 +90,9 @@ item_updated = updated
|
|||||||
[html]
|
[html]
|
||||||
mode = html
|
mode = html
|
||||||
|
|
||||||
|
path =
|
||||||
|
http://localhost/
|
||||||
|
|
||||||
title = //div[@id='header']/h1
|
title = //div[@id='header']/h1
|
||||||
desc = //div[@id='header']/p
|
desc = //div[@id='header']/p
|
||||||
items = //div[@id='content']/div
|
items = //div[@id='content']/div
|
||||||
|
|||||||
@@ -699,7 +699,7 @@ class Feed(object):
|
|||||||
try:
|
try:
|
||||||
setattr(item, attr, new[attr])
|
setattr(item, attr, new[attr])
|
||||||
|
|
||||||
except (KeyError, IndexError, TypeError):
|
except (IndexError, TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return item
|
return item
|
||||||
|
|||||||
@@ -287,9 +287,6 @@ def FeedFetch(url, options):
|
|||||||
|
|
||||||
ruleset['items'] = options.items
|
ruleset['items'] = options.items
|
||||||
|
|
||||||
if options.mode:
|
|
||||||
ruleset['mode'] = options.mode
|
|
||||||
|
|
||||||
ruleset['title'] = options.get('title', '//head/title')
|
ruleset['title'] = options.get('title', '//head/title')
|
||||||
ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')
|
ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')
|
||||||
|
|
||||||
@@ -428,7 +425,7 @@ def process(url, cache=None, options=None):
|
|||||||
options = Options(options)
|
options = Options(options)
|
||||||
|
|
||||||
if cache:
|
if cache:
|
||||||
caching.default_cache = caching.DiskCacheHandler(cache)
|
caching.default_cache = caching.SQLiteCache(cache)
|
||||||
|
|
||||||
url, rss = FeedFetch(url, options)
|
url, rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
|
|||||||
28
setup.py
28
setup.py
@@ -3,33 +3,11 @@ from glob import glob
|
|||||||
|
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
|
||||||
|
|
||||||
def get_version():
|
|
||||||
with open('morss/__init__.py', 'r+') as file:
|
|
||||||
lines = file.readlines()
|
|
||||||
|
|
||||||
# look for hard coded version number
|
|
||||||
for i in range(len(lines)):
|
|
||||||
if lines[i].startswith('__version__'):
|
|
||||||
version = lines[i].split('"')[1]
|
|
||||||
break
|
|
||||||
|
|
||||||
# create (& save) one if none found
|
|
||||||
if version == '':
|
|
||||||
version = datetime.now().strftime('%Y%m%d.%H%M')
|
|
||||||
lines[i] = '__version__ = "' + version + '"\n'
|
|
||||||
|
|
||||||
file.seek(0)
|
|
||||||
file.writelines(lines)
|
|
||||||
|
|
||||||
# return version number
|
|
||||||
return version
|
|
||||||
|
|
||||||
package_name = 'morss'
|
package_name = 'morss'
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name = package_name,
|
name = package_name,
|
||||||
version = get_version(),
|
version = datetime.now().strftime('%Y%m%d.%H%M'),
|
||||||
description = 'Get full-text RSS feeds',
|
description = 'Get full-text RSS feeds',
|
||||||
long_description = open('README.md').read(),
|
long_description = open('README.md').read(),
|
||||||
long_description_content_type = 'text/markdown',
|
long_description_content_type = 'text/markdown',
|
||||||
@@ -44,8 +22,8 @@ setup(
|
|||||||
packages = [package_name],
|
packages = [package_name],
|
||||||
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
|
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
|
||||||
extras_require = {
|
extras_require = {
|
||||||
'full': ['redis', 'diskcache', 'gunicorn', 'setproctitle'],
|
'full': ['pymysql', 'redis', 'diskcache', 'gunicorn', 'setproctitle'],
|
||||||
'dev': ['pylint', 'pyenchant', 'pytest', 'pytest-cov'],
|
'dev': ['pylint', 'pytest'],
|
||||||
},
|
},
|
||||||
python_requires = '>=2.7',
|
python_requires = '>=2.7',
|
||||||
package_data = {package_name: ['feedify.ini']},
|
package_data = {package_name: ['feedify.ini']},
|
||||||
|
|||||||
@@ -1,4 +0,0 @@
|
|||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/plain
|
|
||||||
|
|
||||||
success
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
HTTP/1.1 301 Moved Permanently
|
|
||||||
location: /200-ok.txt
|
|
||||||
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
HTTP/1.1 301 Moved Permanently
|
|
||||||
location: ./200-ok.txt
|
|
||||||
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
HTTP/1.1 301 Moved Permanently
|
|
||||||
location: http://localhost:8888/200-ok.txt
|
|
||||||
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
HTTP/1.1 308 Permanent Redirect
|
|
||||||
location: /200-ok.txt
|
|
||||||
|
|
||||||
/200-ok.txt
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/html; charset=UTF-8
|
|
||||||
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
|
|
||||||
<body>meta redirect</body>
|
|
||||||
</html>
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/plain; charset=gb2312
|
|
||||||
|
|
||||||
<EFBFBD>ɹ<EFBFBD>
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/html
|
|
||||||
|
|
||||||
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><meta charset="gb2312"/></head>
|
|
||||||
<body>
|
|
||||||
<EFBFBD>ɹ<EFBFBD>
|
|
||||||
</body></html>
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/plain; charset=iso-8859-1
|
|
||||||
|
|
||||||
succ<EFBFBD>s
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/plain
|
|
||||||
|
|
||||||
succ<EFBFBD>s
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
succès
|
|
||||||
Binary file not shown.
@@ -1,3 +0,0 @@
|
|||||||
HTTP/1.1 200 OK
|
|
||||||
refresh: 0;url=/200-ok.txt
|
|
||||||
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/html; charset=UTF-8
|
|
||||||
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
|
|
||||||
<body>meta redirect</body>
|
|
||||||
</html>
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/html; charset=UTF-8
|
|
||||||
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
|
|
||||||
<body>meta redirect</body>
|
|
||||||
</html>
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/html; charset=UTF-8
|
|
||||||
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
|
|
||||||
<body>meta redirect</body>
|
|
||||||
</html>
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,62 +0,0 @@
|
|||||||
import pytest
|
|
||||||
|
|
||||||
from morss.crawler import *
|
|
||||||
|
|
||||||
|
|
||||||
def test_get(replay_server):
|
|
||||||
assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
|
|
||||||
|
|
||||||
def test_adv_get(replay_server):
|
|
||||||
assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('before,after', [
|
|
||||||
(b'http://localhost:8888/', 'http://localhost:8888/'),
|
|
||||||
('localhost:8888/', 'http://localhost:8888/'),
|
|
||||||
('http:/localhost:8888/', 'http://localhost:8888/'),
|
|
||||||
('http://localhost:8888/&/', 'http://localhost:8888/&/'),
|
|
||||||
('http://localhost:8888/ /', 'http://localhost:8888/%20/'),
|
|
||||||
('http://localhost-€/€/', 'http://xn--localhost--077e/%E2%82%AC/'),
|
|
||||||
('http://localhost-€:8888/€/', 'http://xn--localhost--077e:8888/%E2%82%AC/'),
|
|
||||||
])
|
|
||||||
def test_sanitize_url(before, after):
|
|
||||||
assert sanitize_url(before) == after
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
|
|
||||||
def test_size_limit_handler(replay_server, opener):
|
|
||||||
assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
|
|
||||||
def test_gzip_handler(replay_server, opener):
|
|
||||||
assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
|
|
||||||
@pytest.mark.parametrize('url', [
|
|
||||||
'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
|
|
||||||
'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
|
|
||||||
'enc-utf-8-header.txt',
|
|
||||||
])
|
|
||||||
def test_encoding_fix_handler(replay_server, opener, url):
|
|
||||||
out = adv_get('http://localhost:8888/%s' % url)
|
|
||||||
out = out['data'].decode(out['encoding'])
|
|
||||||
assert 'succes' in out or 'succès' in out or '成功' in out
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
|
|
||||||
def test_alternate_handler(replay_server, opener):
|
|
||||||
assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
|
|
||||||
def test_http_equiv_handler(replay_server, opener):
|
|
||||||
assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
|
|
||||||
def test_http_all_redirect_handler(replay_server, opener):
|
|
||||||
assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
|
|
||||||
def test_http_refresh_handler(replay_server, opener):
|
|
||||||
assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
@@ -1,9 +1,7 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from morss.crawler import adv_get
|
from morss.crawler import adv_get
|
||||||
from morss.feeds import *
|
from morss.feeds import *
|
||||||
|
|
||||||
|
|
||||||
def get_feed(url):
|
def get_feed(url):
|
||||||
url = 'http://localhost:8888/%s' % url
|
url = 'http://localhost:8888/%s' % url
|
||||||
out = adv_get(url)
|
out = adv_get(url)
|
||||||
|
|||||||
Reference in New Issue
Block a user