Compare commits
23 Commits
ef6efd981c
...
0365232a73
Author | SHA1 | Date |
---|---|---|
pictuga | 0365232a73 | |
pictuga | a523518ae8 | |
pictuga | 52c48b899f | |
pictuga | 9649cabb1b | |
pictuga | 0c29102788 | |
pictuga | 10535a17c5 | |
pictuga | 7d86972e58 | |
pictuga | 62e04549ac | |
pictuga | 5da7121a77 | |
pictuga | bb82902ad1 | |
pictuga | 04afa28fe7 | |
pictuga | 75bb69f0fd | |
pictuga | 97d9dda547 | |
pictuga | 0c31d9f6db | |
pictuga | 49e29208ef | |
pictuga | d8d608a4de | |
pictuga | 5437e40a15 | |
pictuga | 6c1f8da692 | |
pictuga | a1a26d8209 | |
pictuga | edbb580f33 | |
pictuga | 4fd730b983 | |
pictuga | 198353d6b9 | |
pictuga | 0b3e6d7749 |
|
@ -0,0 +1,15 @@
|
||||||
|
kind: pipeline
|
||||||
|
name: default
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: isort
|
||||||
|
image: python:alpine
|
||||||
|
commands:
|
||||||
|
- pip install isort
|
||||||
|
- isort --check-only --diff .
|
||||||
|
- name: pylint
|
||||||
|
image: alpine
|
||||||
|
commands:
|
||||||
|
- apk add --no-cache python3 py3-lxml py3-pip py3-wheel py3-pylint py3-enchant hunspell-en
|
||||||
|
- pip3 install --no-cache-dir .[full]
|
||||||
|
- pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
|
|
@ -0,0 +1,50 @@
|
||||||
|
[MASTER]
|
||||||
|
ignore=CVS
|
||||||
|
suggestion-mode=yes
|
||||||
|
extension-pkg-allow-list=lxml.etree
|
||||||
|
|
||||||
|
[MESSAGES CONTROL]
|
||||||
|
disable=missing-function-docstring,
|
||||||
|
missing-class-docstring,
|
||||||
|
missing-module-docstring,
|
||||||
|
wrong-spelling-in-comment,
|
||||||
|
|
||||||
|
[REPORTS]
|
||||||
|
reports=yes
|
||||||
|
score=yes
|
||||||
|
|
||||||
|
[SPELLING]
|
||||||
|
spelling-dict=en_GB
|
||||||
|
spelling-ignore-words=morss
|
||||||
|
|
||||||
|
[STRING]
|
||||||
|
check-quote-consistency=yes
|
||||||
|
check-str-concat-over-line-jumps=yes
|
||||||
|
|
||||||
|
[VARIABLES]
|
||||||
|
allow-global-unused-variables=no
|
||||||
|
init-import=no
|
||||||
|
|
||||||
|
[FORMAT]
|
||||||
|
expected-line-ending-format=LF
|
||||||
|
indent-string=' '
|
||||||
|
max-line-length=120
|
||||||
|
max-module-lines=1000
|
||||||
|
|
||||||
|
[BASIC]
|
||||||
|
argument-naming-style=snake_case
|
||||||
|
attr-naming-style=snake_case
|
||||||
|
class-attribute-naming-style=snake_case
|
||||||
|
class-const-naming-style=UPPER_CASE
|
||||||
|
class-naming-style=PascalCase
|
||||||
|
const-naming-style=UPPER_CASE
|
||||||
|
function-naming-style=snake_case
|
||||||
|
inlinevar-naming-style=snake_case
|
||||||
|
method-naming-style=snake_case
|
||||||
|
module-naming-style=snake_case
|
||||||
|
variable-naming-style=snake_case
|
||||||
|
|
||||||
|
include-naming-hint=yes
|
||||||
|
|
||||||
|
bad-names=foo, bar
|
||||||
|
good-names=i, j, k
|
|
@ -3,6 +3,6 @@ FROM alpine:latest
|
||||||
RUN apk add --no-cache python3 py3-lxml py3-pip py3-wheel git
|
RUN apk add --no-cache python3 py3-lxml py3-pip py3-wheel git
|
||||||
|
|
||||||
ADD . /app
|
ADD . /app
|
||||||
RUN pip3 install --no-cache-dir /app gunicorn
|
RUN pip3 install --no-cache-dir /app[full] gunicorn
|
||||||
|
|
||||||
CMD gunicorn --bind 0.0.0.0:8080 -w 4 --preload --access-logfile - morss
|
CMD gunicorn --bind 0.0.0.0:8080 -w 4 --preload --access-logfile - morss
|
||||||
|
|
41
README.md
41
README.md
|
@ -1,5 +1,7 @@
|
||||||
# Morss - Get full-text RSS feeds
|
# Morss - Get full-text RSS feeds
|
||||||
|
|
||||||
|
[![Build Status](https://ci.pictuga.com/api/badges/pictuga/morss/status.svg)](https://ci.pictuga.com/pictuga/morss)
|
||||||
|
|
||||||
_GNU AGPLv3 code_
|
_GNU AGPLv3 code_
|
||||||
_Provided logo is CC BY-NC-SA 4.0_
|
_Provided logo is CC BY-NC-SA 4.0_
|
||||||
|
|
||||||
|
@ -46,28 +48,25 @@ Some features of morss:
|
||||||
|
|
||||||
### Python package
|
### Python package
|
||||||
|
|
||||||
|
Simple install (without optional dependencies)
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
pip install git+https://git.pictuga.com/pictuga/morss.git
|
pip install git+https://git.pictuga.com/pictuga/morss.git
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Full installation (including optional dependencies)
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install git+https://git.pictuga.com/pictuga/morss.git#[full]
|
||||||
|
```
|
||||||
|
|
||||||
|
The full install includes mysql and redis (possible cache backends). Otherwise,
|
||||||
|
only in-memory and sqlite3 caches are available.
|
||||||
|
|
||||||
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
|
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
|
||||||
C code needs to be compiled). If possible on your distribution, try installing
|
C code needs to be compiled). If possible on your distribution, try installing
|
||||||
it with the system package manager.
|
it with the system package manager.
|
||||||
|
|
||||||
Dependencies:
|
|
||||||
|
|
||||||
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
|
||||||
- [lxml](http://lxml.de/) for xml parsing
|
|
||||||
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
|
|
||||||
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
|
||||||
- [chardet](https://pypi.python.org/pypi/chardet)
|
|
||||||
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
|
|
||||||
- pymysql
|
|
||||||
|
|
||||||
You may also need:
|
|
||||||
- Apache, with python-cgi support, to run on a server
|
|
||||||
- a fast internet connection
|
|
||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
|
|
||||||
Build & run
|
Build & run
|
||||||
|
@ -263,11 +262,12 @@ arguments to morss is explained in Run above.
|
||||||
The list of arguments can be obtained by running `morss --help`
|
The list of arguments can be obtained by running `morss --help`
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: morss [-h] [--post STRING] [--format {rss,json,html,csv}]
|
usage: morss [-h] [--post STRING] [--xpath XPATH]
|
||||||
[--search STRING] [--clip] [--indent] [--cache] [--force]
|
[--format {rss,json,html,csv}] [--search STRING] [--clip]
|
||||||
[--proxy] [--newest] [--firstlink] [--resolve] [--items XPATH]
|
[--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
|
||||||
[--item_link XPATH] [--item_title XPATH] [--item_content XPATH]
|
[--resolve] [--items XPATH] [--item_link XPATH]
|
||||||
[--item_time XPATH] [--nolink] [--noref] [--silent]
|
[--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
|
||||||
|
[--nolink] [--noref] [--silent]
|
||||||
url
|
url
|
||||||
|
|
||||||
Get full-text RSS feeds
|
Get full-text RSS feeds
|
||||||
|
@ -278,6 +278,7 @@ positional arguments:
|
||||||
optional arguments:
|
optional arguments:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
--post STRING POST request
|
--post STRING POST request
|
||||||
|
--xpath XPATH xpath rule to manually detect the article
|
||||||
|
|
||||||
output:
|
output:
|
||||||
--format {rss,json,html,csv}
|
--format {rss,json,html,csv}
|
||||||
|
@ -373,6 +374,8 @@ will be cleared every time the program is run). Path can be defined with
|
||||||
`SQLITE_PATH`.
|
`SQLITE_PATH`.
|
||||||
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
|
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
|
||||||
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
|
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
|
||||||
|
- `CACHE=redis`: Redis cache. Connection can be defined with the following
|
||||||
|
environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
|
||||||
|
|
||||||
To limit the size of the cache:
|
To limit the size of the cache:
|
||||||
|
|
||||||
|
|
|
@ -16,5 +16,8 @@
|
||||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
# ran on `import morss`
|
# ran on `import morss`
|
||||||
|
|
||||||
|
# pylint: disable=unused-import,unused-variable
|
||||||
|
|
||||||
from .morss import *
|
from .morss import *
|
||||||
from .wsgi import application
|
from .wsgi import application
|
||||||
|
|
|
@ -20,9 +20,7 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from . import wsgi
|
from . import cli, wsgi
|
||||||
from . import cli
|
|
||||||
|
|
||||||
from .morss import MorssException
|
from .morss import MorssException
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,188 @@
|
||||||
|
# This file is part of morss
|
||||||
|
#
|
||||||
|
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify it under
|
||||||
|
# the terms of the GNU Affero General Public License as published by the Free
|
||||||
|
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||||
|
# later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||||
|
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||||
|
# details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License along
|
||||||
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
|
||||||
|
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseCache:
|
||||||
|
""" Subclasses must behave like a dict """
|
||||||
|
|
||||||
|
def trim(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def autotrim(self, delay=CACHE_LIFESPAN):
|
||||||
|
# trim the cache every so often
|
||||||
|
|
||||||
|
self.trim()
|
||||||
|
|
||||||
|
t = threading.Timer(delay, self.autotrim)
|
||||||
|
t.daemon = True
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
def __contains__(self, url):
|
||||||
|
try:
|
||||||
|
self[url]
|
||||||
|
|
||||||
|
except KeyError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
import sqlite3 # isort:skip
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SQLiteCache(BaseCache):
|
||||||
|
def __init__(self, filename=':memory:'):
|
||||||
|
self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
||||||
|
|
||||||
|
with self.con:
|
||||||
|
self.con.execute('CREATE TABLE IF NOT EXISTS data (ky UNICODE PRIMARY KEY, data BLOB, timestamp INT)')
|
||||||
|
self.con.execute('pragma journal_mode=WAL')
|
||||||
|
|
||||||
|
self.trim()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
self.con.close()
|
||||||
|
|
||||||
|
def trim(self):
|
||||||
|
with self.con:
|
||||||
|
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
row = self.con.execute('SELECT * FROM data WHERE ky=?', (key,)).fetchone()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
raise KeyError
|
||||||
|
|
||||||
|
return row[1]
|
||||||
|
|
||||||
|
def __setitem__(self, key, data):
|
||||||
|
with self.con:
|
||||||
|
self.con.execute('INSERT INTO data VALUES (?,?,?) ON CONFLICT(ky) DO UPDATE SET data=?, timestamp=?', (key, data, time.time(), data, time.time()))
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pymysql.cursors # isort:skip
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class MySQLCacheHandler(BaseCache):
|
||||||
|
def __init__(self, user, password, database, host='localhost'):
|
||||||
|
self.user = user
|
||||||
|
self.password = password
|
||||||
|
self.database = database
|
||||||
|
self.host = host
|
||||||
|
|
||||||
|
with self.cursor() as cursor:
|
||||||
|
cursor.execute('CREATE TABLE IF NOT EXISTS data (ky VARCHAR(255) NOT NULL PRIMARY KEY, data MEDIUMBLOB, timestamp INT)')
|
||||||
|
|
||||||
|
self.trim()
|
||||||
|
|
||||||
|
def cursor(self):
|
||||||
|
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
|
||||||
|
|
||||||
|
def trim(self):
|
||||||
|
with self.cursor() as cursor:
|
||||||
|
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
cursor = self.cursor()
|
||||||
|
cursor.execute('SELECT * FROM data WHERE ky=%s', (key,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
raise KeyError
|
||||||
|
|
||||||
|
return row[1]
|
||||||
|
|
||||||
|
def __setitem__(self, key, data):
|
||||||
|
with self.cursor() as cursor:
|
||||||
|
cursor.execute('INSERT INTO data VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE data=%s, timestamp=%s',
|
||||||
|
(key, data, time.time(), data, time.time()))
|
||||||
|
|
||||||
|
|
||||||
|
class CappedDict(OrderedDict, BaseCache):
|
||||||
|
def trim(self):
|
||||||
|
if CACHE_SIZE >= 0:
|
||||||
|
for i in range( max( len(self) - CACHE_SIZE , 0 )):
|
||||||
|
self.popitem(False)
|
||||||
|
|
||||||
|
def __setitem__(self, key, data):
|
||||||
|
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
|
||||||
|
if key in self:
|
||||||
|
del self[key]
|
||||||
|
OrderedDict.__setitem__(self, key, data)
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
import redis # isort:skip
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RedisCacheHandler(BaseCache):
|
||||||
|
def __init__(self, host='localhost', port=6379, db=0, password=None):
|
||||||
|
self.r = redis.Redis(host=host, port=port, db=db, password=password)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return self.r.get(key)
|
||||||
|
|
||||||
|
def __setitem__(self, key, data):
|
||||||
|
self.r.set(key, data)
|
||||||
|
|
||||||
|
|
||||||
|
if 'CACHE' in os.environ:
|
||||||
|
if os.environ['CACHE'] == 'mysql':
|
||||||
|
default_cache = MySQLCacheHandler(
|
||||||
|
user = os.getenv('MYSQL_USER'),
|
||||||
|
password = os.getenv('MYSQL_PWD'),
|
||||||
|
database = os.getenv('MYSQL_DB'),
|
||||||
|
host = os.getenv('MYSQL_HOST', 'localhost')
|
||||||
|
)
|
||||||
|
|
||||||
|
elif os.environ['CACHE'] == 'sqlite':
|
||||||
|
if 'SQLITE_PATH' in os.environ:
|
||||||
|
path = os.getenv('SQLITE_PATH')
|
||||||
|
|
||||||
|
else:
|
||||||
|
path = ':memory:'
|
||||||
|
|
||||||
|
default_cache = SQLiteCache(path)
|
||||||
|
|
||||||
|
elif os.environ['CACHE'] == 'redis':
|
||||||
|
default_cache = RedisCacheHandler(
|
||||||
|
host = os.getenv('REDIS_HOST', 'localhost'),
|
||||||
|
port = int(os.getenv('REDIS_PORT', 6379)),
|
||||||
|
db = int(os.getenv('REDIS_DB', 0)),
|
||||||
|
password = os.getenv('REDIS_PWD', None)
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
default_cache = CappedDict()
|
|
@ -15,12 +15,11 @@
|
||||||
# You should have received a copy of the GNU Affero General Public License along
|
# You should have received a copy of the GNU Affero General Public License along
|
||||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import sys
|
|
||||||
import os.path
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import os.path
|
||||||
|
import sys
|
||||||
|
|
||||||
from .morss import FeedFetch, FeedGather, FeedFormat
|
from .morss import FeedFetch, FeedFormat, FeedGather, Options
|
||||||
from .morss import Options
|
|
||||||
|
|
||||||
|
|
||||||
def cli_app():
|
def cli_app():
|
||||||
|
@ -33,6 +32,7 @@ def cli_app():
|
||||||
parser.add_argument('url', help='feed url')
|
parser.add_argument('url', help='feed url')
|
||||||
|
|
||||||
parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
|
parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
|
||||||
|
parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
|
||||||
|
|
||||||
group = parser.add_argument_group('output')
|
group = parser.add_argument_group('output')
|
||||||
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
|
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
|
||||||
|
|
250
morss/crawler.py
250
morss/crawler.py
|
@ -16,30 +16,36 @@
|
||||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import pickle
|
||||||
|
|
||||||
import zlib
|
|
||||||
from io import BytesIO, StringIO
|
|
||||||
import re
|
|
||||||
import chardet
|
|
||||||
from cgi import parse_header
|
|
||||||
import time
|
|
||||||
import threading
|
|
||||||
import random
|
import random
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import zlib
|
||||||
|
from cgi import parse_header
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from io import BytesIO, StringIO
|
||||||
|
|
||||||
|
import chardet
|
||||||
|
|
||||||
|
from .caching import default_cache
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
from urlparse import urlparse, urlunparse
|
|
||||||
import mimetools
|
import mimetools
|
||||||
|
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
||||||
|
Request, addinfourl, build_opener, parse_http_list,
|
||||||
|
parse_keqv_list)
|
||||||
|
from urlparse import urlparse, urlunparse
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
|
||||||
from urllib.parse import quote
|
|
||||||
from urllib.parse import urlparse, urlunparse
|
|
||||||
import email
|
import email
|
||||||
|
from urllib.parse import quote, urlparse, urlunparse
|
||||||
|
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
||||||
|
HTTPRedirectHandler, Request, addinfourl,
|
||||||
|
build_opener, parse_http_list, parse_keqv_list)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
|
@ -49,10 +55,6 @@ except NameError:
|
||||||
basestring = unicode = str
|
basestring = unicode = str
|
||||||
|
|
||||||
|
|
||||||
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
|
|
||||||
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
|
|
||||||
|
|
||||||
|
|
||||||
MIMETYPE = {
|
MIMETYPE = {
|
||||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
||||||
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||||
|
@ -131,6 +133,7 @@ def custom_opener(follow=None, delay=None):
|
||||||
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
|
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
|
||||||
handlers.append(HTTPCookieProcessor())
|
handlers.append(HTTPCookieProcessor())
|
||||||
handlers.append(GZIPHandler())
|
handlers.append(GZIPHandler())
|
||||||
|
handlers.append(HTTPAllRedirectHandler())
|
||||||
handlers.append(HTTPEquivHandler())
|
handlers.append(HTTPEquivHandler())
|
||||||
handlers.append(HTTPRefreshHandler())
|
handlers.append(HTTPRefreshHandler())
|
||||||
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
||||||
|
@ -397,6 +400,11 @@ class HTTPEquivHandler(RespStrHandler):
|
||||||
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
|
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPAllRedirectHandler(HTTPRedirectHandler):
|
||||||
|
def http_error_308(self, req, fp, code, msg, headers):
|
||||||
|
return self.http_error_301(req, fp, 301, msg, headers)
|
||||||
|
|
||||||
|
|
||||||
class HTTPRefreshHandler(BaseHandler):
|
class HTTPRefreshHandler(BaseHandler):
|
||||||
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
|
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
|
||||||
|
|
||||||
|
@ -447,37 +455,46 @@ class CacheHandler(BaseHandler):
|
||||||
|
|
||||||
def load(self, url):
|
def load(self, url):
|
||||||
try:
|
try:
|
||||||
out = list(self.cache[url])
|
data = pickle.loads(self.cache[url])
|
||||||
|
|
||||||
except KeyError:
|
except KeyError:
|
||||||
out = [None, None, unicode(), bytes(), 0]
|
data = None
|
||||||
|
|
||||||
if sys.version_info[0] >= 3:
|
|
||||||
out[2] = email.message_from_string(out[2] or unicode()) # headers
|
|
||||||
else:
|
else:
|
||||||
out[2] = mimetools.Message(StringIO(out[2] or unicode()))
|
if sys.version_info[0] >= 3:
|
||||||
|
data['headers'] = email.message_from_string(data['headers'] or unicode()) # headers
|
||||||
|
else:
|
||||||
|
data['headers'] = mimetools.Message(StringIO(data['headers'] or unicode()))
|
||||||
|
|
||||||
return out
|
return data
|
||||||
|
|
||||||
def save(self, url, code, msg, headers, data, timestamp):
|
def save(self, key, data):
|
||||||
self.cache[url] = (code, msg, unicode(headers), data, timestamp)
|
data['headers'] = unicode(data['headers'])
|
||||||
|
self.cache[key] = pickle.dumps(data, 0)
|
||||||
|
|
||||||
def is_cached(self, url):
|
def is_cached(self, key):
|
||||||
return self.load(url)[0] is not None
|
return self.load(key) is not None
|
||||||
|
|
||||||
def cached_response(self, req):
|
def cached_response(self, req):
|
||||||
# this does NOT check whether it's already cached, use with care
|
# this does NOT check whether it's already cached, use with care
|
||||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
data = self.load(req.get_full_url())
|
||||||
|
|
||||||
# return the cache as a response
|
# return the cache as a response
|
||||||
resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
|
resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
|
||||||
resp.msg = msg
|
resp.msg = data['msg']
|
||||||
|
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
def save_response(self, req, resp):
|
def save_response(self, req, resp):
|
||||||
data = resp.read()
|
data = resp.read()
|
||||||
|
|
||||||
self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
|
self.save(req.get_full_url(), {
|
||||||
|
'code': resp.code,
|
||||||
|
'msg': resp.msg,
|
||||||
|
'headers': resp.headers,
|
||||||
|
'data': data,
|
||||||
|
'timestamp': time.time()
|
||||||
|
})
|
||||||
|
|
||||||
fp = BytesIO(data)
|
fp = BytesIO(data)
|
||||||
old_resp = resp
|
old_resp = resp
|
||||||
|
@ -487,13 +504,14 @@ class CacheHandler(BaseHandler):
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
data = self.load(req.get_full_url())
|
||||||
|
|
||||||
if 'etag' in headers:
|
if data is not None:
|
||||||
req.add_unredirected_header('If-None-Match', headers['etag'])
|
if 'etag' in data['headers']:
|
||||||
|
req.add_unredirected_header('If-None-Match', data['headers']['etag'])
|
||||||
|
|
||||||
if 'last-modified' in headers:
|
if 'last-modified' in data['headers']:
|
||||||
req.add_unredirected_header('If-Modified-Since', headers.get('last-modified'))
|
req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])
|
||||||
|
|
||||||
return req
|
return req
|
||||||
|
|
||||||
|
@ -502,33 +520,33 @@ class CacheHandler(BaseHandler):
|
||||||
# If 'None' is returned, try your chance with the next-available handler
|
# If 'None' is returned, try your chance with the next-available handler
|
||||||
# If a 'resp' is returned, stop there, and proceed with 'http_response'
|
# If a 'resp' is returned, stop there, and proceed with 'http_response'
|
||||||
|
|
||||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
data = self.load(req.get_full_url())
|
||||||
|
|
||||||
|
if data is None:
|
||||||
|
# cache empty, refresh
|
||||||
|
return None
|
||||||
|
|
||||||
# some info needed to process everything
|
# some info needed to process everything
|
||||||
cache_control = parse_http_list(headers.get('cache-control', ()))
|
cache_control = parse_http_list(data['headers'].get('cache-control', ()))
|
||||||
cache_control += parse_http_list(headers.get('pragma', ()))
|
cache_control += parse_http_list(data['headers'].get('pragma', ()))
|
||||||
|
|
||||||
cc_list = [x for x in cache_control if '=' not in x]
|
cc_list = [x for x in cache_control if '=' not in x]
|
||||||
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
|
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
|
||||||
|
|
||||||
cache_age = time.time() - timestamp
|
cache_age = time.time() - data['timestamp']
|
||||||
|
|
||||||
# list in a simple way what to do when
|
# list in a simple way what to do when
|
||||||
if self.force_min == -2:
|
if self.force_min == -2:
|
||||||
if code is not None:
|
if data['code'] is not None:
|
||||||
# already in cache, perfect, use cache
|
# already in cache, perfect, use cache
|
||||||
return self.cached_response(req)
|
return self.cached_response(req)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# raise an error, via urllib handlers
|
# raise an error, via urllib handlers
|
||||||
resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
|
resp = addinfourl(BytesIO(), data['headers'], req.get_full_url(), 409)
|
||||||
resp.msg = 'Conflict'
|
resp.msg = 'Conflict'
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
elif code is None:
|
|
||||||
# cache empty, refresh
|
|
||||||
return None
|
|
||||||
|
|
||||||
elif self.force_min == -1:
|
elif self.force_min == -1:
|
||||||
# force use cache
|
# force use cache
|
||||||
return self.cached_response(req)
|
return self.cached_response(req)
|
||||||
|
@ -537,7 +555,7 @@ class CacheHandler(BaseHandler):
|
||||||
# force refresh
|
# force refresh
|
||||||
return None
|
return None
|
||||||
|
|
||||||
elif code == 301 and cache_age < 7*24*3600:
|
elif data['code'] == 301 and cache_age < 7*24*3600:
|
||||||
# "301 Moved Permanently" has to be cached...as long as we want
|
# "301 Moved Permanently" has to be cached...as long as we want
|
||||||
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0
|
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0
|
||||||
# if you want to bypass this (needed for a proper refresh)
|
# if you want to bypass this (needed for a proper refresh)
|
||||||
|
@ -594,142 +612,6 @@ class CacheHandler(BaseHandler):
|
||||||
https_response = http_response
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
class BaseCache:
|
|
||||||
""" Subclasses must behave like a dict """
|
|
||||||
|
|
||||||
def trim(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def autotrim(self, delay=CACHE_LIFESPAN):
|
|
||||||
# trim the cache every so often
|
|
||||||
|
|
||||||
self.trim()
|
|
||||||
|
|
||||||
t = threading.Timer(delay, self.autotrim)
|
|
||||||
t.daemon = True
|
|
||||||
t.start()
|
|
||||||
|
|
||||||
def __contains__(self, url):
|
|
||||||
try:
|
|
||||||
self[url]
|
|
||||||
|
|
||||||
except KeyError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
import sqlite3
|
|
||||||
|
|
||||||
|
|
||||||
class SQLiteCache(BaseCache):
|
|
||||||
def __init__(self, filename=':memory:'):
|
|
||||||
self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
|
||||||
|
|
||||||
with self.con:
|
|
||||||
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
|
|
||||||
self.con.execute('pragma journal_mode=WAL')
|
|
||||||
|
|
||||||
self.trim()
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
self.con.close()
|
|
||||||
|
|
||||||
def trim(self):
|
|
||||||
with self.con:
|
|
||||||
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
|
|
||||||
|
|
||||||
def __getitem__(self, url):
|
|
||||||
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
|
|
||||||
|
|
||||||
if not row:
|
|
||||||
raise KeyError
|
|
||||||
|
|
||||||
return row[1:]
|
|
||||||
|
|
||||||
def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
|
|
||||||
value = list(value)
|
|
||||||
value[3] = sqlite3.Binary(value[3]) # data
|
|
||||||
value = tuple(value)
|
|
||||||
|
|
||||||
with self.con:
|
|
||||||
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?) ON CONFLICT(url) DO UPDATE SET code=?, msg=?, headers=?, data=?, timestamp=?', (url,) + value + value)
|
|
||||||
|
|
||||||
|
|
||||||
import pymysql.cursors
|
|
||||||
|
|
||||||
|
|
||||||
class MySQLCacheHandler(BaseCache):
|
|
||||||
def __init__(self, user, password, database, host='localhost'):
|
|
||||||
self.user = user
|
|
||||||
self.password = password
|
|
||||||
self.database = database
|
|
||||||
self.host = host
|
|
||||||
|
|
||||||
with self.cursor() as cursor:
|
|
||||||
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
|
|
||||||
|
|
||||||
self.trim()
|
|
||||||
|
|
||||||
def cursor(self):
|
|
||||||
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
|
|
||||||
|
|
||||||
def trim(self):
|
|
||||||
with self.cursor() as cursor:
|
|
||||||
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
|
|
||||||
|
|
||||||
def __getitem__(self, url):
|
|
||||||
cursor = self.cursor()
|
|
||||||
cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
|
|
||||||
if not row:
|
|
||||||
raise KeyError
|
|
||||||
|
|
||||||
return row[1:]
|
|
||||||
|
|
||||||
def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
|
|
||||||
with self.cursor() as cursor:
|
|
||||||
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE code=%s, msg=%s, headers=%s, data=%s, timestamp=%s',
|
|
||||||
(url,) + value + value)
|
|
||||||
|
|
||||||
|
|
||||||
class CappedDict(OrderedDict, BaseCache):
|
|
||||||
def trim(self):
|
|
||||||
if CACHE_SIZE >= 0:
|
|
||||||
for i in range( max( len(self) - CACHE_SIZE , 0 )):
|
|
||||||
self.popitem(False)
|
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
|
||||||
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
|
|
||||||
if key in self:
|
|
||||||
del self[key]
|
|
||||||
OrderedDict.__setitem__(self, key, value)
|
|
||||||
|
|
||||||
|
|
||||||
if 'CACHE' in os.environ:
|
|
||||||
if os.environ['CACHE'] == 'mysql':
|
|
||||||
default_cache = MySQLCacheHandler(
|
|
||||||
user = os.getenv('MYSQL_USER'),
|
|
||||||
password = os.getenv('MYSQL_PWD'),
|
|
||||||
database = os.getenv('MYSQL_DB'),
|
|
||||||
host = os.getenv('MYSQL_HOST', 'localhost')
|
|
||||||
)
|
|
||||||
|
|
||||||
elif os.environ['CACHE'] == 'sqlite':
|
|
||||||
if 'SQLITE_PATH' in os.environ:
|
|
||||||
path = os.getenv('SQLITE_PATH')
|
|
||||||
|
|
||||||
else:
|
|
||||||
path = ':memory:'
|
|
||||||
|
|
||||||
default_cache = SQLiteCache(path)
|
|
||||||
|
|
||||||
else:
|
|
||||||
default_cache = CappedDict()
|
|
||||||
|
|
||||||
|
|
||||||
if 'IGNORE_SSL' in os.environ:
|
if 'IGNORE_SSL' in os.environ:
|
||||||
import ssl
|
import ssl
|
||||||
ssl._create_default_https_context = ssl._create_unverified_context
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
|
|
@ -15,35 +15,33 @@
|
||||||
# You should have received a copy of the GNU Affero General Public License along
|
# You should have received a copy of the GNU Affero General Public License along
|
||||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import sys
|
|
||||||
import os.path
|
import os.path
|
||||||
|
import sys
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
import re
|
|
||||||
import json
|
|
||||||
import csv
|
import csv
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from copy import deepcopy
|
||||||
|
from datetime import datetime
|
||||||
from fnmatch import fnmatch
|
from fnmatch import fnmatch
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
from dateutil import tz
|
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
from copy import deepcopy
|
|
||||||
|
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
from dateutil import tz
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
from .readabilite import parse as html_parse
|
from .readabilite import parse as html_parse
|
||||||
|
|
||||||
json.encoder.c_make_encoder = None
|
json.encoder.c_make_encoder = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
from StringIO import StringIO
|
|
||||||
from ConfigParser import RawConfigParser
|
from ConfigParser import RawConfigParser
|
||||||
|
from StringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
from io import StringIO
|
|
||||||
from configparser import RawConfigParser
|
from configparser import RawConfigParser
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
|
|
|
@ -16,30 +16,25 @@
|
||||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from dateutil import tz
|
|
||||||
|
|
||||||
from fnmatch import fnmatch
|
from fnmatch import fnmatch
|
||||||
import re
|
|
||||||
|
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
from dateutil import tz
|
||||||
|
|
||||||
from . import feeds
|
from . import caching, crawler, feeds, readabilite
|
||||||
from . import crawler
|
|
||||||
from . import readabilite
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
from httplib import HTTPException
|
from httplib import HTTPException
|
||||||
from urlparse import urlparse, urljoin, parse_qs
|
from urlparse import parse_qs, urljoin, urlparse
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
from http.client import HTTPException
|
from http.client import HTTPException
|
||||||
from urllib.parse import urlparse, urljoin, parse_qs
|
from urllib.parse import parse_qs, urljoin, urlparse
|
||||||
|
|
||||||
|
|
||||||
MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
|
MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
|
||||||
|
@ -91,12 +86,12 @@ class Options:
|
||||||
else:
|
else:
|
||||||
self.options = options or {}
|
self.options = options or {}
|
||||||
|
|
||||||
def __getattr__(self, key):
|
def __getattr__(self, key, default=None):
|
||||||
if key in self.options:
|
if key in self.options:
|
||||||
return self.options[key]
|
return self.options[key]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return None
|
return default
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
def __setitem__(self, key, value):
|
||||||
self.options[key] = value
|
self.options[key] = value
|
||||||
|
@ -104,12 +99,7 @@ class Options:
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
return key in self.options
|
return key in self.options
|
||||||
|
|
||||||
def get(self, key, default=None):
|
get = __getitem__ = __getattr__
|
||||||
if key in self.options:
|
|
||||||
return self.options[key]
|
|
||||||
|
|
||||||
else:
|
|
||||||
return default
|
|
||||||
|
|
||||||
|
|
||||||
def ItemFix(item, options, feedurl='/'):
|
def ItemFix(item, options, feedurl='/'):
|
||||||
|
@ -228,7 +218,11 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
log('non-text page')
|
log('non-text page')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
|
if not req['data']:
|
||||||
|
log('empty page')
|
||||||
|
return True
|
||||||
|
|
||||||
|
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)
|
||||||
|
|
||||||
if out is not None:
|
if out is not None:
|
||||||
item.content = out
|
item.content = out
|
||||||
|
@ -417,7 +411,7 @@ def process(url, cache=None, options=None):
|
||||||
options = Options(options)
|
options = Options(options)
|
||||||
|
|
||||||
if cache:
|
if cache:
|
||||||
crawler.default_cache = crawler.SQLiteCache(cache)
|
caching.default_cache = caching.SQLiteCache(cache)
|
||||||
|
|
||||||
url, rss = FeedFetch(url, options)
|
url, rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
|
|
|
@ -15,10 +15,11 @@
|
||||||
# You should have received a copy of the GNU Affero General Public License along
|
# You should have received a copy of the GNU Affero General Public License along
|
||||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import lxml.html
|
import lxml.html
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
def parse(data, encoding=None):
|
def parse(data, encoding=None):
|
||||||
|
@ -210,7 +211,7 @@ def clean_node(node, keep_threshold=None):
|
||||||
return
|
return
|
||||||
|
|
||||||
# high score, so keep
|
# high score, so keep
|
||||||
if keep_threshold is not None and get_score(node) >= keep_threshold:
|
if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
|
||||||
return
|
return
|
||||||
|
|
||||||
gdparent = parent.getparent()
|
gdparent = parent.getparent()
|
||||||
|
@ -293,28 +294,26 @@ def clean_node(node, keep_threshold=None):
|
||||||
gdparent.insert(gdparent.index(parent)+1, new_node)
|
gdparent.insert(gdparent.index(parent)+1, new_node)
|
||||||
|
|
||||||
|
|
||||||
def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
|
def lowest_common_ancestor(node_a, node_b, max_depth=None):
|
||||||
ancestorsA = list(nodeA.iterancestors())
|
ancestors_a = list(node_a.iterancestors())
|
||||||
ancestorsB = list(nodeB.iterancestors())
|
ancestors_b = list(node_b.iterancestors())
|
||||||
|
|
||||||
if max_depth is not None:
|
if max_depth is not None:
|
||||||
ancestorsA = ancestorsA[:max_depth]
|
ancestors_a = ancestors_a[:max_depth]
|
||||||
ancestorsB = ancestorsB[:max_depth]
|
ancestors_b = ancestors_b[:max_depth]
|
||||||
|
|
||||||
ancestorsA.insert(0, nodeA)
|
ancestors_a.insert(0, node_a)
|
||||||
ancestorsB.insert(0, nodeB)
|
ancestors_b.insert(0, node_b)
|
||||||
|
|
||||||
for ancestorA in ancestorsA:
|
for ancestor_a in ancestors_a:
|
||||||
if ancestorA in ancestorsB:
|
if ancestor_a in ancestors_b:
|
||||||
return ancestorA
|
return ancestor_a
|
||||||
|
|
||||||
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
|
return node_a # should always find one tho, at least <html/>, but needed for max_depth
|
||||||
|
|
||||||
|
|
||||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
|
def get_best_node(html, threshold=5):
|
||||||
" Input a raw html string, returns a raw html string of the article "
|
# score all nodes
|
||||||
|
|
||||||
html = parse(data, encoding_in)
|
|
||||||
score_all(html)
|
score_all(html)
|
||||||
|
|
||||||
# rank all nodes (largest to smallest)
|
# rank all nodes (largest to smallest)
|
||||||
|
@ -331,9 +330,29 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
|
||||||
else:
|
else:
|
||||||
best = ranked_nodes[0]
|
best = ranked_nodes[0]
|
||||||
|
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
|
||||||
|
" Input a raw html string, returns a raw html string of the article "
|
||||||
|
|
||||||
|
html = parse(data, encoding_in)
|
||||||
|
|
||||||
|
if xpath is not None:
|
||||||
|
xpath_match = html.xpath(xpath)
|
||||||
|
|
||||||
|
if len(xpath_match):
|
||||||
|
best = xpath_match[0]
|
||||||
|
|
||||||
|
else:
|
||||||
|
best = get_best_node(html, threshold)
|
||||||
|
|
||||||
|
else:
|
||||||
|
best = get_best_node(html, threshold)
|
||||||
|
|
||||||
# clean up
|
# clean up
|
||||||
if not debug:
|
if not debug:
|
||||||
keep_threshold = get_score(ranked_nodes[0]) * 3/4
|
keep_threshold = get_score(best) * 3/4
|
||||||
clean_root(best, keep_threshold)
|
clean_root(best, keep_threshold)
|
||||||
|
|
||||||
# check for spammy content (links only)
|
# check for spammy content (links only)
|
||||||
|
@ -352,6 +371,7 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from . import crawler
|
from . import crawler
|
||||||
|
|
||||||
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||||
|
|
|
@ -15,16 +15,16 @@
|
||||||
# You should have received a copy of the GNU Affero General Public License along
|
# You should have received a copy of the GNU Affero General Public License along
|
||||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import sys
|
import cgitb
|
||||||
|
import mimetypes
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
import re
|
||||||
import lxml.etree
|
import sys
|
||||||
|
|
||||||
import cgitb
|
|
||||||
import wsgiref.util
|
|
||||||
import wsgiref.simple_server
|
|
||||||
import wsgiref.handlers
|
import wsgiref.handlers
|
||||||
import mimetypes
|
import wsgiref.simple_server
|
||||||
|
import wsgiref.util
|
||||||
|
|
||||||
|
import lxml.etree
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
|
@ -33,11 +33,9 @@ except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
from . import crawler
|
from . import caching, crawler, readabilite
|
||||||
from . import readabilite
|
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
|
||||||
from .morss import FeedFetch, FeedGather, FeedFormat
|
MorssException, Options, log)
|
||||||
from .morss import Options, log, TIMEOUT, DELAY, MorssException
|
|
||||||
|
|
||||||
|
|
||||||
PORT = int(os.getenv('PORT', 8080))
|
PORT = int(os.getenv('PORT', 8080))
|
||||||
|
|
||||||
|
@ -201,7 +199,7 @@ def cgi_get(environ, start_response):
|
||||||
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||||
|
|
||||||
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||||
if options.get == 'page':
|
if options['get'] == 'page':
|
||||||
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
||||||
html.make_links_absolute(req['url'])
|
html.make_links_absolute(req['url'])
|
||||||
|
|
||||||
|
@ -213,7 +211,7 @@ def cgi_get(environ, start_response):
|
||||||
|
|
||||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
||||||
|
|
||||||
elif options.get == 'article':
|
elif options['get'] == 'article':
|
||||||
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
@ -289,7 +287,7 @@ class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
|
||||||
|
|
||||||
|
|
||||||
def cgi_start_server():
|
def cgi_start_server():
|
||||||
crawler.default_cache.autotrim()
|
caching.default_cache.autotrim()
|
||||||
|
|
||||||
print('Serving http://localhost:%s/' % PORT)
|
print('Serving http://localhost:%s/' % PORT)
|
||||||
httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
|
httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
|
||||||
|
@ -297,4 +295,4 @@ def cgi_start_server():
|
||||||
|
|
||||||
|
|
||||||
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
|
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
|
||||||
crawler.default_cache.autotrim()
|
caching.default_cache.autotrim()
|
||||||
|
|
6
setup.py
6
setup.py
|
@ -1,6 +1,7 @@
|
||||||
from setuptools import setup
|
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
package_name = 'morss'
|
package_name = 'morss'
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
|
@ -12,7 +13,8 @@ setup(
|
||||||
download_url = 'https://git.pictuga.com/pictuga/morss',
|
download_url = 'https://git.pictuga.com/pictuga/morss',
|
||||||
license = 'AGPL v3',
|
license = 'AGPL v3',
|
||||||
packages = [package_name],
|
packages = [package_name],
|
||||||
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet', 'pymysql'],
|
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
|
||||||
|
extras_require = {'full': ['pymysql', 'redis']},
|
||||||
package_data = {package_name: ['feedify.ini']},
|
package_data = {package_name: ['feedify.ini']},
|
||||||
data_files = [
|
data_files = [
|
||||||
('share/' + package_name, ['README.md', 'LICENSE']),
|
('share/' + package_name, ['README.md', 'LICENSE']),
|
||||||
|
|
Loading…
Reference in New Issue