Compare commits
No commits in common. "4a888867679b96fbf04509f9b943531fee933a73" and "7c3091d64c796e20e64e423b1de6899d64866b50" have entirely different histories.
4a88886767
...
7c3091d64c
|
@ -48,7 +48,6 @@ You do need:
|
||||||
|
|
||||||
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
||||||
- [lxml](http://lxml.de/) for xml parsing
|
- [lxml](http://lxml.de/) for xml parsing
|
||||||
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
|
|
||||||
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
||||||
- [chardet](https://pypi.python.org/pypi/chardet)
|
- [chardet](https://pypi.python.org/pypi/chardet)
|
||||||
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
|
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
|
||||||
|
|
|
@ -10,7 +10,6 @@ import re
|
||||||
|
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import lxml.html
|
import lxml.html
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from . import feeds
|
from . import feeds
|
||||||
from . import feedify
|
from . import feedify
|
||||||
|
@ -26,13 +25,13 @@ try:
|
||||||
# python 2
|
# python 2
|
||||||
from Queue import Queue
|
from Queue import Queue
|
||||||
from httplib import HTTPException
|
from httplib import HTTPException
|
||||||
from urllib import unquote
|
from urllib import quote_plus
|
||||||
from urlparse import urlparse, urljoin, parse_qs
|
from urlparse import urlparse, urljoin, parse_qs
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
from http.client import HTTPException
|
from http.client import HTTPException
|
||||||
from urllib.parse import unquote
|
from urllib.parse import quote_plus
|
||||||
from urllib.parse import urlparse, urljoin, parse_qs
|
from urllib.parse import urlparse, urljoin, parse_qs
|
||||||
|
|
||||||
LIM_ITEM = 100 # deletes what's beyond
|
LIM_ITEM = 100 # deletes what's beyond
|
||||||
|
@ -171,11 +170,6 @@ def ItemFix(item, feedurl='/'):
|
||||||
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
|
||||||
# pocket
|
|
||||||
if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
|
|
||||||
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
|
||||||
log(item.link)
|
|
||||||
|
|
||||||
# facebook
|
# facebook
|
||||||
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
|
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
|
||||||
item.link = parse_qs(urlparse(item.link).query)['u'][0]
|
item.link = parse_qs(urlparse(item.link).query)['u'][0]
|
||||||
|
@ -509,9 +503,8 @@ def process(url, cache=None, options=None):
|
||||||
return FeedFormat(rss, options)
|
return FeedFormat(rss, options)
|
||||||
|
|
||||||
|
|
||||||
def cgi_parse_environ(environ):
|
def cgi_app(environ, start_response):
|
||||||
# get options
|
# get options
|
||||||
|
|
||||||
if 'REQUEST_URI' in environ:
|
if 'REQUEST_URI' in environ:
|
||||||
url = environ['REQUEST_URI'][1:]
|
url = environ['REQUEST_URI'][1:]
|
||||||
else:
|
else:
|
||||||
|
@ -525,7 +518,7 @@ def cgi_parse_environ(environ):
|
||||||
if url.startswith(':'):
|
if url.startswith(':'):
|
||||||
split = url.split('/', 1)
|
split = url.split('/', 1)
|
||||||
|
|
||||||
raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
raw_options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
||||||
|
|
||||||
if len(split) > 1:
|
if len(split) > 1:
|
||||||
url = split[1]
|
url = split[1]
|
||||||
|
@ -537,18 +530,11 @@ def cgi_parse_environ(environ):
|
||||||
|
|
||||||
# init
|
# init
|
||||||
options = Options(filterOptions(parseOptions(raw_options)))
|
options = Options(filterOptions(parseOptions(raw_options)))
|
||||||
|
headers = {}
|
||||||
|
|
||||||
global DEBUG
|
global DEBUG
|
||||||
DEBUG = options.debug
|
DEBUG = options.debug
|
||||||
|
|
||||||
return (url, options)
|
|
||||||
|
|
||||||
|
|
||||||
def cgi_app(environ, start_response):
|
|
||||||
url, options = cgi_parse_environ(environ)
|
|
||||||
|
|
||||||
headers = {}
|
|
||||||
|
|
||||||
# headers
|
# headers
|
||||||
headers['status'] = '200 OK'
|
headers['status'] = '200 OK'
|
||||||
headers['cache-control'] = 'max-age=%s' % DELAY
|
headers['cache-control'] = 'max-age=%s' % DELAY
|
||||||
|
@ -652,57 +638,6 @@ def cgi_file_handler(environ, start_response, app):
|
||||||
return app(environ, start_response)
|
return app(environ, start_response)
|
||||||
|
|
||||||
|
|
||||||
def cgi_page(environ, start_response):
|
|
||||||
url, options = cgi_parse_environ(environ)
|
|
||||||
|
|
||||||
# get page
|
|
||||||
PROTOCOL = ['http', 'https']
|
|
||||||
|
|
||||||
if urlparse(url).scheme not in ['http', 'https']:
|
|
||||||
url = 'http://' + url
|
|
||||||
|
|
||||||
con = crawler.custom_handler().open(url)
|
|
||||||
data = con.read()
|
|
||||||
|
|
||||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
|
||||||
|
|
||||||
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
|
||||||
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
|
|
||||||
html.make_links_absolute(url)
|
|
||||||
|
|
||||||
kill_tags = ['script', 'iframe', 'noscript']
|
|
||||||
|
|
||||||
for tag in kill_tags:
|
|
||||||
for elem in html.xpath('//'+tag):
|
|
||||||
elem.getparent().remove(elem)
|
|
||||||
|
|
||||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
|
||||||
|
|
||||||
else:
|
|
||||||
output = None
|
|
||||||
|
|
||||||
# return html page
|
|
||||||
headers = {'status': '200 OK', 'content-type': 'text/html'}
|
|
||||||
start_response(headers['status'], list(headers.items()))
|
|
||||||
return [output]
|
|
||||||
|
|
||||||
|
|
||||||
dispatch_table = {
|
|
||||||
'getpage': cgi_page
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@middleware
|
|
||||||
def cgi_dispatcher(environ, start_response, app):
|
|
||||||
url, options = cgi_parse_environ(environ)
|
|
||||||
|
|
||||||
for key in dispatch_table.keys():
|
|
||||||
if key in options:
|
|
||||||
return dispatch_table[key](environ, start_response)
|
|
||||||
|
|
||||||
return app(environ, start_response)
|
|
||||||
|
|
||||||
|
|
||||||
@middleware
|
@middleware
|
||||||
def cgi_error_handler(environ, start_response, app):
|
def cgi_error_handler(environ, start_response, app):
|
||||||
try:
|
try:
|
||||||
|
@ -758,7 +693,6 @@ def main():
|
||||||
# mod_cgi
|
# mod_cgi
|
||||||
|
|
||||||
app = cgi_app
|
app = cgi_app
|
||||||
app = cgi_dispatcher(app)
|
|
||||||
app = cgi_error_handler(app)
|
app = cgi_error_handler(app)
|
||||||
app = cgi_encode(app)
|
app = cgi_encode(app)
|
||||||
|
|
||||||
|
@ -780,7 +714,6 @@ def main():
|
||||||
|
|
||||||
app = cgi_app
|
app = cgi_app
|
||||||
app = cgi_file_handler(app)
|
app = cgi_file_handler(app)
|
||||||
app = cgi_dispatcher(app)
|
|
||||||
app = cgi_error_handler(app)
|
app = cgi_error_handler(app)
|
||||||
app = cgi_encode(app)
|
app = cgi_encode(app)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
lxml
|
lxml
|
||||||
bs4
|
|
||||||
python-dateutil <= 1.5
|
python-dateutil <= 1.5
|
||||||
chardet
|
chardet
|
||||||
pymysql
|
pymysql
|
||||||
|
|
|
@ -13,7 +13,6 @@
|
||||||
body {
|
body {
|
||||||
overflow-wrap: anywhere;
|
overflow-wrap: anywhere;
|
||||||
word-wrap: anywhere;
|
word-wrap: anywhere;
|
||||||
font-family: sans;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#url {
|
#url {
|
||||||
|
|
Loading…
Reference in New Issue