morss: get_page to act as a basic proxy (for iframes)
parent
1653394cf7
commit
4a88886767
|
@ -48,6 +48,7 @@ You do need:
|
||||||
|
|
||||||
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
||||||
- [lxml](http://lxml.de/) for xml parsing
|
- [lxml](http://lxml.de/) for xml parsing
|
||||||
|
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
|
||||||
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
||||||
- [chardet](https://pypi.python.org/pypi/chardet)
|
- [chardet](https://pypi.python.org/pypi/chardet)
|
||||||
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
|
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
|
||||||
|
|
|
@ -10,6 +10,7 @@ import re
|
||||||
|
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from . import feeds
|
from . import feeds
|
||||||
from . import feedify
|
from . import feedify
|
||||||
|
@ -651,7 +652,43 @@ def cgi_file_handler(environ, start_response, app):
|
||||||
return app(environ, start_response)
|
return app(environ, start_response)
|
||||||
|
|
||||||
|
|
||||||
|
def cgi_page(environ, start_response):
|
||||||
|
url, options = cgi_parse_environ(environ)
|
||||||
|
|
||||||
|
# get page
|
||||||
|
PROTOCOL = ['http', 'https']
|
||||||
|
|
||||||
|
if urlparse(url).scheme not in ['http', 'https']:
|
||||||
|
url = 'http://' + url
|
||||||
|
|
||||||
|
con = crawler.custom_handler().open(url)
|
||||||
|
data = con.read()
|
||||||
|
|
||||||
|
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
||||||
|
|
||||||
|
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||||
|
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
|
||||||
|
html.make_links_absolute(url)
|
||||||
|
|
||||||
|
kill_tags = ['script', 'iframe', 'noscript']
|
||||||
|
|
||||||
|
for tag in kill_tags:
|
||||||
|
for elem in html.xpath('//'+tag):
|
||||||
|
elem.getparent().remove(elem)
|
||||||
|
|
||||||
|
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
||||||
|
|
||||||
|
else:
|
||||||
|
output = None
|
||||||
|
|
||||||
|
# return html page
|
||||||
|
headers = {'status': '200 OK', 'content-type': 'text/html'}
|
||||||
|
start_response(headers['status'], list(headers.items()))
|
||||||
|
return [output]
|
||||||
|
|
||||||
|
|
||||||
dispatch_table = {
|
dispatch_table = {
|
||||||
|
'getpage': cgi_page
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
lxml
|
lxml
|
||||||
|
bs4
|
||||||
python-dateutil <= 1.5
|
python-dateutil <= 1.5
|
||||||
chardet
|
chardet
|
||||||
pymysql
|
pymysql
|
||||||
|
|
Loading…
Reference in New Issue