morss: get_page to act as a basic proxy (for iframes)
This commit is contained in:
		@@ -48,6 +48,7 @@ You do need:
 | 
			
		||||
 | 
			
		||||
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
 | 
			
		||||
- [lxml](http://lxml.de/) for xml parsing
 | 
			
		||||
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
 | 
			
		||||
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
 | 
			
		||||
- [chardet](https://pypi.python.org/pypi/chardet)
 | 
			
		||||
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
 | 
			
		||||
 
 | 
			
		||||
@@ -10,6 +10,7 @@ import re
 | 
			
		||||
 | 
			
		||||
import lxml.etree
 | 
			
		||||
import lxml.html
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
 | 
			
		||||
from . import feeds
 | 
			
		||||
from . import feedify
 | 
			
		||||
@@ -651,7 +652,43 @@ def cgi_file_handler(environ, start_response, app):
 | 
			
		||||
        return app(environ, start_response)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def cgi_page(environ, start_response):
 | 
			
		||||
    url, options = cgi_parse_environ(environ)
 | 
			
		||||
 | 
			
		||||
    # get page
 | 
			
		||||
    PROTOCOL = ['http', 'https']
 | 
			
		||||
 | 
			
		||||
    if urlparse(url).scheme not in ['http', 'https']:
 | 
			
		||||
        url = 'http://' + url
 | 
			
		||||
 | 
			
		||||
    con = crawler.custom_handler().open(url)
 | 
			
		||||
    data = con.read()
 | 
			
		||||
 | 
			
		||||
    contenttype = con.info().get('Content-Type', '').split(';')[0]
 | 
			
		||||
 | 
			
		||||
    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
 | 
			
		||||
        html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
 | 
			
		||||
        html.make_links_absolute(url)
 | 
			
		||||
 | 
			
		||||
        kill_tags = ['script', 'iframe', 'noscript']
 | 
			
		||||
 | 
			
		||||
        for tag in kill_tags:
 | 
			
		||||
            for elem in html.xpath('//'+tag):
 | 
			
		||||
                elem.getparent().remove(elem)
 | 
			
		||||
 | 
			
		||||
        output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
 | 
			
		||||
 | 
			
		||||
    else:
 | 
			
		||||
        output = None
 | 
			
		||||
 | 
			
		||||
    # return html page
 | 
			
		||||
    headers = {'status': '200 OK', 'content-type': 'text/html'}
 | 
			
		||||
    start_response(headers['status'], list(headers.items()))
 | 
			
		||||
    return [output]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
dispatch_table = {
 | 
			
		||||
    'getpage': cgi_page
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,5 @@
 | 
			
		||||
lxml
 | 
			
		||||
bs4
 | 
			
		||||
python-dateutil <= 1.5
 | 
			
		||||
chardet
 | 
			
		||||
pymysql
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user