Detect encoding everytime
parent
7691df5257
commit
f3d1f92b39
|
@ -88,7 +88,6 @@ The arguments are:
|
||||||
- `mono`: disable multithreading while fetching, makes debugging easier
|
- `mono`: disable multithreading while fetching, makes debugging easier
|
||||||
- `theforce`: force download the rss feed and ignore cached http errros
|
- `theforce`: force download the rss feed and ignore cached http errros
|
||||||
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
|
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
|
||||||
- `encoding=ENCODING`: overrides the encoding auto-detection of the crawler. Some web developers did not quite understand the importance of setting charset/encoding tags correctly...
|
|
||||||
- http server only
|
- http server only
|
||||||
- `callback=NAME`: for JSONP calls
|
- `callback=NAME`: for JSONP calls
|
||||||
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
|
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
|
||||||
|
|
|
@ -15,7 +15,7 @@ import dateutil.parser
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
|
||||||
import lxml.html
|
import lxml.html
|
||||||
from bs4 import BeautifulSoup
|
from .readabilite import parse as html_parse
|
||||||
|
|
||||||
json.encoder.c_make_encoder = None
|
json.encoder.c_make_encoder = None
|
||||||
|
|
||||||
|
@ -53,7 +53,7 @@ def parse_rules(filename=None):
|
||||||
return rules
|
return rules
|
||||||
|
|
||||||
|
|
||||||
def parse(data, url=None, mimetype=None):
|
def parse(data, url=None, mimetype=None, encoding=None):
|
||||||
" Determine which ruleset to use "
|
" Determine which ruleset to use "
|
||||||
|
|
||||||
rulesets = parse_rules()
|
rulesets = parse_rules()
|
||||||
|
@ -67,7 +67,7 @@ def parse(data, url=None, mimetype=None):
|
||||||
for path in ruleset['path']:
|
for path in ruleset['path']:
|
||||||
if fnmatch(url, path):
|
if fnmatch(url, path):
|
||||||
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
|
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
|
||||||
return parser(data, ruleset)
|
return parser(data, ruleset, encoding=encoding)
|
||||||
|
|
||||||
# 2) Look for a parser based on mimetype
|
# 2) Look for a parser based on mimetype
|
||||||
|
|
||||||
|
@ -86,7 +86,7 @@ def parse(data, url=None, mimetype=None):
|
||||||
# 'path' as they should have been caught beforehands
|
# 'path' as they should have been caught beforehands
|
||||||
|
|
||||||
try:
|
try:
|
||||||
feed = parser(data)
|
feed = parser(data, encoding=encoding)
|
||||||
|
|
||||||
except (ValueError):
|
except (ValueError):
|
||||||
# parsing did not work
|
# parsing did not work
|
||||||
|
@ -113,7 +113,7 @@ def parse(data, url=None, mimetype=None):
|
||||||
|
|
||||||
|
|
||||||
class ParserBase(object):
|
class ParserBase(object):
|
||||||
def __init__(self, data=None, rules=None, parent=None):
|
def __init__(self, data=None, rules=None, parent=None, encoding=None):
|
||||||
if rules is None:
|
if rules is None:
|
||||||
rules = parse_rules()[self.default_ruleset]
|
rules = parse_rules()[self.default_ruleset]
|
||||||
|
|
||||||
|
@ -122,9 +122,10 @@ class ParserBase(object):
|
||||||
if data is None:
|
if data is None:
|
||||||
data = rules['base']
|
data = rules['base']
|
||||||
|
|
||||||
self.root = self.parse(data)
|
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
self.root = self.parse(data)
|
||||||
|
|
||||||
def parse(self, raw):
|
def parse(self, raw):
|
||||||
pass
|
pass
|
||||||
|
@ -442,8 +443,7 @@ class ParserHTML(ParserXML):
|
||||||
mimetype = ['text/html', 'application/xhtml+xml']
|
mimetype = ['text/html', 'application/xhtml+xml']
|
||||||
|
|
||||||
def parse(self, raw):
|
def parse(self, raw):
|
||||||
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
|
return html_parse(raw, encoding=self.encoding)
|
||||||
return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser)
|
|
||||||
|
|
||||||
def tostring(self, encoding='unicode', **k):
|
def tostring(self, encoding='unicode', **k):
|
||||||
return lxml.html.tostring(self.root, encoding=encoding, **k)
|
return lxml.html.tostring(self.root, encoding=encoding, **k)
|
||||||
|
|
|
@ -10,7 +10,6 @@ import re
|
||||||
|
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import lxml.html
|
import lxml.html
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from . import feeds
|
from . import feeds
|
||||||
from . import crawler
|
from . import crawler
|
||||||
|
@ -261,7 +260,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
log('non-text page')
|
log('non-text page')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
|
out = readabilite.get_article(data, url=con.geturl(), encoding=encoding)
|
||||||
|
|
||||||
if out is not None:
|
if out is not None:
|
||||||
item.content = out
|
item.content = out
|
||||||
|
@ -329,7 +328,7 @@ def FeedFetch(url, options):
|
||||||
|
|
||||||
if options.items:
|
if options.items:
|
||||||
# using custom rules
|
# using custom rules
|
||||||
rss = feeds.FeedHTML(xml)
|
rss = feeds.FeedHTML(xml, encoding=encoding)
|
||||||
|
|
||||||
rss.rules['title'] = options.title if options.title else '//head/title'
|
rss.rules['title'] = options.title if options.title else '//head/title'
|
||||||
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
|
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
|
||||||
|
@ -349,7 +348,7 @@ def FeedFetch(url, options):
|
||||||
|
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
rss = feeds.parse(xml, url, contenttype)
|
rss = feeds.parse(xml, url, contenttype, encoding=encoding)
|
||||||
rss = rss.convert(feeds.FeedXML)
|
rss = rss.convert(feeds.FeedXML)
|
||||||
# contains all fields, otherwise much-needed data can be lost
|
# contains all fields, otherwise much-needed data can be lost
|
||||||
|
|
||||||
|
@ -649,7 +648,7 @@ def cgi_page(environ, start_response):
|
||||||
data, con, contenttype, encoding = crawler.adv_get(url=url)
|
data, con, contenttype, encoding = crawler.adv_get(url=url)
|
||||||
|
|
||||||
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||||
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
|
html = readabilite.parse(data, encoding=encoding)
|
||||||
html.make_links_absolute(con.geturl())
|
html.make_links_absolute(con.geturl())
|
||||||
|
|
||||||
kill_tags = ['script', 'iframe', 'noscript']
|
kill_tags = ['script', 'iframe', 'noscript']
|
||||||
|
|
|
@ -6,11 +6,14 @@ import re
|
||||||
|
|
||||||
def parse(data, encoding=None):
|
def parse(data, encoding=None):
|
||||||
if encoding:
|
if encoding:
|
||||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
|
data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
|
||||||
else:
|
|
||||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
|
|
||||||
|
|
||||||
return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser)
|
else:
|
||||||
|
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
|
||||||
|
|
||||||
|
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
|
||||||
|
|
||||||
|
return lxml.html.fromstring(data, parser=parser)
|
||||||
|
|
||||||
|
|
||||||
def count_words(string):
|
def count_words(string):
|
||||||
|
|
Loading…
Reference in New Issue