Compare commits

...

11 Commits

Author SHA1 Message Date
pictuga e5a82ff1f4 crawler: drop auto-referer
Was solving some issues. But creating even more issues.
2020-04-07 10:39:21 +02:00
pictuga f3d1f92b39 Detect encoding everytime 2020-04-07 10:38:36 +02:00
pictuga 7691df5257 Use wrapper for http calls 2020-04-07 10:30:17 +02:00
pictuga 0ae0dbc175 README: mention csv output 2020-04-07 09:24:32 +02:00
pictuga f1d0431e68 morss: drop :html, replaced with :reader
README updated accordingly
2020-04-07 09:23:29 +02:00
pictuga a09831415f feeds: fix bug when mimetype matches nothing 2020-04-06 18:53:07 +02:00
pictuga bfad6b7a4a readabilite: clean before counting
To remove links which are not kept anyway
2020-04-06 16:55:39 +02:00
pictuga 6b8c3e51e7 readabilite: fix threshold feature
Awkward typo...
2020-04-06 16:52:06 +02:00
pictuga dc9e425247 readabilite: don't clean-out the top 10% nodes
Loosen up the code once again to limit over-kill
2020-04-06 14:26:28 +02:00
pictuga 2f48e18bb1 readabilite: put scores directly in html node
Probably slower but makes code somewhat cleaner...
2020-04-06 14:21:41 +02:00
pictuga 31cac921c7 README: remove ref to iTunes 2020-04-05 22:20:33 +02:00
5 changed files with 97 additions and 72 deletions

View File

@ -24,15 +24,13 @@ hand-written rules (ie. there's no automatic detection of links to build feeds).
Please mind that feeds based on html files may stop working unexpectedly, due to
html structure changes on the target website.
Additionally morss can grab the source xml feed of iTunes podcast, and detect
rss feeds in html pages' `<meta>`.
Additionally morss can detect rss feeds in html pages' `<meta>`.
You can use this program online for free at **[morss.it](https://morss.it/)**.
Some features of morss:
- Read RSS/Atom feeds
- Create RSS feeds from json/html pages
- Convert iTunes podcast links into xml links
- Export feeds as RSS/JSON/CSV/HTML
- Fetch full-text content of feed items
- Follow 301/meta redirects
@ -75,6 +73,8 @@ The arguments are:
- Change what morss does
- `json`: output as JSON
- `html`: outpout as HTML
- `csv`: outpout as CSV
- `proxy`: doesn't fill the articles
- `clip`: stick the full article content under the original feed content (useful for twitter)
- `search=STRING`: does a basic case-sensitive search in the feed
@ -88,11 +88,9 @@ The arguments are:
- `mono`: disable multithreading while fetching, makes debugging easier
- `theforce`: force download the rss feed and ignore cached http errros
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
- `encoding=ENCODING`: overrides the encoding auto-detection of the crawler. Some web developers did not quite understand the importance of setting charset/encoding tags correctly...
- http server only
- `callback=NAME`: for JSONP calls
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
- `html`: changes the http content-type to html, so that python cgi erros (written in html) are readable in a web browser
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
- Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries

View File

@ -34,6 +34,25 @@ MIMETYPE = {
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
def get(*args, **kwargs):
return adv_get(*args, **kwargs)[0]
def adv_get(url, timeout=None, *args, **kwargs):
if timeout is None:
con = custom_handler(*args, **kwargs).open(url)
else:
con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
data = con.read()
contenttype = con.info().get('Content-Type', '').split(';')[0]
encoding= detect_encoding(data, con)
return data, con, contenttype, encoding
def custom_handler(follow=None, delay=None, encoding=None):
handlers = []
@ -199,7 +218,6 @@ class BrowserlyHeaderHandler(BaseHandler):
""" Add more headers to look less suspicious """
def http_request(self, req):
req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host))
req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
return req

View File

@ -15,7 +15,7 @@ import dateutil.parser
from copy import deepcopy
import lxml.html
from bs4 import BeautifulSoup
from .readabilite import parse as html_parse
json.encoder.c_make_encoder = None
@ -53,7 +53,7 @@ def parse_rules(filename=None):
return rules
def parse(data, url=None, mimetype=None):
def parse(data, url=None, mimetype=None, encoding=None):
" Determine which ruleset to use "
rulesets = parse_rules()
@ -67,14 +67,14 @@ def parse(data, url=None, mimetype=None):
for path in ruleset['path']:
if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
return parser(data, ruleset)
return parser(data, ruleset, encoding=encoding)
# 2) Look for a parser based on mimetype
if mimetype is not None:
parser_candidates = [x for x in parsers if mimetype in x.mimetype]
if mimetype is None or parser_candidates is None:
if mimetype is None or len(parser_candidates) == 0:
parser_candidates = parsers
# 3) Look for working ruleset for given parser
@ -86,7 +86,7 @@ def parse(data, url=None, mimetype=None):
# 'path' as they should have been caught beforehands
try:
feed = parser(data)
feed = parser(data, encoding=encoding)
except (ValueError):
# parsing did not work
@ -113,7 +113,7 @@ def parse(data, url=None, mimetype=None):
class ParserBase(object):
def __init__(self, data=None, rules=None, parent=None):
def __init__(self, data=None, rules=None, parent=None, encoding=None):
if rules is None:
rules = parse_rules()[self.default_ruleset]
@ -122,9 +122,10 @@ class ParserBase(object):
if data is None:
data = rules['base']
self.root = self.parse(data)
self.parent = parent
self.encoding = encoding
self.root = self.parse(data)
def parse(self, raw):
pass
@ -442,8 +443,7 @@ class ParserHTML(ParserXML):
mimetype = ['text/html', 'application/xhtml+xml']
def parse(self, raw):
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser)
return html_parse(raw, encoding=self.encoding)
def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, encoding=encoding, **k)

View File

@ -10,7 +10,6 @@ import re
import lxml.etree
import lxml.html
from bs4 import BeautifulSoup
from . import feeds
from . import crawler
@ -251,19 +250,17 @@ def ItemFill(item, options, feedurl='/', fast=False):
delay = -2
try:
con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
data = con.read()
data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
except (IOError, HTTPException) as e:
log('http error')
return False # let's just delete errors stuff when in cache mode
contenttype = con.info().get('Content-Type', '').split(';')[0]
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
log('non-text page')
return True
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
out = readabilite.get_article(data, url=con.geturl(), encoding=encoding)
if out is not None:
item.content = out
@ -324,18 +321,14 @@ def FeedFetch(url, options):
delay = 0
try:
con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
.open(url, timeout=TIMEOUT * 2)
xml = con.read()
xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
except (IOError, HTTPException):
raise MorssException('Error downloading feed')
contenttype = con.info().get('Content-Type', '').split(';')[0]
if options.items:
# using custom rules
rss = feeds.FeedHTML(xml)
rss = feeds.FeedHTML(xml, encoding=encoding)
rss.rules['title'] = options.title if options.title else '//head/title'
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
@ -355,7 +348,7 @@ def FeedFetch(url, options):
else:
try:
rss = feeds.parse(xml, url, contenttype)
rss = feeds.parse(xml, url, contenttype, encoding=encoding)
rss = rss.convert(feeds.FeedXML)
# contains all fields, otherwise much-needed data can be lost
@ -469,7 +462,7 @@ def FeedFormat(rss, options, encoding='utf-8'):
elif options.csv:
return rss.tocsv(encoding=encoding)
elif options.reader:
elif options.html:
if options.indent:
return rss.tohtml(encoding=encoding, pretty_print=True)
@ -547,7 +540,7 @@ def cgi_app(environ, start_response):
if options.cors:
headers['access-control-allow-origin'] = '*'
if options.html or options.reader:
if options.html:
headers['content-type'] = 'text/html'
elif options.txt or options.silent:
headers['content-type'] = 'text/plain'
@ -652,13 +645,10 @@ def cgi_page(environ, start_response):
if urlparse(url).scheme not in ['http', 'https']:
url = 'http://' + url
con = crawler.custom_handler().open(url)
data = con.read()
contenttype = con.info().get('Content-Type', '').split(';')[0]
data, con, contenttype, encoding = crawler.adv_get(url=url)
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
html = readabilite.parse(data, encoding=encoding)
html.make_links_absolute(con.geturl())
kill_tags = ['script', 'iframe', 'noscript']

View File

@ -6,11 +6,14 @@ import re
def parse(data, encoding=None):
if encoding:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
else:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser)
else:
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
return lxml.html.fromstring(data, parser=parser)
def count_words(string):
@ -44,6 +47,12 @@ def count_content(node):
return count_words(node.text_content()) + len(node.findall('.//img'))
def percentile(N, P):
# https://stackoverflow.com/a/7464107
n = max(int(round(P * len(N) + 0.5)), 2)
return N[n-2]
class_bad = ['comment', 'community', 'extra', 'foot',
'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about',
@ -123,33 +132,42 @@ def score_node(node):
return score
def score_all(node, grades=None):
def score_all(node):
" Fairly dumb loop to score all worthwhile nodes. Tries to be fast "
if grades is None:
grades = {}
for child in node:
score = score_node(child)
child.attrib['seen'] = 'yes, ' + str(int(score))
if score > 0 or not len(grades):
spread_score(child, score, grades)
score_all(child, grades)
return grades
if score > 0 or len(list(child.iterancestors())) <= 2:
spread_score(child, score)
score_all(child)
def spread_score(node, score, grades):
def set_score(node, value):
node.attrib['morss_score'] = str(float(value))
def get_score(node):
return float(node.attrib.get('morss_score', 0))
def incr_score(node, delta):
set_score(node, get_score(node) + delta)
def get_all_scores(node):
return {x:get_score(x) for x in list(node.iter()) if get_score(x) != 0}
def spread_score(node, score):
" Spread the node's score to its parents, on a linear way "
delta = score / 2
for ancestor in [node,] + list(node.iterancestors()):
if score >= 1 or ancestor is node:
try:
grades[ancestor] += score
except KeyError:
grades[ancestor] = score
incr_score(ancestor, score)
score -= delta
@ -157,26 +175,24 @@ def spread_score(node, score, grades):
break
def write_score_all(root, grades):
" Useful for debugging "
for node in root.iter():
node.attrib['score'] = str(int(grades.get(node, 0)))
def clean_root(root):
def clean_root(root, keep_threshold=None):
for node in list(root):
clean_root(node)
clean_node(node)
# bottom-up approach, i.e. starting with children before cleaning current node
clean_root(node, keep_threshold)
clean_node(node, keep_threshold)
def clean_node(node):
def clean_node(node, keep_threshold=None):
parent = node.getparent()
if parent is None:
# this is <html/> (or a removed element waiting for GC)
return
if keep_threshold is not None and get_score(node) >= keep_threshold:
# high score, so keep
return
gdparent = parent.getparent()
# remove shitty tags
@ -275,18 +291,18 @@ def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
def rank_nodes(grades):
def rank_grades(grades):
# largest score to smallest
return sorted(grades.items(), key=lambda x: x[1], reverse=True)
def get_best_node(grades):
def get_best_node(ranked_grades):
" To pick the best (raw) node. Another function will clean it "
if len(grades) == 1:
return grades[0]
if len(ranked_grades) == 1:
return ranked_grades[0]
top = rank_nodes(grades)
lowest = lowest_common_ancestor(top[0][0], top[1][0], 3)
lowest = lowest_common_ancestor(ranked_grades[0][0], ranked_grades[1][0], 3)
return lowest
@ -295,12 +311,17 @@ def get_article(data, url=None, encoding=None):
" Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding)
scores = score_all(html)
score_all(html)
scores = rank_grades(get_all_scores(html))
if not len(scores):
return None
best = get_best_node(scores)
keep_threshold = percentile([x[1] for x in scores], 0.1)
clean_root(best, keep_threshold)
wc = count_words(best.text_content())
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
@ -310,6 +331,4 @@ def get_article(data, url=None, encoding=None):
if url:
best.make_links_absolute(url)
clean_root(best)
return lxml.etree.tostring(best, pretty_print=True)