Compare commits

...

3 Commits

Author SHA1 Message Date
pictuga d90756b337 morss: drop 'keep' option
Because the Firefox behaviour it is working around is no longer in use
2020-04-05 16:37:27 +02:00
pictuga 40c69f17d2 feeds: parse html with BS
More robust & to make it consistent with :getpage
2020-04-05 16:12:41 +02:00
pictuga 99461ea185 crawler: fix var name issues (private_cache) 2020-04-05 16:11:36 +02:00
4 changed files with 5 additions and 8 deletions

View File

@ -77,7 +77,6 @@ The arguments are:
- `json`: output as JSON
- `proxy`: doesn't fill the articles
- `clip`: stick the full article content under the original feed content (useful for twitter)
- `keep`: by default, morss does drop feed description whenever the full-content is found (so as not to mislead users who use Firefox, since the latter only shows the description in the feed preview, so they might believe morss doens't work), but with this argument, the description is kept
- `search=STRING`: does a basic case-sensitive search in the feed
- Advanced
- `csv`: export to csv

View File

@ -367,7 +367,7 @@ class CacheHandler(BaseHandler):
elif self.force_min is None and ('no-cache' in cc_list
or 'no-store' in cc_list
or ('private' in cc_list and not self.private)):
or ('private' in cc_list and not self.private_cache)):
# kindly follow web servers indications, refresh
return None
@ -402,7 +402,7 @@ class CacheHandler(BaseHandler):
cc_list = [x for x in cache_control if '=' not in x]
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private):
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
# kindly follow web servers indications
return resp

View File

@ -15,6 +15,7 @@ import dateutil.parser
from copy import deepcopy
import lxml.html
from bs4 import BeautifulSoup
json.encoder.c_make_encoder = None
@ -441,7 +442,7 @@ class ParserHTML(ParserXML):
def parse(self, raw):
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
return etree.fromstring(raw, parser)
return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify(), parser)
def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, encoding=encoding, **k)

View File

@ -54,7 +54,7 @@ def filterOptions(options):
# example of filtering code below
#allowed = ['proxy', 'clip', 'keep', 'cache', 'force', 'silent', 'pro', 'debug']
#allowed = ['proxy', 'clip', 'cache', 'force', 'silent', 'pro', 'debug']
#filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
#return filtered
@ -288,9 +288,6 @@ def ItemAfter(item, options):
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
del item.desc
if not options.keep and not options.proxy:
del item.desc
if options.nolink and item.content:
content = lxml.html.fromstring(item.content)
for link in content.xpath('//a'):