Compare commits

..

3 Commits

Author SHA1 Message Date
pictuga d90756b337 morss: drop 'keep' option
Because the Firefox behaviour it is working around is no longer in use
2020-04-05 16:37:27 +02:00
pictuga 40c69f17d2 feeds: parse html with BS
More robust & to make it consistent with :getpage
2020-04-05 16:12:41 +02:00
pictuga 99461ea185 crawler: fix var name issues (private_cache) 2020-04-05 16:11:36 +02:00
4 changed files with 5 additions and 8 deletions

View File

@ -77,7 +77,6 @@ The arguments are:
- `json`: output as JSON - `json`: output as JSON
- `proxy`: doesn't fill the articles - `proxy`: doesn't fill the articles
- `clip`: stick the full article content under the original feed content (useful for twitter) - `clip`: stick the full article content under the original feed content (useful for twitter)
- `keep`: by default, morss does drop feed description whenever the full-content is found (so as not to mislead users who use Firefox, since the latter only shows the description in the feed preview, so they might believe morss doens't work), but with this argument, the description is kept
- `search=STRING`: does a basic case-sensitive search in the feed - `search=STRING`: does a basic case-sensitive search in the feed
- Advanced - Advanced
- `csv`: export to csv - `csv`: export to csv

View File

@ -367,7 +367,7 @@ class CacheHandler(BaseHandler):
elif self.force_min is None and ('no-cache' in cc_list elif self.force_min is None and ('no-cache' in cc_list
or 'no-store' in cc_list or 'no-store' in cc_list
or ('private' in cc_list and not self.private)): or ('private' in cc_list and not self.private_cache)):
# kindly follow web servers indications, refresh # kindly follow web servers indications, refresh
return None return None
@ -402,7 +402,7 @@ class CacheHandler(BaseHandler):
cc_list = [x for x in cache_control if '=' not in x] cc_list = [x for x in cache_control if '=' not in x]
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private): if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
# kindly follow web servers indications # kindly follow web servers indications
return resp return resp

View File

@ -15,6 +15,7 @@ import dateutil.parser
from copy import deepcopy from copy import deepcopy
import lxml.html import lxml.html
from bs4 import BeautifulSoup
json.encoder.c_make_encoder = None json.encoder.c_make_encoder = None
@ -441,7 +442,7 @@ class ParserHTML(ParserXML):
def parse(self, raw): def parse(self, raw):
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
return etree.fromstring(raw, parser) return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify(), parser)
def tostring(self, encoding='unicode', **k): def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, encoding=encoding, **k) return lxml.html.tostring(self.root, encoding=encoding, **k)

View File

@ -54,7 +54,7 @@ def filterOptions(options):
# example of filtering code below # example of filtering code below
#allowed = ['proxy', 'clip', 'keep', 'cache', 'force', 'silent', 'pro', 'debug'] #allowed = ['proxy', 'clip', 'cache', 'force', 'silent', 'pro', 'debug']
#filtered = dict([(key,value) for (key,value) in options.items() if key in allowed]) #filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
#return filtered #return filtered
@ -288,9 +288,6 @@ def ItemAfter(item, options):
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
del item.desc del item.desc
if not options.keep and not options.proxy:
del item.desc
if options.nolink and item.content: if options.nolink and item.content:
content = lxml.html.fromstring(item.content) content = lxml.html.fromstring(item.content)
for link in content.xpath('//a'): for link in content.xpath('//a'):