morss/morss/feeds.py

831 lines
23 KiB
Python
Raw Normal View History

2020-08-26 18:08:22 +00:00
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import csv
2021-09-08 18:54:34 +00:00
import json
import re
from copy import deepcopy
from datetime import datetime
2020-03-18 15:34:40 +00:00
from fnmatch import fnmatch
import dateutil.parser
import lxml.html
2021-09-08 18:54:34 +00:00
from dateutil import tz
from lxml import etree
2020-04-07 08:38:36 +00:00
from .readabilite import parse as html_parse
2021-12-05 11:09:01 +00:00
from .util import *
json.encoder.c_make_encoder = None
try:
2020-03-19 09:13:22 +00:00
# python 2
from ConfigParser import RawConfigParser
2021-09-08 18:54:34 +00:00
from StringIO import StringIO
except ImportError:
2020-03-19 09:13:22 +00:00
# python 3
from configparser import RawConfigParser
2021-09-08 18:54:34 +00:00
from io import StringIO
2015-02-25 10:22:38 +00:00
2015-02-25 16:50:23 +00:00
try:
2020-03-19 09:13:22 +00:00
# python 2
2015-02-25 16:50:23 +00:00
basestring
except NameError:
2020-03-19 09:13:22 +00:00
# python 3
2015-02-25 16:50:23 +00:00
basestring = unicode = str
2018-11-09 21:02:44 +00:00
def parse_rules(filename=None):
if not filename:
2021-12-05 11:09:01 +00:00
filename = pkg_path('feedify.ini')
2018-11-09 21:02:44 +00:00
config = RawConfigParser()
2018-11-09 21:02:44 +00:00
config.read(filename)
rules = dict([(x, dict(config.items(x))) for x in config.sections()])
for section in rules.keys():
# for each ruleset
2018-11-09 21:02:44 +00:00
for arg in rules[section].keys():
# for each rule
2020-04-09 18:00:51 +00:00
if rules[section][arg].startswith('file:'):
path = data_path('www', rules[section][arg][5:])
file_raw = open(path).read()
2021-12-05 11:09:01 +00:00
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
rules[section][arg] = file_clean
2020-04-09 18:00:51 +00:00
elif '\n' in rules[section][arg]:
2018-11-09 21:02:44 +00:00
rules[section][arg] = rules[section][arg].split('\n')[1:]
return rules
def parse(data, url=None, encoding=None, ruleset=None):
2020-03-18 15:34:40 +00:00
" Determine which ruleset to use "
if ruleset is not None:
rulesets = [ruleset]
else:
rulesets = parse_rules().values()
2020-03-18 15:34:40 +00:00
parsers = [FeedXML, FeedHTML, FeedJSON]
# 1) Look for a ruleset based on path
if url is not None:
for ruleset in rulesets:
2020-03-18 15:34:40 +00:00
if 'path' in ruleset:
for path in ruleset['path']:
if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset.get('mode')][0] # FIXME what if no mode specified?
2020-04-19 10:50:05 +00:00
return parser(data, ruleset, encoding=encoding)
2020-03-18 15:34:40 +00:00
# 2) Try each and every parser
2020-03-18 15:34:40 +00:00
# 3) Look for working ruleset for given parser
# 3a) See if parsing works
# 3b) See if .items matches anything
for parser in parsers:
2020-03-18 15:34:40 +00:00
try:
2020-04-07 08:38:36 +00:00
feed = parser(data, encoding=encoding)
2020-03-18 15:34:40 +00:00
2020-04-20 14:15:15 +00:00
except (ValueError, SyntaxError):
2020-03-18 15:34:40 +00:00
# parsing did not work
pass
else:
# parsing worked, now we try the rulesets
ruleset_candidates = [x for x in rulesets if x.get('mode') in (parser.mode, None) and 'path' not in x]
# 'path' as they should have been caught beforehands
# try anyway if no 'mode' specified
2020-03-18 15:34:40 +00:00
for ruleset in ruleset_candidates:
feed.rules = ruleset
try:
feed.items[0]
except (AttributeError, IndexError, TypeError):
2020-03-18 15:34:40 +00:00
# parsing and or item picking did not work out
pass
else:
# it worked!
return feed
2020-03-19 09:13:22 +00:00
raise TypeError('no way to handle this feed')
2020-03-18 15:34:40 +00:00
2018-11-09 21:02:44 +00:00
class ParserBase(object):
2020-04-07 08:38:36 +00:00
def __init__(self, data=None, rules=None, parent=None, encoding=None):
2018-11-09 21:02:44 +00:00
if rules is None:
2020-03-19 09:13:22 +00:00
rules = parse_rules()[self.default_ruleset]
self.rules = rules
2018-11-09 21:02:44 +00:00
if data is None:
data = rules['base']
2018-11-09 21:02:44 +00:00
2020-03-17 11:22:14 +00:00
self.parent = parent
2020-04-07 08:38:36 +00:00
self.encoding = encoding
2018-11-09 21:02:44 +00:00
2020-04-07 08:38:36 +00:00
self.root = self.parse(data)
2018-11-09 21:02:44 +00:00
def parse(self, raw):
pass
def remove(self):
# delete oneslf
pass
def tostring(self, **k):
2018-11-09 21:02:44 +00:00
# output in its input format
# to output in sth fancy (json, csv, html), change class type with .convert first
2018-11-09 21:02:44 +00:00
pass
def torss(self, **k):
return self.convert(FeedXML).tostring(**k)
2020-03-19 08:47:58 +00:00
def tojson(self, **k):
return self.convert(FeedJSON).tostring(**k)
def tocsv(self, encoding='unicode'):
out = StringIO()
c = csv.writer(out, dialect=csv.excel)
for item in self.items:
2020-04-09 17:05:50 +00:00
c.writerow([getattr(item, x) for x in item.dic])
2020-04-09 17:05:50 +00:00
out.seek(0)
out = out.read()
2020-04-09 17:05:50 +00:00
if encoding != 'unicode':
out = out.encode(encoding)
2020-04-09 17:05:50 +00:00
return out
def tohtml(self, **k):
return self.convert(FeedHTML).tostring(**k)
def convert(self, TargetParser):
target = TargetParser()
if type(self) == TargetParser and self.rules == target.rules:
# check both type *AND* rules (e.g. when going from freeform xml to rss)
return self
for attr in target.dic:
if attr == 'items':
for item in self.items:
target.append(item)
else:
setattr(target, attr, getattr(self, attr))
return target
# RULE-BASED FUNCTIONS
2018-11-09 21:02:44 +00:00
def rule_search(self, rule):
# xpath, return the first one only
try:
return self.rule_search_all(rule)[0]
except IndexError:
return None
def rule_search_all(self, rule):
# xpath, return all raw matches (useful to find feed items)
2018-11-09 21:02:44 +00:00
pass
def rule_search_last(self, rule):
# xpath, return only the first raw match
2018-11-09 21:02:44 +00:00
try:
return self.rule_search_all(rule)[-1]
except IndexError:
return None
def rule_create(self, rule):
# create node based on rule
# (duplicate, copy existing (or template) or create from scratch, if possible)
# --> might want to create node_duplicate helper fns
pass
def rule_remove(self, rule):
# remove node from its parent. Returns nothing
2018-11-09 21:02:44 +00:00
pass
def rule_set(self, rule, value):
# set the value. Returns nothing
2018-11-09 21:02:44 +00:00
pass
def rule_str(self, rule):
# GETs inside (pure) text from it
pass
# PARSERS
2018-11-09 21:02:44 +00:00
def time_prs(self, x):
# parse
try:
return parse_time(x)
except ValueError:
return None
2018-11-09 21:02:44 +00:00
def time_fmt(self, x):
# format
try:
time = parse_time(x)
return time.strftime(self.rules.get('timeformat', self.default_timeformat))
except ValueError:
pass
2018-11-09 21:02:44 +00:00
default_timeformat = "%D"
# HELPERS
2018-11-09 21:02:44 +00:00
def get_raw(self, rule_name):
# get the raw output, for self.get_raw('items')
2020-03-19 08:47:17 +00:00
if rule_name not in self.rules:
return []
return self.rule_search_all(self.rules[rule_name])
2018-11-09 21:02:44 +00:00
def get(self, rule_name):
2018-11-09 21:02:44 +00:00
# simple function to get nice text from the rule name
# for use in @property, ie. self.get('title')
2020-03-19 08:47:17 +00:00
if rule_name not in self.rules:
return None
return self.rule_str(self.rules[rule_name]) or None
2018-11-09 21:02:44 +00:00
def set(self, rule_name, value):
# simple function to set nice text from the rule name. Returns nothing
2020-03-19 08:47:17 +00:00
if rule_name not in self.rules:
return
if value is None:
self.rmv(rule_name)
return
2020-03-19 08:47:17 +00:00
try:
self.rule_set(self.rules[rule_name], value)
except AttributeError:
# does not exist, have to create it
2020-04-09 17:09:10 +00:00
try:
self.rule_create(self.rules[rule_name])
except AttributeError:
# no way to create it, give up
pass
else:
self.rule_set(self.rules[rule_name], value)
2018-11-09 21:02:44 +00:00
def rmv(self, rule_name):
2018-11-09 21:02:44 +00:00
# easy deleter
2020-03-19 08:47:17 +00:00
if rule_name not in self.rules:
return
self.rule_remove(self.rules[rule_name])
2018-11-09 21:02:44 +00:00
2018-11-09 21:04:08 +00:00
class ParserXML(ParserBase):
default_ruleset = 'rss-channel'
2020-03-18 15:24:08 +00:00
mode = 'xml'
mimetype = ['text/xml', 'application/xml', 'application/rss+xml',
'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml']
2018-11-18 14:14:38 +00:00
NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
'atom03': 'http://purl.org/atom/ns#',
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'content': 'http://purl.org/rss/1.0/modules/content/',
'rssfake': 'http://purl.org/rss/1.0/'}
2018-11-09 21:04:08 +00:00
def parse(self, raw):
parser = etree.XMLParser(recover=True, remove_blank_text=True, remove_pis=True) # remove_blank_text needed for pretty_print
2018-11-09 21:04:08 +00:00
return etree.fromstring(raw, parser)
def remove(self):
return self.root.getparent().remove(self.root)
def tostring(self, encoding='unicode', **k):
return etree.tostring(self.root, encoding=encoding, method='xml', **k)
2018-11-09 21:04:08 +00:00
def _rule_parse(self, rule):
test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
return test.groups() if test else (rule, None)
def _resolve_ns(self, rule):
2020-03-17 11:26:34 +00:00
# shortname to full name
2018-11-11 14:26:46 +00:00
match = re.search(r'^([^:]+):([^:]+)$', rule) # to match fakerss:content
2018-11-09 21:04:08 +00:00
if match:
match = match.groups()
2018-11-18 14:14:38 +00:00
if match[0] in self.NSMAP:
return "{%s}%s" % (self.NSMAP[match[0]], match[1].lower())
2018-11-09 21:04:08 +00:00
return rule
2018-11-11 14:21:06 +00:00
@staticmethod
def _inner_html(xml):
2020-03-19 08:47:17 +00:00
return (xml.text or '') + ''.join([etree.tostring(child, encoding='unicode') for child in xml])
2018-11-11 14:21:06 +00:00
@staticmethod
def _clean_node(xml):
2020-03-19 10:35:51 +00:00
if xml is not None:
if len(xml):
[xml.remove(child) for child in xml]
2020-03-19 08:47:17 +00:00
xml.text = None
2018-11-11 14:21:06 +00:00
2018-11-09 21:04:08 +00:00
def rule_search_all(self, rule):
try:
match = self.root.xpath(rule, namespaces=self.NSMAP)
if isinstance(match, str):
# some xpath rules return a single string instead of an array (e.g. concatenate() )
return [match,]
else:
return match
2018-11-09 21:04:08 +00:00
except etree.XPathEvalError:
return []
def rule_create(self, rule):
# duplicate, copy from template or create from scratch
rrule, key = self._rule_parse(rule)
2018-11-09 21:04:08 +00:00
# try recreating based on the rule (for really basic rules, ie. plain RSS) `/feed/item`
if re.search(r'^[a-zA-Z0-9/:]+$', rrule):
chain = rrule.strip('/').split('/')
2018-11-09 21:04:08 +00:00
current = self.root
if rrule[0] == '/':
2020-03-17 11:26:34 +00:00
# we skip the first chain-element, as we _start_ from the first/root one
# i.e. for "/rss/channel/title" we only keep "/channel/title"
2018-11-09 21:04:08 +00:00
chain = chain[1:]
for (i, node) in enumerate(chain):
test = current.find(self._resolve_ns(node))
2020-03-17 11:26:34 +00:00
if test is not None and i < len(chain) - 1:
2018-11-09 21:04:08 +00:00
# yay, go on
current = test
else:
# opps need to create
element = etree.Element(self._resolve_ns(node))
current.append(element)
current = element
return current
# try duplicating from existing (works well with fucked up structures)
match = self.rule_search_last(rrule)
2018-11-09 21:04:08 +00:00
if match:
element = deepcopy(match)
2018-11-13 20:23:24 +00:00
match.getparent().append(element)
2018-11-09 21:04:08 +00:00
return element
return None
def rule_remove(self, rule):
rrule, key = self._rule_parse(rule)
2018-11-09 21:04:08 +00:00
match = self.rule_search(rrule)
2018-11-09 21:04:08 +00:00
if match is None:
return
elif key is not None:
if key in match.attrib:
del match.attrib[key]
2018-11-09 21:04:08 +00:00
else:
match.getparent().remove(match)
def rule_set(self, rule, value):
rrule, key = self._rule_parse(rule)
2018-11-09 21:04:08 +00:00
match = self.rule_search(rrule)
2018-11-09 21:04:08 +00:00
html_rich = ('atom' in rule or self.rules.get('mode') == 'html') \
2020-03-19 08:48:53 +00:00
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
2018-11-09 21:04:08 +00:00
if key is not None:
match.attrib[key] = value
else:
2020-03-19 08:48:53 +00:00
if html_rich:
2018-11-11 14:21:06 +00:00
self._clean_node(match)
2020-03-19 08:48:53 +00:00
match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
if self.rules.get('mode') == 'html':
match.find('div').drop_tag() # not supported by lxml.etree
else: # i.e. if atom
match.attrib['type'] = 'xhtml'
2018-11-11 14:21:06 +00:00
2020-03-19 08:48:53 +00:00
else:
if match is not None and len(match):
self._clean_node(match)
2018-11-11 14:21:06 +00:00
match.attrib['type'] = 'html'
2020-03-19 08:48:53 +00:00
match.text = value
2018-11-09 21:04:08 +00:00
def rule_str(self, rule):
match = self.rule_search(rule)
html_rich = ('atom' in rule or self.mode == 'html') \
2020-03-19 08:48:53 +00:00
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
2018-11-09 21:04:08 +00:00
if isinstance(match, etree._Element):
2020-03-19 08:48:53 +00:00
if html_rich:
2018-11-11 14:21:06 +00:00
# atom stuff
return self._inner_html(match)
else:
2020-03-19 08:48:53 +00:00
return etree.tostring(match, method='text', encoding='unicode').strip()
2018-11-09 21:04:08 +00:00
else:
return match # might be None is no match
2018-11-09 21:04:08 +00:00
class ParserHTML(ParserXML):
default_ruleset = 'html'
mode = 'html'
mimetype = ['text/html', 'application/xhtml+xml']
def parse(self, raw):
2020-04-07 08:38:36 +00:00
return html_parse(raw, encoding=self.encoding)
def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, encoding=encoding, method='html', **k)
def rule_search_all(self, rule):
try:
# do proper "class" matching (too "heavy" to type as-it in rules)
pattern = r'\[class=([^\]]+)\]'
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
rule = re.sub(pattern, repl, rule)
2022-01-17 14:22:48 +00:00
match = self.root.xpath(rule)
if isinstance(match, str):
# for some xpath rules, see XML parser
return [match,]
else:
return match
except etree.XPathEvalError:
return []
def rule_create(self, rule):
# try duplicating from existing (works well with fucked up structures)
rrule, key = self._rule_parse(rule)
match = self.rule_search_last(rule)
if match is not None:
element = deepcopy(match)
match.getparent().append(element)
2020-04-09 17:09:10 +00:00
else:
raise AttributeError('no way to create item')
2018-11-09 21:04:08 +00:00
def parse_time(value):
2021-04-22 19:51:00 +00:00
# parsing per se
2020-03-20 11:30:42 +00:00
if value is None or value == 0:
2021-04-22 19:51:00 +00:00
time = None
2020-03-20 11:30:42 +00:00
elif isinstance(value, basestring):
if re.match(r'^[0-9]+$', value):
2021-04-22 19:51:00 +00:00
time = datetime.fromtimestamp(int(value))
else:
2021-04-22 19:51:00 +00:00
time = dateutil.parser.parse(value)
elif isinstance(value, int):
2021-04-22 19:51:00 +00:00
time = datetime.fromtimestamp(value)
elif isinstance(value, datetime):
2021-04-22 19:51:00 +00:00
time = value
2021-04-22 19:57:16 +00:00
else:
time = None
2021-04-22 19:51:00 +00:00
# add default time zone if none set
if time is not None and time.tzinfo is None:
time = time.replace(tzinfo=tz.tzutc())
return time
2020-03-17 13:02:01 +00:00
class ParserJSON(ParserBase):
default_ruleset = 'json'
2020-03-18 15:24:08 +00:00
mode = 'json'
mimetype = ['application/json', 'application/javascript', 'text/javascript']
2020-03-17 13:02:01 +00:00
def parse(self, raw):
return json.loads(raw)
def remove(self):
2020-03-20 11:22:07 +00:00
# impossible to "delete" oneself per se but can clear all its items
for attr in self.root:
del self.root[attr]
2020-03-17 13:02:01 +00:00
def tostring(self, encoding='unicode', **k):
dump = json.dumps(self.root, ensure_ascii=False, **k) # ensure_ascii = False to have proper (unicode) string and not \u00
if encoding != 'unicode':
return dump.encode(encoding)
else:
return dump
2020-03-17 13:02:01 +00:00
def _rule_parse(self, rule):
return rule.split(".")
def rule_search_all(self, rule):
try:
rrule = self._rule_parse(rule)
cur = self.root
for node in rrule:
if node == '[]':
break
else:
cur = cur[node]
return cur if isinstance(cur, list) else [cur,]
except (AttributeError, KeyError):
return []
def rule_create(self, rule):
# create from scracth
rrule = self._rule_parse(rule)
cur = self.root
for (i, node) in enumerate(rrule):
if rrule[i+1] == '[]':
if node in cur and isinstance(cur[node], list):
cur[node].append({})
else:
cur[node] = [{}]
return
else:
if node in cur:
# yay, go on
cur = cur[node]
else:
# opps need to create
cur[node] = {}
def rule_remove(self, rule):
if '[]' in rule:
raise ValueError('not supported') # FIXME
rrule = self._rule_parse(rule)
cur = self.root
2020-03-20 11:22:07 +00:00
try:
for node in rrule[:-1]:
cur = cur[node]
del cur[rrule[-1]]
2020-03-17 13:02:01 +00:00
2020-03-20 11:22:07 +00:00
except KeyError:
# nothing to delete
pass
2020-03-17 13:02:01 +00:00
def rule_set(self, rule, value):
if '[]' in rule:
raise ValueError('not supported') # FIXME
rrule = self._rule_parse(rule)
cur = self.root
for node in rrule[:-1]:
cur = cur[node]
cur[rrule[-1]] = value
def rule_str(self, rule):
out = self.rule_search(rule)
2020-03-19 09:13:22 +00:00
return out.replace('\n', '<br/>') if out else out
2020-03-17 13:02:01 +00:00
def wrap_uniq(wrapper_fn_name):
" Wraps the output of the function with the specified function "
# This is called when parsing "wrap_uniq('wrap_item')"
def decorator(func):
# This is called when parsing "@wrap_uniq('wrap_item')"
def wrapped_func(self, *args, **kwargs):
# This is called when the wrapped function is called
output = func(self, *args, **kwargs)
output_id = id(output)
try:
return self._map[output_id]
except (KeyError, AttributeError):
if not hasattr(self, '_map'):
self._map = {}
wrapper_fn = getattr(self, wrapper_fn_name)
obj = wrapper_fn(output)
self._map[output_id] = obj
return obj
return wrapped_func
return decorator
2018-11-09 21:02:44 +00:00
class Feed(object):
itemsClass = property(lambda x: Item) # because Item is define below, i.e. afterwards
2018-11-09 21:02:44 +00:00
dic = ('title', 'desc', 'items')
title = property(
lambda f: f.get('title'),
lambda f,x: f.set('title', x),
lambda f: f.rmv('title') )
2018-11-09 21:02:44 +00:00
description = desc = property(
lambda f: f.get('desc'),
lambda f,x: f.set('desc', x),
lambda f: f.rmv('desc') )
2018-11-09 21:02:44 +00:00
items = property(
lambda f: f )
def append(self, new=None):
self.rule_create(self.rules['items'])
item = self.items[-1]
for attr in self.itemsClass.dic:
2020-03-17 12:59:51 +00:00
try:
2018-11-11 15:33:18 +00:00
setattr(item, attr, getattr(new, attr))
2018-11-09 21:02:44 +00:00
2020-03-17 12:59:51 +00:00
except AttributeError:
try:
setattr(item, attr, new[attr])
except (KeyError, IndexError, TypeError):
2020-03-17 12:59:51 +00:00
pass
2018-11-09 21:02:44 +00:00
return item
def wrap_item(self, item):
return self.itemsClass(item, self.rules, self)
@wrap_uniq('wrap_item')
2018-11-09 21:02:44 +00:00
def __getitem__(self, key):
return self.get_raw('items')[key]
2018-11-09 21:02:44 +00:00
def __delitem__(self, key):
2020-03-17 11:26:34 +00:00
self[key].remove()
2018-11-09 21:02:44 +00:00
def __len__(self):
return len(self.get_raw('items'))
class Item(object):
dic = ('title', 'link', 'desc', 'content', 'time', 'updated')
2018-11-09 21:02:44 +00:00
2020-03-17 11:22:14 +00:00
def __init__(self, xml=None, rules=None, parent=None):
2018-11-09 21:02:44 +00:00
self._id = self._gen_id(xml)
self.root = xml
self.rules = rules
2020-03-17 11:22:14 +00:00
self.parent = parent
2018-11-09 21:02:44 +00:00
@staticmethod
def _gen_id(xml=None, *args, **kwargs):
return id(xml)
title = property(
lambda f: f.get('item_title'),
lambda f,x: f.set('item_title', x),
lambda f: f.rmv('item_title') )
2018-11-09 21:02:44 +00:00
link = property(
lambda f: f.get('item_link'),
lambda f,x: f.set('item_link', x),
lambda f: f.rmv('item_link') )
2018-11-09 21:02:44 +00:00
description = desc = property(
lambda f: f.get('item_desc'),
lambda f,x: f.set('item_desc', x),
lambda f: f.rmv('item_desc') )
2018-11-09 21:02:44 +00:00
content = property(
lambda f: f.get('item_content'),
lambda f,x: f.set('item_content', x),
lambda f: f.rmv('item_content') )
2018-11-09 21:02:44 +00:00
time = property(
lambda f: f.time_prs(f.get('item_time')),
lambda f,x: f.set('item_time', f.time_fmt(x)),
lambda f: f.rmv('item_time') )
2018-11-09 21:02:44 +00:00
updated = property(
lambda f: f.time_prs(f.get('item_updated')),
lambda f,x: f.set('item_updated', f.time_fmt(x)),
lambda f: f.rmv('item_updated') )
2018-11-09 21:02:44 +00:00
class ItemXML(Item, ParserXML):
pass
2018-11-11 16:24:56 +00:00
class FeedXML(Feed, ParserXML):
itemsClass = ItemXML
2018-11-11 16:24:56 +00:00
def root_siblings(self):
out = []
current = self.root.getprevious()
while current is not None:
out.append(current)
current = current.getprevious()
return out
def tostring(self, encoding='unicode', **k):
2020-03-17 11:26:34 +00:00
# override needed due to "getroottree" inclusion
# and to add stylesheet
stylesheets = [x for x in self.root_siblings() if isinstance(x, etree.PIBase) and x.target == 'xml-stylesheet']
for stylesheet in stylesheets:
# remove all stylesheets present (be that ours or others')
self.root.append(stylesheet) # needed as we can't delete root siblings https://stackoverflow.com/a/60232366
self.root.remove(stylesheet)
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
return etree.tostring(self.root.getroottree(), encoding=encoding, method='xml', **k)
2018-11-11 16:24:56 +00:00
class ItemHTML(Item, ParserHTML):
pass
class FeedHTML(Feed, ParserHTML):
itemsClass = ItemHTML
2020-03-17 13:02:01 +00:00
class ItemJSON(Item, ParserJSON):
def remove(self):
rrule = self._rule_parse(self.rules['items'])
cur = self.parent.root
for node in rrule:
if node == '[]':
cur.remove(self.root)
return
cur = cur[node]
class FeedJSON(Feed, ParserJSON):
itemsClass = ItemJSON
if __name__ == '__main__':
2021-12-05 11:09:01 +00:00
import sys
from . import crawler
2020-04-28 20:29:07 +00:00
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
feed = parse(req['data'], url=req['url'], encoding=req['encoding'])
2020-05-26 17:34:20 +00:00
if sys.flags.interactive:
print('>>> Interactive shell: try using `feed`')
else:
for item in feed.items:
print(item.title, item.link)