diff --git a/morss/feedify.py b/morss/feedify.py index e14fe1b..b5cf8b6 100644 --- a/morss/feedify.py +++ b/morss/feedify.py @@ -5,97 +5,14 @@ import os.path import re import json -from fnmatch import fnmatch -import lxml.html - -from . import feeds from . import crawler -try: - from ConfigParser import ConfigParser - from urlparse import urljoin - from httplib import HTTPException -except ImportError: - from configparser import ConfigParser - from urllib.parse import urljoin - from http.client import HTTPException - try: basestring except NameError: basestring = str -def to_class(query): - pattern = r'\[class=([^\]]+)\]' - repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]' - return re.sub(pattern, repl, query) - - -def get_rule(link): - config = ConfigParser() - config.read(os.path.join(os.path.dirname(__file__), 'feedify.ini')) - - for section in config.sections(): - values = dict(config.items(section)) - values['path'] = values['path'].split('\n')[1:] - - for path in values['path']: - if fnmatch(link, path): - return values - - return False - - -def supported(link): - return get_rule(link) is not False - - -def format_string(string, getter, error=False): - out = "" - char = string[0] - - follow = string[1:] - - if char == '"': - match = follow.partition('"') - out = match[0] - if len(match) >= 2: - next_match = match[2] - else: - next_match = None - elif char == '{': - match = follow.partition('}') - try: - test = format_string(match[0], getter, True) - except (ValueError, KeyError): - pass - else: - out = test - - next_match = match[2] - elif char == ' ': - next_match = follow - elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string): - match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups() - raw_value = getter(match[0]) - if not isinstance(raw_value, basestring): - if match[1] is not None: - out = match[1].join(raw_value) - else: - out = ''.join(raw_value) - if not out and error: - raise ValueError - next_match = match[2] - else: - raise ValueError('bogus string') - - if next_match is not None and len(next_match): - return out + format_string(next_match, getter, error) - else: - return out - - def pre_worker(url): if url.startswith('http://itunes.apple.com/') or url.startswith('https://itunes.apple.com/'): match = re.search('/id([0-9]+)(\?.*)?$', url) @@ -113,115 +30,3 @@ def pre_worker(url): return json.loads(data.decode('utf-8', 'replace'))['results'][0]['feedUrl'] return None - - -class Builder(object): - def __init__(self, link, data, rule=None): - # data must be a unicode string - - self.link = link - self.data = data - self.rule = rule - - self.encoding = crawler.detect_encoding(self.data) - - if isinstance(self.data, bytes): - self.data = self.data.decode(crawler.detect_encoding(self.data), 'replace') - - if self.rule is None: - self.rule = get_rule(link) - - if self.rule['mode'] == 'xpath': - self.doc = lxml.html.fromstring(self.data) - - elif self.rule['mode'] == 'json': - self.doc = json.loads(self.data) - - self.feed = feeds.FeedXML() - - def raw(self, html, expr): - " Returns selected items, thru a stupid query " - - if self.rule['mode'] == 'xpath': - return html.xpath(to_class(expr)) - - elif self.rule['mode'] == 'json': - a = [html] - b = [] - for x in expr.strip(".").split("."): - match = re.search('^([^\[]+)(?:\[([0-9]+)\])?$', x).groups() - for elem in a: - if isinstance(elem, dict): - kids = elem.get(match[0]) - if kids is None: - pass - elif isinstance(kids, list): - b += kids - elif isinstance(kids, basestring): - b.append(kids.replace('\n', '
')) - else: - b.append(kids) - - if match[1] is None: - a = b - else: - if len(b) - 1 >= int(match[1]): - a = [b[int(match[1])]] - else: - a = [] - b = [] - return a - - def strings(self, html, expr): - " Turns the results of raw() into a nice array of strings (ie. sth useful) " - - if self.rule['mode'] == 'xpath': - out = [] - for match in self.raw(html, expr): - if isinstance(match, basestring): - out.append(match) - elif isinstance(match, lxml.html.HtmlElement): - out.append(lxml.html.tostring(match)) - - elif self.rule['mode'] == 'json': - out = self.raw(html, expr) - - out = [x.decode(self.encoding) if isinstance(x, bytes) else x for x in out] - return out - - def string(self, html, expr): - " Makes a formatted string, using our custom template format, out of the getter and rule " - - getter = lambda x: self.strings(html, x) - return format_string(self.rule[expr], getter) - - def build(self): - " Builds the actual rss feed " - - if 'title' in self.rule: - self.feed.title = self.string(self.doc, 'title') - - if 'items' in self.rule: - matches = self.raw(self.doc, self.rule['items']) - if matches and len(matches): - for item in matches: - feed_item = {} - - if 'item_title' in self.rule: - feed_item['title'] = self.string(item, 'item_title') - if 'item_link' in self.rule: - url = self.string(item, 'item_link') - if url: - url = urljoin(self.link, url) - feed_item['link'] = url - if 'item_desc' in self.rule: - feed_item['desc'] = self.string(item, 'item_desc') - if 'item_content' in self.rule: - feed_item['content'] = self.string(item, 'item_content') - if 'item_time' in self.rule: - feed_item['updated'] = self.string(item, 'item_time') - if 'item_id' in self.rule: - feed_item['id'] = self.string(item, 'item_id') - feed_item['is_permalink'] = False - - self.feed.items.append(feed_item) diff --git a/morss/morss.py b/morss/morss.py index 02389e0..8ac301c 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -320,9 +320,9 @@ def FeedFetch(url, options): delay = 0 try: - con = crawler.custom_handler('xml', True, delay, options.encoding, - not feedify.supported(url) or not options.items).open(url, timeout=TIMEOUT * 2) - # feedify.supported(url) to use full crawler if using feedify + con = crawler.custom_handler(accept='xml', strict=True, delay=delay, + encoding=options.encoding, basic=not options.items) \ + .open(url, timeout=TIMEOUT * 2) xml = con.read() except (IOError, HTTPException): @@ -330,37 +330,30 @@ def FeedFetch(url, options): contenttype = con.info().get('Content-Type', '').split(';')[0] - if feedify.supported(url): - # using config file-based feedify - feed = feedify.Builder(url, xml) - feed.build() - rss = feed.feed + if options.items: + # using custom rules + rss = feeds.FeedHTML(xml, url, contenttype) + feed.rule - elif re.match(b'\s*<\?xml', xml) is not None or contenttype in crawler.MIMETYPE['xml']: - rss = feeds.FeedXML(xml) - - elif options.items: - # using argument-based feedify - rule = {'items': options.items} - rule['mode'] = 'xpath' + rss.rules['items'] = options.items if options.item_title: - rule['item_title'] = options.item_title + rss.rules['item_title'] = options.item_title if options.item_link: - rule['item_link'] = options.item_link + rss.rules['item_link'] = options.item_link if options.item_content: - rule['item_content'] = options.item_content + rss.rules['item_content'] = options.item_content if options.item_time: - rule['item_time'] = options.item_time - - feed = feedify.Builder(url, xml, rule) - feed.build() - rss = feed.feed + rss.rules['item_time'] = options.item_time else: - log('random page') - log(contenttype) - raise MorssException('Link provided is not a valid feed') + try: + rss = feeds.parse(xml, url, contenttype) + + except TypeError: + log('random page') + log(contenttype) + raise MorssException('Link provided is not a valid feed') return rss @@ -542,7 +535,7 @@ def cgi_app(environ, start_response): rss = FeedFetch(url, options) if headers['content-type'] == 'text/xml': - headers['content-type'] = rss.rules['mimetype'][0] + headers['content-type'] = rss.mimetype[0] start_response(headers['status'], list(headers.items()))