diff --git a/morss/feedify.py b/morss/feedify.py index 6271817..b7dd3d7 100644 --- a/morss/feedify.py +++ b/morss/feedify.py @@ -1,23 +1,25 @@ #!/usr/bin/env python +import re +import json +import urlparse +import urllib2 + from ConfigParser import ConfigParser from fnmatch import fnmatch +import lxml.html + import feeds import morss -import re - -import urllib2 -import lxml.html -import json -import urlparse -def toclass(query): +def to_class(query): pattern = r'\[class=([^\]]+)\]' repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]' return re.sub(pattern, repl, query) -def getRule(link): + +def get_rule(link): config = ConfigParser() config.read('feedify.ini') @@ -29,10 +31,12 @@ def getRule(link): return values return False -def supported(link): - return getRule(link) is not False -def formatString(string, getter, error=False): +def supported(link): + return get_rule(link) is not False + + +def format_string(string, getter, error=False): out = "" char = string[0] @@ -42,41 +46,42 @@ def formatString(string, getter, error=False): match = follow.partition('"') out = match[0] if len(match) >= 2: - next = match[2] + next_match = match[2] else: - next = None + next_match = None elif char == '{': match = follow.partition('}') try: - test = formatString(match[0], getter, True) - except ValueError, KeyError: + test = format_string(match[0], getter, True) + except (ValueError, KeyError): pass else: out = test - next = match[2] + next_match = match[2] elif char == ' ': - next = follow + next_match = follow elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string): match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups() - rawValue = getter(match[0]) - if not isinstance(rawValue, basestring): + raw_value = getter(match[0]) + if not isinstance(raw_value, basestring): if match[1] is not None: - out = match[1].join(rawValue) + out = match[1].join(raw_value) else: - out = ''.join(rawValue) + out = ''.join(raw_value) if not out and error: raise ValueError - next = match[2] + next_match = match[2] else: raise ValueError('bogus string') - if next is not None and len(next): - return out + formatString(next, getter, error) + if next_match is not None and len(next_match): + return out + format_string(next_match, getter, error) else: return out -def PreWorker(url, cache): + +def pre_worker(url, cache): if urlparse.urlparse(url).netloc == 'itunes.apple.com': match = re.search('/id([0-9]+)(\?.*)?$', url) if match: @@ -84,6 +89,7 @@ def PreWorker(url, cache): redirect = 'https://itunes.apple.com/lookup?id={id}'.format(id=iid) cache.set('redirect', redirect) + class Builder(object): def __init__(self, link, data=None, cache=False): self.link = link @@ -93,11 +99,11 @@ class Builder(object): data = urllib2.urlopen(link).read() self.data = data - self.rule = getRule(link) + self.rule = get_rule(link) if self.rule['mode'] == 'xpath': if not isinstance(self.data, unicode): - self.data = self.data.decode(morss.detEncoding(self.data), 'replace') + self.data = self.data.decode(morss.detect_encoding(self.data), 'replace') self.doc = lxml.html.fromstring(self.data) elif self.rule['mode'] == 'json': self.doc = json.loads(data) @@ -106,7 +112,7 @@ class Builder(object): def raw(self, html, expr): if self.rule['mode'] == 'xpath': - return html.xpath(toclass(expr)) + return html.xpath(to_class(expr)) elif self.rule['mode'] == 'json': a = [html] @@ -119,7 +125,7 @@ class Builder(object): if kids is None: pass elif isinstance(kids, list): - [b.append(i) for i in kids] + b += kids elif isinstance(kids, basestring): b.append(kids.replace('\n', '
')) else: @@ -128,7 +134,7 @@ class Builder(object): if match[1] is None: a = b else: - if len(b)-1 >= int(match[1]): + if len(b) - 1 >= int(match[1]): a = [b[int(match[1])]] else: a = [] @@ -150,7 +156,7 @@ class Builder(object): def string(self, html, expr): getter = lambda x: self.strings(html, x) - return formatString(self.rule[expr], getter) + return format_string(self.rule[expr], getter) def build(self): if 'title' in self.rule: @@ -160,23 +166,22 @@ class Builder(object): matches = self.raw(self.doc, self.rule['items']) if matches and len(matches): for item in matches: - feedItem = {} + feed_item = {} if 'item_title' in self.rule: - feedItem['title'] = self.string(item, 'item_title') + feed_item['title'] = self.string(item, 'item_title') if 'item_link' in self.rule: url = self.string(item, 'item_link') url = urlparse.urljoin(self.link, url) - feedItem['link'] = url + feed_item['link'] = url if 'item_desc' in self.rule: - feedItem['desc'] = self.string(item, 'item_desc') + feed_item['desc'] = self.string(item, 'item_desc') if 'item_content' in self.rule: - feedItem['content'] = self.string(item, 'item_content') + feed_item['content'] = self.string(item, 'item_content') if 'item_time' in self.rule: - feedItem['updated'] = self.string(item, 'item_time') + feed_item['updated'] = self.string(item, 'item_time') if 'item_id' in self.rule: - feedItem['id'] = self.string(item, 'item_id') - feedItem['isPermaLink'] = False - - self.feed.items.append(feedItem) + feed_item['id'] = self.string(item, 'item_id') + feed_item['isPermaLink'] = False + self.feed.items.append(feed_item) diff --git a/morss/feeds.py b/morss/feeds.py index f18232a..093b5ed 100644 --- a/morss/feeds.py +++ b/morss/feeds.py @@ -1,14 +1,16 @@ #!/usr/bin/env python -from lxml import etree from datetime import datetime -import dateutil.parser -from dateutil import tz -import re - from StringIO import StringIO + +import re import json import csv +import urllib2 + +from lxml import etree +from dateutil import tz +import dateutil.parser try: from wheezy.template.engine import Engine @@ -26,21 +28,22 @@ except ImportError: Element = etree.Element -NSMAP = {'atom': 'http://www.w3.org/2005/Atom', - 'atom03': 'http://purl.org/atom/ns#', - 'media': 'http://search.yahoo.com/mrss/', - 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', - 'slash': 'http://purl.org/rss/1.0/modules/slash/', - 'dc': 'http://purl.org/dc/elements/1.1/', - 'content': 'http://purl.org/rss/1.0/modules/content/', - 'rssfake': 'http://purl.org/rss/1.0/'} +NSMAP = {'atom': 'http://www.w3.org/2005/Atom', + 'atom03': 'http://purl.org/atom/ns#', + 'media': 'http://search.yahoo.com/mrss/', + 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', + 'slash': 'http://purl.org/rss/1.0/modules/slash/', + 'dc': 'http://purl.org/dc/elements/1.1/', + 'content': 'http://purl.org/rss/1.0/modules/content/', + 'rssfake': 'http://purl.org/rss/1.0/'} + def load(url): - import urllib2 d = urllib2.urlopen(url).read() return parse(d) -def tagNS(tag, nsmap=NSMAP): + +def tag_NS(tag, nsmap=NSMAP): match = re.search(r'^\{([^\}]+)\}(.*)$', tag) if match: match = match.groups() @@ -55,15 +58,19 @@ def tagNS(tag, nsmap=NSMAP): return "{%s}%s" % (nsmap[match[0]], match[1].lower()) return tag -def innerHTML(xml): + +def inner_html(xml): return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()]) -def cleanNode(xml): + +def clean_node(xml): [xml.remove(child) for child in xml.iterchildren()] + class FeedException(Exception): pass + def parse(data): # encoding match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) @@ -80,15 +87,16 @@ def parse(data): # rss match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP) if len(match): - mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS, - 'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom } + m_table = {'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS, + 'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom} match = match[0] - tag = tagNS(match.tag) - if tag in mtable: - return mtable[tag](doc, tag) + tag = tag_NS(match.tag) + if tag in m_table: + return m_table[tag](doc, tag) raise FeedException('unknown feed type') + class FeedBase(object): """ Base for xml-related classes, which provides simple wrappers around xpath @@ -135,7 +143,7 @@ class FeedBase(object): else: return "" - def xgetCreate(self, table): + def xget_create(self, table): """ Returns an element, and creates it when not present """ value = table[self.tag] if not isinstance(value, tuple): @@ -145,7 +153,7 @@ class FeedBase(object): if match is not None: return match else: - element = etree.Element(tagNS(new)) + element = etree.Element(tag_NS(new)) self.root.append(element) return element @@ -158,58 +166,62 @@ class FeedBase(object): """ Returns string using lxml. Arguments passed to tostring """ return etree.tostring(self.xml, pretty_print=True, **k) + class FeedDescriptor(object): """ Descriptor which gives off elements based on "self.getName" and "self.setName" as getter/setters. Looks far better, and avoids duplicates """ + def __init__(self, name): self.name = name - self.nname = name[0].upper() + name[1:] def __get__(self, instance, owner): - getter = getattr(instance, 'get%s' % self.nname) + getter = getattr(instance, 'get_%s' % self.name) return getter() def __set__(self, instance, value): - setter = getattr(instance, 'set%s' % self.nname) + setter = getattr(instance, 'set_%s' % self.name) return setter(value) def __delete__(self, instance): - deleter = getattr(instance, 'del%s' % self.nname) + deleter = getattr(instance, 'del_%s' % self.name) return deleter() + class FeedTime(FeedDescriptor): def __get__(self, instance, owner): - getter = getattr(instance, 'get%s' % self.nname) + getter = getattr(instance, 'get_%s' % self.name) raw = getter() try: - time = parseTime(raw) + time = parse_time(raw) return time except ValueError: return None def __set__(self, instance, value): try: - time = parseTime(value) + time = parse_time(value) raw = time.strftime(instance.timeFormat) - setter = getattr(instance, 'set%s' % self.nname) + setter = getattr(instance, 'set_%s' % self.name) return setter(raw) except ValueError: pass + class FeedBool(FeedDescriptor): def __get__(self, instance, owner): - getter = getattr(instance, 'get%s' % self.nname) + getter = getattr(instance, 'get_%s' % self.name) raw = getter() return (raw or '').lower() != 'false' def __set__(self, instance, value): raw = 'true' if value else 'false' - setter = getattr(instance, 'set%s' % self.nname) + setter = getattr(instance, 'set_%s' % self.name) return setter(raw) -def parseTime(value): + +def parse_time(value): if isinstance(value, basestring): if re.match(r'^[0-9]+$', value): return datetime.fromtimestamp(int(value), tz.tzutc()) @@ -222,6 +234,7 @@ def parseTime(value): else: return False + class FeedList(object): """ Class to map a list of xml elements against a list of matching objects, @@ -231,14 +244,15 @@ class FeedList(object): Comes with its very own descriptor. """ - def __init__(self, parent, getter, tag, childClass): + + def __init__(self, parent, getter, tag, child_class): self.parent = parent self.getter = getter - self.childClass = childClass + self.childClass = child_class self.tag = tag - self._children = {} # id(xml) => FeedItem + self._children = {} # id(xml) => FeedItem - def getChildren(self): + def get_children(self): children = self.getter() out = [] for child in children: @@ -269,7 +283,7 @@ class FeedList(object): return new def __getitem__(self, key): - return self.getChildren()[key] + return self.get_children()[key] def __delitem__(self, key): child = self.getter()[key] @@ -282,22 +296,24 @@ class FeedList(object): def __len__(self): return len(self.getter()) + class FeedListDescriptor(object): """ Descriptor for FeedList """ + def __init__(self, name): self.name = name - self.items = {} # id(instance) => FeedList + self.items = {} # id(instance) => FeedList def __get__(self, instance, owner=None): key = id(instance) if key in self.items: return self.items[key] else: - getter = getattr(instance, 'get%s' % self.name.title()) - className = globals()[getattr(instance, '%sClass' % self.name)] - self.items[key] = FeedList(instance, getter, instance.tag, className) + getter = getattr(instance, 'get_%s' % self.name) + class_name = globals()[getattr(instance, '%sClass' % self.name)] + self.items[key] = FeedList(instance, getter, instance.tag, class_name) return self.items[key] def __set__(self, instance, value): @@ -305,6 +321,7 @@ class FeedListDescriptor(object): [x.remove() for x in [x for x in f.items]] [feedlist.append(x) for x in value] + class FeedParser(FeedBase): itemsClass = 'FeedItem' mimetype = 'application/xml' @@ -318,27 +335,25 @@ class FeedParser(FeedBase): self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0] self.tag = tag - def getTitle(self): + def get_title(self): return "" - def setTitle(self, value): + def set_title(self, value): pass - def delTitle(self): + def del_title(self): self.title = "" - - def getDesc(self): + def get_desc(self): pass - def setDesc(self, value): + def set_desc(self, value): pass - def delDesc(self): + def del_desc(self): self.desc = "" - - def getItems(self): + def get_items(self): return [] title = FeedDescriptor('title') @@ -355,7 +370,8 @@ class FeedParser(FeedBase): out = StringIO() c = csv.writer(out, dialect=csv.excel) for item in self.items: - row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if isinstance(x[1], basestring)] + row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if + isinstance(x[1], basestring)] c.writerow(row) out.seek(0) return out.read() @@ -367,7 +383,8 @@ class FeedParser(FeedBase): loader = DictLoader({'reader': open('reader.html.template').read()}) engine = Engine(loader=loader, extensions=[CoreExtension()]) template = engine.get_template('reader') - return template.render({'feed':self}).encode('utf-8') + return template.render({'feed': self}).encode('utf-8') + class FeedParserRSS(FeedParser): """ @@ -375,161 +392,153 @@ class FeedParserRSS(FeedParser): """ itemsClass = 'FeedItemRSS' mimetype = 'application/rss+xml' - base = { 'rdf:rdf': '', - 'channel': ''} + base = { + 'rdf:rdf': '', + 'channel': ''} - def getTitle(self): + def get_title(self): return self.xval('rssfake:title|title') - def setTitle(self, value): + def set_title(self, value): if not value: return self.xdel('rssfake:title|title') - table = { 'rdf:rdf': 'rssfake:title', - 'channel': 'title'} - element = self.xgetCreate(table) + table = {'rdf:rdf': 'rssfake:title', + 'channel': 'title'} + element = self.xget_create(table) element.text = value - - def getDesc(self): + def get_desc(self): return self.xval('rssfake:description|description') - def setDesc(self, value): + def set_desc(self, value): if not value: return self.xdel('rssfake:description|description') - table = { 'rdf:rdf': 'rssfake:description', - 'channel': 'description'} - element = self.xgetCreate(table) + table = {'rdf:rdf': 'rssfake:description', + 'channel': 'description'} + element = self.xget_create(table) element.text = value - - def getItems(self): + def get_items(self): return self.xpath('rssfake:item|item') + class FeedParserAtom(FeedParser): """ Atom Parser """ itemsClass = 'FeedItemAtom' mimetype = 'application/atom+xml' - base = { 'atom:feed': '', - 'atom03:feed': ''} + base = {'atom:feed': '', + 'atom03:feed': ''} - def getTitle(self): + def get_title(self): return self.xval('atom:title|atom03:title') - def setTitle(self, value): + def set_title(self, value): if not value: return self.xval('atom:title|atom03:title') - table = { 'atom:feed': 'atom:title', - 'atom03:feed': 'atom03:title'} - element = self.xgetCreate(table) + table = {'atom:feed': 'atom:title', + 'atom03:feed': 'atom03:title'} + element = self.xget_create(table) element.text = value - - def getDesc(self): + def get_desc(self): return self.xval('atom:subtitle|atom03:subtitle') - def setDesc(self, value): + def set_desc(self, value): if not value: return self.xdel('atom:subtitle|atom03:subtitle') - table = { 'atom:feed': 'atom:subtitle', - 'atom03:feed': 'atom03:subtitle'} - element = self.xgetCreate(table) + table = {'atom:feed': 'atom:subtitle', + 'atom03:feed': 'atom03:subtitle'} + element = self.xget_create(table) element.text = value - - def getItems(self): + def get_items(self): return self.xpath('atom:entry|atom03:entry') + class FeedItem(FeedBase): timeFormat = '' - dic = ('title', 'link', 'desc', 'content', 'id', 'isPermaLink', 'time', 'updated') + dic = ('title', 'link', 'desc', 'content', 'id', 'is_permalink', 'time', 'updated') def __init__(self, xml=None, tag='atom:feed'): if xml is None: - xml = Element(tagNS(self.base[tag])) + xml = Element(tag_NS(self.base[tag])) self.root = self.xml = xml self.tag = tag - def getTitle(self): + def get_title(self): return "" - def setTitle(self): + def set_title(self, value): pass - def delTitle(self): + def del_title(self): self.title = "" - - def getLink(self): + def get_link(self): return "" - def setLink(self, value): + def set_link(self, value): pass - def delLink(self): + def del_link(self): self.link = "" - - def getIsPermaLink(self): + def get_is_permalink(self): return "" - def setIsPermaLink(self, value): + def set_is_permalink(self, value): pass - - def getDesc(self): + def get_desc(self): return "" - def setDesc(self, value): + def set_desc(self, value): pass - def delDesc(self): + def del_desc(self): self.desc = "" - - def getContent(self): + def get_content(self): return "" - def setContent(self, value): + def set_content(self, value): pass - def delContent(self): + def del_content(self): self.content = "" - - def getId(self): + def get_id(self): return "" - def setId(self, value): + def set_id(self, value): pass - def delId(self): + def del_id(self): self.id = "" - - def getTime(self): + def get_time(self): return None - def setTime(self, value): + def set_time(self, value): pass def delTime(self): self.time = None - - def getUpdated(self): + def get_updated(self): return None - def setUpdated(self, value): + def set_updated(self, value): pass - def delUpdated(self): + def del_updated(self): self.updated = None title = FeedDescriptor('title') @@ -537,11 +546,11 @@ class FeedItem(FeedBase): description = desc = FeedDescriptor('desc') content = FeedDescriptor('content') id = FeedDescriptor('id') - isPermaLink = FeedBool('isPermaLink') + is_permalink = FeedBool('is_permalink') time = FeedTime('time') updated = FeedTime('updated') - def pushContent(self, value): + def push_content(self, value): if not self.desc and self.content: self.desc = self.content @@ -550,201 +559,192 @@ class FeedItem(FeedBase): def remove(self): self.xml.getparent().remove(self.xml) + class FeedItemRSS(FeedItem): timeFormat = '%a, %d %b %Y %H:%M:%S %Z' - base = { 'rdf:rdf': 'rssfake:item', - 'channel': 'item'} + base = {'rdf:rdf': 'rssfake:item', + 'channel': 'item'} - def getTitle(self): + def get_title(self): return self.xval('rssfake:title|title') - def setTitle(self, value): + def set_title(self, value): if not value: return self.xdel('rssfake:title|title') - table = { 'rdf:rdf': 'rssfake:title', - 'channel': 'title'} - element = self.xgetCreate(table) + table = {'rdf:rdf': 'rssfake:title', + 'channel': 'title'} + element = self.xget_create(table) element.text = value - - def getLink(self): + def get_link(self): return self.xval('rssfake:link|link') - def setLink(self, value): - if self.isPermaLink and self.id == self.link != value: - self.isPermaLink = False + def set_link(self, value): + if self.is_permalink and self.id == self.link != value: + self.is_permalink = False - table = { 'rdf:rdf': 'rssfake:link', - 'channel': 'link'} - element = self.xgetCreate(table) + table = {'rdf:rdf': 'rssfake:link', + 'channel': 'link'} + element = self.xget_create(table) element.text = value - def getDesc(self): + def get_desc(self): return self.xval('rssfake:description|description') - def setDesc(self, value): + def set_desc(self, value): if not value: return self.xdel('rssfake:description|description') - table = { 'rdf:rdf': 'rssfake:description', - 'channel': 'description'} - element = self.xgetCreate(table) + table = {'rdf:rdf': 'rssfake:description', + 'channel': 'description'} + element = self.xget_create(table) element.text = value - - def getContent(self): + def get_content(self): return self.xval('content:encoded') - def setContent(self, value): + def set_content(self, value): if not value: return self.xdel('content:encoded') - table = { 'rdf:rdf': 'content:encoded', - 'channel': 'content:encoded'} - element = self.xgetCreate(table) + table = {'rdf:rdf': 'content:encoded', + 'channel': 'content:encoded'} + element = self.xget_create(table) element.text = value - - def getId(self): + def get_id(self): return self.xval('rssfake:guid|guid') - def setId(self, value): + def set_id(self, value): if not value: return self.xdel('rssfake:guid|guid') - table = { 'rdf:rdf': 'rssfake:guid', - 'channel': 'guid'} - element = self.xgetCreate(table) + table = {'rdf:rdf': 'rssfake:guid', + 'channel': 'guid'} + element = self.xget_create(table) element.text = value - - def getIsPermaLink(self): + def get_is_permalink(self): return self.xget('rssfake:guid/@isPermaLink|guid/@isPermaLink') - def setIsPermaLink(self, value): - table = { 'rdf:rdf': 'rssfake:guid', - 'channel': 'guid'} - element = self.xgetCreate(table) + def set_is_permalink(self, value): + table = {'rdf:rdf': 'rssfake:guid', + 'channel': 'guid'} + element = self.xget_create(table) element.attrib['isPermaLink'] = value - - def getTime(self): + def get_time(self): return self.xval('rssfake:pubDate|pubDate') - def setTime(self, value): + def set_time(self, value): if not value: return self.xdel('rssfake:pubDate|pubDate') - table = { 'rdf:rdf': 'rssfake:pubDate', - 'channel': 'pubDate'} - element = self.xgetCreate(table) + table = {'rdf:rdf': 'rssfake:pubDate', + 'channel': 'pubDate'} + element = self.xget_create(table) element.text = value + class FeedItemAtom(FeedItem): timeFormat = '%Y-%m-%dT%H:%M:%SZ' - base = { 'atom:feed': 'atom:entry', - 'atom03:feed': 'atom03:entry'} + base = {'atom:feed': 'atom:entry', + 'atom03:feed': 'atom03:entry'} - def getTitle(self): + def get_title(self): return self.xval('atom:title|atom03:title') - def setTitle(self, value): + def set_title(self, value): if not value: return self.xdel('atom:title|atom03:title') - table = { 'atom:feed': 'atom:title', - 'atom03:feed': 'atom03:title'} - element = self.xgetCreate(table) + table = {'atom:feed': 'atom:title', + 'atom03:feed': 'atom03:title'} + element = self.xget_create(table) element.text = value - - def getLink(self): + def get_link(self): return self.xget('(atom:link|atom03:link)[@rel="alternate" or not(@rel)]/@href') - def setLink(self, value): - table = { 'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'), - 'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')} - element = self.xgetCreate(table) + def set_link(self, value): + table = {'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'), + 'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')} + element = self.xget_create(table) element.attrib['href'] = value - - def getDesc(self): + def get_desc(self): # default "type" is "text" element = self.xget('atom:summary|atom03:summary') if element is not None: - return innerHTML(element) + return inner_html(element) else: return "" - def setDesc(self, value): + def set_desc(self, value): if not value: return self.xdel('atom:summary|atom03:summary') - table = { 'atom:feed': 'atom:summary', - 'atom03:feed': 'atom03:summary'} - element = self.xgetCreate(table) + table = {'atom:feed': 'atom:summary', + 'atom03:feed': 'atom03:summary'} + element = self.xget_create(table) if element.attrib.get('type', '') == 'xhtml': - cleanNode(element) + clean_node(element) element.attrib['type'] = 'html' element.text = value - - def getContent(self): + def get_content(self): element = self.xget('atom:content|atom03:content') if element is not None: - return innerHTML(element) + return inner_html(element) else: return "" - def setContent(self, value): + def set_content(self, value): if not value: return self.xdel('atom:content|atom03:content') - table = { 'atom:feed': 'atom:content', - 'atom03:feed': 'atom03:content'} - element = self.xgetCreate(table) + table = {'atom:feed': 'atom:content', + 'atom03:feed': 'atom03:content'} + element = self.xget_create(table) if element.attrib.get('type', '') == 'xhtml': - cleanNode(element) + clean_node(element) element.attrib['type'] = 'html' element.text = value - - def getId(self): + def get_id(self): return self.xval('atom:id|atom03:id') - def setId(self, value): + def set_id(self, value): if not value: return self.xdel('atom:id|atom03:id') - table = { 'atom:feed': 'atom:id', - 'atom03:feed': 'atom03:id'} - element = self.xgetCreate(table) + table = {'atom:feed': 'atom:id', + 'atom03:feed': 'atom03:id'} + element = self.xget_create(table) element.text = value - - def getTime(self): + def get_time(self): return self.xval('atom:published|atom03:published') - def setTime(self, value): + def set_time(self, value): if not value: return self.xdel('atom:published|atom03:published') - table = { 'atom:feed': 'atom:published', - 'atom03:feed': 'atom03:published'} - element = self.xgetCreate(table) + table = {'atom:feed': 'atom:published', + 'atom03:feed': 'atom03:published'} + element = self.xget_create(table) element.text = value - - def getUpdated(self): + def get_updated(self): return self.xval('atom:updated|atom03:updated') - def setUpdated(self, value): + def set_updated(self, value): if not value: return self.xdel('atom:updated|atom03:updated') - table = { 'atom:feed': 'atom:updated', - 'atom03:feed': 'atom03:updated'} - element = self.xgetCreate(table) + table = {'atom:feed': 'atom:updated', + 'atom03:feed': 'atom03:updated'} + element = self.xget_create(table) element.text = value diff --git a/morss/morss.py b/morss/morss.py index 2b9884c..0299fec 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -31,21 +31,22 @@ from StringIO import StringIO from readability import readability from html2text import HTML2Text -LIM_ITEM = 100 # deletes what's beyond -LIM_TIME = 7 # deletes what's after -MAX_ITEM = 50 # cache-only beyond -MAX_TIME = 7 # cache-only after (in sec) -DELAY = 10*60 # xml cache & ETag cache (in sec) -TIMEOUT = 2 # http timeout (in sec) -THREADS = 10 # number of threads (1 for single-threaded) +LIM_ITEM = 100 # deletes what's beyond +LIM_TIME = 7 # deletes what's after +MAX_ITEM = 50 # cache-only beyond +MAX_TIME = 7 # cache-only after (in sec) +DELAY = 10 * 60 # xml cache & ETag cache (in sec) +TIMEOUT = 2 # http timeout (in sec) +THREADS = 10 # number of threads (1 for single-threaded) DEBUG = False UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)' UA_HTML = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' -MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], - 'html': ['text/html', 'application/xhtml+xml', 'application/xml']} +MIMETYPE = { + 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], + 'html': ['text/html', 'application/xhtml+xml', 'application/xml']} FBAPPID = "" FBSECRET = "" @@ -57,11 +58,14 @@ if 'SCRIPT_NAME' in os.environ: httplib.HTTPConnection.debuglevel = 1 import cgitb + cgitb.enable() + class MorssException(Exception): pass + def log(txt, force=False): if DEBUG or force: if 'REQUEST_URI' in os.environ: @@ -70,17 +74,18 @@ def log(txt, force=False): print repr(txt) -def lenHTML(txt): +def len_html(txt): if len(txt): return len(lxml.html.fromstring(txt).text_content()) else: return 0 -def countWord(txt): + +def count_words(txt): if len(txt): return len(lxml.html.fromstring(txt).text_content().split()) - else: - return 0 + return 0 + class Options: def __init__(self, options=None): @@ -95,9 +100,11 @@ class Options: def __contains__(self, key): return key in self.options + class Cache: """ Light, error-prone caching system. """ - def __init__(self, folder=None, key='cache', lifespan=10*24*3600): + + def __init__(self, folder=None, key='cache', lifespan=10 * 24 * 3600): self._key = key self._dir = folder self._lifespan = lifespan @@ -108,7 +115,7 @@ class Cache: self._hash = "NO CACHE" return - maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp" + maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp" self._hash = urllib.quote_plus(self._key)[:maxsize] self._file = self._dir + '/' + self._hash @@ -178,13 +185,16 @@ class Cache: else: return self + class SimpleDownload(urllib2.HTTPCookieProcessor): """ Custom urllib2 handler to download a page, using etag/last-modified headers, to save bandwidth. The given headers are added back into the header on error 304 for easier use. """ - def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False): + + def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, + accept=None, strict=False): urllib2.HTTPCookieProcessor.__init__(self, cookiejar) self.cache = cache self.etag = etag @@ -214,7 +224,7 @@ class SimpleDownload(urllib2.HTTPCookieProcessor): out = {} rank = 1.1 for group in self.accept: - rank = rank - 0.1 + rank -= 0.1 if isinstance(group, basestring): if group in MIMETYPE: @@ -228,9 +238,9 @@ class SimpleDownload(urllib2.HTTPCookieProcessor): out[mime] = rank if not self.strict: - out['*/*'] = rank-0.1 + out['*/*'] = rank - 0.1 - string = ','.join([x+';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out]) + string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out]) req.add_unredirected_header('Accept', string) return req @@ -259,20 +269,20 @@ class SimpleDownload(urllib2.HTTPCookieProcessor): if resp.info().type in MIMETYPE['html']: match = re.search(r'(?i)]*?url=(http.*?)["\']', data) if match: - newurl = match.groups()[0] - log('redirect: %s' % newurl) + new_url = match.groups()[0] + log('redirect: %s' % new_url) - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ('content-length', 'content-type')) - new = urllib2.Request(newurl, - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True) + new_headers = dict((k, v) for k, v in req.headers.items() + if k.lower() not in ('content-length', 'content-type')) + new = urllib2.Request(new_url, + headers=new_headers, + origin_req_host=req.get_origin_req_host(), + unverifiable=True) return self.parent.open(new, timeout=req.timeout) # encoding - enc = detEncoding(data, resp) + enc = detect_encoding(data, resp) if enc: data = data.decode(enc, 'replace') @@ -290,7 +300,8 @@ class SimpleDownload(urllib2.HTTPCookieProcessor): https_response = http_response https_request = http_request -def detEncoding(data, con=None): + +def detect_encoding(data, con=None): if con is not None and con.headers.getparam('charset'): log('header') return con.headers.getparam('charset') @@ -306,6 +317,7 @@ def detEncoding(data, con=None): return None + def Fix(item, feedurl='/'): """ Improves feed items (absolute links, resolve feedburner links, etc) """ @@ -358,7 +370,8 @@ def Fix(item, feedurl='/'): match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link) if match: url = match.groups()[0].split('0') - t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'} + t = {'A': '0', 'B': '.', 'C': '/', 'D': '?', 'E': '-', 'H': ',', 'I': '_', 'L': 'http://', 'S': 'www.', + 'N': '.com', 'O': '.co.uk'} item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]]) log(item.link) @@ -371,6 +384,7 @@ def Fix(item, feedurl='/'): return item + def Fill(item, cache, feedurl='/', fast=False): """ Returns True when it has done its best """ @@ -381,8 +395,8 @@ def Fill(item, cache, feedurl='/', fast=False): log(item.link) # content already provided? - count_content = countWord(item.content) - count_desc = countWord(item.desc) + count_content = count_words(item.content) + count_desc = count_words(item.desc) if max(count_content, count_desc) > 500: if count_desc > count_content: @@ -392,7 +406,7 @@ def Fill(item, cache, feedurl='/', fast=False): log('long enough') return True - if count_content > 5*count_desc > 0 and count_content > 50: + if count_content > 5 * count_desc > 0 and count_content > 50: log('content bigger enough') return True @@ -432,7 +446,7 @@ def Fill(item, cache, feedurl='/', fast=False): log('old error') else: log('cached') - item.pushContent(cache.get(link)) + item.push_content(cache.get(link)) return True # super-fast mode @@ -457,8 +471,8 @@ def Fill(item, cache, feedurl='/', fast=False): out = readability.Document(data, url=con.url).summary(True) - if countWord(out) > max(count_content, count_desc) > 0: - item.pushContent(out) + if count_words(out) > max(count_content, count_desc) > 0: + item.push_content(out) cache.set(link, out) else: log('not bigger enough') @@ -467,7 +481,8 @@ def Fill(item, cache, feedurl='/', fast=False): return True -def Init(url, cachePath, options): + +def Init(url, cache_path, options): # url clean up log(url) @@ -481,14 +496,15 @@ def Init(url, cachePath, options): url = url.replace(' ', '%20') # cache - cache = Cache(cachePath, url) + cache = Cache(cache_path, url) log(cache._hash) return (url, cache) + def Fetch(url, cache, options): # do some useful facebook work - feedify.PreWorker(url, cache) + feedify.pre_worker(url, cache) if 'redirect' in cache: url = cache.get('redirect') @@ -502,8 +518,9 @@ def Fetch(url, cache, options): style = cache.get('style') else: try: - opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html')) - con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT*2) + opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), + accept=('xml', 'html')) + con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT * 2) xml = con.read() except (IOError, httplib.HTTPException): raise MorssException('Error downloading feed') @@ -540,7 +557,8 @@ def Fetch(url, cache, options): feed.build() rss = feed.feed elif style == 'html': - match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href") + match = lxml.html.fromstring(xml).xpath( + "//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href") if len(match): link = urlparse.urljoin(url, match[0]) log('rss redirect: %s' % link) @@ -552,13 +570,13 @@ def Fetch(url, cache, options): log('random page') raise MorssException('Link provided is not a valid feed') - cache.save() return rss + def Gather(rss, url, cache, options): size = len(rss.items) - startTime = time.time() + start_time = time.time() # custom settings lim_item = LIM_ITEM @@ -580,14 +598,14 @@ def Gather(rss, url, cache, options): queue.task_done() def worker(i, item): - if time.time() - startTime > lim_time >= 0 or i+1 > lim_item >= 0: + if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0: log('dropped') item.remove() return item = Fix(item, url) - if time.time() - startTime > max_time >= 0 or i+1 > max_item >= 0: + if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0: if not options.proxy: if Fill(item, cache, url, True) is False: item.remove() @@ -617,10 +635,11 @@ def Gather(rss, url, cache, options): new.time = "5 Oct 2013 22:42" log(len(rss.items)) - log(time.time() - startTime) + log(time.time() - start_time) return rss + def After(rss, options): for i, item in enumerate(rss.items): @@ -662,8 +681,9 @@ def After(rss, options): else: return rss.tostring(xml_declaration=True, encoding='UTF-8') + def process(url, cache=None, options=None): - if options == None: + if not options: options = [] options = Options(options) @@ -673,6 +693,7 @@ def process(url, cache=None, options=None): return After(rss, options) + def cgi_app(environ, start_response): # get options if 'REQUEST_URI' in environ: @@ -696,7 +717,8 @@ def cgi_app(environ, start_response): DEBUG = options.debug if 'HTTP_IF_NONE_MATCH' in environ: - if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY: + if not options.force and not options.facebook and time.time() - int( + environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY: headers['status'] = '304 Not Modified' start_response(headers['status'], headers.items()) log(url) @@ -722,30 +744,31 @@ def cgi_app(environ, start_response): url, cache = Init(url, os.getcwd() + '/cache', options) if options.facebook: - doFacebook(url, environ, headers, options, cache) + do_facebook(url, environ, headers, options, cache) start_response(headers['status'], headers.items()) return # get the work done - RSS = Fetch(url, cache, options) + rss = Fetch(url, cache, options) if headers['content-type'] == 'text/xml': - headers['content-type'] = RSS.mimetype + headers['content-type'] = rss.mimetype start_response(headers['status'], headers.items()) - RSS = Gather(RSS, url, cache, options) + rss = Gather(rss, url, cache, options) if not DEBUG and not options.silent: - return After(RSS, options) + return After(rss, options) log('done') + def cgi_wrapper(environ, start_response): # simple http server for html and css files = { - '': 'text/html', - 'index.html': 'text/html'} + '': 'text/html', + 'index.html': 'text/html'} if 'REQUEST_URI' in environ: url = environ['REQUEST_URI'][1:] @@ -774,13 +797,12 @@ def cgi_wrapper(environ, start_response): except (KeyboardInterrupt, SystemExit): raise except Exception as e: - headers = {} - headers['status'] = '500 Oops' - headers['content-type'] = 'text/plain' + headers = {'status': '500 Oops', 'content-type': 'text/plain'} start_response(headers['status'], headers.items(), sys.exc_info()) log('ERROR: %s' % e.message, force=True) return 'An error happened' + def cli_app(): options = Options(sys.argv[1:-1]) url = sys.argv[-1] @@ -789,15 +811,16 @@ def cli_app(): DEBUG = options.debug url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options) - RSS = Fetch(url, cache, options) - RSS = Gather(RSS, url, cache, options) + rss = Fetch(url, cache, options) + rss = Gather(rss, url, cache, options) if not DEBUG and not options.silent: - print After(RSS, options) + print After(rss, options) log('done') -def doFacebook(url, environ, headers, options, cache): + +def do_facebook(url, environ, headers, options, cache): log('fb stuff') query = urlparse.urlparse(url).query @@ -805,11 +828,13 @@ def doFacebook(url, environ, headers, options, cache): if 'code' in query: # get real token from code code = urlparse.parse_qs(query)['code'][0] - eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI']) + eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format( + app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI']) token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0] # get long-lived access token - eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token) + eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format( + app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token) values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip()) ltoken = values['access_token'][0] @@ -824,6 +849,7 @@ def doFacebook(url, environ, headers, options, cache): log('fb done') return + def main(): if 'REQUEST_URI' in os.environ: wsgiref.handlers.CGIHandler().run(cgi_wrapper) diff --git a/setup.py b/setup.py index 7715db9..8a75267 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,8 @@ from setuptools import setup, find_packages package_name = 'morss' -setup( name=package_name, +setup( + name=package_name, description='Get full-text RSS feeds', author='pictuga', author_email='contact at author name dot com',