Make most of the code pep8-compliant

Thanks a lot to github.com/SamuelMarks for his nice work
master
pictuga 2014-06-22 01:59:01 +02:00
parent da0a8feadd
commit f01efb7334
4 changed files with 348 additions and 316 deletions

View File

@ -1,23 +1,25 @@
#!/usr/bin/env python #!/usr/bin/env python
import re
import json
import urlparse
import urllib2
from ConfigParser import ConfigParser from ConfigParser import ConfigParser
from fnmatch import fnmatch from fnmatch import fnmatch
import lxml.html
import feeds import feeds
import morss import morss
import re
import urllib2
import lxml.html
import json
import urlparse
def toclass(query): def to_class(query):
pattern = r'\[class=([^\]]+)\]' pattern = r'\[class=([^\]]+)\]'
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]' repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
return re.sub(pattern, repl, query) return re.sub(pattern, repl, query)
def getRule(link):
def get_rule(link):
config = ConfigParser() config = ConfigParser()
config.read('feedify.ini') config.read('feedify.ini')
@ -29,10 +31,12 @@ def getRule(link):
return values return values
return False return False
def supported(link):
return getRule(link) is not False
def formatString(string, getter, error=False): def supported(link):
return get_rule(link) is not False
def format_string(string, getter, error=False):
out = "" out = ""
char = string[0] char = string[0]
@ -42,41 +46,42 @@ def formatString(string, getter, error=False):
match = follow.partition('"') match = follow.partition('"')
out = match[0] out = match[0]
if len(match) >= 2: if len(match) >= 2:
next = match[2] next_match = match[2]
else: else:
next = None next_match = None
elif char == '{': elif char == '{':
match = follow.partition('}') match = follow.partition('}')
try: try:
test = formatString(match[0], getter, True) test = format_string(match[0], getter, True)
except ValueError, KeyError: except (ValueError, KeyError):
pass pass
else: else:
out = test out = test
next = match[2] next_match = match[2]
elif char == ' ': elif char == ' ':
next = follow next_match = follow
elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string): elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups() match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
rawValue = getter(match[0]) raw_value = getter(match[0])
if not isinstance(rawValue, basestring): if not isinstance(raw_value, basestring):
if match[1] is not None: if match[1] is not None:
out = match[1].join(rawValue) out = match[1].join(raw_value)
else: else:
out = ''.join(rawValue) out = ''.join(raw_value)
if not out and error: if not out and error:
raise ValueError raise ValueError
next = match[2] next_match = match[2]
else: else:
raise ValueError('bogus string') raise ValueError('bogus string')
if next is not None and len(next): if next_match is not None and len(next_match):
return out + formatString(next, getter, error) return out + format_string(next_match, getter, error)
else: else:
return out return out
def PreWorker(url, cache):
def pre_worker(url, cache):
if urlparse.urlparse(url).netloc == 'itunes.apple.com': if urlparse.urlparse(url).netloc == 'itunes.apple.com':
match = re.search('/id([0-9]+)(\?.*)?$', url) match = re.search('/id([0-9]+)(\?.*)?$', url)
if match: if match:
@ -84,6 +89,7 @@ def PreWorker(url, cache):
redirect = 'https://itunes.apple.com/lookup?id={id}'.format(id=iid) redirect = 'https://itunes.apple.com/lookup?id={id}'.format(id=iid)
cache.set('redirect', redirect) cache.set('redirect', redirect)
class Builder(object): class Builder(object):
def __init__(self, link, data=None, cache=False): def __init__(self, link, data=None, cache=False):
self.link = link self.link = link
@ -93,11 +99,11 @@ class Builder(object):
data = urllib2.urlopen(link).read() data = urllib2.urlopen(link).read()
self.data = data self.data = data
self.rule = getRule(link) self.rule = get_rule(link)
if self.rule['mode'] == 'xpath': if self.rule['mode'] == 'xpath':
if not isinstance(self.data, unicode): if not isinstance(self.data, unicode):
self.data = self.data.decode(morss.detEncoding(self.data), 'replace') self.data = self.data.decode(morss.detect_encoding(self.data), 'replace')
self.doc = lxml.html.fromstring(self.data) self.doc = lxml.html.fromstring(self.data)
elif self.rule['mode'] == 'json': elif self.rule['mode'] == 'json':
self.doc = json.loads(data) self.doc = json.loads(data)
@ -106,7 +112,7 @@ class Builder(object):
def raw(self, html, expr): def raw(self, html, expr):
if self.rule['mode'] == 'xpath': if self.rule['mode'] == 'xpath':
return html.xpath(toclass(expr)) return html.xpath(to_class(expr))
elif self.rule['mode'] == 'json': elif self.rule['mode'] == 'json':
a = [html] a = [html]
@ -119,7 +125,7 @@ class Builder(object):
if kids is None: if kids is None:
pass pass
elif isinstance(kids, list): elif isinstance(kids, list):
[b.append(i) for i in kids] b += kids
elif isinstance(kids, basestring): elif isinstance(kids, basestring):
b.append(kids.replace('\n', '<br/>')) b.append(kids.replace('\n', '<br/>'))
else: else:
@ -128,7 +134,7 @@ class Builder(object):
if match[1] is None: if match[1] is None:
a = b a = b
else: else:
if len(b)-1 >= int(match[1]): if len(b) - 1 >= int(match[1]):
a = [b[int(match[1])]] a = [b[int(match[1])]]
else: else:
a = [] a = []
@ -150,7 +156,7 @@ class Builder(object):
def string(self, html, expr): def string(self, html, expr):
getter = lambda x: self.strings(html, x) getter = lambda x: self.strings(html, x)
return formatString(self.rule[expr], getter) return format_string(self.rule[expr], getter)
def build(self): def build(self):
if 'title' in self.rule: if 'title' in self.rule:
@ -160,23 +166,22 @@ class Builder(object):
matches = self.raw(self.doc, self.rule['items']) matches = self.raw(self.doc, self.rule['items'])
if matches and len(matches): if matches and len(matches):
for item in matches: for item in matches:
feedItem = {} feed_item = {}
if 'item_title' in self.rule: if 'item_title' in self.rule:
feedItem['title'] = self.string(item, 'item_title') feed_item['title'] = self.string(item, 'item_title')
if 'item_link' in self.rule: if 'item_link' in self.rule:
url = self.string(item, 'item_link') url = self.string(item, 'item_link')
url = urlparse.urljoin(self.link, url) url = urlparse.urljoin(self.link, url)
feedItem['link'] = url feed_item['link'] = url
if 'item_desc' in self.rule: if 'item_desc' in self.rule:
feedItem['desc'] = self.string(item, 'item_desc') feed_item['desc'] = self.string(item, 'item_desc')
if 'item_content' in self.rule: if 'item_content' in self.rule:
feedItem['content'] = self.string(item, 'item_content') feed_item['content'] = self.string(item, 'item_content')
if 'item_time' in self.rule: if 'item_time' in self.rule:
feedItem['updated'] = self.string(item, 'item_time') feed_item['updated'] = self.string(item, 'item_time')
if 'item_id' in self.rule: if 'item_id' in self.rule:
feedItem['id'] = self.string(item, 'item_id') feed_item['id'] = self.string(item, 'item_id')
feedItem['isPermaLink'] = False feed_item['isPermaLink'] = False
self.feed.items.append(feedItem)
self.feed.items.append(feed_item)

View File

@ -1,14 +1,16 @@
#!/usr/bin/env python #!/usr/bin/env python
from lxml import etree
from datetime import datetime from datetime import datetime
import dateutil.parser
from dateutil import tz
import re
from StringIO import StringIO from StringIO import StringIO
import re
import json import json
import csv import csv
import urllib2
from lxml import etree
from dateutil import tz
import dateutil.parser
try: try:
from wheezy.template.engine import Engine from wheezy.template.engine import Engine
@ -26,21 +28,22 @@ except ImportError:
Element = etree.Element Element = etree.Element
NSMAP = {'atom': 'http://www.w3.org/2005/Atom', NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
'atom03': 'http://purl.org/atom/ns#', 'atom03': 'http://purl.org/atom/ns#',
'media': 'http://search.yahoo.com/mrss/', 'media': 'http://search.yahoo.com/mrss/',
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'slash': 'http://purl.org/rss/1.0/modules/slash/', 'slash': 'http://purl.org/rss/1.0/modules/slash/',
'dc': 'http://purl.org/dc/elements/1.1/', 'dc': 'http://purl.org/dc/elements/1.1/',
'content': 'http://purl.org/rss/1.0/modules/content/', 'content': 'http://purl.org/rss/1.0/modules/content/',
'rssfake': 'http://purl.org/rss/1.0/'} 'rssfake': 'http://purl.org/rss/1.0/'}
def load(url): def load(url):
import urllib2
d = urllib2.urlopen(url).read() d = urllib2.urlopen(url).read()
return parse(d) return parse(d)
def tagNS(tag, nsmap=NSMAP):
def tag_NS(tag, nsmap=NSMAP):
match = re.search(r'^\{([^\}]+)\}(.*)$', tag) match = re.search(r'^\{([^\}]+)\}(.*)$', tag)
if match: if match:
match = match.groups() match = match.groups()
@ -55,15 +58,19 @@ def tagNS(tag, nsmap=NSMAP):
return "{%s}%s" % (nsmap[match[0]], match[1].lower()) return "{%s}%s" % (nsmap[match[0]], match[1].lower())
return tag return tag
def innerHTML(xml):
def inner_html(xml):
return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()]) return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()])
def cleanNode(xml):
def clean_node(xml):
[xml.remove(child) for child in xml.iterchildren()] [xml.remove(child) for child in xml.iterchildren()]
class FeedException(Exception): class FeedException(Exception):
pass pass
def parse(data): def parse(data):
# encoding # encoding
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
@ -80,15 +87,16 @@ def parse(data):
# rss # rss
match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP) match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
if len(match): if len(match):
mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS, m_table = {'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom } 'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom}
match = match[0] match = match[0]
tag = tagNS(match.tag) tag = tag_NS(match.tag)
if tag in mtable: if tag in m_table:
return mtable[tag](doc, tag) return m_table[tag](doc, tag)
raise FeedException('unknown feed type') raise FeedException('unknown feed type')
class FeedBase(object): class FeedBase(object):
""" """
Base for xml-related classes, which provides simple wrappers around xpath Base for xml-related classes, which provides simple wrappers around xpath
@ -135,7 +143,7 @@ class FeedBase(object):
else: else:
return "" return ""
def xgetCreate(self, table): def xget_create(self, table):
""" Returns an element, and creates it when not present """ """ Returns an element, and creates it when not present """
value = table[self.tag] value = table[self.tag]
if not isinstance(value, tuple): if not isinstance(value, tuple):
@ -145,7 +153,7 @@ class FeedBase(object):
if match is not None: if match is not None:
return match return match
else: else:
element = etree.Element(tagNS(new)) element = etree.Element(tag_NS(new))
self.root.append(element) self.root.append(element)
return element return element
@ -158,58 +166,62 @@ class FeedBase(object):
""" Returns string using lxml. Arguments passed to tostring """ """ Returns string using lxml. Arguments passed to tostring """
return etree.tostring(self.xml, pretty_print=True, **k) return etree.tostring(self.xml, pretty_print=True, **k)
class FeedDescriptor(object): class FeedDescriptor(object):
""" """
Descriptor which gives off elements based on "self.getName" and Descriptor which gives off elements based on "self.getName" and
"self.setName" as getter/setters. Looks far better, and avoids duplicates "self.setName" as getter/setters. Looks far better, and avoids duplicates
""" """
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
self.nname = name[0].upper() + name[1:]
def __get__(self, instance, owner): def __get__(self, instance, owner):
getter = getattr(instance, 'get%s' % self.nname) getter = getattr(instance, 'get_%s' % self.name)
return getter() return getter()
def __set__(self, instance, value): def __set__(self, instance, value):
setter = getattr(instance, 'set%s' % self.nname) setter = getattr(instance, 'set_%s' % self.name)
return setter(value) return setter(value)
def __delete__(self, instance): def __delete__(self, instance):
deleter = getattr(instance, 'del%s' % self.nname) deleter = getattr(instance, 'del_%s' % self.name)
return deleter() return deleter()
class FeedTime(FeedDescriptor): class FeedTime(FeedDescriptor):
def __get__(self, instance, owner): def __get__(self, instance, owner):
getter = getattr(instance, 'get%s' % self.nname) getter = getattr(instance, 'get_%s' % self.name)
raw = getter() raw = getter()
try: try:
time = parseTime(raw) time = parse_time(raw)
return time return time
except ValueError: except ValueError:
return None return None
def __set__(self, instance, value): def __set__(self, instance, value):
try: try:
time = parseTime(value) time = parse_time(value)
raw = time.strftime(instance.timeFormat) raw = time.strftime(instance.timeFormat)
setter = getattr(instance, 'set%s' % self.nname) setter = getattr(instance, 'set_%s' % self.name)
return setter(raw) return setter(raw)
except ValueError: except ValueError:
pass pass
class FeedBool(FeedDescriptor): class FeedBool(FeedDescriptor):
def __get__(self, instance, owner): def __get__(self, instance, owner):
getter = getattr(instance, 'get%s' % self.nname) getter = getattr(instance, 'get_%s' % self.name)
raw = getter() raw = getter()
return (raw or '').lower() != 'false' return (raw or '').lower() != 'false'
def __set__(self, instance, value): def __set__(self, instance, value):
raw = 'true' if value else 'false' raw = 'true' if value else 'false'
setter = getattr(instance, 'set%s' % self.nname) setter = getattr(instance, 'set_%s' % self.name)
return setter(raw) return setter(raw)
def parseTime(value):
def parse_time(value):
if isinstance(value, basestring): if isinstance(value, basestring):
if re.match(r'^[0-9]+$', value): if re.match(r'^[0-9]+$', value):
return datetime.fromtimestamp(int(value), tz.tzutc()) return datetime.fromtimestamp(int(value), tz.tzutc())
@ -222,6 +234,7 @@ def parseTime(value):
else: else:
return False return False
class FeedList(object): class FeedList(object):
""" """
Class to map a list of xml elements against a list of matching objects, Class to map a list of xml elements against a list of matching objects,
@ -231,14 +244,15 @@ class FeedList(object):
Comes with its very own descriptor. Comes with its very own descriptor.
""" """
def __init__(self, parent, getter, tag, childClass):
def __init__(self, parent, getter, tag, child_class):
self.parent = parent self.parent = parent
self.getter = getter self.getter = getter
self.childClass = childClass self.childClass = child_class
self.tag = tag self.tag = tag
self._children = {} # id(xml) => FeedItem self._children = {} # id(xml) => FeedItem
def getChildren(self): def get_children(self):
children = self.getter() children = self.getter()
out = [] out = []
for child in children: for child in children:
@ -269,7 +283,7 @@ class FeedList(object):
return new return new
def __getitem__(self, key): def __getitem__(self, key):
return self.getChildren()[key] return self.get_children()[key]
def __delitem__(self, key): def __delitem__(self, key):
child = self.getter()[key] child = self.getter()[key]
@ -282,22 +296,24 @@ class FeedList(object):
def __len__(self): def __len__(self):
return len(self.getter()) return len(self.getter())
class FeedListDescriptor(object): class FeedListDescriptor(object):
""" """
Descriptor for FeedList Descriptor for FeedList
""" """
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
self.items = {} # id(instance) => FeedList self.items = {} # id(instance) => FeedList
def __get__(self, instance, owner=None): def __get__(self, instance, owner=None):
key = id(instance) key = id(instance)
if key in self.items: if key in self.items:
return self.items[key] return self.items[key]
else: else:
getter = getattr(instance, 'get%s' % self.name.title()) getter = getattr(instance, 'get_%s' % self.name)
className = globals()[getattr(instance, '%sClass' % self.name)] class_name = globals()[getattr(instance, '%sClass' % self.name)]
self.items[key] = FeedList(instance, getter, instance.tag, className) self.items[key] = FeedList(instance, getter, instance.tag, class_name)
return self.items[key] return self.items[key]
def __set__(self, instance, value): def __set__(self, instance, value):
@ -305,6 +321,7 @@ class FeedListDescriptor(object):
[x.remove() for x in [x for x in f.items]] [x.remove() for x in [x for x in f.items]]
[feedlist.append(x) for x in value] [feedlist.append(x) for x in value]
class FeedParser(FeedBase): class FeedParser(FeedBase):
itemsClass = 'FeedItem' itemsClass = 'FeedItem'
mimetype = 'application/xml' mimetype = 'application/xml'
@ -318,27 +335,25 @@ class FeedParser(FeedBase):
self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0] self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0]
self.tag = tag self.tag = tag
def getTitle(self): def get_title(self):
return "" return ""
def setTitle(self, value): def set_title(self, value):
pass pass
def delTitle(self): def del_title(self):
self.title = "" self.title = ""
def get_desc(self):
def getDesc(self):
pass pass
def setDesc(self, value): def set_desc(self, value):
pass pass
def delDesc(self): def del_desc(self):
self.desc = "" self.desc = ""
def get_items(self):
def getItems(self):
return [] return []
title = FeedDescriptor('title') title = FeedDescriptor('title')
@ -355,7 +370,8 @@ class FeedParser(FeedBase):
out = StringIO() out = StringIO()
c = csv.writer(out, dialect=csv.excel) c = csv.writer(out, dialect=csv.excel)
for item in self.items: for item in self.items:
row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if isinstance(x[1], basestring)] row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if
isinstance(x[1], basestring)]
c.writerow(row) c.writerow(row)
out.seek(0) out.seek(0)
return out.read() return out.read()
@ -367,7 +383,8 @@ class FeedParser(FeedBase):
loader = DictLoader({'reader': open('reader.html.template').read()}) loader = DictLoader({'reader': open('reader.html.template').read()})
engine = Engine(loader=loader, extensions=[CoreExtension()]) engine = Engine(loader=loader, extensions=[CoreExtension()])
template = engine.get_template('reader') template = engine.get_template('reader')
return template.render({'feed':self}).encode('utf-8') return template.render({'feed': self}).encode('utf-8')
class FeedParserRSS(FeedParser): class FeedParserRSS(FeedParser):
""" """
@ -375,161 +392,153 @@ class FeedParserRSS(FeedParser):
""" """
itemsClass = 'FeedItemRSS' itemsClass = 'FeedItemRSS'
mimetype = 'application/rss+xml' mimetype = 'application/rss+xml'
base = { 'rdf:rdf': '<?xml version="1.0" encoding="utf-8"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/"><channel rdf:about="http://example.org/rss.rdf"></channel></rdf:RDF>', base = {
'channel': '<?xml version="1.0" encoding="utf-8"?><rss version="2.0"><channel></channel></rss>'} 'rdf:rdf': '<?xml version="1.0" encoding="utf-8"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/"><channel rdf:about="http://example.org/rss.rdf"></channel></rdf:RDF>',
'channel': '<?xml version="1.0" encoding="utf-8"?><rss version="2.0"><channel></channel></rss>'}
def getTitle(self): def get_title(self):
return self.xval('rssfake:title|title') return self.xval('rssfake:title|title')
def setTitle(self, value): def set_title(self, value):
if not value: if not value:
return self.xdel('rssfake:title|title') return self.xdel('rssfake:title|title')
table = { 'rdf:rdf': 'rssfake:title', table = {'rdf:rdf': 'rssfake:title',
'channel': 'title'} 'channel': 'title'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def get_desc(self):
def getDesc(self):
return self.xval('rssfake:description|description') return self.xval('rssfake:description|description')
def setDesc(self, value): def set_desc(self, value):
if not value: if not value:
return self.xdel('rssfake:description|description') return self.xdel('rssfake:description|description')
table = { 'rdf:rdf': 'rssfake:description', table = {'rdf:rdf': 'rssfake:description',
'channel': 'description'} 'channel': 'description'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def get_items(self):
def getItems(self):
return self.xpath('rssfake:item|item') return self.xpath('rssfake:item|item')
class FeedParserAtom(FeedParser): class FeedParserAtom(FeedParser):
""" """
Atom Parser Atom Parser
""" """
itemsClass = 'FeedItemAtom' itemsClass = 'FeedItemAtom'
mimetype = 'application/atom+xml' mimetype = 'application/atom+xml'
base = { 'atom:feed': '<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>', base = {'atom:feed': '<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>',
'atom03:feed': '<?xml version="1.0" encoding="utf-8"?><feed version="0.3" xmlns="http://purl.org/atom/ns#"></feed>'} 'atom03:feed': '<?xml version="1.0" encoding="utf-8"?><feed version="0.3" xmlns="http://purl.org/atom/ns#"></feed>'}
def getTitle(self): def get_title(self):
return self.xval('atom:title|atom03:title') return self.xval('atom:title|atom03:title')
def setTitle(self, value): def set_title(self, value):
if not value: if not value:
return self.xval('atom:title|atom03:title') return self.xval('atom:title|atom03:title')
table = { 'atom:feed': 'atom:title', table = {'atom:feed': 'atom:title',
'atom03:feed': 'atom03:title'} 'atom03:feed': 'atom03:title'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def get_desc(self):
def getDesc(self):
return self.xval('atom:subtitle|atom03:subtitle') return self.xval('atom:subtitle|atom03:subtitle')
def setDesc(self, value): def set_desc(self, value):
if not value: if not value:
return self.xdel('atom:subtitle|atom03:subtitle') return self.xdel('atom:subtitle|atom03:subtitle')
table = { 'atom:feed': 'atom:subtitle', table = {'atom:feed': 'atom:subtitle',
'atom03:feed': 'atom03:subtitle'} 'atom03:feed': 'atom03:subtitle'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def get_items(self):
def getItems(self):
return self.xpath('atom:entry|atom03:entry') return self.xpath('atom:entry|atom03:entry')
class FeedItem(FeedBase): class FeedItem(FeedBase):
timeFormat = '' timeFormat = ''
dic = ('title', 'link', 'desc', 'content', 'id', 'isPermaLink', 'time', 'updated') dic = ('title', 'link', 'desc', 'content', 'id', 'is_permalink', 'time', 'updated')
def __init__(self, xml=None, tag='atom:feed'): def __init__(self, xml=None, tag='atom:feed'):
if xml is None: if xml is None:
xml = Element(tagNS(self.base[tag])) xml = Element(tag_NS(self.base[tag]))
self.root = self.xml = xml self.root = self.xml = xml
self.tag = tag self.tag = tag
def getTitle(self): def get_title(self):
return "" return ""
def setTitle(self): def set_title(self, value):
pass pass
def delTitle(self): def del_title(self):
self.title = "" self.title = ""
def get_link(self):
def getLink(self):
return "" return ""
def setLink(self, value): def set_link(self, value):
pass pass
def delLink(self): def del_link(self):
self.link = "" self.link = ""
def get_is_permalink(self):
def getIsPermaLink(self):
return "" return ""
def setIsPermaLink(self, value): def set_is_permalink(self, value):
pass pass
def get_desc(self):
def getDesc(self):
return "" return ""
def setDesc(self, value): def set_desc(self, value):
pass pass
def delDesc(self): def del_desc(self):
self.desc = "" self.desc = ""
def get_content(self):
def getContent(self):
return "" return ""
def setContent(self, value): def set_content(self, value):
pass pass
def delContent(self): def del_content(self):
self.content = "" self.content = ""
def get_id(self):
def getId(self):
return "" return ""
def setId(self, value): def set_id(self, value):
pass pass
def delId(self): def del_id(self):
self.id = "" self.id = ""
def get_time(self):
def getTime(self):
return None return None
def setTime(self, value): def set_time(self, value):
pass pass
def delTime(self): def delTime(self):
self.time = None self.time = None
def get_updated(self):
def getUpdated(self):
return None return None
def setUpdated(self, value): def set_updated(self, value):
pass pass
def delUpdated(self): def del_updated(self):
self.updated = None self.updated = None
title = FeedDescriptor('title') title = FeedDescriptor('title')
@ -537,11 +546,11 @@ class FeedItem(FeedBase):
description = desc = FeedDescriptor('desc') description = desc = FeedDescriptor('desc')
content = FeedDescriptor('content') content = FeedDescriptor('content')
id = FeedDescriptor('id') id = FeedDescriptor('id')
isPermaLink = FeedBool('isPermaLink') is_permalink = FeedBool('is_permalink')
time = FeedTime('time') time = FeedTime('time')
updated = FeedTime('updated') updated = FeedTime('updated')
def pushContent(self, value): def push_content(self, value):
if not self.desc and self.content: if not self.desc and self.content:
self.desc = self.content self.desc = self.content
@ -550,201 +559,192 @@ class FeedItem(FeedBase):
def remove(self): def remove(self):
self.xml.getparent().remove(self.xml) self.xml.getparent().remove(self.xml)
class FeedItemRSS(FeedItem): class FeedItemRSS(FeedItem):
timeFormat = '%a, %d %b %Y %H:%M:%S %Z' timeFormat = '%a, %d %b %Y %H:%M:%S %Z'
base = { 'rdf:rdf': 'rssfake:item', base = {'rdf:rdf': 'rssfake:item',
'channel': 'item'} 'channel': 'item'}
def getTitle(self): def get_title(self):
return self.xval('rssfake:title|title') return self.xval('rssfake:title|title')
def setTitle(self, value): def set_title(self, value):
if not value: if not value:
return self.xdel('rssfake:title|title') return self.xdel('rssfake:title|title')
table = { 'rdf:rdf': 'rssfake:title', table = {'rdf:rdf': 'rssfake:title',
'channel': 'title'} 'channel': 'title'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def get_link(self):
def getLink(self):
return self.xval('rssfake:link|link') return self.xval('rssfake:link|link')
def setLink(self, value): def set_link(self, value):
if self.isPermaLink and self.id == self.link != value: if self.is_permalink and self.id == self.link != value:
self.isPermaLink = False self.is_permalink = False
table = { 'rdf:rdf': 'rssfake:link', table = {'rdf:rdf': 'rssfake:link',
'channel': 'link'} 'channel': 'link'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def getDesc(self): def get_desc(self):
return self.xval('rssfake:description|description') return self.xval('rssfake:description|description')
def setDesc(self, value): def set_desc(self, value):
if not value: if not value:
return self.xdel('rssfake:description|description') return self.xdel('rssfake:description|description')
table = { 'rdf:rdf': 'rssfake:description', table = {'rdf:rdf': 'rssfake:description',
'channel': 'description'} 'channel': 'description'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def get_content(self):
def getContent(self):
return self.xval('content:encoded') return self.xval('content:encoded')
def setContent(self, value): def set_content(self, value):
if not value: if not value:
return self.xdel('content:encoded') return self.xdel('content:encoded')
table = { 'rdf:rdf': 'content:encoded', table = {'rdf:rdf': 'content:encoded',
'channel': 'content:encoded'} 'channel': 'content:encoded'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def get_id(self):
def getId(self):
return self.xval('rssfake:guid|guid') return self.xval('rssfake:guid|guid')
def setId(self, value): def set_id(self, value):
if not value: if not value:
return self.xdel('rssfake:guid|guid') return self.xdel('rssfake:guid|guid')
table = { 'rdf:rdf': 'rssfake:guid', table = {'rdf:rdf': 'rssfake:guid',
'channel': 'guid'} 'channel': 'guid'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def get_is_permalink(self):
def getIsPermaLink(self):
return self.xget('rssfake:guid/@isPermaLink|guid/@isPermaLink') return self.xget('rssfake:guid/@isPermaLink|guid/@isPermaLink')
def setIsPermaLink(self, value): def set_is_permalink(self, value):
table = { 'rdf:rdf': 'rssfake:guid', table = {'rdf:rdf': 'rssfake:guid',
'channel': 'guid'} 'channel': 'guid'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.attrib['isPermaLink'] = value element.attrib['isPermaLink'] = value
def get_time(self):
def getTime(self):
return self.xval('rssfake:pubDate|pubDate') return self.xval('rssfake:pubDate|pubDate')
def setTime(self, value): def set_time(self, value):
if not value: if not value:
return self.xdel('rssfake:pubDate|pubDate') return self.xdel('rssfake:pubDate|pubDate')
table = { 'rdf:rdf': 'rssfake:pubDate', table = {'rdf:rdf': 'rssfake:pubDate',
'channel': 'pubDate'} 'channel': 'pubDate'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
class FeedItemAtom(FeedItem): class FeedItemAtom(FeedItem):
timeFormat = '%Y-%m-%dT%H:%M:%SZ' timeFormat = '%Y-%m-%dT%H:%M:%SZ'
base = { 'atom:feed': 'atom:entry', base = {'atom:feed': 'atom:entry',
'atom03:feed': 'atom03:entry'} 'atom03:feed': 'atom03:entry'}
def getTitle(self): def get_title(self):
return self.xval('atom:title|atom03:title') return self.xval('atom:title|atom03:title')
def setTitle(self, value): def set_title(self, value):
if not value: if not value:
return self.xdel('atom:title|atom03:title') return self.xdel('atom:title|atom03:title')
table = { 'atom:feed': 'atom:title', table = {'atom:feed': 'atom:title',
'atom03:feed': 'atom03:title'} 'atom03:feed': 'atom03:title'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def get_link(self):
def getLink(self):
return self.xget('(atom:link|atom03:link)[@rel="alternate" or not(@rel)]/@href') return self.xget('(atom:link|atom03:link)[@rel="alternate" or not(@rel)]/@href')
def setLink(self, value): def set_link(self, value):
table = { 'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'), table = {'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'),
'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')} 'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')}
element = self.xgetCreate(table) element = self.xget_create(table)
element.attrib['href'] = value element.attrib['href'] = value
def get_desc(self):
def getDesc(self):
# default "type" is "text" # default "type" is "text"
element = self.xget('atom:summary|atom03:summary') element = self.xget('atom:summary|atom03:summary')
if element is not None: if element is not None:
return innerHTML(element) return inner_html(element)
else: else:
return "" return ""
def setDesc(self, value): def set_desc(self, value):
if not value: if not value:
return self.xdel('atom:summary|atom03:summary') return self.xdel('atom:summary|atom03:summary')
table = { 'atom:feed': 'atom:summary', table = {'atom:feed': 'atom:summary',
'atom03:feed': 'atom03:summary'} 'atom03:feed': 'atom03:summary'}
element = self.xgetCreate(table) element = self.xget_create(table)
if element.attrib.get('type', '') == 'xhtml': if element.attrib.get('type', '') == 'xhtml':
cleanNode(element) clean_node(element)
element.attrib['type'] = 'html' element.attrib['type'] = 'html'
element.text = value element.text = value
def get_content(self):
def getContent(self):
element = self.xget('atom:content|atom03:content') element = self.xget('atom:content|atom03:content')
if element is not None: if element is not None:
return innerHTML(element) return inner_html(element)
else: else:
return "" return ""
def setContent(self, value): def set_content(self, value):
if not value: if not value:
return self.xdel('atom:content|atom03:content') return self.xdel('atom:content|atom03:content')
table = { 'atom:feed': 'atom:content', table = {'atom:feed': 'atom:content',
'atom03:feed': 'atom03:content'} 'atom03:feed': 'atom03:content'}
element = self.xgetCreate(table) element = self.xget_create(table)
if element.attrib.get('type', '') == 'xhtml': if element.attrib.get('type', '') == 'xhtml':
cleanNode(element) clean_node(element)
element.attrib['type'] = 'html' element.attrib['type'] = 'html'
element.text = value element.text = value
def get_id(self):
def getId(self):
return self.xval('atom:id|atom03:id') return self.xval('atom:id|atom03:id')
def setId(self, value): def set_id(self, value):
if not value: if not value:
return self.xdel('atom:id|atom03:id') return self.xdel('atom:id|atom03:id')
table = { 'atom:feed': 'atom:id', table = {'atom:feed': 'atom:id',
'atom03:feed': 'atom03:id'} 'atom03:feed': 'atom03:id'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def get_time(self):
def getTime(self):
return self.xval('atom:published|atom03:published') return self.xval('atom:published|atom03:published')
def setTime(self, value): def set_time(self, value):
if not value: if not value:
return self.xdel('atom:published|atom03:published') return self.xdel('atom:published|atom03:published')
table = { 'atom:feed': 'atom:published', table = {'atom:feed': 'atom:published',
'atom03:feed': 'atom03:published'} 'atom03:feed': 'atom03:published'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value
def get_updated(self):
def getUpdated(self):
return self.xval('atom:updated|atom03:updated') return self.xval('atom:updated|atom03:updated')
def setUpdated(self, value): def set_updated(self, value):
if not value: if not value:
return self.xdel('atom:updated|atom03:updated') return self.xdel('atom:updated|atom03:updated')
table = { 'atom:feed': 'atom:updated', table = {'atom:feed': 'atom:updated',
'atom03:feed': 'atom03:updated'} 'atom03:feed': 'atom03:updated'}
element = self.xgetCreate(table) element = self.xget_create(table)
element.text = value element.text = value

View File

@ -31,21 +31,22 @@ from StringIO import StringIO
from readability import readability from readability import readability
from html2text import HTML2Text from html2text import HTML2Text
LIM_ITEM = 100 # deletes what's beyond LIM_ITEM = 100 # deletes what's beyond
LIM_TIME = 7 # deletes what's after LIM_TIME = 7 # deletes what's after
MAX_ITEM = 50 # cache-only beyond MAX_ITEM = 50 # cache-only beyond
MAX_TIME = 7 # cache-only after (in sec) MAX_TIME = 7 # cache-only after (in sec)
DELAY = 10*60 # xml cache & ETag cache (in sec) DELAY = 10 * 60 # xml cache & ETag cache (in sec)
TIMEOUT = 2 # http timeout (in sec) TIMEOUT = 2 # http timeout (in sec)
THREADS = 10 # number of threads (1 for single-threaded) THREADS = 10 # number of threads (1 for single-threaded)
DEBUG = False DEBUG = False
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)' UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
UA_HTML = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' UA_HTML = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], MIMETYPE = {
'html': ['text/html', 'application/xhtml+xml', 'application/xml']} 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
FBAPPID = "<insert yours>" FBAPPID = "<insert yours>"
FBSECRET = "<insert yours>" FBSECRET = "<insert yours>"
@ -57,11 +58,14 @@ if 'SCRIPT_NAME' in os.environ:
httplib.HTTPConnection.debuglevel = 1 httplib.HTTPConnection.debuglevel = 1
import cgitb import cgitb
cgitb.enable() cgitb.enable()
class MorssException(Exception): class MorssException(Exception):
pass pass
def log(txt, force=False): def log(txt, force=False):
if DEBUG or force: if DEBUG or force:
if 'REQUEST_URI' in os.environ: if 'REQUEST_URI' in os.environ:
@ -70,17 +74,18 @@ def log(txt, force=False):
print repr(txt) print repr(txt)
def lenHTML(txt): def len_html(txt):
if len(txt): if len(txt):
return len(lxml.html.fromstring(txt).text_content()) return len(lxml.html.fromstring(txt).text_content())
else: else:
return 0 return 0
def countWord(txt):
def count_words(txt):
if len(txt): if len(txt):
return len(lxml.html.fromstring(txt).text_content().split()) return len(lxml.html.fromstring(txt).text_content().split())
else: return 0
return 0
class Options: class Options:
def __init__(self, options=None): def __init__(self, options=None):
@ -95,9 +100,11 @@ class Options:
def __contains__(self, key): def __contains__(self, key):
return key in self.options return key in self.options
class Cache: class Cache:
""" Light, error-prone caching system. """ """ Light, error-prone caching system. """
def __init__(self, folder=None, key='cache', lifespan=10*24*3600):
def __init__(self, folder=None, key='cache', lifespan=10 * 24 * 3600):
self._key = key self._key = key
self._dir = folder self._dir = folder
self._lifespan = lifespan self._lifespan = lifespan
@ -108,7 +115,7 @@ class Cache:
self._hash = "NO CACHE" self._hash = "NO CACHE"
return return
maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp" maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp"
self._hash = urllib.quote_plus(self._key)[:maxsize] self._hash = urllib.quote_plus(self._key)[:maxsize]
self._file = self._dir + '/' + self._hash self._file = self._dir + '/' + self._hash
@ -178,13 +185,16 @@ class Cache:
else: else:
return self return self
class SimpleDownload(urllib2.HTTPCookieProcessor): class SimpleDownload(urllib2.HTTPCookieProcessor):
""" """
Custom urllib2 handler to download a page, using etag/last-modified headers, Custom urllib2 handler to download a page, using etag/last-modified headers,
to save bandwidth. The given headers are added back into the header on error to save bandwidth. The given headers are added back into the header on error
304 for easier use. 304 for easier use.
""" """
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False):
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None,
accept=None, strict=False):
urllib2.HTTPCookieProcessor.__init__(self, cookiejar) urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
self.cache = cache self.cache = cache
self.etag = etag self.etag = etag
@ -214,7 +224,7 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
out = {} out = {}
rank = 1.1 rank = 1.1
for group in self.accept: for group in self.accept:
rank = rank - 0.1 rank -= 0.1
if isinstance(group, basestring): if isinstance(group, basestring):
if group in MIMETYPE: if group in MIMETYPE:
@ -228,9 +238,9 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
out[mime] = rank out[mime] = rank
if not self.strict: if not self.strict:
out['*/*'] = rank-0.1 out['*/*'] = rank - 0.1
string = ','.join([x+';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out]) string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
req.add_unredirected_header('Accept', string) req.add_unredirected_header('Accept', string)
return req return req
@ -259,20 +269,20 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
if resp.info().type in MIMETYPE['html']: if resp.info().type in MIMETYPE['html']:
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data) match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
if match: if match:
newurl = match.groups()[0] new_url = match.groups()[0]
log('redirect: %s' % newurl) log('redirect: %s' % new_url)
newheaders = dict((k,v) for k,v in req.headers.items() new_headers = dict((k, v) for k, v in req.headers.items()
if k.lower() not in ('content-length', 'content-type')) if k.lower() not in ('content-length', 'content-type'))
new = urllib2.Request(newurl, new = urllib2.Request(new_url,
headers=newheaders, headers=new_headers,
origin_req_host=req.get_origin_req_host(), origin_req_host=req.get_origin_req_host(),
unverifiable=True) unverifiable=True)
return self.parent.open(new, timeout=req.timeout) return self.parent.open(new, timeout=req.timeout)
# encoding # encoding
enc = detEncoding(data, resp) enc = detect_encoding(data, resp)
if enc: if enc:
data = data.decode(enc, 'replace') data = data.decode(enc, 'replace')
@ -290,7 +300,8 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
https_response = http_response https_response = http_response
https_request = http_request https_request = http_request
def detEncoding(data, con=None):
def detect_encoding(data, con=None):
if con is not None and con.headers.getparam('charset'): if con is not None and con.headers.getparam('charset'):
log('header') log('header')
return con.headers.getparam('charset') return con.headers.getparam('charset')
@ -306,6 +317,7 @@ def detEncoding(data, con=None):
return None return None
def Fix(item, feedurl='/'): def Fix(item, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """ """ Improves feed items (absolute links, resolve feedburner links, etc) """
@ -358,7 +370,8 @@ def Fix(item, feedurl='/'):
match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link) match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
if match: if match:
url = match.groups()[0].split('0') url = match.groups()[0].split('0')
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'} t = {'A': '0', 'B': '.', 'C': '/', 'D': '?', 'E': '-', 'H': ',', 'I': '_', 'L': 'http://', 'S': 'www.',
'N': '.com', 'O': '.co.uk'}
item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]]) item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]])
log(item.link) log(item.link)
@ -371,6 +384,7 @@ def Fix(item, feedurl='/'):
return item return item
def Fill(item, cache, feedurl='/', fast=False): def Fill(item, cache, feedurl='/', fast=False):
""" Returns True when it has done its best """ """ Returns True when it has done its best """
@ -381,8 +395,8 @@ def Fill(item, cache, feedurl='/', fast=False):
log(item.link) log(item.link)
# content already provided? # content already provided?
count_content = countWord(item.content) count_content = count_words(item.content)
count_desc = countWord(item.desc) count_desc = count_words(item.desc)
if max(count_content, count_desc) > 500: if max(count_content, count_desc) > 500:
if count_desc > count_content: if count_desc > count_content:
@ -392,7 +406,7 @@ def Fill(item, cache, feedurl='/', fast=False):
log('long enough') log('long enough')
return True return True
if count_content > 5*count_desc > 0 and count_content > 50: if count_content > 5 * count_desc > 0 and count_content > 50:
log('content bigger enough') log('content bigger enough')
return True return True
@ -432,7 +446,7 @@ def Fill(item, cache, feedurl='/', fast=False):
log('old error') log('old error')
else: else:
log('cached') log('cached')
item.pushContent(cache.get(link)) item.push_content(cache.get(link))
return True return True
# super-fast mode # super-fast mode
@ -457,8 +471,8 @@ def Fill(item, cache, feedurl='/', fast=False):
out = readability.Document(data, url=con.url).summary(True) out = readability.Document(data, url=con.url).summary(True)
if countWord(out) > max(count_content, count_desc) > 0: if count_words(out) > max(count_content, count_desc) > 0:
item.pushContent(out) item.push_content(out)
cache.set(link, out) cache.set(link, out)
else: else:
log('not bigger enough') log('not bigger enough')
@ -467,7 +481,8 @@ def Fill(item, cache, feedurl='/', fast=False):
return True return True
def Init(url, cachePath, options):
def Init(url, cache_path, options):
# url clean up # url clean up
log(url) log(url)
@ -481,14 +496,15 @@ def Init(url, cachePath, options):
url = url.replace(' ', '%20') url = url.replace(' ', '%20')
# cache # cache
cache = Cache(cachePath, url) cache = Cache(cache_path, url)
log(cache._hash) log(cache._hash)
return (url, cache) return (url, cache)
def Fetch(url, cache, options): def Fetch(url, cache, options):
# do some useful facebook work # do some useful facebook work
feedify.PreWorker(url, cache) feedify.pre_worker(url, cache)
if 'redirect' in cache: if 'redirect' in cache:
url = cache.get('redirect') url = cache.get('redirect')
@ -502,8 +518,9 @@ def Fetch(url, cache, options):
style = cache.get('style') style = cache.get('style')
else: else:
try: try:
opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html')) opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'),
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT*2) accept=('xml', 'html'))
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT * 2)
xml = con.read() xml = con.read()
except (IOError, httplib.HTTPException): except (IOError, httplib.HTTPException):
raise MorssException('Error downloading feed') raise MorssException('Error downloading feed')
@ -540,7 +557,8 @@ def Fetch(url, cache, options):
feed.build() feed.build()
rss = feed.feed rss = feed.feed
elif style == 'html': elif style == 'html':
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href") match = lxml.html.fromstring(xml).xpath(
"//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
if len(match): if len(match):
link = urlparse.urljoin(url, match[0]) link = urlparse.urljoin(url, match[0])
log('rss redirect: %s' % link) log('rss redirect: %s' % link)
@ -552,13 +570,13 @@ def Fetch(url, cache, options):
log('random page') log('random page')
raise MorssException('Link provided is not a valid feed') raise MorssException('Link provided is not a valid feed')
cache.save() cache.save()
return rss return rss
def Gather(rss, url, cache, options): def Gather(rss, url, cache, options):
size = len(rss.items) size = len(rss.items)
startTime = time.time() start_time = time.time()
# custom settings # custom settings
lim_item = LIM_ITEM lim_item = LIM_ITEM
@ -580,14 +598,14 @@ def Gather(rss, url, cache, options):
queue.task_done() queue.task_done()
def worker(i, item): def worker(i, item):
if time.time() - startTime > lim_time >= 0 or i+1 > lim_item >= 0: if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
log('dropped') log('dropped')
item.remove() item.remove()
return return
item = Fix(item, url) item = Fix(item, url)
if time.time() - startTime > max_time >= 0 or i+1 > max_item >= 0: if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
if not options.proxy: if not options.proxy:
if Fill(item, cache, url, True) is False: if Fill(item, cache, url, True) is False:
item.remove() item.remove()
@ -617,10 +635,11 @@ def Gather(rss, url, cache, options):
new.time = "5 Oct 2013 22:42" new.time = "5 Oct 2013 22:42"
log(len(rss.items)) log(len(rss.items))
log(time.time() - startTime) log(time.time() - start_time)
return rss return rss
def After(rss, options): def After(rss, options):
for i, item in enumerate(rss.items): for i, item in enumerate(rss.items):
@ -662,8 +681,9 @@ def After(rss, options):
else: else:
return rss.tostring(xml_declaration=True, encoding='UTF-8') return rss.tostring(xml_declaration=True, encoding='UTF-8')
def process(url, cache=None, options=None): def process(url, cache=None, options=None):
if options == None: if not options:
options = [] options = []
options = Options(options) options = Options(options)
@ -673,6 +693,7 @@ def process(url, cache=None, options=None):
return After(rss, options) return After(rss, options)
def cgi_app(environ, start_response): def cgi_app(environ, start_response):
# get options # get options
if 'REQUEST_URI' in environ: if 'REQUEST_URI' in environ:
@ -696,7 +717,8 @@ def cgi_app(environ, start_response):
DEBUG = options.debug DEBUG = options.debug
if 'HTTP_IF_NONE_MATCH' in environ: if 'HTTP_IF_NONE_MATCH' in environ:
if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY: if not options.force and not options.facebook and time.time() - int(
environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
headers['status'] = '304 Not Modified' headers['status'] = '304 Not Modified'
start_response(headers['status'], headers.items()) start_response(headers['status'], headers.items())
log(url) log(url)
@ -722,30 +744,31 @@ def cgi_app(environ, start_response):
url, cache = Init(url, os.getcwd() + '/cache', options) url, cache = Init(url, os.getcwd() + '/cache', options)
if options.facebook: if options.facebook:
doFacebook(url, environ, headers, options, cache) do_facebook(url, environ, headers, options, cache)
start_response(headers['status'], headers.items()) start_response(headers['status'], headers.items())
return return
# get the work done # get the work done
RSS = Fetch(url, cache, options) rss = Fetch(url, cache, options)
if headers['content-type'] == 'text/xml': if headers['content-type'] == 'text/xml':
headers['content-type'] = RSS.mimetype headers['content-type'] = rss.mimetype
start_response(headers['status'], headers.items()) start_response(headers['status'], headers.items())
RSS = Gather(RSS, url, cache, options) rss = Gather(rss, url, cache, options)
if not DEBUG and not options.silent: if not DEBUG and not options.silent:
return After(RSS, options) return After(rss, options)
log('done') log('done')
def cgi_wrapper(environ, start_response): def cgi_wrapper(environ, start_response):
# simple http server for html and css # simple http server for html and css
files = { files = {
'': 'text/html', '': 'text/html',
'index.html': 'text/html'} 'index.html': 'text/html'}
if 'REQUEST_URI' in environ: if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:] url = environ['REQUEST_URI'][1:]
@ -774,13 +797,12 @@ def cgi_wrapper(environ, start_response):
except (KeyboardInterrupt, SystemExit): except (KeyboardInterrupt, SystemExit):
raise raise
except Exception as e: except Exception as e:
headers = {} headers = {'status': '500 Oops', 'content-type': 'text/plain'}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
start_response(headers['status'], headers.items(), sys.exc_info()) start_response(headers['status'], headers.items(), sys.exc_info())
log('ERROR: %s' % e.message, force=True) log('ERROR: %s' % e.message, force=True)
return 'An error happened' return 'An error happened'
def cli_app(): def cli_app():
options = Options(sys.argv[1:-1]) options = Options(sys.argv[1:-1])
url = sys.argv[-1] url = sys.argv[-1]
@ -789,15 +811,16 @@ def cli_app():
DEBUG = options.debug DEBUG = options.debug
url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options) url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
RSS = Fetch(url, cache, options) rss = Fetch(url, cache, options)
RSS = Gather(RSS, url, cache, options) rss = Gather(rss, url, cache, options)
if not DEBUG and not options.silent: if not DEBUG and not options.silent:
print After(RSS, options) print After(rss, options)
log('done') log('done')
def doFacebook(url, environ, headers, options, cache):
def do_facebook(url, environ, headers, options, cache):
log('fb stuff') log('fb stuff')
query = urlparse.urlparse(url).query query = urlparse.urlparse(url).query
@ -805,11 +828,13 @@ def doFacebook(url, environ, headers, options, cache):
if 'code' in query: if 'code' in query:
# get real token from code # get real token from code
code = urlparse.parse_qs(query)['code'][0] code = urlparse.parse_qs(query)['code'][0]
eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI']) eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(
app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI'])
token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0] token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0]
# get long-lived access token # get long-lived access token
eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token) eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(
app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip()) values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
ltoken = values['access_token'][0] ltoken = values['access_token'][0]
@ -824,6 +849,7 @@ def doFacebook(url, environ, headers, options, cache):
log('fb done') log('fb done')
return return
def main(): def main():
if 'REQUEST_URI' in os.environ: if 'REQUEST_URI' in os.environ:
wsgiref.handlers.CGIHandler().run(cgi_wrapper) wsgiref.handlers.CGIHandler().run(cgi_wrapper)

View File

@ -1,7 +1,8 @@
from setuptools import setup, find_packages from setuptools import setup, find_packages
package_name = 'morss' package_name = 'morss'
setup( name=package_name, setup(
name=package_name,
description='Get full-text RSS feeds', description='Get full-text RSS feeds',
author='pictuga', author='pictuga',
author_email='contact at author name dot com', author_email='contact at author name dot com',