Add feedify, and use it in morss

master
pictuga 2013-09-25 12:36:21 +02:00
parent 9bc4417be3
commit da14242bcf
2 changed files with 74 additions and 0 deletions

69
feedify.py 100644
View File

@ -0,0 +1,69 @@
#!/usr/bin/env python
from ConfigParser import ConfigParser
from fnmatch import fnmatch
import feeds
import re
import urllib2
import lxml.html
import urlparse
def toclass(query):
pattern = r'\[class=([^\]]+)\]'
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
return re.sub(pattern, repl, query)
def getRule(link=URL):
config = ConfigParser()
config.read('feedify.ini')
for section in config.sections():
values = dict(config.items(section))
values['path'] = values['path'].split('\n')[1:]
for path in values['path']:
if fnmatch(link, path):
return values
return False
def supported(link):
return getRule(link) is not False
def getString(expr, html):
match = html.xpath(toclass(expr))
if len(match):
return match[0].text_content()
else:
return ''
def build(link, data=None):
rule = getRule(link)
if rule is False:
return False
if data is None:
data = urllib2.urlopen(link).read()
html = lxml.html.fromstring(data)
feed = feeds.FeedParserAtom()
if 'title' in rule:
feed.title = html.xpath(toclass(rule['title']))[0]
if 'items' in rule:
for item in html.xpath(toclass(rule['items'])):
feedItem = {}
if 'item_title' in rule:
feedItem['title'] = item.xpath(toclass(rule['item_title']))[0]
if 'item_link' in rule:
url = item.xpath(toclass(rule['item_link']))[0]
url = urlparse.urljoin(link, url)
feedItem['link'] = url
if 'item_desc' in rule:
feedItem['desc'] = lxml.html.tostring(item.xpath(toclass(rule['item_desc']))[0], encoding='unicode')
if 'item_content' in rule:
feedItem['content'] = lxml.html.tostring(item.xpath(toclass(rule['item_content']))[0])
feed.items.append(feedItem)
return feed

View File

@ -13,6 +13,7 @@ import lxml.html.clean
import lxml.builder
import feeds
import feedify
import httplib
import urllib2
@ -377,6 +378,8 @@ def Gather(url, cachePath, progress=False):
if xml[:5] == '<?xml' or con.info().type in MIMETYPE['xml']:
style = 'normal'
elif feedify.supported(url):
style = 'feedify'
elif con.info().type in MIMETYPE['html']:
style = 'html'
else:
@ -389,6 +392,8 @@ def Gather(url, cachePath, progress=False):
if style == 'normal':
rss = feeds.parse(xml)
elif style == 'feedify':
xml = decodeHTML(xml)
rss = feedify.build(url, xml)
elif style == 'html':
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")