Add feedify, and use it in morss
parent
9bc4417be3
commit
da14242bcf
|
@ -0,0 +1,69 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from ConfigParser import ConfigParser
|
||||||
|
from fnmatch import fnmatch
|
||||||
|
import feeds
|
||||||
|
import re
|
||||||
|
|
||||||
|
import urllib2
|
||||||
|
import lxml.html
|
||||||
|
import urlparse
|
||||||
|
|
||||||
|
def toclass(query):
|
||||||
|
pattern = r'\[class=([^\]]+)\]'
|
||||||
|
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
|
||||||
|
return re.sub(pattern, repl, query)
|
||||||
|
|
||||||
|
def getRule(link=URL):
|
||||||
|
config = ConfigParser()
|
||||||
|
config.read('feedify.ini')
|
||||||
|
|
||||||
|
for section in config.sections():
|
||||||
|
values = dict(config.items(section))
|
||||||
|
values['path'] = values['path'].split('\n')[1:]
|
||||||
|
for path in values['path']:
|
||||||
|
if fnmatch(link, path):
|
||||||
|
return values
|
||||||
|
return False
|
||||||
|
|
||||||
|
def supported(link):
|
||||||
|
return getRule(link) is not False
|
||||||
|
|
||||||
|
def getString(expr, html):
|
||||||
|
match = html.xpath(toclass(expr))
|
||||||
|
if len(match):
|
||||||
|
return match[0].text_content()
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def build(link, data=None):
|
||||||
|
rule = getRule(link)
|
||||||
|
if rule is False:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if data is None:
|
||||||
|
data = urllib2.urlopen(link).read()
|
||||||
|
|
||||||
|
html = lxml.html.fromstring(data)
|
||||||
|
feed = feeds.FeedParserAtom()
|
||||||
|
|
||||||
|
if 'title' in rule:
|
||||||
|
feed.title = html.xpath(toclass(rule['title']))[0]
|
||||||
|
|
||||||
|
if 'items' in rule:
|
||||||
|
for item in html.xpath(toclass(rule['items'])):
|
||||||
|
feedItem = {}
|
||||||
|
|
||||||
|
if 'item_title' in rule:
|
||||||
|
feedItem['title'] = item.xpath(toclass(rule['item_title']))[0]
|
||||||
|
if 'item_link' in rule:
|
||||||
|
url = item.xpath(toclass(rule['item_link']))[0]
|
||||||
|
url = urlparse.urljoin(link, url)
|
||||||
|
feedItem['link'] = url
|
||||||
|
if 'item_desc' in rule:
|
||||||
|
feedItem['desc'] = lxml.html.tostring(item.xpath(toclass(rule['item_desc']))[0], encoding='unicode')
|
||||||
|
if 'item_content' in rule:
|
||||||
|
feedItem['content'] = lxml.html.tostring(item.xpath(toclass(rule['item_content']))[0])
|
||||||
|
|
||||||
|
feed.items.append(feedItem)
|
||||||
|
return feed
|
5
morss.py
5
morss.py
|
@ -13,6 +13,7 @@ import lxml.html.clean
|
||||||
import lxml.builder
|
import lxml.builder
|
||||||
|
|
||||||
import feeds
|
import feeds
|
||||||
|
import feedify
|
||||||
|
|
||||||
import httplib
|
import httplib
|
||||||
import urllib2
|
import urllib2
|
||||||
|
@ -377,6 +378,8 @@ def Gather(url, cachePath, progress=False):
|
||||||
|
|
||||||
if xml[:5] == '<?xml' or con.info().type in MIMETYPE['xml']:
|
if xml[:5] == '<?xml' or con.info().type in MIMETYPE['xml']:
|
||||||
style = 'normal'
|
style = 'normal'
|
||||||
|
elif feedify.supported(url):
|
||||||
|
style = 'feedify'
|
||||||
elif con.info().type in MIMETYPE['html']:
|
elif con.info().type in MIMETYPE['html']:
|
||||||
style = 'html'
|
style = 'html'
|
||||||
else:
|
else:
|
||||||
|
@ -389,6 +392,8 @@ def Gather(url, cachePath, progress=False):
|
||||||
|
|
||||||
if style == 'normal':
|
if style == 'normal':
|
||||||
rss = feeds.parse(xml)
|
rss = feeds.parse(xml)
|
||||||
|
elif style == 'feedify':
|
||||||
|
xml = decodeHTML(xml)
|
||||||
rss = feedify.build(url, xml)
|
rss = feedify.build(url, xml)
|
||||||
elif style == 'html':
|
elif style == 'html':
|
||||||
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
||||||
|
|
Loading…
Reference in New Issue