Another huge commit.
Now uses OOP where it fits. Atom feeds are supported, but no real tests were made. Unix globbing is now possible for urls. Caching is done a cleaner way. Feedburner links are also replaced. HTML is cleaned a more efficient way. Code is now much cleaner, using lxml.objectify and a small wrapper to access Atom feeds as if they were RSS feeds (and much faster than feedparser). README has been updated.master
parent
a098b7e104
commit
af8879049f
|
@ -3,6 +3,8 @@
|
||||||
This tool's goal is to get full-text RSS feeds out of striped RSS feeds, commonly available on internet. Indeed most newspapers only make a small description available to users in their rss feeds, which makes the RSS feed rather useless. So this tool intends to fix that problem.
|
This tool's goal is to get full-text RSS feeds out of striped RSS feeds, commonly available on internet. Indeed most newspapers only make a small description available to users in their rss feeds, which makes the RSS feed rather useless. So this tool intends to fix that problem.
|
||||||
This tool opens the links from the rss feed, then downloads the full article from the newspaper website and puts it back in the rss feed.
|
This tool opens the links from the rss feed, then downloads the full article from the newspaper website and puts it back in the rss feed.
|
||||||
|
|
||||||
|
morss also has experimental support for Atom feeds.
|
||||||
|
|
||||||
##(xpath) Rules
|
##(xpath) Rules
|
||||||
|
|
||||||
To find the article content on the newspaper's website, morss need to know where to look at. The default target is the first `<h1>` element, since it's a common practice, or a `<article>` element, for HTML5 compliant websites.
|
To find the article content on the newspaper's website, morss need to know where to look at. The default target is the first `<h1>` element, since it's a common practice, or a `<article>` element, for HTML5 compliant websites.
|
||||||
|
@ -19,9 +21,12 @@ Here, xpath rules stored in the `rules` file. (The name of the file can be chang
|
||||||
|
|
||||||
Fancy name (description)(useless but not optional)
|
Fancy name (description)(useless but not optional)
|
||||||
http://example.com/path/to/the/rss/feed.xml
|
http://example.com/path/to/the/rss/feed.xml
|
||||||
|
http://example.co.uk/other/*/path/with/wildcard/*.xml
|
||||||
//super/accurate[@xpath='expression']/..
|
//super/accurate[@xpath='expression']/..
|
||||||
|
|
||||||
Works like a charm with Tiny TinyRSS (<http://tt-rss.org/redmine/projects/tt-rss/wiki>).
|
As shown in the example, multiple urls can be specified for a single rule, so as to be able to match feeds from different locations of the website server (for example with or without "www."). Moreover feeds urls can be *NIX glob-style patterns, so as to match any feed from a website.
|
||||||
|
|
||||||
|
Works like a charm with Tiny Tiny RSS (<http://tt-rss.org/redmine/projects/tt-rss/wiki>).
|
||||||
|
|
||||||
###As a newsreader hook
|
###As a newsreader hook
|
||||||
|
|
||||||
|
@ -51,4 +56,4 @@ Unwanted HTML elements are also stripped from the article. By default, elements
|
||||||
---
|
---
|
||||||
|
|
||||||
GPL3 licence.
|
GPL3 licence.
|
||||||
Python **2.6** required (not 3).
|
Python **2.6**+ required (not 3).
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
DefaultType text/html
|
|
483
morss.py
483
morss.py
|
@ -1,15 +1,42 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
from os.path import expanduser
|
import copy
|
||||||
from lxml import etree
|
from base64 import b64encode, b64decode
|
||||||
|
from fnmatch import fnmatch
|
||||||
|
import os.path
|
||||||
|
import lxml.etree
|
||||||
|
import lxml.objectify
|
||||||
|
import lxml.html
|
||||||
|
import lxml.html.clean
|
||||||
|
import lxml.builder
|
||||||
import re
|
import re
|
||||||
import string
|
import string
|
||||||
import urllib2
|
import urllib2
|
||||||
from cookielib import CookieJar
|
from cookielib import CookieJar
|
||||||
import chardet
|
import chardet
|
||||||
|
|
||||||
|
# DISCLAIMER: feedparser is pure shit if you intend to *edit* the feed.
|
||||||
|
|
||||||
SERVER = True
|
SERVER = True
|
||||||
|
MAX = 70
|
||||||
|
TRASH = ['//h1', '//header']
|
||||||
|
E = lxml.objectify.E
|
||||||
|
|
||||||
|
ITEM_MAP = {
|
||||||
|
'link': (('{http://www.w3.org/2005/Atom}link', 'href'), '{}link'),
|
||||||
|
'desc': ('{http://www.w3.org/2005/Atom}summary', '{}description'),
|
||||||
|
'description': ('{http://www.w3.org/2005/Atom}summary', '{}description'),
|
||||||
|
'summary': ('{http://www.w3.org/2005/Atom}summary', '{}description'),
|
||||||
|
'content': ('{http://www.w3.org/2005/Atom}content', '{http://purl.org/rss/1.0/modules/content/}encoded')
|
||||||
|
}
|
||||||
|
RSS_MAP = {
|
||||||
|
'desc': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'),
|
||||||
|
'description': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'),
|
||||||
|
'subtitle': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'),
|
||||||
|
'item': ('{http://www.w3.org/2005/Atom}entry', '{}item'),
|
||||||
|
'entry': ('{http://www.w3.org/2005/Atom}entry', '{}item')
|
||||||
|
}
|
||||||
|
|
||||||
if SERVER:
|
if SERVER:
|
||||||
import httplib
|
import httplib
|
||||||
|
@ -23,215 +50,317 @@ def log(txt):
|
||||||
print txt
|
print txt
|
||||||
if SERVER:
|
if SERVER:
|
||||||
with open('morss.log', 'a') as file:
|
with open('morss.log', 'a') as file:
|
||||||
if isinstance(txt, str):
|
file.write(str(txt).encode('utf-8') + "\n")
|
||||||
file.write(txt.encode('utf-8') + "\n")
|
|
||||||
|
|
||||||
class Info:
|
def cleanXML(xml):
|
||||||
def __init__(self, item, feed):
|
table = string.maketrans('', '')
|
||||||
self.item = item
|
return xml.translate(table, table[:32]).lstrip()
|
||||||
self.feed = feed
|
|
||||||
|
|
||||||
self.data = False
|
class Cache:
|
||||||
self.page = False
|
"""Light, error-prone caching system."""
|
||||||
self.html = False
|
def __init__(self, folder, key):
|
||||||
self.con = False
|
self._key = key
|
||||||
self.opener = False
|
self._dir = folder
|
||||||
self.enc = False
|
self._file = self._dir + "/" + str(hash(self._key))
|
||||||
|
self._cached = {} # what *was* cached
|
||||||
|
self._cache = {} # new things to put in cache
|
||||||
|
|
||||||
self.link = self.item.xpath('link')[0]
|
if os.path.exists(self._file):
|
||||||
self.desc = self.item.xpath('description')[0]
|
data = open(self._file).read().strip().split("\n")
|
||||||
|
for line in data:
|
||||||
|
key, bdata = line.split("\t")
|
||||||
|
self._cached[key] = bdata
|
||||||
|
|
||||||
def checkURL(self):
|
log(str(hash(self._key)))
|
||||||
if self.link.text.startswith("http://rss.feedsportal.com"):
|
|
||||||
log('feedsportal')
|
|
||||||
url = re.search('/([0-9a-zA-Z]+)/[a-zA-Z0-9\.]+$', self.link.text).groups()[0].split('0')
|
|
||||||
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'L':'ww', 'S':'w.'}
|
|
||||||
self.link.text = 'http://' + "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]])
|
|
||||||
log(self.link.text)
|
|
||||||
|
|
||||||
def fetch(self):
|
def get(self, key):
|
||||||
log(self.link.text)
|
if key in self._cached:
|
||||||
self.checkURL()
|
return b64decode(self._cached[key])
|
||||||
if not self.findCache():
|
|
||||||
self.download()
|
|
||||||
self.chardet()
|
|
||||||
self.fetchDesc()
|
|
||||||
self.save()
|
|
||||||
log(self.enc)
|
|
||||||
|
|
||||||
def parseHTML(self):
|
|
||||||
if self.enc is False:
|
|
||||||
self.page = etree.HTML(self.data)
|
|
||||||
else:
|
else:
|
||||||
try:
|
return None
|
||||||
self.page = etree.HTML(self.data.decode(self.enc, 'ignore'))
|
|
||||||
except ValueError:
|
|
||||||
self.page = etree.HTML(self.data)
|
|
||||||
|
|
||||||
|
def save(self, key, content):
|
||||||
|
# Maybe, appending to file when adding new elements could be
|
||||||
|
# a good idea, but that'd require to check a couple of things,
|
||||||
|
# like whether it has aleardy been over-written (ie. whether
|
||||||
|
# it no longer contains self._cached)
|
||||||
|
|
||||||
def save(self):
|
self._cache[key] = b64encode(content)
|
||||||
self.feed.save()
|
|
||||||
|
|
||||||
def findCache(self):
|
txt = ""
|
||||||
if self.feed.cache is not False:
|
for (key, bdata) in self._cache.iteritems():
|
||||||
xpath = "//link[text()='" + self.link.text + "']/../description/text()"
|
txt += "\n" + str(key) + "\t" + bdata
|
||||||
match = self.feed.cache.xpath(xpath)
|
txt.strip()
|
||||||
if len(match):
|
|
||||||
log('cached')
|
|
||||||
self.desc.text = match[0]
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def fetchDesc(self):
|
if not os.path.exists(self._dir):
|
||||||
self.parseHTML()
|
os.makedirs(self._dir)
|
||||||
match = self.page.xpath(self.feed.rule)
|
|
||||||
if len(match):
|
open(self._file, 'w').write(txt)
|
||||||
self.html = match[0]
|
|
||||||
self.deleteTags()
|
class XMLMap(object):
|
||||||
self.desc.text = etree.tostring(self.html).decode(self.enc, 'ignore')
|
"""
|
||||||
log('ok txt')
|
Sort of wrapper around lxml.objectify.StringElement (from which this
|
||||||
|
class *DOESN'T* inherit) which makes "links" between different children
|
||||||
|
of an element. For example, this allows cheap, efficient, transparent
|
||||||
|
RSS 2.0/Atom seamless use, which can be way faster than feedparser, and
|
||||||
|
has the advantage to edit the corresponding mapped fields. On top of
|
||||||
|
that, XML output with "classic" lxml API calls (such as
|
||||||
|
lxml.etree.tostring) is still possible. Element attributes are also
|
||||||
|
supported (as in <entry attr='value'/>).
|
||||||
|
|
||||||
|
However, keep in mind that this feature's support is only partial. For
|
||||||
|
example if you want to alias an element to both <el>value</el> and <el
|
||||||
|
href='value'/>, and put them as ('el', ('el', 'value')) in the _map
|
||||||
|
definition, then only 'el' will be whatched, even if ('el', 'value')
|
||||||
|
makes more sens in that specific case, because that would require to
|
||||||
|
also check the others, in case of "better" match, which is not done now.
|
||||||
|
|
||||||
|
Also, this class assumes there's some consistency in the _map
|
||||||
|
definition. Which means that it expects matches to be always found in
|
||||||
|
the same "column" in _map. This is useful when setting values which are
|
||||||
|
not yet in the XML tree. Indeed the class will try to use the alias from
|
||||||
|
the same column. With the RSS/Atom example, the default _map will always
|
||||||
|
create elements for the same kind of feed.
|
||||||
|
"""
|
||||||
|
def __init__(self, obj, alias=ITEM_MAP, string=False):
|
||||||
|
self._xml = obj
|
||||||
|
self._key = None
|
||||||
|
self._map = alias
|
||||||
|
self._str = string
|
||||||
|
|
||||||
|
self._guessKey()
|
||||||
|
self._E = E #lxml.objectify.ElementMaker(annotate=False)
|
||||||
|
|
||||||
|
def _guessKey(self):
|
||||||
|
for tag in self._map:
|
||||||
|
self._key = 0
|
||||||
|
for choice in self._map[tag]:
|
||||||
|
if not isinstance(choice, tuple):
|
||||||
|
choice = (choice, None)
|
||||||
|
el, attr = choice
|
||||||
|
if hasattr(self._xml, el):
|
||||||
|
if attr is None:
|
||||||
|
return
|
||||||
else:
|
else:
|
||||||
log('no match')
|
if attr in self._xml[el].attrib:
|
||||||
|
return
|
||||||
|
self._key+=1
|
||||||
|
self._key = 0
|
||||||
|
|
||||||
def download(self):
|
def _getElement(self, tag):
|
||||||
|
"""Returns a tuple whatsoever."""
|
||||||
|
if tag in self._map:
|
||||||
|
for choice in self._map[tag]:
|
||||||
|
if not isinstance(choice, tuple):
|
||||||
|
choice = (choice, None)
|
||||||
|
el, attr = choice
|
||||||
|
if hasattr(self._xml, el):
|
||||||
|
if attr is None:
|
||||||
|
return (self._xml[el], attr)
|
||||||
|
else:
|
||||||
|
if attr in self._xml[el].attrib:
|
||||||
|
return (self._xml[el], attr)
|
||||||
|
return (None, None)
|
||||||
|
if hasattr(self._xml, tag):
|
||||||
|
return (self._xml[tag], None)
|
||||||
|
return (None, None)
|
||||||
|
|
||||||
|
def __getattr__(self, tag):
|
||||||
|
el, attr = self._getElement(tag)
|
||||||
|
if el is not None:
|
||||||
|
if attr is None:
|
||||||
|
out = el
|
||||||
|
else:
|
||||||
|
out = el.get(attr)
|
||||||
|
else:
|
||||||
|
out = self._xml.__getattr__(tag)
|
||||||
|
|
||||||
|
return unicode(out) if self._str else out
|
||||||
|
|
||||||
|
def __getitem__(self, tag):
|
||||||
|
return self.__getattr__(tag)
|
||||||
|
|
||||||
|
def __setattr__(self, tag, value):
|
||||||
|
if tag.startswith('_'):
|
||||||
|
return object.__setattr__(self, tag, value)
|
||||||
|
|
||||||
|
el, attr = self._getElement(tag)
|
||||||
|
if el is not None:
|
||||||
|
if attr is None:
|
||||||
|
if (isinstance(value, lxml.objectify.StringElement)
|
||||||
|
or isinstance(value, str)
|
||||||
|
or isinstance(value, unicode)):
|
||||||
|
el._setText(value)
|
||||||
|
else:
|
||||||
|
el = value
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
el.set(attr, value)
|
||||||
|
return
|
||||||
|
choice = self._map[tag][self._key]
|
||||||
|
if not isinstance(choice, tuple):
|
||||||
|
child = lxml.objectify.Element(choice)
|
||||||
|
self._xml.append(child)
|
||||||
|
self._xml[choice] = value
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
el, attr = choice
|
||||||
|
child = lxml.objectify.Element(choice, attrib={attr:value})
|
||||||
|
self._xml.append(child)
|
||||||
|
return
|
||||||
|
|
||||||
|
def __contains__(self, tag):
|
||||||
|
el, attr = self._getElement(tag)
|
||||||
|
return el is not None
|
||||||
|
|
||||||
|
def remove(self):
|
||||||
|
self._xml.getparent().remove(self._xml)
|
||||||
|
|
||||||
|
def tostring(self, **k):
|
||||||
|
"""Returns string using lxml. Arguments passed to tostring."""
|
||||||
|
out = self._xml if self._xml.getparent() is None else self._xml.getparent()
|
||||||
|
return lxml.etree.tostring(out, pretty_print=True, **k)
|
||||||
|
|
||||||
|
def EncDownload(url):
|
||||||
try:
|
try:
|
||||||
cj = CookieJar()
|
cj = CookieJar()
|
||||||
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
|
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
|
||||||
self.con = self.opener.open(self.link.text.encode('utf-8'))
|
con = opener.open(url)
|
||||||
self.data = self.con.read()
|
data = con.read()
|
||||||
except (urllib2.HTTPError, urllib2.URLError) as error:
|
except (urllib2.HTTPError, urllib2.URLError) as error:
|
||||||
log(error)
|
log(error)
|
||||||
log('http error')
|
log('http error')
|
||||||
|
return False
|
||||||
|
|
||||||
def chardet(self):
|
if con.headers.getparam('charset'):
|
||||||
if self.con.headers.getparam('charset'):
|
|
||||||
log('header')
|
log('header')
|
||||||
self.enc = self.con.headers.getparam('charset')
|
enc = con.headers.getparam('charset')
|
||||||
return
|
|
||||||
|
|
||||||
page = etree.HTML(self.data)
|
|
||||||
header = page.xpath("//head/meta[@http-equiv='Content-Type']/@content")
|
|
||||||
if len(header) and len(header[0].split("=")):
|
|
||||||
log('meta')
|
|
||||||
self.enc = header[0].split("=")[1]
|
|
||||||
return
|
|
||||||
|
|
||||||
header = page.xpath("//head/meta[@charset]/@charset")
|
|
||||||
if len(header):
|
|
||||||
log('meta2')
|
|
||||||
self.enc = header[0]
|
|
||||||
return
|
|
||||||
|
|
||||||
log('chardet')
|
|
||||||
self.enc = chardet.detect(self.data)['encoding']
|
|
||||||
|
|
||||||
def deleteTags(self):
|
|
||||||
for tag in self.feed.trash:
|
|
||||||
for elem in self.html.xpath(tag):
|
|
||||||
elem.getparent().remove(elem)
|
|
||||||
|
|
||||||
class Feed:
|
|
||||||
def __init__(self, impl, data, cachePath):
|
|
||||||
self.rulePath = 'rules'
|
|
||||||
self.rule = '//article|//h1/..'
|
|
||||||
|
|
||||||
self.trash = ['//script', '//iframe', '//object', '//noscript', '//form', '//h1']
|
|
||||||
self.max = 70
|
|
||||||
|
|
||||||
self.cachePath = cachePath
|
|
||||||
self.cacheFile = False
|
|
||||||
self.cache = False
|
|
||||||
self.impl = impl
|
|
||||||
|
|
||||||
self.items = []
|
|
||||||
self.rss = False
|
|
||||||
self.out = False
|
|
||||||
|
|
||||||
if self.impl == 'server':
|
|
||||||
self.url = data
|
|
||||||
self.xml = False
|
|
||||||
else:
|
else:
|
||||||
self.url = False
|
match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data).groups()
|
||||||
self.xml = data
|
if len(match):
|
||||||
|
log('meta.re')
|
||||||
|
enc = match[0]
|
||||||
|
else:
|
||||||
|
log('chardet')
|
||||||
|
enc = chardet.detect(data)['encoding']
|
||||||
|
|
||||||
def save(self):
|
return (data, enc)
|
||||||
self.out = etree.tostring(self.rss, xml_declaration=True, pretty_print=True)
|
|
||||||
open(self.cacheFile, 'w').write(self.out)
|
|
||||||
|
|
||||||
def getData(self):
|
def parseRules(rulePath, url):
|
||||||
if self.impl == 'server':
|
rules = open(rulePath, "r").read().strip().split("\n\n")
|
||||||
req = urllib2.Request(self.url)
|
|
||||||
req.add_unredirected_header('User-Agent', '')
|
|
||||||
self.xml = urllib2.urlopen(req).read()
|
|
||||||
self.cleanXml()
|
|
||||||
|
|
||||||
def setCache(self):
|
|
||||||
if self.cache is not False:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.parse()
|
|
||||||
key = str(hash(self.rss.xpath('//channel/title/text()')[0]))
|
|
||||||
self.cacheFile = self.cachePath + "/" + key
|
|
||||||
log(self.cacheFile)
|
|
||||||
if not os.path.exists(self.cachePath):
|
|
||||||
os.makedirs(self.cachePath)
|
|
||||||
|
|
||||||
if os.path.exists(self.cacheFile):
|
|
||||||
self.cache = etree.XML(open(self.cacheFile, 'r').read())
|
|
||||||
|
|
||||||
def parse(self):
|
|
||||||
if self.rss is not False:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.rss = etree.XML(self.xml)
|
|
||||||
|
|
||||||
def setItems(self):
|
|
||||||
self.items = [Info(e, self) for e in self.rss.xpath('//item')]
|
|
||||||
if self.max:
|
|
||||||
self.items = self.items[:self.max]
|
|
||||||
|
|
||||||
def fill(self):
|
|
||||||
self.parseRules()
|
|
||||||
log(self.rule)
|
|
||||||
for item in self.items:
|
|
||||||
item.fetch()
|
|
||||||
|
|
||||||
def cleanXml(self):
|
|
||||||
table = string.maketrans('', '')
|
|
||||||
self.xml = self.xml.translate(table, table[:32]).lstrip()
|
|
||||||
|
|
||||||
def parseRules(self):
|
|
||||||
if self.impl == 'server':
|
|
||||||
rules = open(self.rulePath, "r").read().split("\n\n")
|
|
||||||
rules = [r.split('\n') for r in rules]
|
rules = [r.split('\n') for r in rules]
|
||||||
for rule in rules:
|
for rule in rules:
|
||||||
if rule[1] == self.url:
|
for domain in rule[1:-1]:
|
||||||
self.rule = rule[2]
|
if fnmatch(url, domain):
|
||||||
return
|
return rule[-1]
|
||||||
|
return '//article|//h1/..'
|
||||||
|
|
||||||
|
def Fill(rss, rule, cache):
|
||||||
|
item = XMLMap(rss, ITEM_MAP, True)
|
||||||
|
log(item.link)
|
||||||
|
|
||||||
|
# content already provided?
|
||||||
|
if 'content' in item:
|
||||||
|
if len(item.content) > 4*len(item.desc):
|
||||||
|
return item
|
||||||
|
|
||||||
|
# check link
|
||||||
|
if fnmatch(item.link, "http://*.feedsportal.com/*"):
|
||||||
|
url = re.search('/([0-9a-zA-Z]+)/[^/]+$', item.link).groups()[0].split('0')
|
||||||
|
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'L':'ww', 'S':'w.', 'O':'.co.uk'}
|
||||||
|
item.link = 'http://' + "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]])
|
||||||
|
if '{http://rssnamespace.org/feedburner/ext/1.0}origLink' in item:
|
||||||
|
item.link = item['{http://rssnamespace.org/feedburner/ext/1.0}origLink']
|
||||||
|
|
||||||
|
# check cache
|
||||||
|
cached = cache.get(item.link)
|
||||||
|
if cached is not None:
|
||||||
|
log('cached')
|
||||||
|
item.content = cached
|
||||||
|
return item
|
||||||
|
|
||||||
|
# download
|
||||||
|
ddl = EncDownload(item.link)
|
||||||
|
|
||||||
|
if ddl is False:
|
||||||
|
return item
|
||||||
|
|
||||||
|
data, enc = ddl
|
||||||
|
log(enc)
|
||||||
|
|
||||||
|
# parse
|
||||||
|
parser = lxml.html.HTMLParser(encoding=enc)
|
||||||
|
page = lxml.etree.fromstring(data, parser)
|
||||||
|
|
||||||
|
# filter
|
||||||
|
match = page.xpath(rule)
|
||||||
|
if len(match):
|
||||||
|
art = match[0]
|
||||||
|
log('ok txt')
|
||||||
|
else:
|
||||||
|
log('no match')
|
||||||
|
return item
|
||||||
|
|
||||||
|
# clean
|
||||||
|
for tag in TRASH:
|
||||||
|
for elem in art.xpath(tag):
|
||||||
|
elem.getparent().remove(elem)
|
||||||
|
|
||||||
|
art.tag = 'div' # solves crash in lxml.html.clean
|
||||||
|
art = lxml.html.clean.clean_html(art)
|
||||||
|
out = lxml.etree.tostring(art, pretty_print=True).decode(enc, 'ignore')
|
||||||
|
item.content = out
|
||||||
|
cache.save(item.link, out)
|
||||||
|
|
||||||
|
def Gather(data, cachePath):
|
||||||
|
# fetch feed
|
||||||
|
if data.startswith("http"):
|
||||||
|
req = urllib2.Request(data)
|
||||||
|
req.add_unredirected_header('User-Agent', '')
|
||||||
|
xml = urllib2.urlopen(req).read()
|
||||||
|
else:
|
||||||
|
xml = data
|
||||||
|
|
||||||
|
xml = cleanXML(xml)
|
||||||
|
rss = lxml.objectify.fromstring(xml)
|
||||||
|
root = rss.channel if hasattr(rss, 'channel') else rss
|
||||||
|
root = XMLMap(root, RSS_MAP)
|
||||||
|
|
||||||
|
cache = Cache(cachePath, unicode(root.title))
|
||||||
|
|
||||||
|
# rules
|
||||||
|
if data.startswith("http"):
|
||||||
|
rule = parseRules('rules', url)
|
||||||
else:
|
else:
|
||||||
if len(sys.argv) > 1:
|
if len(sys.argv) > 1:
|
||||||
self.rule = sys.argv[1]
|
rule = sys.argv[1]
|
||||||
|
else:
|
||||||
|
rule = '//article|//h1/..'
|
||||||
|
|
||||||
|
# set
|
||||||
|
log(rule)
|
||||||
|
if MAX:
|
||||||
|
for item in root.item[MAX:]:
|
||||||
|
item.getparent().remove(item)
|
||||||
|
for item in root.item:
|
||||||
|
Fill(item, rule, cache)
|
||||||
|
|
||||||
|
return root.tostring(xml_declaration=True, encoding='UTF-8')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if SERVER:
|
if SERVER:
|
||||||
print 'Content-Type: text/html\n'
|
print 'Content-Type: text/html\n'
|
||||||
url = os.environ['REQUEST_URI'][len(os.environ['SCRIPT_NAME'])+1:]
|
url = os.environ['REQUEST_URI'][len(os.environ['SCRIPT_NAME'])+1:]
|
||||||
url = 'http://' + url.replace(' ', '%20')
|
url = 'http://' + url.replace(' ', '%20')
|
||||||
|
cache = os.getcwd() + '/cache'
|
||||||
log(url)
|
log(url)
|
||||||
RSS = Feed('server', url, os.getcwd() + '/cache')
|
RSS = Gather(url, cache)
|
||||||
else:
|
else:
|
||||||
xml = sys.stdin.read()
|
xml = sys.stdin.read()
|
||||||
cache = expanduser('~') + '/.cache/morss'
|
cache = os.path.expanduser('~') + '/.cache/morss'
|
||||||
RSS = Feed('liferea', xml, os.getcwd() + '/cache')
|
RSS = Gather(xml, cache)
|
||||||
|
|
||||||
RSS.getData()
|
|
||||||
RSS.parse()
|
|
||||||
RSS.setCache()
|
|
||||||
RSS.setItems()
|
|
||||||
RSS.fill()
|
|
||||||
RSS.save()
|
|
||||||
|
|
||||||
if SERVER or not os.getenv('DEBUG', False):
|
if SERVER or not os.getenv('DEBUG', False):
|
||||||
print RSS.out
|
print RSS
|
||||||
else:
|
|
||||||
print 'done'
|
log('done')
|
||||||
|
|
30
rules
30
rules
|
@ -1,15 +1,37 @@
|
||||||
TehranTimes
|
TehranTimes
|
||||||
http://www.tehrantimes.com/component/ninjarsssyndicator/?feed_id=1&format=raw
|
http://www.tehrantimes.com/*
|
||||||
|
http://tehrantimes.com/*
|
||||||
//div[@class='article-indent']
|
//div[@class='article-indent']
|
||||||
|
|
||||||
FranceInfo
|
FranceInfo
|
||||||
http://www.franceinfo.fr/rss.xml
|
http://www.franceinfo.fr/rss*
|
||||||
//h2[@class='chapo']/..
|
//h2[@class='chapo']/..
|
||||||
|
|
||||||
|
Les Echos
|
||||||
|
http://rss.feedsportal.com/c/499/f/413829/index.rss
|
||||||
|
http://syndication.lesechos.fr/rss/*
|
||||||
|
//h1/../..
|
||||||
|
|
||||||
Spiegel
|
Spiegel
|
||||||
http://www.spiegel.de/schlagzeilen/tops/index.rss
|
http://www.spiegel.de/schlagzeilen/*
|
||||||
//div[@id='spArticleSection']
|
//div[@id='spArticleSection']
|
||||||
|
|
||||||
Le Soir
|
Le Soir
|
||||||
http://www.lesoir.be/feed/La%20Une/destination_une_block/
|
http://www.lesoir.be/feed/*
|
||||||
//div[@class='article-content']
|
//div[@class='article-content']
|
||||||
|
|
||||||
|
Stack Overflow
|
||||||
|
http://stackoverflow.com/feeds/*
|
||||||
|
//*[@id='question']
|
||||||
|
|
||||||
|
Daily Telegraph
|
||||||
|
http://www.telegraph.co.uk/*
|
||||||
|
//*[@id='mainBodyArea']
|
||||||
|
|
||||||
|
Cracked.com
|
||||||
|
http://feeds.feedburner.com/CrackedRSS
|
||||||
|
//div[@class='content']|//section[@class='body']
|
||||||
|
|
||||||
|
TheOnion
|
||||||
|
http://feeds.theonion.com/*
|
||||||
|
//article
|
||||||
|
|
Loading…
Reference in New Issue