2to3: morss.py port most default libs
parent
327b8504c4
commit
803d6e37c4
|
@ -4,7 +4,6 @@ import os
|
||||||
import os.path
|
import os.path
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import Queue
|
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
from fnmatch import fnmatch
|
from fnmatch import fnmatch
|
||||||
|
@ -18,17 +17,27 @@ from . import feeds
|
||||||
from . import feedify
|
from . import feedify
|
||||||
from . import crawler
|
from . import crawler
|
||||||
|
|
||||||
import httplib
|
|
||||||
import urllib
|
|
||||||
import urllib2
|
|
||||||
import urlparse
|
|
||||||
|
|
||||||
import wsgiref.simple_server
|
import wsgiref.simple_server
|
||||||
import wsgiref.handlers
|
import wsgiref.handlers
|
||||||
|
|
||||||
from readability import readability
|
from readability import readability
|
||||||
from html2text import HTML2Text
|
from html2text import HTML2Text
|
||||||
|
|
||||||
|
try:
|
||||||
|
from Queue import Queue
|
||||||
|
from httplib import HTTPConnection, HTTPException
|
||||||
|
from urllib2 import build_opener
|
||||||
|
from urllib2 import HTTPError
|
||||||
|
from urllib import quote_plus
|
||||||
|
from urlparse import urlparse, urljoin, parse_qs
|
||||||
|
except ImportError:
|
||||||
|
from queue import Queue
|
||||||
|
from http.client import HTTPConnection, HTTPException
|
||||||
|
from urllib.request import build_opener
|
||||||
|
from urllib.error import HTTPError
|
||||||
|
from urllib.parse import quote_plus
|
||||||
|
from urllib.parse import urlparse, urljoin, parse_qs
|
||||||
|
|
||||||
LIM_ITEM = 100 # deletes what's beyond
|
LIM_ITEM = 100 # deletes what's beyond
|
||||||
LIM_TIME = 7 # deletes what's after
|
LIM_TIME = 7 # deletes what's after
|
||||||
MAX_ITEM = 50 # cache-only beyond
|
MAX_ITEM = 50 # cache-only beyond
|
||||||
|
@ -49,7 +58,7 @@ MIMETYPE = {
|
||||||
PROTOCOL = ['http', 'https', 'ftp']
|
PROTOCOL = ['http', 'https', 'ftp']
|
||||||
|
|
||||||
if 'SCRIPT_NAME' in os.environ:
|
if 'SCRIPT_NAME' in os.environ:
|
||||||
httplib.HTTPConnection.debuglevel = 1
|
HTTPConnection.debuglevel = 1
|
||||||
|
|
||||||
import cgitb
|
import cgitb
|
||||||
|
|
||||||
|
@ -145,7 +154,7 @@ class Cache:
|
||||||
return
|
return
|
||||||
|
|
||||||
maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp"
|
maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp"
|
||||||
self._hash = urllib.quote_plus(self._key)[:maxsize]
|
self._hash = quote_plus(self._key)[:maxsize]
|
||||||
|
|
||||||
self._file = self._dir + '/' + self._hash
|
self._file = self._dir + '/' + self._hash
|
||||||
self._file_tmp = self._file + '.tmp'
|
self._file_tmp = self._file + '.tmp'
|
||||||
|
@ -256,26 +265,26 @@ def Fix(item, feedurl='/'):
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
|
||||||
# check relative urls
|
# check relative urls
|
||||||
item.link = urlparse.urljoin(feedurl, item.link)
|
item.link = urljoin(feedurl, item.link)
|
||||||
|
|
||||||
# google translate
|
# google translate
|
||||||
if fnmatch(item.link, 'http://translate.google.*/translate*u=*'):
|
if fnmatch(item.link, 'http://translate.google.*/translate*u=*'):
|
||||||
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
|
item.link = parse_qs(urlparse(item.link).query)['u'][0]
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
|
||||||
# google
|
# google
|
||||||
if fnmatch(item.link, 'http://www.google.*/url?q=*'):
|
if fnmatch(item.link, 'http://www.google.*/url?q=*'):
|
||||||
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['q'][0]
|
item.link = parse_qs(urlparse(item.link).query)['q'][0]
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
|
||||||
# google news
|
# google news
|
||||||
if fnmatch(item.link, 'http://news.google.com/news/url*url=*'):
|
if fnmatch(item.link, 'http://news.google.com/news/url*url=*'):
|
||||||
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['url'][0]
|
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
|
||||||
# facebook
|
# facebook
|
||||||
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
|
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
|
||||||
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
|
item.link = parse_qs(urlparse(item.link).query)['u'][0]
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
|
||||||
# feedburner
|
# feedburner
|
||||||
|
@ -294,7 +303,7 @@ def Fix(item, feedurl='/'):
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
|
||||||
# reddit
|
# reddit
|
||||||
if urlparse.urlparse(feedurl).netloc == 'www.reddit.com':
|
if urlparse(feedurl).netloc == 'www.reddit.com':
|
||||||
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
|
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
|
||||||
if len(match):
|
if len(match):
|
||||||
item.link = match[0]
|
item.link = match[0]
|
||||||
|
@ -331,7 +340,7 @@ def Fill(item, cache, options, feedurl='/', fast=False):
|
||||||
link = item.link
|
link = item.link
|
||||||
|
|
||||||
# twitter
|
# twitter
|
||||||
if urlparse.urlparse(feedurl).netloc == 'twitter.com':
|
if urlparse(feedurl).netloc == 'twitter.com':
|
||||||
match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url')
|
match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url')
|
||||||
if len(match):
|
if len(match):
|
||||||
link = match[0]
|
link = match[0]
|
||||||
|
@ -340,9 +349,9 @@ def Fill(item, cache, options, feedurl='/', fast=False):
|
||||||
link = None
|
link = None
|
||||||
|
|
||||||
# facebook
|
# facebook
|
||||||
if urlparse.urlparse(feedurl).netloc == 'graph.facebook.com':
|
if urlparse(feedurl).netloc == 'graph.facebook.com':
|
||||||
match = lxml.html.fromstring(item.content).xpath('//a/@href')
|
match = lxml.html.fromstring(item.content).xpath('//a/@href')
|
||||||
if len(match) and urlparse.urlparse(match[0]).netloc != 'www.facebook.com':
|
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
|
||||||
link = match[0]
|
link = match[0]
|
||||||
log(link)
|
log(link)
|
||||||
else:
|
else:
|
||||||
|
@ -375,9 +384,9 @@ def Fill(item, cache, options, feedurl='/', fast=False):
|
||||||
# download
|
# download
|
||||||
try:
|
try:
|
||||||
url = link.encode('utf-8')
|
url = link.encode('utf-8')
|
||||||
con = urllib2.build_opener(*accept_handler(('html', 'text/*'), True)).open(url, timeout=TIMEOUT)
|
con = build_opener(*accept_handler(('html', 'text/*'), True)).open(url, timeout=TIMEOUT)
|
||||||
data = con.read()
|
data = con.read()
|
||||||
except (IOError, httplib.HTTPException) as e:
|
except (IOError, HTTPException) as e:
|
||||||
log('http error: %s' % e.message)
|
log('http error: %s' % e.message)
|
||||||
cache.set(link, 'error-http')
|
cache.set(link, 'error-http')
|
||||||
return True
|
return True
|
||||||
|
@ -407,7 +416,7 @@ def Init(url, cache_path, options):
|
||||||
if url is None:
|
if url is None:
|
||||||
raise MorssException('No url provided')
|
raise MorssException('No url provided')
|
||||||
|
|
||||||
if urlparse.urlparse(url).scheme not in PROTOCOL:
|
if urlparse(url).scheme not in PROTOCOL:
|
||||||
url = 'http://' + url
|
url = 'http://' + url
|
||||||
log(url)
|
log(url)
|
||||||
|
|
||||||
|
@ -437,13 +446,13 @@ def Fetch(url, cache, options):
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
opener = etag_handler(('xml', 'html'), False, cache.get(url), cache.get('etag'), cache.get('lastmodified'))
|
opener = etag_handler(('xml', 'html'), False, cache.get(url), cache.get('etag'), cache.get('lastmodified'))
|
||||||
con = urllib2.build_opener(*opener).open(url, timeout=TIMEOUT * 2)
|
con = build_opener(*opener).open(url, timeout=TIMEOUT * 2)
|
||||||
xml = con.read()
|
xml = con.read()
|
||||||
except (urllib2.HTTPError) as e:
|
except (HTTPError) as e:
|
||||||
raise MorssException('Error downloading feed (HTTP Error %s)' % e.code)
|
raise MorssException('Error downloading feed (HTTP Error %s)' % e.code)
|
||||||
except (crawler.InvalidCertificateException) as e:
|
except (crawler.InvalidCertificateException) as e:
|
||||||
raise MorssException('Error downloading feed (Invalid SSL Certificate)')
|
raise MorssException('Error downloading feed (Invalid SSL Certificate)')
|
||||||
except (IOError, httplib.HTTPException):
|
except (IOError, HTTPException):
|
||||||
raise MorssException('Error downloading feed')
|
raise MorssException('Error downloading feed')
|
||||||
|
|
||||||
cache.set('xml', xml)
|
cache.set('xml', xml)
|
||||||
|
@ -481,7 +490,7 @@ def Fetch(url, cache, options):
|
||||||
match = lxml.html.fromstring(xml).xpath(
|
match = lxml.html.fromstring(xml).xpath(
|
||||||
"//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
"//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
||||||
if len(match):
|
if len(match):
|
||||||
link = urlparse.urljoin(url, match[0])
|
link = urljoin(url, match[0])
|
||||||
log('rss redirect: %s' % link)
|
log('rss redirect: %s' % link)
|
||||||
return Fetch(link, cache.new(link), options)
|
return Fetch(link, cache.new(link), options)
|
||||||
else:
|
else:
|
||||||
|
@ -539,7 +548,7 @@ def Gather(rss, url, cache, options):
|
||||||
if not options.proxy:
|
if not options.proxy:
|
||||||
Fill(item, cache, options, url)
|
Fill(item, cache, options, url)
|
||||||
|
|
||||||
queue = Queue.Queue()
|
queue = Queue()
|
||||||
|
|
||||||
for i in xrange(threads):
|
for i in xrange(threads):
|
||||||
t = threading.Thread(target=runner, args=(queue,))
|
t = threading.Thread(target=runner, args=(queue,))
|
||||||
|
|
Loading…
Reference in New Issue