crawler: add MySQL backend

With extra dependency
master
pictuga 2017-11-04 14:51:41 +01:00
parent f29a107a09
commit d091e74d56
3 changed files with 35 additions and 0 deletions

View File

@ -36,6 +36,7 @@ You do need:
- [wheezy.template](https://pypi.python.org/pypi/wheezy.template) to generate HTML pages
- [chardet](https://pypi.python.org/pypi/chardet)
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
- pymysql
Simplest way to get these:

View File

@ -517,3 +517,36 @@ class SQLiteCache(BaseCache):
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url,) + value)
import pymysql.cursors
class MySQLCacheHandler(BaseCache):
" NB. Requires mono-threading, as pymysql doesn't isn't thread-safe "
def __init__(self, user, password, database, host='localhost'):
self.con = pymysql.connect(host=host, user=user, password=password, database=database, charset='utf8', autocommit=True)
with self.con.cursor() as cursor:
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
def __del__(self):
self.con.close()
def __getitem__(self, url):
cursor = self.con.cursor()
cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
row = cursor.fetchone()
if not row:
raise KeyError
return row[1:]
def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
if url in self:
with self.con.cursor() as cursor:
cursor.execute('UPDATE data SET code=%s, msg=%s, headers=%s, data=%s, timestamp=%s WHERE url=%s',
value + (url,))
else:
with self.con.cursor() as cursor:
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s)', (url,) + value)

View File

@ -4,3 +4,4 @@ html2text
ordereddict
wheezy.template
chardet
pymysql