diff --git a/morss/crawler.py b/morss/crawler.py index ad8b316..cbf5b92 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -23,7 +23,6 @@ from io import BytesIO, StringIO import re import chardet from cgi import parse_header -import lxml.html import time import threading import random @@ -337,71 +336,51 @@ class BrowserlyHeaderHandler(BaseHandler): https_request = http_request -class AlternateHandler(BaseHandler): +def iter_html_tag(html_str, tag_name): + re_tag = r'<%s(\s*[^>])*>' % tag_name + re_attr = r'(?P[^=\s]+)=[\'"](?P[^\'"]+)[\'"]' + + for tag_match in re.finditer(re_tag, html_str): + attr_match = re.findall(re_attr, tag_match.group(0)) + + if attr_match is not None: + yield dict(attr_match) + + +class AlternateHandler(RespStrHandler): " Follow " def __init__(self, follow=None): self.follow = follow or [] - def http_response(self, req, resp): + def str_response(self, req, resp, data_str): contenttype = resp.info().get('Content-Type', '').split(';')[0] + if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow: # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types - data = resp.read() - - try: - links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]') - - for link in links: - if link.get('type', '') in self.follow: - resp.code = 302 - resp.msg = 'Moved Temporarily' - resp.headers['location'] = link.get('href') - break - - except (ValueError, SyntaxError): - # catch parsing errors - pass - - fp = BytesIO(data) - old_resp = resp - resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - - return resp - - https_response = http_response + for link in iter_html_tag(data_str[:10000], 'link'): + if (link.get('rel') == 'alternate' + and link.get('type') in self.follow + and 'href' in link): + resp.code = 302 + resp.msg = 'Moved Temporarily' + resp.headers['location'] = link.get('href') + break -class HTTPEquivHandler(BaseHandler): +class HTTPEquivHandler(RespStrHandler): " Handler to support , since it defines HTTP headers " handler_order = 600 - def http_response(self, req, resp): + def str_response(self, req, resp, data_str): contenttype = resp.info().get('Content-Type', '').split(';')[0] if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']: - data = resp.read() - try: - headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]') - - for header in headers: - resp.headers[header.get('http-equiv').lower()] = header.get('content') - - except (ValueError, SyntaxError): - # catch parsing errors - pass - - fp = BytesIO(data) - old_resp = resp - resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - - return resp - - https_response = http_response + for meta in iter_html_tag(data_str[:10000], 'meta'): + if 'http-equiv' in meta and 'content' in meta: + resp.headers[meta.get('http-equiv').lower()] = meta.get('content') class HTTPRefreshHandler(BaseHandler):