crawler: improve html iter code
continuous-integration/drone/push Build is passing Details

Ignores tags without attributes. Avoids bug with unclosed tags.
master
pictuga 2022-02-09 15:57:12 +01:00
parent b65272daab
commit e1ed33f320
1 changed files with 1 additions and 1 deletions

View File

@ -368,7 +368,7 @@ class BrowserlyHeaderHandler(BaseHandler):
def iter_html_tag(html_str, tag_name): def iter_html_tag(html_str, tag_name):
" To avoid parsing whole pages when looking for a simple tag " " To avoid parsing whole pages when looking for a simple tag "
re_tag = r'<%s(\s*[^>])*>' % tag_name re_tag = r'<%s\s+[^>]+>' % tag_name
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]' re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
for tag_match in re.finditer(re_tag, html_str): for tag_match in re.finditer(re_tag, html_str):