+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
r"""
htmldocck.py is a custom checker script for Rustdoc HTML outputs.
"""
-from __future__ import print_function
+from __future__ import absolute_import, print_function, unicode_literals
+
+import codecs
+import io
import sys
import os.path
import re
from HTMLParser import HTMLParser
from xml.etree import cElementTree as ET
-# ⇤/⇥ are not in HTML 4 but are in HTML 5
try:
- from html.entities import entitydefs
+ from html.entities import name2codepoint
except ImportError:
- from htmlentitydefs import entitydefs
-entitydefs['larrb'] = u'\u21e4'
-entitydefs['rarrb'] = u'\u21e5'
-entitydefs['nbsp'] = ' '
+ from htmlentitydefs import name2codepoint
# "void elements" (no closing tag) from the HTML Standard section 12.1.2
VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
except NameError:
unichr = chr
+
class CustomHTMLParser(HTMLParser):
"""simplified HTML parser.
self.__builder.data(data)
def handle_entityref(self, name):
- self.__builder.data(entitydefs[name])
+ self.__builder.data(unichr(name2codepoint[name]))
def handle_charref(self, name):
code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
- self.__builder.data(unichr(code).encode('utf-8'))
+ self.__builder.data(unichr(code))
def close(self):
HTMLParser.close(self)
return self.__builder.close()
+
Command = namedtuple('Command', 'negated cmd args lineno context')
+
class FailedCheck(Exception):
pass
+
class InvalidCheck(Exception):
pass
+
def concat_multi_lines(f):
"""returns a generator out of the file object, which
- removes `\\` then `\n` then a shared prefix with the previous line then
optional whitespace;
- keeps a line number (starting from 0) of the first line being
concatenated."""
- lastline = None # set to the last line when the last line has a backslash
+ lastline = None # set to the last line when the last line has a backslash
firstlineno = None
catenated = ''
for lineno, line in enumerate(f):
if lastline is not None:
print_err(lineno, line, 'Trailing backslash at the end of the file')
+
LINE_PATTERN = re.compile(r'''
(?<=(?<!\S)@)(?P<negated>!?)
(?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
(?P<args>.*)$
-''', re.X)
+''', re.X | re.UNICODE)
def get_commands(template):
- with open(template, 'rU') as f:
+ with io.open(template, encoding='utf-8') as f:
for lineno, line in concat_multi_lines(f):
m = LINE_PATTERN.search(line)
if not m:
if args and not args[:1].isspace():
print_err(lineno, line, 'Invalid template syntax')
continue
- args = shlex.split(args)
+ try:
+ args = shlex.split(args)
+ except UnicodeEncodeError:
+ args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
def normalize_xpath(path):
if path.startswith('//'):
- return '.' + path # avoid warnings
+ return '.' + path # avoid warnings
elif path.startswith('.//'):
return path
else:
if not(os.path.exists(abspath) and os.path.isfile(abspath)):
raise FailedCheck('File does not exist {!r}'.format(path))
- with open(abspath) as f:
+ with io.open(abspath, encoding='utf-8') as f:
data = f.read()
self.files[path] = data
return data
if not(os.path.exists(abspath) and os.path.isfile(abspath)):
raise FailedCheck('File does not exist {!r}'.format(path))
- with open(abspath) as f:
+ with io.open(abspath, encoding='utf-8') as f:
try:
- tree = ET.parse(f, CustomHTMLParser())
+ tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
except Exception as e:
raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
self.trees[path] = tree
def check_string(data, pat, regexp):
if not pat:
- return True # special case a presence testing
+ return True # special case a presence testing
elif regexp:
- return re.search(pat, data) is not None
+ return re.search(pat, data, flags=re.UNICODE) is not None
else:
data = ' '.join(data.split())
pat = ' '.join(pat.split())
ret = check_string(value, pat, regexp)
if ret:
break
- except Exception as e:
+ except Exception:
print('Failed to get path "{}"'.format(path))
- raise e
+ raise
return ret
path = normalize_xpath(path)
return len(tree.findall(path))
+
def stderr(*args):
- print(*args, file=sys.stderr)
+ if sys.version_info.major < 3:
+ file = codecs.getwriter('utf-8')(sys.stderr)
+ else:
+ file = sys.stderr
+
+ print(*args, file=file)
+
def print_err(lineno, context, err, message=None):
global ERR_COUNT
if context:
stderr("\t{}".format(context))
+
ERR_COUNT = 0
+
def check_command(c, cache):
try:
cerr = ""
- if c.cmd == 'has' or c.cmd == 'matches': # string test
+ if c.cmd == 'has' or c.cmd == 'matches': # string test
regexp = (c.cmd == 'matches')
- if len(c.args) == 1 and not regexp: # @has <path> = file existence
+ if len(c.args) == 1 and not regexp: # @has <path> = file existence
try:
cache.get_file(c.args[0])
ret = True
except FailedCheck as err:
cerr = str(err)
ret = False
- elif len(c.args) == 2: # @has/matches <path> <pat> = string test
+ elif len(c.args) == 2: # @has/matches <path> <pat> = string test
cerr = "`PATTERN` did not match"
ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
- elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
+ elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
cerr = "`XPATH PATTERN` did not match"
tree = cache.get_tree(c.args[0])
pat, sep, attr = c.args[1].partition('/@')
- if sep: # attribute
+ if sep: # attribute
tree = cache.get_tree(c.args[0])
ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
- else: # normalized text
+ else: # normalized text
pat = c.args[1]
if pat.endswith('/text()'):
pat = pat[:-7]
else:
raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
- elif c.cmd == 'count': # count test
- if len(c.args) == 3: # @count <path> <pat> <count> = count test
+ elif c.cmd == 'count': # count test
+ if len(c.args) == 3: # @count <path> <pat> <count> = count test
expected = int(c.args[2])
found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
cerr = "Expected {} occurrences but found {}".format(expected, found)
ret = expected == found
else:
raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
- elif c.cmd == 'has-dir': # has-dir test
- if len(c.args) == 1: # @has-dir <path> = has-dir test
+ elif c.cmd == 'has-dir': # has-dir test
+ if len(c.args) == 1: # @has-dir <path> = has-dir test
try:
cache.get_dir(c.args[0])
ret = True
except InvalidCheck as err:
print_err(c.lineno, c.context, str(err))
+
def check(target, commands):
cache = CachedFiles(target)
for c in commands:
check_command(c, cache)
+
if __name__ == '__main__':
if len(sys.argv) != 3:
stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))