2 # -*- coding: utf-8 -*-
5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
9 The principle is simple: This script receives a path to generated HTML
10 documentation and a "template" script, which has a series of check
11 commands like `@has` or `@matches`. Each command is used to check if
12 some pattern is present or not present in the particular file or in
13 a particular node of the HTML tree. In many cases, the template script
14 happens to be the source code given to rustdoc.
16 While it indeed is possible to test in smaller portions, it has been
17 hard to construct tests in this fashion and major rendering errors were
18 discovered much later. This script is designed to make black-box and
19 regression testing of Rustdoc easy. This does not preclude the needs for
20 unit testing, but can be used to complement related tests by quickly
21 showing the expected renderings.
23 In order to avoid one-off dependencies for this task, this script uses
24 a reasonably working HTML parser and the existing XPath implementation
25 from Python's standard library. Hopefully, we won't render
30 Commands start with an `@` followed by a command name (letters and
31 hyphens), and zero or more arguments separated by one or more whitespace
32 characters and optionally delimited with single or double quotes. The `@`
33 mark cannot be preceded by a non-whitespace character. Other lines
34 (including every text up to the first `@`) are ignored, but it is
35 recommended to avoid the use of `@` in the template file.
37 There are a number of supported commands:
39 * `@has PATH` checks for the existence of the given file.
41 `PATH` is relative to the output directory. It can be given as `-`
42 which repeats the most recently used `PATH`.
44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
45 the occurrence of the given pattern `PATTERN` in the specified file.
46 Only one occurrence of the pattern is enough.
48 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
49 whitespace being replaced by one single space character) string.
50 The entire file is also whitespace-normalized including newlines.
52 For `@matches`, `PATTERN` is a Python-supported regular expression.
53 The file remains intact but the regexp is matched without the `MULTILINE`
54 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
55 to override them, and `\A` and `\Z` for definitely matching
56 the beginning and end of the file.
58 (The same distinction goes to other variants of these commands.)
60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
61 the presence of the given XPath `XPATH` in the specified HTML file,
62 and also the occurrence of the given pattern `PATTERN` in the matching
63 node or attribute. Only one occurrence of the pattern in the match
66 `PATH` should be a valid and well-formed HTML file. It does *not*
67 accept arbitrary HTML5; it should have matching open and close tags
68 and correct entity references at least.
70 `XPATH` is an XPath expression to match. The XPath is fairly limited:
71 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
72 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
73 and `@attr` (both as the last segment) are supported. Some examples:
75 - `//pre` or `.//pre` matches any element with a name `pre`.
76 - `//a[@href]` matches any element with an `href` attribute.
77 - `//*[@class="impl"]//code` matches any element with a name `code`,
78 which is an ancestor of some element which `class` attr is `impl`.
79 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
80 `class` attribute in the last `a` element (can be followed by more
81 elements that are not `a`) inside the first `span` in the `h1` with
82 a class of `fqn`. Note that there cannot be any additional elements
83 between them due to the use of `/` instead of `//`.
85 Do not try to use non-absolute paths, it won't work due to the flawed
86 ElementTree implementation. The script rejects them.
88 For the text matches (i.e. paths not ending with `@attr`), any
89 subelements are flattened into one string; this is handy for ignoring
90 highlights for example. If you want to simply check for the presence of
91 a given node or attribute, use an empty string (`""`) as a `PATTERN`.
93 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
94 in the specified file. The number of occurrences must match the given
97 * `@has-dir PATH` checks for the existence of the given directory.
99 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
100 checks if the given file does not exist, for example.
104 from __future__ import absolute_import, print_function, unicode_literals
112 from collections import namedtuple
114 from html.parser import HTMLParser
116 from HTMLParser import HTMLParser
118 from xml.etree import cElementTree as ET
120 from xml.etree import ElementTree as ET
123 from html.entities import name2codepoint
125 from htmlentitydefs import name2codepoint
127 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
128 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
129 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
131 # Python 2 -> 3 compatibility
138 class CustomHTMLParser(HTMLParser):
139 """simplified HTML parser.
141 this is possible because we are dealing with very regular HTML from
142 rustdoc; we only have to deal with i) void elements and ii) empty
144 def __init__(self, target=None):
145 HTMLParser.__init__(self)
146 self.__builder = target or ET.TreeBuilder()
148 def handle_starttag(self, tag, attrs):
149 attrs = {k: v or '' for k, v in attrs}
150 self.__builder.start(tag, attrs)
151 if tag in VOID_ELEMENTS:
152 self.__builder.end(tag)
154 def handle_endtag(self, tag):
155 self.__builder.end(tag)
157 def handle_startendtag(self, tag, attrs):
158 attrs = {k: v or '' for k, v in attrs}
159 self.__builder.start(tag, attrs)
160 self.__builder.end(tag)
162 def handle_data(self, data):
163 self.__builder.data(data)
165 def handle_entityref(self, name):
166 self.__builder.data(unichr(name2codepoint[name]))
168 def handle_charref(self, name):
169 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
170 self.__builder.data(unichr(code))
173 HTMLParser.close(self)
174 return self.__builder.close()
177 Command = namedtuple('Command', 'negated cmd args lineno context')
180 class FailedCheck(Exception):
184 class InvalidCheck(Exception):
188 def concat_multi_lines(f):
189 """returns a generator out of the file object, which
190 - removes `\\` then `\n` then a shared prefix with the previous line then
192 - keeps a line number (starting from 0) of the first line being
194 lastline = None # set to the last line when the last line has a backslash
197 for lineno, line in enumerate(f):
198 line = line.rstrip('\r\n')
200 # strip the common prefix from the current line if needed
201 if lastline is not None:
202 common_prefix = os.path.commonprefix([line, lastline])
203 line = line[len(common_prefix):].lstrip()
205 firstlineno = firstlineno or lineno
206 if line.endswith('\\'):
209 catenated += line[:-1]
211 yield firstlineno, catenated + line
216 if lastline is not None:
217 print_err(lineno, line, 'Trailing backslash at the end of the file')
220 LINE_PATTERN = re.compile(r'''
221 (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?)
222 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
224 ''', re.X | re.UNICODE)
227 def get_commands(template):
228 with io.open(template, encoding='utf-8') as f:
229 for lineno, line in concat_multi_lines(f):
230 m = LINE_PATTERN.search(line)
234 negated = (m.group('negated') == '!')
236 if m.group('invalid') == '!':
240 'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format(
241 '!' if negated else '',
246 args = m.group('args')
247 if args and not args[:1].isspace():
248 print_err(lineno, line, 'Invalid template syntax')
251 args = shlex.split(args)
252 except UnicodeEncodeError:
253 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
254 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
257 def _flatten(node, acc):
259 acc.append(node.text)
272 def normalize_xpath(path):
273 if path.startswith('//'):
274 return '.' + path # avoid warnings
275 elif path.startswith('.//'):
278 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
281 class CachedFiles(object):
282 def __init__(self, root):
286 self.last_path = None
288 def resolve_path(self, path):
290 path = os.path.normpath(path)
291 self.last_path = path
293 elif self.last_path is None:
294 raise InvalidCheck('Tried to use the previous path in the first command')
296 return self.last_path
298 def get_file(self, path):
299 path = self.resolve_path(path)
300 if path in self.files:
301 return self.files[path]
303 abspath = os.path.join(self.root, path)
304 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
305 raise FailedCheck('File does not exist {!r}'.format(path))
307 with io.open(abspath, encoding='utf-8') as f:
309 self.files[path] = data
312 def get_tree(self, path):
313 path = self.resolve_path(path)
314 if path in self.trees:
315 return self.trees[path]
317 abspath = os.path.join(self.root, path)
318 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
319 raise FailedCheck('File does not exist {!r}'.format(path))
321 with io.open(abspath, encoding='utf-8') as f:
323 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
324 except Exception as e:
325 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
326 self.trees[path] = tree
327 return self.trees[path]
329 def get_dir(self, path):
330 path = self.resolve_path(path)
331 abspath = os.path.join(self.root, path)
332 if not(os.path.exists(abspath) and os.path.isdir(abspath)):
333 raise FailedCheck('Directory does not exist {!r}'.format(path))
336 def check_string(data, pat, regexp):
338 return True # special case a presence testing
340 return re.search(pat, data, flags=re.UNICODE) is not None
342 data = ' '.join(data.split())
343 pat = ' '.join(pat.split())
347 def check_tree_attr(tree, path, attr, pat, regexp):
348 path = normalize_xpath(path)
350 for e in tree.findall(path):
352 value = e.attrib[attr]
356 ret = check_string(value, pat, regexp)
362 def check_tree_text(tree, path, pat, regexp):
363 path = normalize_xpath(path)
366 for e in tree.findall(path):
372 ret = check_string(value, pat, regexp)
376 print('Failed to get path "{}"'.format(path))
381 def get_tree_count(tree, path):
382 path = normalize_xpath(path)
383 return len(tree.findall(path))
387 if sys.version_info.major < 3:
388 file = codecs.getwriter('utf-8')(sys.stderr)
392 print(*args, file=file)
395 def print_err(lineno, context, err, message=None):
398 stderr("{}: {}".format(lineno, message or err))
400 stderr("\t{}".format(err))
403 stderr("\t{}".format(context))
409 def check_command(c, cache):
412 if c.cmd == 'has' or c.cmd == 'matches': # string test
413 regexp = (c.cmd == 'matches')
414 if len(c.args) == 1 and not regexp: # @has <path> = file existence
416 cache.get_file(c.args[0])
418 except FailedCheck as err:
421 elif len(c.args) == 2: # @has/matches <path> <pat> = string test
422 cerr = "`PATTERN` did not match"
423 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
424 elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
425 cerr = "`XPATH PATTERN` did not match"
426 tree = cache.get_tree(c.args[0])
427 pat, sep, attr = c.args[1].partition('/@')
429 tree = cache.get_tree(c.args[0])
430 ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
431 else: # normalized text
433 if pat.endswith('/text()'):
435 ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
437 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
439 elif c.cmd == 'count': # count test
440 if len(c.args) == 3: # @count <path> <pat> <count> = count test
441 expected = int(c.args[2])
442 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
443 cerr = "Expected {} occurrences but found {}".format(expected, found)
444 ret = expected == found
446 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
447 elif c.cmd == 'has-dir': # has-dir test
448 if len(c.args) == 1: # @has-dir <path> = has-dir test
450 cache.get_dir(c.args[0])
452 except FailedCheck as err:
456 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
457 elif c.cmd == 'valid-html':
458 raise InvalidCheck('Unimplemented @valid-html')
460 elif c.cmd == 'valid-links':
461 raise InvalidCheck('Unimplemented @valid-links')
463 raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
466 raise FailedCheck(cerr)
468 except FailedCheck as err:
469 message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
470 print_err(c.lineno, c.context, str(err), message)
471 except InvalidCheck as err:
472 print_err(c.lineno, c.context, str(err))
475 def check(target, commands):
476 cache = CachedFiles(target)
478 check_command(c, cache)
481 if __name__ == '__main__':
482 if len(sys.argv) != 3:
483 stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
486 check(sys.argv[1], get_commands(sys.argv[2]))
488 stderr("\nEncountered {} errors".format(ERR_COUNT))