2 # -*- coding: utf-8 -*-
5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
9 The principle is simple: This script receives a path to generated HTML
10 documentation and a "template" script, which has a series of check
11 commands like `@has` or `@matches`. Each command is used to check if
12 some pattern is present or not present in the particular file or in
13 a particular node of the HTML tree. In many cases, the template script
14 happens to be the source code given to rustdoc.
16 While it indeed is possible to test in smaller portions, it has been
17 hard to construct tests in this fashion and major rendering errors were
18 discovered much later. This script is designed to make black-box and
19 regression testing of Rustdoc easy. This does not preclude the needs for
20 unit testing, but can be used to complement related tests by quickly
21 showing the expected renderings.
23 In order to avoid one-off dependencies for this task, this script uses
24 a reasonably working HTML parser and the existing XPath implementation
25 from Python's standard library. Hopefully, we won't render
30 Commands start with an `@` followed by a command name (letters and
31 hyphens), and zero or more arguments separated by one or more whitespace
32 characters and optionally delimited with single or double quotes. The `@`
33 mark cannot be preceded by a non-whitespace character. Other lines
34 (including every text up to the first `@`) are ignored, but it is
35 recommended to avoid the use of `@` in the template file.
37 There are a number of supported commands:
39 * `@has PATH` checks for the existence of the given file.
41 `PATH` is relative to the output directory. It can be given as `-`
42 which repeats the most recently used `PATH`.
44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
45 the occurrence of the given pattern `PATTERN` in the specified file.
46 Only one occurrence of the pattern is enough.
48 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
49 whitespace being replaced by one single space character) string.
50 The entire file is also whitespace-normalized including newlines.
52 For `@matches`, `PATTERN` is a Python-supported regular expression.
53 The file remains intact but the regexp is matched without the `MULTILINE`
54 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
55 to override them, and `\A` and `\Z` for definitely matching
56 the beginning and end of the file.
58 (The same distinction goes to other variants of these commands.)
60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
61 the presence of the given XPath `XPATH` in the specified HTML file,
62 and also the occurrence of the given pattern `PATTERN` in the matching
63 node or attribute. Only one occurrence of the pattern in the match
66 `PATH` should be a valid and well-formed HTML file. It does *not*
67 accept arbitrary HTML5; it should have matching open and close tags
68 and correct entity references at least.
70 `XPATH` is an XPath expression to match. The XPath is fairly limited:
71 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
72 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
73 and `@attr` (both as the last segment) are supported. Some examples:
75 - `//pre` or `.//pre` matches any element with a name `pre`.
76 - `//a[@href]` matches any element with an `href` attribute.
77 - `//*[@class="impl"]//code` matches any element with a name `code`,
78 which is an ancestor of some element which `class` attr is `impl`.
79 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
80 `class` attribute in the last `a` element (can be followed by more
81 elements that are not `a`) inside the first `span` in the `h1` with
82 a class of `fqn`. Note that there cannot be any additional elements
83 between them due to the use of `/` instead of `//`.
85 Do not try to use non-absolute paths, it won't work due to the flawed
86 ElementTree implementation. The script rejects them.
88 For the text matches (i.e. paths not ending with `@attr`), any
89 subelements are flattened into one string; this is handy for ignoring
90 highlights for example. If you want to simply check for the presence of
91 a given node or attribute, use an empty string (`""`) as a `PATTERN`.
93 * `@count PATH XPATH COUNT` checks for the occurrence of the given XPath
94 in the specified file. The number of occurrences must match the given
97 * `@count PATH XPATH TEXT COUNT` checks for the occurrence of the given XPath
98 with the given text in the specified file. The number of occurrences must
99 match the given count.
101 * `@snapshot NAME PATH XPATH` creates a snapshot test named NAME.
102 A snapshot test captures a subtree of the DOM, at the location
103 determined by the XPath, and compares it to a pre-recorded value
104 in a file. The file's name is the test's name with the `.rs` extension
105 replaced with `.NAME.html`, where NAME is the snapshot's name.
107 htmldocck supports the `--bless` option to accept the current subtree
108 as expected, saving it to the file determined by the snapshot's name.
109 compiletest's `--bless` flag is forwarded to htmldocck.
111 * `@has-dir PATH` checks for the existence of the given directory.
113 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
114 checks if the given file does not exist, for example.
118 from __future__ import absolute_import, print_function, unicode_literals
126 from collections import namedtuple
128 from html.parser import HTMLParser
130 from HTMLParser import HTMLParser
132 from xml.etree import cElementTree as ET
134 from xml.etree import ElementTree as ET
137 from html.entities import name2codepoint
139 from htmlentitydefs import name2codepoint
141 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
142 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
143 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
145 # Python 2 -> 3 compatibility
152 channel = os.environ["DOC_RUST_LANG_ORG_CHANNEL"]
154 # Initialized in main
155 rust_test_path = None
158 class CustomHTMLParser(HTMLParser):
159 """simplified HTML parser.
161 this is possible because we are dealing with very regular HTML from
162 rustdoc; we only have to deal with i) void elements and ii) empty
164 def __init__(self, target=None):
165 HTMLParser.__init__(self)
166 self.__builder = target or ET.TreeBuilder()
168 def handle_starttag(self, tag, attrs):
169 attrs = {k: v or '' for k, v in attrs}
170 self.__builder.start(tag, attrs)
171 if tag in VOID_ELEMENTS:
172 self.__builder.end(tag)
174 def handle_endtag(self, tag):
175 self.__builder.end(tag)
177 def handle_startendtag(self, tag, attrs):
178 attrs = {k: v or '' for k, v in attrs}
179 self.__builder.start(tag, attrs)
180 self.__builder.end(tag)
182 def handle_data(self, data):
183 self.__builder.data(data)
185 def handle_entityref(self, name):
186 self.__builder.data(unichr(name2codepoint[name]))
188 def handle_charref(self, name):
189 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
190 self.__builder.data(unichr(code))
193 HTMLParser.close(self)
194 return self.__builder.close()
197 Command = namedtuple('Command', 'negated cmd args lineno context')
200 class FailedCheck(Exception):
204 class InvalidCheck(Exception):
208 def concat_multi_lines(f):
209 """returns a generator out of the file object, which
210 - removes `\\` then `\n` then a shared prefix with the previous line then
212 - keeps a line number (starting from 0) of the first line being
214 lastline = None # set to the last line when the last line has a backslash
217 for lineno, line in enumerate(f):
218 line = line.rstrip('\r\n')
220 # strip the common prefix from the current line if needed
221 if lastline is not None:
222 common_prefix = os.path.commonprefix([line, lastline])
223 line = line[len(common_prefix):].lstrip()
225 firstlineno = firstlineno or lineno
226 if line.endswith('\\'):
229 catenated += line[:-1]
231 yield firstlineno, catenated + line
236 if lastline is not None:
237 print_err(lineno, line, 'Trailing backslash at the end of the file')
240 LINE_PATTERN = re.compile(r'''
241 (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?)
242 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
244 ''', re.X | re.UNICODE)
247 def get_commands(template):
248 with io.open(template, encoding='utf-8') as f:
249 for lineno, line in concat_multi_lines(f):
250 m = LINE_PATTERN.search(line)
254 negated = (m.group('negated') == '!')
256 if m.group('invalid') == '!':
260 'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format(
261 '!' if negated else '',
266 args = m.group('args')
267 if args and not args[:1].isspace():
268 print_err(lineno, line, 'Invalid template syntax')
271 args = shlex.split(args)
272 except UnicodeEncodeError:
273 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
274 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
277 def _flatten(node, acc):
279 acc.append(node.text)
293 xml = ET.XML('<xml>%s</xml>' % text)
297 def normalize_xpath(path):
298 path = path.replace("{{channel}}", channel)
299 if path.startswith('//'):
300 return '.' + path # avoid warnings
301 elif path.startswith('.//'):
304 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
307 class CachedFiles(object):
308 def __init__(self, root):
312 self.last_path = None
314 def resolve_path(self, path):
316 path = os.path.normpath(path)
317 self.last_path = path
319 elif self.last_path is None:
320 raise InvalidCheck('Tried to use the previous path in the first command')
322 return self.last_path
324 def get_file(self, path):
325 path = self.resolve_path(path)
326 if path in self.files:
327 return self.files[path]
329 abspath = os.path.join(self.root, path)
330 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
331 raise FailedCheck('File does not exist {!r}'.format(path))
333 with io.open(abspath, encoding='utf-8') as f:
335 self.files[path] = data
338 def get_tree(self, path):
339 path = self.resolve_path(path)
340 if path in self.trees:
341 return self.trees[path]
343 abspath = os.path.join(self.root, path)
344 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
345 raise FailedCheck('File does not exist {!r}'.format(path))
347 with io.open(abspath, encoding='utf-8') as f:
349 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
350 except Exception as e:
351 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
352 self.trees[path] = tree
353 return self.trees[path]
355 def get_dir(self, path):
356 path = self.resolve_path(path)
357 abspath = os.path.join(self.root, path)
358 if not(os.path.exists(abspath) and os.path.isdir(abspath)):
359 raise FailedCheck('Directory does not exist {!r}'.format(path))
362 def check_string(data, pat, regexp):
363 pat = pat.replace("{{channel}}", channel)
365 return True # special case a presence testing
367 return re.search(pat, data, flags=re.UNICODE) is not None
369 data = ' '.join(data.split())
370 pat = ' '.join(pat.split())
374 def check_tree_attr(tree, path, attr, pat, regexp):
375 path = normalize_xpath(path)
377 for e in tree.findall(path):
379 value = e.attrib[attr]
383 ret = check_string(value, pat, regexp)
389 # Returns the number of occurences matching the regex (`regexp`) and the text (`pat`).
390 def check_tree_text(tree, path, pat, regexp, stop_at_first):
391 path = normalize_xpath(path)
394 for e in tree.findall(path):
400 if check_string(value, pat, regexp):
405 print('Failed to get path "{}"'.format(path))
410 def get_tree_count(tree, path):
411 path = normalize_xpath(path)
412 return len(tree.findall(path))
415 def check_snapshot(snapshot_name, actual_tree, normalize_to_text):
416 assert rust_test_path.endswith('.rs')
417 snapshot_path = '{}.{}.{}'.format(rust_test_path[:-3], snapshot_name, 'html')
419 with open(snapshot_path, 'r') as snapshot_file:
420 expected_str = snapshot_file.read()
421 except FileNotFoundError:
425 raise FailedCheck('No saved snapshot value')
427 if not normalize_to_text:
428 actual_str = ET.tostring(actual_tree).decode('utf-8')
430 actual_str = flatten(actual_tree)
432 expected_str = expected_str.replace("{{channel}}", channel)
436 # 2. Are actual and expected tree different
437 # 3. Are actual and expected text different
438 if not expected_str \
439 or (not normalize_to_text and \
440 not compare_tree(make_xml(actual_str), make_xml(expected_str), stderr)) \
441 or (normalize_to_text and actual_str != expected_str):
444 with open(snapshot_path, 'w') as snapshot_file:
445 snapshot_file.write(actual_str)
447 print('--- expected ---\n')
449 print('\n\n--- actual ---\n')
452 raise FailedCheck('Actual snapshot value is different than expected')
455 # Adapted from https://github.com/formencode/formencode/blob/3a1ba9de2fdd494dd945510a4568a3afeddb0b2e/formencode/doctest_xml_compare.py#L72-L120
456 def compare_tree(x1, x2, reporter=None):
459 reporter('Tags do not match: %s and %s' % (x1.tag, x2.tag))
461 for name, value in x1.attrib.items():
462 if x2.attrib.get(name) != value:
464 reporter('Attributes do not match: %s=%r, %s=%r'
465 % (name, value, name, x2.attrib.get(name)))
467 for name in x2.attrib:
468 if name not in x1.attrib:
470 reporter('x2 has an attribute x1 is missing: %s'
473 if not text_compare(x1.text, x2.text):
475 reporter('text: %r != %r' % (x1.text, x2.text))
477 if not text_compare(x1.tail, x2.tail):
479 reporter('tail: %r != %r' % (x1.tail, x2.tail))
483 if len(cl1) != len(cl2):
485 reporter('children length differs, %i != %i'
486 % (len(cl1), len(cl2)))
489 for c1, c2 in zip(cl1, cl2):
491 if not compare_tree(c1, c2, reporter=reporter):
493 reporter('children %i do not match: %s'
499 def text_compare(t1, t2):
500 if not t1 and not t2:
502 if t1 == '*' or t2 == '*':
504 return (t1 or '').strip() == (t2 or '').strip()
508 if sys.version_info.major < 3:
509 file = codecs.getwriter('utf-8')(sys.stderr)
513 print(*args, file=file)
516 def print_err(lineno, context, err, message=None):
519 stderr("{}: {}".format(lineno, message or err))
521 stderr("\t{}".format(err))
524 stderr("\t{}".format(context))
527 def get_nb_matching_elements(cache, c, regexp, stop_at_first):
528 tree = cache.get_tree(c.args[0])
529 pat, sep, attr = c.args[1].partition('/@')
531 tree = cache.get_tree(c.args[0])
532 return check_tree_attr(tree, pat, attr, c.args[2], False)
533 else: # normalized text
535 if pat.endswith('/text()'):
537 return check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp, stop_at_first)
543 def check_command(c, cache):
546 if c.cmd == 'has' or c.cmd == 'matches': # string test
547 regexp = (c.cmd == 'matches')
548 if len(c.args) == 1 and not regexp: # @has <path> = file existence
550 cache.get_file(c.args[0])
552 except FailedCheck as err:
555 elif len(c.args) == 2: # @has/matches <path> <pat> = string test
556 cerr = "`PATTERN` did not match"
557 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
558 elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
559 cerr = "`XPATH PATTERN` did not match"
560 ret = get_nb_matching_elements(cache, c, regexp, True) != 0
562 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
564 elif c.cmd == 'count': # count test
565 if len(c.args) == 3: # @count <path> <pat> <count> = count test
566 expected = int(c.args[2])
567 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
568 cerr = "Expected {} occurrences but found {}".format(expected, found)
569 ret = expected == found
570 elif len(c.args) == 4: # @count <path> <pat> <text> <count> = count test
571 expected = int(c.args[3])
572 found = get_nb_matching_elements(cache, c, False, False)
573 cerr = "Expected {} occurrences but found {}".format(expected, found)
574 ret = found == expected
576 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
578 elif c.cmd == 'snapshot': # snapshot test
579 if len(c.args) == 3: # @snapshot <snapshot-name> <html-path> <xpath>
580 [snapshot_name, html_path, pattern] = c.args
581 tree = cache.get_tree(html_path)
582 xpath = normalize_xpath(pattern)
583 normalize_to_text = False
584 if xpath.endswith('/text()'):
586 normalize_to_text = True
588 subtrees = tree.findall(xpath)
589 if len(subtrees) == 1:
592 check_snapshot(snapshot_name, subtree, normalize_to_text)
594 except FailedCheck as err:
597 elif len(subtrees) == 0:
598 raise FailedCheck('XPATH did not match')
600 raise FailedCheck('Expected 1 match, but found {}'.format(len(subtrees)))
602 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
604 elif c.cmd == 'has-dir': # has-dir test
605 if len(c.args) == 1: # @has-dir <path> = has-dir test
607 cache.get_dir(c.args[0])
609 except FailedCheck as err:
613 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
615 elif c.cmd == 'valid-html':
616 raise InvalidCheck('Unimplemented @valid-html')
618 elif c.cmd == 'valid-links':
619 raise InvalidCheck('Unimplemented @valid-links')
622 raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
625 raise FailedCheck(cerr)
627 except FailedCheck as err:
628 message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
629 print_err(c.lineno, c.context, str(err), message)
630 except InvalidCheck as err:
631 print_err(c.lineno, c.context, str(err))
634 def check(target, commands):
635 cache = CachedFiles(target)
637 check_command(c, cache)
640 if __name__ == '__main__':
641 if len(sys.argv) not in [3, 4]:
642 stderr('Usage: {} <doc dir> <template> [--bless]'.format(sys.argv[0]))
645 rust_test_path = sys.argv[2]
646 if len(sys.argv) > 3 and sys.argv[3] == '--bless':
649 # We only support `--bless` at the end of the arguments.
650 # This assert is to prevent silent failures.
651 assert '--bless' not in sys.argv
653 check(sys.argv[1], get_commands(rust_test_path))
655 stderr("\nEncountered {} errors".format(ERR_COUNT))