2 # -*- coding: utf-8 -*-
5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
9 The principle is simple: This script receives a path to generated HTML
10 documentation and a "template" script, which has a series of check
11 commands like `@has` or `@matches`. Each command is used to check if
12 some pattern is present or not present in the particular file or in
13 a particular node of the HTML tree. In many cases, the template script
14 happens to be the source code given to rustdoc.
16 While it indeed is possible to test in smaller portions, it has been
17 hard to construct tests in this fashion and major rendering errors were
18 discovered much later. This script is designed to make black-box and
19 regression testing of Rustdoc easy. This does not preclude the needs for
20 unit testing, but can be used to complement related tests by quickly
21 showing the expected renderings.
23 In order to avoid one-off dependencies for this task, this script uses
24 a reasonably working HTML parser and the existing XPath implementation
25 from Python's standard library. Hopefully, we won't render
30 Commands start with an `@` followed by a command name (letters and
31 hyphens), and zero or more arguments separated by one or more whitespace
32 characters and optionally delimited with single or double quotes. The `@`
33 mark cannot be preceded by a non-whitespace character. Other lines
34 (including every text up to the first `@`) are ignored, but it is
35 recommended to avoid the use of `@` in the template file.
37 There are a number of supported commands:
39 * `@has PATH` checks for the existence of the given file.
41 `PATH` is relative to the output directory. It can be given as `-`
42 which repeats the most recently used `PATH`.
44 * `@hasraw PATH PATTERN` and `@matchesraw PATH PATTERN` checks
45 for the occurrence of the given pattern `PATTERN` in the specified file.
46 Only one occurrence of the pattern is enough.
48 For `@hasraw`, `PATTERN` is a whitespace-normalized (every consecutive
49 whitespace being replaced by one single space character) string.
50 The entire file is also whitespace-normalized including newlines.
52 For `@matchesraw`, `PATTERN` is a Python-supported regular expression.
53 The file remains intact but the regexp is matched without the `MULTILINE`
54 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
55 to override them, and `\A` and `\Z` for definitely matching
56 the beginning and end of the file.
58 (The same distinction goes to other variants of these commands.)
60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
61 the presence of the given XPath `XPATH` in the specified HTML file,
62 and also the occurrence of the given pattern `PATTERN` in the matching
63 node or attribute. Only one occurrence of the pattern in the match
66 `PATH` should be a valid and well-formed HTML file. It does *not*
67 accept arbitrary HTML5; it should have matching open and close tags
68 and correct entity references at least.
70 `XPATH` is an XPath expression to match. The XPath is fairly limited:
71 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
72 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
73 and `@attr` (both as the last segment) are supported. Some examples:
75 - `//pre` or `.//pre` matches any element with a name `pre`.
76 - `//a[@href]` matches any element with an `href` attribute.
77 - `//*[@class="impl"]//code` matches any element with a name `code`,
78 which is an ancestor of some element which `class` attr is `impl`.
79 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
80 `class` attribute in the last `a` element (can be followed by more
81 elements that are not `a`) inside the first `span` in the `h1` with
82 a class of `fqn`. Note that there cannot be any additional elements
83 between them due to the use of `/` instead of `//`.
85 Do not try to use non-absolute paths, it won't work due to the flawed
86 ElementTree implementation. The script rejects them.
88 For the text matches (i.e. paths not ending with `@attr`), any
89 subelements are flattened into one string; this is handy for ignoring
90 highlights for example. If you want to simply check for the presence of
91 a given node or attribute, use an empty string (`""`) as a `PATTERN`.
93 * `@count PATH XPATH COUNT` checks for the occurrence of the given XPath
94 in the specified file. The number of occurrences must match the given
97 * `@count PATH XPATH TEXT COUNT` checks for the occurrence of the given XPath
98 with the given text in the specified file. The number of occurrences must
99 match the given count.
101 * `@snapshot NAME PATH XPATH` creates a snapshot test named NAME.
102 A snapshot test captures a subtree of the DOM, at the location
103 determined by the XPath, and compares it to a pre-recorded value
104 in a file. The file's name is the test's name with the `.rs` extension
105 replaced with `.NAME.html`, where NAME is the snapshot's name.
107 htmldocck supports the `--bless` option to accept the current subtree
108 as expected, saving it to the file determined by the snapshot's name.
109 compiletest's `--bless` flag is forwarded to htmldocck.
111 * `@has-dir PATH` checks for the existence of the given directory.
113 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
114 checks if the given file does not exist, for example.
118 from __future__ import absolute_import, print_function, unicode_literals
126 from collections import namedtuple
128 from html.parser import HTMLParser
130 from HTMLParser import HTMLParser
132 from xml.etree import cElementTree as ET
134 from xml.etree import ElementTree as ET
137 from html.entities import name2codepoint
139 from htmlentitydefs import name2codepoint
141 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
142 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
143 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
145 # Python 2 -> 3 compatibility
152 channel = os.environ["DOC_RUST_LANG_ORG_CHANNEL"]
154 # Initialized in main
155 rust_test_path = None
158 class CustomHTMLParser(HTMLParser):
159 """simplified HTML parser.
161 this is possible because we are dealing with very regular HTML from
162 rustdoc; we only have to deal with i) void elements and ii) empty
164 def __init__(self, target=None):
165 HTMLParser.__init__(self)
166 self.__builder = target or ET.TreeBuilder()
168 def handle_starttag(self, tag, attrs):
169 attrs = {k: v or '' for k, v in attrs}
170 self.__builder.start(tag, attrs)
171 if tag in VOID_ELEMENTS:
172 self.__builder.end(tag)
174 def handle_endtag(self, tag):
175 self.__builder.end(tag)
177 def handle_startendtag(self, tag, attrs):
178 attrs = {k: v or '' for k, v in attrs}
179 self.__builder.start(tag, attrs)
180 self.__builder.end(tag)
182 def handle_data(self, data):
183 self.__builder.data(data)
185 def handle_entityref(self, name):
186 self.__builder.data(unichr(name2codepoint[name]))
188 def handle_charref(self, name):
189 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
190 self.__builder.data(unichr(code))
193 HTMLParser.close(self)
194 return self.__builder.close()
197 Command = namedtuple('Command', 'negated cmd args lineno context')
200 class FailedCheck(Exception):
204 class InvalidCheck(Exception):
208 def concat_multi_lines(f):
209 """returns a generator out of the file object, which
210 - removes `\\` then `\n` then a shared prefix with the previous line then
212 - keeps a line number (starting from 0) of the first line being
214 lastline = None # set to the last line when the last line has a backslash
217 for lineno, line in enumerate(f):
218 line = line.rstrip('\r\n')
220 # strip the common prefix from the current line if needed
221 if lastline is not None:
222 common_prefix = os.path.commonprefix([line, lastline])
223 line = line[len(common_prefix):].lstrip()
225 firstlineno = firstlineno or lineno
226 if line.endswith('\\'):
229 catenated += line[:-1]
231 yield firstlineno, catenated + line
236 if lastline is not None:
237 print_err(lineno, line, 'Trailing backslash at the end of the file')
240 LINE_PATTERN = re.compile(r'''
241 (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?)
242 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
244 ''', re.X | re.UNICODE)
247 def get_commands(template):
248 with io.open(template, encoding='utf-8') as f:
249 for lineno, line in concat_multi_lines(f):
250 m = LINE_PATTERN.search(line)
254 negated = (m.group('negated') == '!')
256 if m.group('invalid') == '!':
260 'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format(
261 '!' if negated else '',
266 args = m.group('args')
267 if args and not args[:1].isspace():
268 print_err(lineno, line, 'Invalid template syntax')
271 args = shlex.split(args)
272 except UnicodeEncodeError:
273 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
274 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
277 def _flatten(node, acc):
279 acc.append(node.text)
293 xml = ET.XML('<xml>%s</xml>' % text)
297 def normalize_xpath(path):
298 path = path.replace("{{channel}}", channel)
299 if path.startswith('//'):
300 return '.' + path # avoid warnings
301 elif path.startswith('.//'):
304 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
307 class CachedFiles(object):
308 def __init__(self, root):
312 self.last_path = None
314 def resolve_path(self, path):
316 path = os.path.normpath(path)
317 self.last_path = path
319 elif self.last_path is None:
320 raise InvalidCheck('Tried to use the previous path in the first command')
322 return self.last_path
324 def get_file(self, path):
325 path = self.resolve_path(path)
326 if path in self.files:
327 return self.files[path]
329 abspath = os.path.join(self.root, path)
330 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
331 raise FailedCheck('File does not exist {!r}'.format(path))
333 with io.open(abspath, encoding='utf-8') as f:
335 self.files[path] = data
338 def get_tree(self, path):
339 path = self.resolve_path(path)
340 if path in self.trees:
341 return self.trees[path]
343 abspath = os.path.join(self.root, path)
344 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
345 raise FailedCheck('File does not exist {!r}'.format(path))
347 with io.open(abspath, encoding='utf-8') as f:
349 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
350 except Exception as e:
351 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
352 self.trees[path] = tree
353 return self.trees[path]
355 def get_dir(self, path):
356 path = self.resolve_path(path)
357 abspath = os.path.join(self.root, path)
358 if not(os.path.exists(abspath) and os.path.isdir(abspath)):
359 raise FailedCheck('Directory does not exist {!r}'.format(path))
362 def check_string(data, pat, regexp):
363 pat = pat.replace("{{channel}}", channel)
365 return True # special case a presence testing
367 return re.search(pat, data, flags=re.UNICODE) is not None
369 data = ' '.join(data.split())
370 pat = ' '.join(pat.split())
374 def check_tree_attr(tree, path, attr, pat, regexp):
375 path = normalize_xpath(path)
377 for e in tree.findall(path):
379 value = e.attrib[attr]
383 ret = check_string(value, pat, regexp)
389 # Returns the number of occurences matching the regex (`regexp`) and the text (`pat`).
390 def check_tree_text(tree, path, pat, regexp, stop_at_first):
391 path = normalize_xpath(path)
394 for e in tree.findall(path):
400 if check_string(value, pat, regexp):
405 print('Failed to get path "{}"'.format(path))
410 def get_tree_count(tree, path):
411 path = normalize_xpath(path)
412 return len(tree.findall(path))
415 def check_snapshot(snapshot_name, actual_tree, normalize_to_text):
416 assert rust_test_path.endswith('.rs')
417 snapshot_path = '{}.{}.{}'.format(rust_test_path[:-3], snapshot_name, 'html')
419 with open(snapshot_path, 'r') as snapshot_file:
420 expected_str = snapshot_file.read().replace("{{channel}}", channel)
421 except FileNotFoundError:
425 raise FailedCheck('No saved snapshot value')
427 if not normalize_to_text:
428 actual_str = ET.tostring(actual_tree).decode('utf-8')
430 actual_str = flatten(actual_tree)
434 # 2. Are actual and expected tree different
435 # 3. Are actual and expected text different
436 if not expected_str \
437 or (not normalize_to_text and \
438 not compare_tree(make_xml(actual_str), make_xml(expected_str), stderr)) \
439 or (normalize_to_text and actual_str != expected_str):
442 with open(snapshot_path, 'w') as snapshot_file:
443 actual_str = actual_str.replace(channel, "{{channel}}")
444 snapshot_file.write(actual_str)
446 print('--- expected ---\n')
448 print('\n\n--- actual ---\n')
451 raise FailedCheck('Actual snapshot value is different than expected')
454 # Adapted from https://github.com/formencode/formencode/blob/3a1ba9de2fdd494dd945510a4568a3afeddb0b2e/formencode/doctest_xml_compare.py#L72-L120
455 def compare_tree(x1, x2, reporter=None):
458 reporter('Tags do not match: %s and %s' % (x1.tag, x2.tag))
460 for name, value in x1.attrib.items():
461 if x2.attrib.get(name) != value:
463 reporter('Attributes do not match: %s=%r, %s=%r'
464 % (name, value, name, x2.attrib.get(name)))
466 for name in x2.attrib:
467 if name not in x1.attrib:
469 reporter('x2 has an attribute x1 is missing: %s'
472 if not text_compare(x1.text, x2.text):
474 reporter('text: %r != %r' % (x1.text, x2.text))
476 if not text_compare(x1.tail, x2.tail):
478 reporter('tail: %r != %r' % (x1.tail, x2.tail))
482 if len(cl1) != len(cl2):
484 reporter('children length differs, %i != %i'
485 % (len(cl1), len(cl2)))
488 for c1, c2 in zip(cl1, cl2):
490 if not compare_tree(c1, c2, reporter=reporter):
492 reporter('children %i do not match: %s'
498 def text_compare(t1, t2):
499 if not t1 and not t2:
501 if t1 == '*' or t2 == '*':
503 return (t1 or '').strip() == (t2 or '').strip()
507 if sys.version_info.major < 3:
508 file = codecs.getwriter('utf-8')(sys.stderr)
512 print(*args, file=file)
515 def print_err(lineno, context, err, message=None):
518 stderr("{}: {}".format(lineno, message or err))
520 stderr("\t{}".format(err))
523 stderr("\t{}".format(context))
526 def get_nb_matching_elements(cache, c, regexp, stop_at_first):
527 tree = cache.get_tree(c.args[0])
528 pat, sep, attr = c.args[1].partition('/@')
530 tree = cache.get_tree(c.args[0])
531 return check_tree_attr(tree, pat, attr, c.args[2], False)
532 else: # normalized text
534 if pat.endswith('/text()'):
536 return check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp, stop_at_first)
542 def check_command(c, cache):
545 if c.cmd in ['has', 'hasraw', 'matches', 'matchesraw']: # string test
546 regexp = c.cmd.startswith('matches')
548 # @has <path> = file existence
549 if len(c.args) == 1 and not regexp and 'raw' not in c.cmd:
551 cache.get_file(c.args[0])
553 except FailedCheck as err:
556 # @hasraw/matchesraw <path> <pat> = string test
557 elif len(c.args) == 2 and 'raw' in c.cmd:
558 cerr = "`PATTERN` did not match"
559 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
560 # @has/matches <path> <pat> <match> = XML tree test
561 elif len(c.args) == 3 and 'raw' not in c.cmd:
562 cerr = "`XPATH PATTERN` did not match"
563 ret = get_nb_matching_elements(cache, c, regexp, True) != 0
565 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
567 elif c.cmd == 'count': # count test
568 if len(c.args) == 3: # @count <path> <pat> <count> = count test
569 expected = int(c.args[2])
570 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
571 cerr = "Expected {} occurrences but found {}".format(expected, found)
572 ret = expected == found
573 elif len(c.args) == 4: # @count <path> <pat> <text> <count> = count test
574 expected = int(c.args[3])
575 found = get_nb_matching_elements(cache, c, False, False)
576 cerr = "Expected {} occurrences but found {}".format(expected, found)
577 ret = found == expected
579 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
581 elif c.cmd == 'snapshot': # snapshot test
582 if len(c.args) == 3: # @snapshot <snapshot-name> <html-path> <xpath>
583 [snapshot_name, html_path, pattern] = c.args
584 tree = cache.get_tree(html_path)
585 xpath = normalize_xpath(pattern)
586 normalize_to_text = False
587 if xpath.endswith('/text()'):
589 normalize_to_text = True
591 subtrees = tree.findall(xpath)
592 if len(subtrees) == 1:
595 check_snapshot(snapshot_name, subtree, normalize_to_text)
597 except FailedCheck as err:
600 elif len(subtrees) == 0:
601 raise FailedCheck('XPATH did not match')
603 raise FailedCheck('Expected 1 match, but found {}'.format(len(subtrees)))
605 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
607 elif c.cmd == 'has-dir': # has-dir test
608 if len(c.args) == 1: # @has-dir <path> = has-dir test
610 cache.get_dir(c.args[0])
612 except FailedCheck as err:
616 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
618 elif c.cmd == 'valid-html':
619 raise InvalidCheck('Unimplemented @valid-html')
621 elif c.cmd == 'valid-links':
622 raise InvalidCheck('Unimplemented @valid-links')
625 raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
628 raise FailedCheck(cerr)
630 except FailedCheck as err:
631 message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
632 print_err(c.lineno, c.context, str(err), message)
633 except InvalidCheck as err:
634 print_err(c.lineno, c.context, str(err))
637 def check(target, commands):
638 cache = CachedFiles(target)
640 check_command(c, cache)
643 if __name__ == '__main__':
644 if len(sys.argv) not in [3, 4]:
645 stderr('Usage: {} <doc dir> <template> [--bless]'.format(sys.argv[0]))
648 rust_test_path = sys.argv[2]
649 if len(sys.argv) > 3 and sys.argv[3] == '--bless':
652 # We only support `--bless` at the end of the arguments.
653 # This assert is to prevent silent failures.
654 assert '--bless' not in sys.argv
656 check(sys.argv[1], get_commands(rust_test_path))
658 stderr("\nEncountered {} errors".format(ERR_COUNT))