2 # -*- coding: utf-8 -*-
5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
9 The principle is simple: This script receives a path to generated HTML
10 documentation and a "template" script, which has a series of check
11 commands like `@has` or `@matches`. Each command is used to check if
12 some pattern is present or not present in the particular file or in
13 a particular node of the HTML tree. In many cases, the template script
14 happens to be the source code given to rustdoc.
16 While it indeed is possible to test in smaller portions, it has been
17 hard to construct tests in this fashion and major rendering errors were
18 discovered much later. This script is designed to make black-box and
19 regression testing of Rustdoc easy. This does not preclude the needs for
20 unit testing, but can be used to complement related tests by quickly
21 showing the expected renderings.
23 In order to avoid one-off dependencies for this task, this script uses
24 a reasonably working HTML parser and the existing XPath implementation
25 from Python's standard library. Hopefully, we won't render
30 Commands start with an `@` followed by a command name (letters and
31 hyphens), and zero or more arguments separated by one or more whitespace
32 characters and optionally delimited with single or double quotes. The `@`
33 mark cannot be preceded by a non-whitespace character. Other lines
34 (including every text up to the first `@`) are ignored, but it is
35 recommended to avoid the use of `@` in the template file.
37 There are a number of supported commands:
39 * `@has PATH` checks for the existence of the given file.
41 `PATH` is relative to the output directory. It can be given as `-`
42 which repeats the most recently used `PATH`.
44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
45 the occurrence of the given pattern `PATTERN` in the specified file.
46 Only one occurrence of the pattern is enough.
48 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
49 whitespace being replaced by one single space character) string.
50 The entire file is also whitespace-normalized including newlines.
52 For `@matches`, `PATTERN` is a Python-supported regular expression.
53 The file remains intact but the regexp is matched without the `MULTILINE`
54 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
55 to override them, and `\A` and `\Z` for definitely matching
56 the beginning and end of the file.
58 (The same distinction goes to other variants of these commands.)
60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
61 the presence of the given XPath `XPATH` in the specified HTML file,
62 and also the occurrence of the given pattern `PATTERN` in the matching
63 node or attribute. Only one occurrence of the pattern in the match
66 `PATH` should be a valid and well-formed HTML file. It does *not*
67 accept arbitrary HTML5; it should have matching open and close tags
68 and correct entity references at least.
70 `XPATH` is an XPath expression to match. The XPath is fairly limited:
71 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
72 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
73 and `@attr` (both as the last segment) are supported. Some examples:
75 - `//pre` or `.//pre` matches any element with a name `pre`.
76 - `//a[@href]` matches any element with an `href` attribute.
77 - `//*[@class="impl"]//code` matches any element with a name `code`,
78 which is an ancestor of some element which `class` attr is `impl`.
79 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
80 `class` attribute in the last `a` element (can be followed by more
81 elements that are not `a`) inside the first `span` in the `h1` with
82 a class of `fqn`. Note that there cannot be any additional elements
83 between them due to the use of `/` instead of `//`.
85 Do not try to use non-absolute paths, it won't work due to the flawed
86 ElementTree implementation. The script rejects them.
88 For the text matches (i.e. paths not ending with `@attr`), any
89 subelements are flattened into one string; this is handy for ignoring
90 highlights for example. If you want to simply check for the presence of
91 a given node or attribute, use an empty string (`""`) as a `PATTERN`.
93 * `@count PATH XPATH COUNT` checks for the occurrence of the given XPath
94 in the specified file. The number of occurrences must match the given
97 * `@count PATH XPATH TEXT COUNT` checks for the occurrence of the given XPath
98 with the given text in the specified file. The number of occurrences must
99 match the given count.
101 * `@snapshot NAME PATH XPATH` creates a snapshot test named NAME.
102 A snapshot test captures a subtree of the DOM, at the location
103 determined by the XPath, and compares it to a pre-recorded value
104 in a file. The file's name is the test's name with the `.rs` extension
105 replaced with `.NAME.html`, where NAME is the snapshot's name.
107 htmldocck supports the `--bless` option to accept the current subtree
108 as expected, saving it to the file determined by the snapshot's name.
109 compiletest's `--bless` flag is forwarded to htmldocck.
111 * `@has-dir PATH` checks for the existence of the given directory.
113 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
114 checks if the given file does not exist, for example.
118 from __future__ import absolute_import, print_function, unicode_literals
126 from collections import namedtuple
128 from html.parser import HTMLParser
130 from HTMLParser import HTMLParser
132 from xml.etree import cElementTree as ET
134 from xml.etree import ElementTree as ET
137 from html.entities import name2codepoint
139 from htmlentitydefs import name2codepoint
141 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
142 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
143 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
145 # Python 2 -> 3 compatibility
152 channel = os.environ["DOC_RUST_LANG_ORG_CHANNEL"]
154 # Initialized in main
155 rust_test_path = None
158 class CustomHTMLParser(HTMLParser):
159 """simplified HTML parser.
161 this is possible because we are dealing with very regular HTML from
162 rustdoc; we only have to deal with i) void elements and ii) empty
164 def __init__(self, target=None):
165 HTMLParser.__init__(self)
166 self.__builder = target or ET.TreeBuilder()
168 def handle_starttag(self, tag, attrs):
169 attrs = {k: v or '' for k, v in attrs}
170 self.__builder.start(tag, attrs)
171 if tag in VOID_ELEMENTS:
172 self.__builder.end(tag)
174 def handle_endtag(self, tag):
175 self.__builder.end(tag)
177 def handle_startendtag(self, tag, attrs):
178 attrs = {k: v or '' for k, v in attrs}
179 self.__builder.start(tag, attrs)
180 self.__builder.end(tag)
182 def handle_data(self, data):
183 self.__builder.data(data)
185 def handle_entityref(self, name):
186 self.__builder.data(unichr(name2codepoint[name]))
188 def handle_charref(self, name):
189 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
190 self.__builder.data(unichr(code))
193 HTMLParser.close(self)
194 return self.__builder.close()
197 Command = namedtuple('Command', 'negated cmd args lineno context')
200 class FailedCheck(Exception):
204 class InvalidCheck(Exception):
208 def concat_multi_lines(f):
209 """returns a generator out of the file object, which
210 - removes `\\` then `\n` then a shared prefix with the previous line then
212 - keeps a line number (starting from 0) of the first line being
214 lastline = None # set to the last line when the last line has a backslash
217 for lineno, line in enumerate(f):
218 line = line.rstrip('\r\n')
220 # strip the common prefix from the current line if needed
221 if lastline is not None:
222 common_prefix = os.path.commonprefix([line, lastline])
223 line = line[len(common_prefix):].lstrip()
225 firstlineno = firstlineno or lineno
226 if line.endswith('\\'):
229 catenated += line[:-1]
231 yield firstlineno, catenated + line
236 if lastline is not None:
237 print_err(lineno, line, 'Trailing backslash at the end of the file')
240 LINE_PATTERN = re.compile(r'''
241 (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?)
242 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
244 ''', re.X | re.UNICODE)
247 def get_commands(template):
248 with io.open(template, encoding='utf-8') as f:
249 for lineno, line in concat_multi_lines(f):
250 m = LINE_PATTERN.search(line)
254 negated = (m.group('negated') == '!')
256 if m.group('invalid') == '!':
260 'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format(
261 '!' if negated else '',
266 args = m.group('args')
267 if args and not args[:1].isspace():
268 print_err(lineno, line, 'Invalid template syntax')
271 args = shlex.split(args)
272 except UnicodeEncodeError:
273 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
274 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
277 def _flatten(node, acc):
279 acc.append(node.text)
293 xml = ET.XML('<xml>%s</xml>' % text)
297 def normalize_xpath(path):
298 path = path.replace("{{channel}}", channel)
299 if path.startswith('//'):
300 return '.' + path # avoid warnings
301 elif path.startswith('.//'):
304 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
307 class CachedFiles(object):
308 def __init__(self, root):
312 self.last_path = None
314 def resolve_path(self, path):
316 path = os.path.normpath(path)
317 self.last_path = path
319 elif self.last_path is None:
320 raise InvalidCheck('Tried to use the previous path in the first command')
322 return self.last_path
324 def get_file(self, path):
325 path = self.resolve_path(path)
326 if path in self.files:
327 return self.files[path]
329 abspath = os.path.join(self.root, path)
330 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
331 raise FailedCheck('File does not exist {!r}'.format(path))
333 with io.open(abspath, encoding='utf-8') as f:
335 self.files[path] = data
338 def get_tree(self, path):
339 path = self.resolve_path(path)
340 if path in self.trees:
341 return self.trees[path]
343 abspath = os.path.join(self.root, path)
344 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
345 raise FailedCheck('File does not exist {!r}'.format(path))
347 with io.open(abspath, encoding='utf-8') as f:
349 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
350 except Exception as e:
351 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
352 self.trees[path] = tree
353 return self.trees[path]
355 def get_dir(self, path):
356 path = self.resolve_path(path)
357 abspath = os.path.join(self.root, path)
358 if not(os.path.exists(abspath) and os.path.isdir(abspath)):
359 raise FailedCheck('Directory does not exist {!r}'.format(path))
362 def check_string(data, pat, regexp):
363 pat = pat.replace("{{channel}}", channel)
365 return True # special case a presence testing
367 return re.search(pat, data, flags=re.UNICODE) is not None
369 data = ' '.join(data.split())
370 pat = ' '.join(pat.split())
374 def check_tree_attr(tree, path, attr, pat, regexp):
375 path = normalize_xpath(path)
377 for e in tree.findall(path):
379 value = e.attrib[attr]
383 ret = check_string(value, pat, regexp)
389 # Returns the number of occurences matching the regex (`regexp`) and the text (`pat`).
390 def check_tree_text(tree, path, pat, regexp, stop_at_first):
391 path = normalize_xpath(path)
394 for e in tree.findall(path):
400 if check_string(value, pat, regexp):
405 print('Failed to get path "{}"'.format(path))
410 def get_tree_count(tree, path):
411 path = normalize_xpath(path)
412 return len(tree.findall(path))
415 def check_snapshot(snapshot_name, actual_tree, normalize_to_text):
416 assert rust_test_path.endswith('.rs')
417 snapshot_path = '{}.{}.{}'.format(rust_test_path[:-3], snapshot_name, 'html')
419 with open(snapshot_path, 'r') as snapshot_file:
420 expected_str = snapshot_file.read().replace("{{channel}}", channel)
421 except FileNotFoundError:
425 raise FailedCheck('No saved snapshot value')
427 if not normalize_to_text:
428 actual_str = ET.tostring(actual_tree).decode('utf-8')
430 actual_str = flatten(actual_tree)
434 # 2. Are actual and expected tree different
435 # 3. Are actual and expected text different
436 if not expected_str \
437 or (not normalize_to_text and \
438 not compare_tree(make_xml(actual_str), make_xml(expected_str), stderr)) \
439 or (normalize_to_text and actual_str != expected_str):
442 with open(snapshot_path, 'w') as snapshot_file:
443 snapshot_file.write(actual_str)
445 print('--- expected ---\n')
447 print('\n\n--- actual ---\n')
450 raise FailedCheck('Actual snapshot value is different than expected')
453 # Adapted from https://github.com/formencode/formencode/blob/3a1ba9de2fdd494dd945510a4568a3afeddb0b2e/formencode/doctest_xml_compare.py#L72-L120
454 def compare_tree(x1, x2, reporter=None):
457 reporter('Tags do not match: %s and %s' % (x1.tag, x2.tag))
459 for name, value in x1.attrib.items():
460 if x2.attrib.get(name) != value:
462 reporter('Attributes do not match: %s=%r, %s=%r'
463 % (name, value, name, x2.attrib.get(name)))
465 for name in x2.attrib:
466 if name not in x1.attrib:
468 reporter('x2 has an attribute x1 is missing: %s'
471 if not text_compare(x1.text, x2.text):
473 reporter('text: %r != %r' % (x1.text, x2.text))
475 if not text_compare(x1.tail, x2.tail):
477 reporter('tail: %r != %r' % (x1.tail, x2.tail))
481 if len(cl1) != len(cl2):
483 reporter('children length differs, %i != %i'
484 % (len(cl1), len(cl2)))
487 for c1, c2 in zip(cl1, cl2):
489 if not compare_tree(c1, c2, reporter=reporter):
491 reporter('children %i do not match: %s'
497 def text_compare(t1, t2):
498 if not t1 and not t2:
500 if t1 == '*' or t2 == '*':
502 return (t1 or '').strip() == (t2 or '').strip()
506 if sys.version_info.major < 3:
507 file = codecs.getwriter('utf-8')(sys.stderr)
511 print(*args, file=file)
514 def print_err(lineno, context, err, message=None):
517 stderr("{}: {}".format(lineno, message or err))
519 stderr("\t{}".format(err))
522 stderr("\t{}".format(context))
525 def get_nb_matching_elements(cache, c, regexp, stop_at_first):
526 tree = cache.get_tree(c.args[0])
527 pat, sep, attr = c.args[1].partition('/@')
529 tree = cache.get_tree(c.args[0])
530 return check_tree_attr(tree, pat, attr, c.args[2], False)
531 else: # normalized text
533 if pat.endswith('/text()'):
535 return check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp, stop_at_first)
541 def check_command(c, cache):
544 if c.cmd == 'has' or c.cmd == 'matches': # string test
545 regexp = (c.cmd == 'matches')
546 if len(c.args) == 1 and not regexp: # @has <path> = file existence
548 cache.get_file(c.args[0])
550 except FailedCheck as err:
553 elif len(c.args) == 2: # @has/matches <path> <pat> = string test
554 cerr = "`PATTERN` did not match"
555 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
556 elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
557 cerr = "`XPATH PATTERN` did not match"
558 ret = get_nb_matching_elements(cache, c, regexp, True) != 0
560 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
562 elif c.cmd == 'count': # count test
563 if len(c.args) == 3: # @count <path> <pat> <count> = count test
564 expected = int(c.args[2])
565 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
566 cerr = "Expected {} occurrences but found {}".format(expected, found)
567 ret = expected == found
568 elif len(c.args) == 4: # @count <path> <pat> <text> <count> = count test
569 expected = int(c.args[3])
570 found = get_nb_matching_elements(cache, c, False, False)
571 cerr = "Expected {} occurrences but found {}".format(expected, found)
572 ret = found == expected
574 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
576 elif c.cmd == 'snapshot': # snapshot test
577 if len(c.args) == 3: # @snapshot <snapshot-name> <html-path> <xpath>
578 [snapshot_name, html_path, pattern] = c.args
579 tree = cache.get_tree(html_path)
580 xpath = normalize_xpath(pattern)
581 normalize_to_text = False
582 if xpath.endswith('/text()'):
584 normalize_to_text = True
586 subtrees = tree.findall(xpath)
587 if len(subtrees) == 1:
590 check_snapshot(snapshot_name, subtree, normalize_to_text)
592 except FailedCheck as err:
595 elif len(subtrees) == 0:
596 raise FailedCheck('XPATH did not match')
598 raise FailedCheck('Expected 1 match, but found {}'.format(len(subtrees)))
600 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
602 elif c.cmd == 'has-dir': # has-dir test
603 if len(c.args) == 1: # @has-dir <path> = has-dir test
605 cache.get_dir(c.args[0])
607 except FailedCheck as err:
611 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
613 elif c.cmd == 'valid-html':
614 raise InvalidCheck('Unimplemented @valid-html')
616 elif c.cmd == 'valid-links':
617 raise InvalidCheck('Unimplemented @valid-links')
620 raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
623 raise FailedCheck(cerr)
625 except FailedCheck as err:
626 message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
627 print_err(c.lineno, c.context, str(err), message)
628 except InvalidCheck as err:
629 print_err(c.lineno, c.context, str(err))
632 def check(target, commands):
633 cache = CachedFiles(target)
635 check_command(c, cache)
638 if __name__ == '__main__':
639 if len(sys.argv) not in [3, 4]:
640 stderr('Usage: {} <doc dir> <template> [--bless]'.format(sys.argv[0]))
643 rust_test_path = sys.argv[2]
644 if len(sys.argv) > 3 and sys.argv[3] == '--bless':
647 # We only support `--bless` at the end of the arguments.
648 # This assert is to prevent silent failures.
649 assert '--bless' not in sys.argv
651 check(sys.argv[1], get_commands(rust_test_path))
653 stderr("\nEncountered {} errors".format(ERR_COUNT))