2 # -*- coding: utf-8 -*-
5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
9 The principle is simple: This script receives a path to generated HTML
10 documentation and a "template" script, which has a series of check
11 commands like `@has` or `@matches`. Each command is used to check if
12 some pattern is present or not present in the particular file or in
13 a particular node of the HTML tree. In many cases, the template script
14 happens to be the source code given to rustdoc.
16 While it indeed is possible to test in smaller portions, it has been
17 hard to construct tests in this fashion and major rendering errors were
18 discovered much later. This script is designed to make black-box and
19 regression testing of Rustdoc easy. This does not preclude the needs for
20 unit testing, but can be used to complement related tests by quickly
21 showing the expected renderings.
23 In order to avoid one-off dependencies for this task, this script uses
24 a reasonably working HTML parser and the existing XPath implementation
25 from Python's standard library. Hopefully, we won't render
30 Commands start with an `@` followed by a command name (letters and
31 hyphens), and zero or more arguments separated by one or more whitespace
32 characters and optionally delimited with single or double quotes. The `@`
33 mark cannot be preceded by a non-whitespace character. Other lines
34 (including every text up to the first `@`) are ignored, but it is
35 recommended to avoid the use of `@` in the template file.
37 There are a number of supported commands:
39 * `@has PATH` checks for the existence of the given file.
41 `PATH` is relative to the output directory. It can be given as `-`
42 which repeats the most recently used `PATH`.
44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
45 the occurrence of the given pattern `PATTERN` in the specified file.
46 Only one occurrence of the pattern is enough.
48 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
49 whitespace being replaced by one single space character) string.
50 The entire file is also whitespace-normalized including newlines.
52 For `@matches`, `PATTERN` is a Python-supported regular expression.
53 The file remains intact but the regexp is matched without the `MULTILINE`
54 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
55 to override them, and `\A` and `\Z` for definitely matching
56 the beginning and end of the file.
58 (The same distinction goes to other variants of these commands.)
60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
61 the presence of the given XPath `XPATH` in the specified HTML file,
62 and also the occurrence of the given pattern `PATTERN` in the matching
63 node or attribute. Only one occurrence of the pattern in the match
66 `PATH` should be a valid and well-formed HTML file. It does *not*
67 accept arbitrary HTML5; it should have matching open and close tags
68 and correct entity references at least.
70 `XPATH` is an XPath expression to match. The XPath is fairly limited:
71 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
72 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
73 and `@attr` (both as the last segment) are supported. Some examples:
75 - `//pre` or `.//pre` matches any element with a name `pre`.
76 - `//a[@href]` matches any element with an `href` attribute.
77 - `//*[@class="impl"]//code` matches any element with a name `code`,
78 which is an ancestor of some element which `class` attr is `impl`.
79 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
80 `class` attribute in the last `a` element (can be followed by more
81 elements that are not `a`) inside the first `span` in the `h1` with
82 a class of `fqn`. Note that there cannot be any additional elements
83 between them due to the use of `/` instead of `//`.
85 Do not try to use non-absolute paths, it won't work due to the flawed
86 ElementTree implementation. The script rejects them.
88 For the text matches (i.e. paths not ending with `@attr`), any
89 subelements are flattened into one string; this is handy for ignoring
90 highlights for example. If you want to simply check for the presence of
91 a given node or attribute, use an empty string (`""`) as a `PATTERN`.
93 * `@count PATH XPATH COUNT` checks for the occurrence of the given XPath
94 in the specified file. The number of occurrences must match the given
97 * `@snapshot NAME PATH XPATH` creates a snapshot test named NAME.
98 A snapshot test captures a subtree of the DOM, at the location
99 determined by the XPath, and compares it to a pre-recorded value
100 in a file. The file's name is the test's name with the `.rs` extension
101 replaced with `.NAME.html`, where NAME is the snapshot's name.
103 htmldocck supports the `--bless` option to accept the current subtree
104 as expected, saving it to the file determined by the snapshot's name.
105 compiletest's `--bless` flag is forwarded to htmldocck.
107 * `@has-dir PATH` checks for the existence of the given directory.
109 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
110 checks if the given file does not exist, for example.
114 from __future__ import absolute_import, print_function, unicode_literals
122 from collections import namedtuple
124 from html.parser import HTMLParser
126 from HTMLParser import HTMLParser
128 from xml.etree import cElementTree as ET
130 from xml.etree import ElementTree as ET
133 from html.entities import name2codepoint
135 from htmlentitydefs import name2codepoint
137 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
138 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
139 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
141 # Python 2 -> 3 compatibility
148 channel = os.environ["DOC_RUST_LANG_ORG_CHANNEL"]
150 # Initialized in main
151 rust_test_path = None
154 class CustomHTMLParser(HTMLParser):
155 """simplified HTML parser.
157 this is possible because we are dealing with very regular HTML from
158 rustdoc; we only have to deal with i) void elements and ii) empty
160 def __init__(self, target=None):
161 HTMLParser.__init__(self)
162 self.__builder = target or ET.TreeBuilder()
164 def handle_starttag(self, tag, attrs):
165 attrs = {k: v or '' for k, v in attrs}
166 self.__builder.start(tag, attrs)
167 if tag in VOID_ELEMENTS:
168 self.__builder.end(tag)
170 def handle_endtag(self, tag):
171 self.__builder.end(tag)
173 def handle_startendtag(self, tag, attrs):
174 attrs = {k: v or '' for k, v in attrs}
175 self.__builder.start(tag, attrs)
176 self.__builder.end(tag)
178 def handle_data(self, data):
179 self.__builder.data(data)
181 def handle_entityref(self, name):
182 self.__builder.data(unichr(name2codepoint[name]))
184 def handle_charref(self, name):
185 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
186 self.__builder.data(unichr(code))
189 HTMLParser.close(self)
190 return self.__builder.close()
193 Command = namedtuple('Command', 'negated cmd args lineno context')
196 class FailedCheck(Exception):
200 class InvalidCheck(Exception):
204 def concat_multi_lines(f):
205 """returns a generator out of the file object, which
206 - removes `\\` then `\n` then a shared prefix with the previous line then
208 - keeps a line number (starting from 0) of the first line being
210 lastline = None # set to the last line when the last line has a backslash
213 for lineno, line in enumerate(f):
214 line = line.rstrip('\r\n')
216 # strip the common prefix from the current line if needed
217 if lastline is not None:
218 common_prefix = os.path.commonprefix([line, lastline])
219 line = line[len(common_prefix):].lstrip()
221 firstlineno = firstlineno or lineno
222 if line.endswith('\\'):
225 catenated += line[:-1]
227 yield firstlineno, catenated + line
232 if lastline is not None:
233 print_err(lineno, line, 'Trailing backslash at the end of the file')
236 LINE_PATTERN = re.compile(r'''
237 (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?)
238 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
240 ''', re.X | re.UNICODE)
243 def get_commands(template):
244 with io.open(template, encoding='utf-8') as f:
245 for lineno, line in concat_multi_lines(f):
246 m = LINE_PATTERN.search(line)
250 negated = (m.group('negated') == '!')
252 if m.group('invalid') == '!':
256 'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format(
257 '!' if negated else '',
262 args = m.group('args')
263 if args and not args[:1].isspace():
264 print_err(lineno, line, 'Invalid template syntax')
267 args = shlex.split(args)
268 except UnicodeEncodeError:
269 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
270 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
273 def _flatten(node, acc):
275 acc.append(node.text)
289 xml = ET.XML('<xml>%s</xml>' % text)
293 def normalize_xpath(path):
294 path = path.replace("{{channel}}", channel)
295 if path.startswith('//'):
296 return '.' + path # avoid warnings
297 elif path.startswith('.//'):
300 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
303 class CachedFiles(object):
304 def __init__(self, root):
308 self.last_path = None
310 def resolve_path(self, path):
312 path = os.path.normpath(path)
313 self.last_path = path
315 elif self.last_path is None:
316 raise InvalidCheck('Tried to use the previous path in the first command')
318 return self.last_path
320 def get_file(self, path):
321 path = self.resolve_path(path)
322 if path in self.files:
323 return self.files[path]
325 abspath = os.path.join(self.root, path)
326 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
327 raise FailedCheck('File does not exist {!r}'.format(path))
329 with io.open(abspath, encoding='utf-8') as f:
331 self.files[path] = data
334 def get_tree(self, path):
335 path = self.resolve_path(path)
336 if path in self.trees:
337 return self.trees[path]
339 abspath = os.path.join(self.root, path)
340 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
341 raise FailedCheck('File does not exist {!r}'.format(path))
343 with io.open(abspath, encoding='utf-8') as f:
345 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
346 except Exception as e:
347 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
348 self.trees[path] = tree
349 return self.trees[path]
351 def get_dir(self, path):
352 path = self.resolve_path(path)
353 abspath = os.path.join(self.root, path)
354 if not(os.path.exists(abspath) and os.path.isdir(abspath)):
355 raise FailedCheck('Directory does not exist {!r}'.format(path))
358 def check_string(data, pat, regexp):
359 pat = pat.replace("{{channel}}", channel)
361 return True # special case a presence testing
363 return re.search(pat, data, flags=re.UNICODE) is not None
365 data = ' '.join(data.split())
366 pat = ' '.join(pat.split())
370 def check_tree_attr(tree, path, attr, pat, regexp):
371 path = normalize_xpath(path)
373 for e in tree.findall(path):
375 value = e.attrib[attr]
379 ret = check_string(value, pat, regexp)
385 def check_tree_text(tree, path, pat, regexp):
386 path = normalize_xpath(path)
389 for e in tree.findall(path):
395 ret = check_string(value, pat, regexp)
399 print('Failed to get path "{}"'.format(path))
404 def get_tree_count(tree, path):
405 path = normalize_xpath(path)
406 return len(tree.findall(path))
409 def check_snapshot(snapshot_name, actual_tree, normalize_to_text):
410 assert rust_test_path.endswith('.rs')
411 snapshot_path = '{}.{}.{}'.format(rust_test_path[:-3], snapshot_name, 'html')
413 with open(snapshot_path, 'r') as snapshot_file:
414 expected_str = snapshot_file.read()
415 except FileNotFoundError:
419 raise FailedCheck('No saved snapshot value')
421 if not normalize_to_text:
422 actual_str = ET.tostring(actual_tree).decode('utf-8')
424 actual_str = flatten(actual_tree)
428 # 2. Are actual and expected tree different
429 # 3. Are actual and expected text different
430 if not expected_str \
431 or (not normalize_to_text and \
432 not compare_tree(make_xml(actual_str), make_xml(expected_str), stderr)) \
433 or (normalize_to_text and actual_str != expected_str):
436 with open(snapshot_path, 'w') as snapshot_file:
437 snapshot_file.write(actual_str)
439 print('--- expected ---\n')
441 print('\n\n--- actual ---\n')
444 raise FailedCheck('Actual snapshot value is different than expected')
447 # Adapted from https://github.com/formencode/formencode/blob/3a1ba9de2fdd494dd945510a4568a3afeddb0b2e/formencode/doctest_xml_compare.py#L72-L120
448 def compare_tree(x1, x2, reporter=None):
451 reporter('Tags do not match: %s and %s' % (x1.tag, x2.tag))
453 for name, value in x1.attrib.items():
454 if x2.attrib.get(name) != value:
456 reporter('Attributes do not match: %s=%r, %s=%r'
457 % (name, value, name, x2.attrib.get(name)))
459 for name in x2.attrib:
460 if name not in x1.attrib:
462 reporter('x2 has an attribute x1 is missing: %s'
465 if not text_compare(x1.text, x2.text):
467 reporter('text: %r != %r' % (x1.text, x2.text))
469 if not text_compare(x1.tail, x2.tail):
471 reporter('tail: %r != %r' % (x1.tail, x2.tail))
475 if len(cl1) != len(cl2):
477 reporter('children length differs, %i != %i'
478 % (len(cl1), len(cl2)))
481 for c1, c2 in zip(cl1, cl2):
483 if not compare_tree(c1, c2, reporter=reporter):
485 reporter('children %i do not match: %s'
491 def text_compare(t1, t2):
492 if not t1 and not t2:
494 if t1 == '*' or t2 == '*':
496 return (t1 or '').strip() == (t2 or '').strip()
500 if sys.version_info.major < 3:
501 file = codecs.getwriter('utf-8')(sys.stderr)
505 print(*args, file=file)
508 def print_err(lineno, context, err, message=None):
511 stderr("{}: {}".format(lineno, message or err))
513 stderr("\t{}".format(err))
516 stderr("\t{}".format(context))
522 def check_command(c, cache):
525 if c.cmd == 'has' or c.cmd == 'matches': # string test
526 regexp = (c.cmd == 'matches')
527 if len(c.args) == 1 and not regexp: # @has <path> = file existence
529 cache.get_file(c.args[0])
531 except FailedCheck as err:
534 elif len(c.args) == 2: # @has/matches <path> <pat> = string test
535 cerr = "`PATTERN` did not match"
536 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
537 elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
538 cerr = "`XPATH PATTERN` did not match"
539 tree = cache.get_tree(c.args[0])
540 pat, sep, attr = c.args[1].partition('/@')
542 tree = cache.get_tree(c.args[0])
543 ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
544 else: # normalized text
546 if pat.endswith('/text()'):
548 ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
550 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
552 elif c.cmd == 'count': # count test
553 if len(c.args) == 3: # @count <path> <pat> <count> = count test
554 expected = int(c.args[2])
555 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
556 cerr = "Expected {} occurrences but found {}".format(expected, found)
557 ret = expected == found
559 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
561 elif c.cmd == 'snapshot': # snapshot test
562 if len(c.args) == 3: # @snapshot <snapshot-name> <html-path> <xpath>
563 [snapshot_name, html_path, pattern] = c.args
564 tree = cache.get_tree(html_path)
565 xpath = normalize_xpath(pattern)
566 normalize_to_text = False
567 if xpath.endswith('/text()'):
569 normalize_to_text = True
571 subtrees = tree.findall(xpath)
572 if len(subtrees) == 1:
575 check_snapshot(snapshot_name, subtree, normalize_to_text)
577 except FailedCheck as err:
580 elif len(subtrees) == 0:
581 raise FailedCheck('XPATH did not match')
583 raise FailedCheck('Expected 1 match, but found {}'.format(len(subtrees)))
585 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
587 elif c.cmd == 'has-dir': # has-dir test
588 if len(c.args) == 1: # @has-dir <path> = has-dir test
590 cache.get_dir(c.args[0])
592 except FailedCheck as err:
596 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
598 elif c.cmd == 'valid-html':
599 raise InvalidCheck('Unimplemented @valid-html')
601 elif c.cmd == 'valid-links':
602 raise InvalidCheck('Unimplemented @valid-links')
605 raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
608 raise FailedCheck(cerr)
610 except FailedCheck as err:
611 message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
612 print_err(c.lineno, c.context, str(err), message)
613 except InvalidCheck as err:
614 print_err(c.lineno, c.context, str(err))
617 def check(target, commands):
618 cache = CachedFiles(target)
620 check_command(c, cache)
623 if __name__ == '__main__':
624 if len(sys.argv) not in [3, 4]:
625 stderr('Usage: {} <doc dir> <template> [--bless]'.format(sys.argv[0]))
628 rust_test_path = sys.argv[2]
629 if len(sys.argv) > 3 and sys.argv[3] == '--bless':
632 # We only support `--bless` at the end of the arguments.
633 # This assert is to prevent silent failures.
634 assert '--bless' not in sys.argv
636 check(sys.argv[1], get_commands(rust_test_path))
638 stderr("\nEncountered {} errors".format(ERR_COUNT))