2 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
6 The principle is simple: This script receives a path to generated HTML
7 documentation and a "template" script, which has a series of check
8 commands like `@has` or `@matches`. Each command is used to check if
9 some pattern is present or not present in the particular file or in
10 a particular node of the HTML tree. In many cases, the template script
11 happens to be the source code given to rustdoc.
13 While it indeed is possible to test in smaller portions, it has been
14 hard to construct tests in this fashion and major rendering errors were
15 discovered much later. This script is designed to make black-box and
16 regression testing of Rustdoc easy. This does not preclude the needs for
17 unit testing, but can be used to complement related tests by quickly
18 showing the expected renderings.
20 In order to avoid one-off dependencies for this task, this script uses
21 a reasonably working HTML parser and the existing XPath implementation
22 from Python's standard library. Hopefully, we won't render
27 Commands start with an `@` followed by a command name (letters and
28 hyphens), and zero or more arguments separated by one or more whitespace
29 characters and optionally delimited with single or double quotes. The `@`
30 mark cannot be preceded by a non-whitespace character. Other lines
31 (including every text up to the first `@`) are ignored, but it is
32 recommended to avoid the use of `@` in the template file.
34 There are a number of supported commands:
36 * `@has PATH` checks for the existence of the given file.
38 `PATH` is relative to the output directory. It can be given as `-`
39 which repeats the most recently used `PATH`.
41 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
42 the occurrence of the given pattern `PATTERN` in the specified file.
43 Only one occurrence of the pattern is enough.
45 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
46 whitespace being replaced by one single space character) string.
47 The entire file is also whitespace-normalized including newlines.
49 For `@matches`, `PATTERN` is a Python-supported regular expression.
50 The file remains intact but the regexp is matched without the `MULTILINE`
51 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
52 to override them, and `\A` and `\Z` for definitely matching
53 the beginning and end of the file.
55 (The same distinction goes to other variants of these commands.)
57 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
58 the presence of the given XPath `XPATH` in the specified HTML file,
59 and also the occurrence of the given pattern `PATTERN` in the matching
60 node or attribute. Only one occurrence of the pattern in the match
63 `PATH` should be a valid and well-formed HTML file. It does *not*
64 accept arbitrary HTML5; it should have matching open and close tags
65 and correct entity references at least.
67 `XPATH` is an XPath expression to match. The XPath is fairly limited:
68 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
69 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
70 and `@attr` (both as the last segment) are supported. Some examples:
72 - `//pre` or `.//pre` matches any element with a name `pre`.
73 - `//a[@href]` matches any element with an `href` attribute.
74 - `//*[@class="impl"]//code` matches any element with a name `code`,
75 which is an ancestor of some element which `class` attr is `impl`.
76 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
77 `class` attribute in the last `a` element (can be followed by more
78 elements that are not `a`) inside the first `span` in the `h1` with
79 a class of `fqn`. Note that there cannot be any additional elements
80 between them due to the use of `/` instead of `//`.
82 Do not try to use non-absolute paths, it won't work due to the flawed
83 ElementTree implementation. The script rejects them.
85 For the text matches (i.e. paths not ending with `@attr`), any
86 subelements are flattened into one string; this is handy for ignoring
87 highlights for example. If you want to simply check for the presence of
88 a given node or attribute, use an empty string (`""`) as a `PATTERN`.
90 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
91 in the specified file. The number of occurrences must match the given
94 * `@has-dir PATH` checks for the existence of the given directory.
96 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
97 checks if the given file does not exist, for example.
101 from __future__ import print_function
106 from collections import namedtuple
108 from html.parser import HTMLParser
110 from HTMLParser import HTMLParser
111 from xml.etree import cElementTree as ET
113 # ⇤/⇥ are not in HTML 4 but are in HTML 5
115 from html.entities import entitydefs
117 from htmlentitydefs import entitydefs
118 entitydefs['larrb'] = u'\u21e4'
119 entitydefs['rarrb'] = u'\u21e5'
120 entitydefs['nbsp'] = ' '
122 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
123 VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
124 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'])
126 # Python 2 -> 3 compatibility
132 class CustomHTMLParser(HTMLParser):
133 """simplified HTML parser.
135 this is possible because we are dealing with very regular HTML from
136 rustdoc; we only have to deal with i) void elements and ii) empty
138 def __init__(self, target=None):
139 HTMLParser.__init__(self)
140 self.__builder = target or ET.TreeBuilder()
142 def handle_starttag(self, tag, attrs):
143 attrs = dict((k, v or '') for k, v in attrs)
144 self.__builder.start(tag, attrs)
145 if tag in VOID_ELEMENTS:
146 self.__builder.end(tag)
148 def handle_endtag(self, tag):
149 self.__builder.end(tag)
151 def handle_startendtag(self, tag, attrs):
152 attrs = dict((k, v or '') for k, v in attrs)
153 self.__builder.start(tag, attrs)
154 self.__builder.end(tag)
156 def handle_data(self, data):
157 self.__builder.data(data)
159 def handle_entityref(self, name):
160 self.__builder.data(entitydefs[name])
162 def handle_charref(self, name):
163 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
164 self.__builder.data(unichr(code).encode('utf-8'))
167 HTMLParser.close(self)
168 return self.__builder.close()
170 Command = namedtuple('Command', 'negated cmd args lineno context')
172 class FailedCheck(Exception):
175 class InvalidCheck(Exception):
178 def concat_multi_lines(f):
179 """returns a generator out of the file object, which
180 - removes `\\` then `\n` then a shared prefix with the previous line then
182 - keeps a line number (starting from 0) of the first line being
184 lastline = None # set to the last line when the last line has a backslash
187 for lineno, line in enumerate(f):
188 line = line.rstrip('\r\n')
190 # strip the common prefix from the current line if needed
191 if lastline is not None:
192 common_prefix = os.path.commonprefix([line, lastline])
193 line = line[len(common_prefix):].lstrip()
195 firstlineno = firstlineno or lineno
196 if line.endswith('\\'):
199 catenated += line[:-1]
201 yield firstlineno, catenated + line
206 if lastline is not None:
207 print_err(lineno, line, 'Trailing backslash at the end of the file')
209 LINE_PATTERN = re.compile(r'''
210 (?<=(?<!\S)@)(?P<negated>!?)
211 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
216 def get_commands(template):
217 with open(template, 'rU') as f:
218 for lineno, line in concat_multi_lines(f):
219 m = LINE_PATTERN.search(line)
223 negated = (m.group('negated') == '!')
225 args = m.group('args')
226 if args and not args[:1].isspace():
227 print_err(lineno, line, 'Invalid template syntax')
229 args = shlex.split(args)
230 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
233 def _flatten(node, acc):
235 acc.append(node.text)
248 def normalize_xpath(path):
249 if path.startswith('//'):
250 return '.' + path # avoid warnings
251 elif path.startswith('.//'):
254 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
257 class CachedFiles(object):
258 def __init__(self, root):
262 self.last_path = None
264 def resolve_path(self, path):
266 path = os.path.normpath(path)
267 self.last_path = path
269 elif self.last_path is None:
270 raise InvalidCheck('Tried to use the previous path in the first command')
272 return self.last_path
274 def get_file(self, path):
275 path = self.resolve_path(path)
276 if path in self.files:
277 return self.files[path]
279 abspath = os.path.join(self.root, path)
280 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
281 raise FailedCheck('File does not exist {!r}'.format(path))
283 with open(abspath) as f:
285 self.files[path] = data
288 def get_tree(self, path):
289 path = self.resolve_path(path)
290 if path in self.trees:
291 return self.trees[path]
293 abspath = os.path.join(self.root, path)
294 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
295 raise FailedCheck('File does not exist {!r}'.format(path))
297 with open(abspath) as f:
299 tree = ET.parse(f, CustomHTMLParser())
300 except Exception as e:
301 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
302 self.trees[path] = tree
303 return self.trees[path]
305 def get_dir(self, path):
306 path = self.resolve_path(path)
307 abspath = os.path.join(self.root, path)
308 if not(os.path.exists(abspath) and os.path.isdir(abspath)):
309 raise FailedCheck('Directory does not exist {!r}'.format(path))
312 def check_string(data, pat, regexp):
314 return True # special case a presence testing
316 return re.search(pat, data) is not None
318 data = ' '.join(data.split())
319 pat = ' '.join(pat.split())
323 def check_tree_attr(tree, path, attr, pat, regexp):
324 path = normalize_xpath(path)
326 for e in tree.findall(path):
328 value = e.attrib[attr]
332 ret = check_string(value, pat, regexp)
338 def check_tree_text(tree, path, pat, regexp):
339 path = normalize_xpath(path)
342 for e in tree.findall(path):
348 ret = check_string(value, pat, regexp)
351 except Exception as e:
352 print('Failed to get path "{}"'.format(path))
357 def get_tree_count(tree, path):
358 path = normalize_xpath(path)
359 return len(tree.findall(path))
362 print(*args, file=sys.stderr)
364 def print_err(lineno, context, err, message=None):
367 stderr("{}: {}".format(lineno, message or err))
369 stderr("\t{}".format(err))
372 stderr("\t{}".format(context))
376 def check_command(c, cache):
379 if c.cmd == 'has' or c.cmd == 'matches': # string test
380 regexp = (c.cmd == 'matches')
381 if len(c.args) == 1 and not regexp: # @has <path> = file existence
383 cache.get_file(c.args[0])
385 except FailedCheck as err:
388 elif len(c.args) == 2: # @has/matches <path> <pat> = string test
389 cerr = "`PATTERN` did not match"
390 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
391 elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
392 cerr = "`XPATH PATTERN` did not match"
393 tree = cache.get_tree(c.args[0])
394 pat, sep, attr = c.args[1].partition('/@')
396 tree = cache.get_tree(c.args[0])
397 ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
398 else: # normalized text
400 if pat.endswith('/text()'):
402 ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
404 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
406 elif c.cmd == 'count': # count test
407 if len(c.args) == 3: # @count <path> <pat> <count> = count test
408 expected = int(c.args[2])
409 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
410 cerr = "Expected {} occurrences but found {}".format(expected, found)
411 ret = expected == found
413 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
414 elif c.cmd == 'has-dir': # has-dir test
415 if len(c.args) == 1: # @has-dir <path> = has-dir test
417 cache.get_dir(c.args[0])
419 except FailedCheck as err:
423 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
424 elif c.cmd == 'valid-html':
425 raise InvalidCheck('Unimplemented @valid-html')
427 elif c.cmd == 'valid-links':
428 raise InvalidCheck('Unimplemented @valid-links')
430 raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
433 raise FailedCheck(cerr)
435 except FailedCheck as err:
436 message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
437 print_err(c.lineno, c.context, str(err), message)
438 except InvalidCheck as err:
439 print_err(c.lineno, c.context, str(err))
441 def check(target, commands):
442 cache = CachedFiles(target)
444 check_command(c, cache)
446 if __name__ == '__main__':
447 if len(sys.argv) != 3:
448 stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
451 check(sys.argv[1], get_commands(sys.argv[2]))
453 stderr("\nEncountered {} errors".format(ERR_COUNT))