2 # -*- coding: utf-8 -*-
5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
9 The principle is simple: This script receives a path to generated HTML
10 documentation and a "template" script, which has a series of check
11 commands like `@has` or `@matches`. Each command is used to check if
12 some pattern is present or not present in the particular file or in
13 a particular node of the HTML tree. In many cases, the template script
14 happens to be the source code given to rustdoc.
16 While it indeed is possible to test in smaller portions, it has been
17 hard to construct tests in this fashion and major rendering errors were
18 discovered much later. This script is designed to make black-box and
19 regression testing of Rustdoc easy. This does not preclude the needs for
20 unit testing, but can be used to complement related tests by quickly
21 showing the expected renderings.
23 In order to avoid one-off dependencies for this task, this script uses
24 a reasonably working HTML parser and the existing XPath implementation
25 from Python's standard library. Hopefully, we won't render
30 Commands start with an `@` followed by a command name (letters and
31 hyphens), and zero or more arguments separated by one or more whitespace
32 characters and optionally delimited with single or double quotes. The `@`
33 mark cannot be preceded by a non-whitespace character. Other lines
34 (including every text up to the first `@`) are ignored, but it is
35 recommended to avoid the use of `@` in the template file.
37 There are a number of supported commands:
39 * `@has PATH` checks for the existence of the given file.
41 `PATH` is relative to the output directory. It can be given as `-`
42 which repeats the most recently used `PATH`.
44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
45 the occurrence of the given pattern `PATTERN` in the specified file.
46 Only one occurrence of the pattern is enough.
48 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
49 whitespace being replaced by one single space character) string.
50 The entire file is also whitespace-normalized including newlines.
52 For `@matches`, `PATTERN` is a Python-supported regular expression.
53 The file remains intact but the regexp is matched without the `MULTILINE`
54 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
55 to override them, and `\A` and `\Z` for definitely matching
56 the beginning and end of the file.
58 (The same distinction goes to other variants of these commands.)
60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
61 the presence of the given XPath `XPATH` in the specified HTML file,
62 and also the occurrence of the given pattern `PATTERN` in the matching
63 node or attribute. Only one occurrence of the pattern in the match
66 `PATH` should be a valid and well-formed HTML file. It does *not*
67 accept arbitrary HTML5; it should have matching open and close tags
68 and correct entity references at least.
70 `XPATH` is an XPath expression to match. The XPath is fairly limited:
71 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
72 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
73 and `@attr` (both as the last segment) are supported. Some examples:
75 - `//pre` or `.//pre` matches any element with a name `pre`.
76 - `//a[@href]` matches any element with an `href` attribute.
77 - `//*[@class="impl"]//code` matches any element with a name `code`,
78 which is an ancestor of some element which `class` attr is `impl`.
79 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
80 `class` attribute in the last `a` element (can be followed by more
81 elements that are not `a`) inside the first `span` in the `h1` with
82 a class of `fqn`. Note that there cannot be any additional elements
83 between them due to the use of `/` instead of `//`.
85 Do not try to use non-absolute paths, it won't work due to the flawed
86 ElementTree implementation. The script rejects them.
88 For the text matches (i.e. paths not ending with `@attr`), any
89 subelements are flattened into one string; this is handy for ignoring
90 highlights for example. If you want to simply check for the presence of
91 a given node or attribute, use an empty string (`""`) as a `PATTERN`.
93 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
94 in the specified file. The number of occurrences must match the given
97 * `@has-dir PATH` checks for the existence of the given directory.
99 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
100 checks if the given file does not exist, for example.
104 from __future__ import absolute_import, print_function, unicode_literals
112 from collections import namedtuple
114 from html.parser import HTMLParser
116 from HTMLParser import HTMLParser
118 from xml.etree import cElementTree as ET
120 from xml.etree import ElementTree as ET
123 from html.entities import name2codepoint
125 from htmlentitydefs import name2codepoint
127 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
128 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
129 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
131 # Python 2 -> 3 compatibility
138 channel = os.environ["DOC_RUST_LANG_ORG_CHANNEL"]
140 class CustomHTMLParser(HTMLParser):
141 """simplified HTML parser.
143 this is possible because we are dealing with very regular HTML from
144 rustdoc; we only have to deal with i) void elements and ii) empty
146 def __init__(self, target=None):
147 HTMLParser.__init__(self)
148 self.__builder = target or ET.TreeBuilder()
150 def handle_starttag(self, tag, attrs):
151 attrs = {k: v or '' for k, v in attrs}
152 self.__builder.start(tag, attrs)
153 if tag in VOID_ELEMENTS:
154 self.__builder.end(tag)
156 def handle_endtag(self, tag):
157 self.__builder.end(tag)
159 def handle_startendtag(self, tag, attrs):
160 attrs = {k: v or '' for k, v in attrs}
161 self.__builder.start(tag, attrs)
162 self.__builder.end(tag)
164 def handle_data(self, data):
165 self.__builder.data(data)
167 def handle_entityref(self, name):
168 self.__builder.data(unichr(name2codepoint[name]))
170 def handle_charref(self, name):
171 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
172 self.__builder.data(unichr(code))
175 HTMLParser.close(self)
176 return self.__builder.close()
179 Command = namedtuple('Command', 'negated cmd args lineno context')
182 class FailedCheck(Exception):
186 class InvalidCheck(Exception):
190 def concat_multi_lines(f):
191 """returns a generator out of the file object, which
192 - removes `\\` then `\n` then a shared prefix with the previous line then
194 - keeps a line number (starting from 0) of the first line being
196 lastline = None # set to the last line when the last line has a backslash
199 for lineno, line in enumerate(f):
200 line = line.rstrip('\r\n')
202 # strip the common prefix from the current line if needed
203 if lastline is not None:
204 common_prefix = os.path.commonprefix([line, lastline])
205 line = line[len(common_prefix):].lstrip()
207 firstlineno = firstlineno or lineno
208 if line.endswith('\\'):
211 catenated += line[:-1]
213 yield firstlineno, catenated + line
218 if lastline is not None:
219 print_err(lineno, line, 'Trailing backslash at the end of the file')
222 LINE_PATTERN = re.compile(r'''
223 (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?)
224 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
226 ''', re.X | re.UNICODE)
229 def get_commands(template):
230 with io.open(template, encoding='utf-8') as f:
231 for lineno, line in concat_multi_lines(f):
232 m = LINE_PATTERN.search(line)
236 negated = (m.group('negated') == '!')
238 if m.group('invalid') == '!':
242 'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format(
243 '!' if negated else '',
248 args = m.group('args')
249 if args and not args[:1].isspace():
250 print_err(lineno, line, 'Invalid template syntax')
253 args = shlex.split(args)
254 except UnicodeEncodeError:
255 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
256 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
259 def _flatten(node, acc):
261 acc.append(node.text)
274 def normalize_xpath(path):
275 path = path.replace("{{channel}}", channel)
276 if path.startswith('//'):
277 return '.' + path # avoid warnings
278 elif path.startswith('.//'):
281 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
284 class CachedFiles(object):
285 def __init__(self, root):
289 self.last_path = None
291 def resolve_path(self, path):
293 path = os.path.normpath(path)
294 self.last_path = path
296 elif self.last_path is None:
297 raise InvalidCheck('Tried to use the previous path in the first command')
299 return self.last_path
301 def get_file(self, path):
302 path = self.resolve_path(path)
303 if path in self.files:
304 return self.files[path]
306 abspath = os.path.join(self.root, path)
307 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
308 raise FailedCheck('File does not exist {!r}'.format(path))
310 with io.open(abspath, encoding='utf-8') as f:
312 self.files[path] = data
315 def get_tree(self, path):
316 path = self.resolve_path(path)
317 if path in self.trees:
318 return self.trees[path]
320 abspath = os.path.join(self.root, path)
321 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
322 raise FailedCheck('File does not exist {!r}'.format(path))
324 with io.open(abspath, encoding='utf-8') as f:
326 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
327 except Exception as e:
328 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
329 self.trees[path] = tree
330 return self.trees[path]
332 def get_dir(self, path):
333 path = self.resolve_path(path)
334 abspath = os.path.join(self.root, path)
335 if not(os.path.exists(abspath) and os.path.isdir(abspath)):
336 raise FailedCheck('Directory does not exist {!r}'.format(path))
339 def check_string(data, pat, regexp):
340 pat = pat.replace("{{channel}}", channel)
342 return True # special case a presence testing
344 return re.search(pat, data, flags=re.UNICODE) is not None
346 data = ' '.join(data.split())
347 pat = ' '.join(pat.split())
351 def check_tree_attr(tree, path, attr, pat, regexp):
352 path = normalize_xpath(path)
354 for e in tree.findall(path):
356 value = e.attrib[attr]
360 ret = check_string(value, pat, regexp)
366 def check_tree_text(tree, path, pat, regexp):
367 path = normalize_xpath(path)
370 for e in tree.findall(path):
376 ret = check_string(value, pat, regexp)
380 print('Failed to get path "{}"'.format(path))
385 def get_tree_count(tree, path):
386 path = normalize_xpath(path)
387 return len(tree.findall(path))
391 if sys.version_info.major < 3:
392 file = codecs.getwriter('utf-8')(sys.stderr)
396 print(*args, file=file)
399 def print_err(lineno, context, err, message=None):
402 stderr("{}: {}".format(lineno, message or err))
404 stderr("\t{}".format(err))
407 stderr("\t{}".format(context))
413 def check_command(c, cache):
416 if c.cmd == 'has' or c.cmd == 'matches': # string test
417 regexp = (c.cmd == 'matches')
418 if len(c.args) == 1 and not regexp: # @has <path> = file existence
420 cache.get_file(c.args[0])
422 except FailedCheck as err:
425 elif len(c.args) == 2: # @has/matches <path> <pat> = string test
426 cerr = "`PATTERN` did not match"
427 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
428 elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
429 cerr = "`XPATH PATTERN` did not match"
430 tree = cache.get_tree(c.args[0])
431 pat, sep, attr = c.args[1].partition('/@')
433 tree = cache.get_tree(c.args[0])
434 ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
435 else: # normalized text
437 if pat.endswith('/text()'):
439 ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
441 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
443 elif c.cmd == 'count': # count test
444 if len(c.args) == 3: # @count <path> <pat> <count> = count test
445 expected = int(c.args[2])
446 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
447 cerr = "Expected {} occurrences but found {}".format(expected, found)
448 ret = expected == found
450 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
451 elif c.cmd == 'has-dir': # has-dir test
452 if len(c.args) == 1: # @has-dir <path> = has-dir test
454 cache.get_dir(c.args[0])
456 except FailedCheck as err:
460 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
461 elif c.cmd == 'valid-html':
462 raise InvalidCheck('Unimplemented @valid-html')
464 elif c.cmd == 'valid-links':
465 raise InvalidCheck('Unimplemented @valid-links')
467 raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
470 raise FailedCheck(cerr)
472 except FailedCheck as err:
473 message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
474 print_err(c.lineno, c.context, str(err), message)
475 except InvalidCheck as err:
476 print_err(c.lineno, c.context, str(err))
479 def check(target, commands):
480 cache = CachedFiles(target)
482 check_command(c, cache)
485 if __name__ == '__main__':
486 if len(sys.argv) != 3:
487 stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
490 check(sys.argv[1], get_commands(sys.argv[2]))
492 stderr("\nEncountered {} errors".format(ERR_COUNT))