2 # -*- coding: utf-8 -*-
5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
9 The principle is simple: This script receives a path to generated HTML
10 documentation and a "template" script, which has a series of check
11 commands like `@has` or `@matches`. Each command is used to check if
12 some pattern is present or not present in the particular file or in
13 a particular node of the HTML tree. In many cases, the template script
14 happens to be the source code given to rustdoc.
16 While it indeed is possible to test in smaller portions, it has been
17 hard to construct tests in this fashion and major rendering errors were
18 discovered much later. This script is designed to make black-box and
19 regression testing of Rustdoc easy. This does not preclude the needs for
20 unit testing, but can be used to complement related tests by quickly
21 showing the expected renderings.
23 In order to avoid one-off dependencies for this task, this script uses
24 a reasonably working HTML parser and the existing XPath implementation
25 from Python's standard library. Hopefully, we won't render
30 Commands start with an `@` followed by a command name (letters and
31 hyphens), and zero or more arguments separated by one or more whitespace
32 characters and optionally delimited with single or double quotes. The `@`
33 mark cannot be preceded by a non-whitespace character. Other lines
34 (including every text up to the first `@`) are ignored, but it is
35 recommended to avoid the use of `@` in the template file.
37 There are a number of supported commands:
39 * `@has PATH` checks for the existence of the given file.
41 `PATH` is relative to the output directory. It can be given as `-`
42 which repeats the most recently used `PATH`.
44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
45 the occurrence of the given pattern `PATTERN` in the specified file.
46 Only one occurrence of the pattern is enough.
48 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
49 whitespace being replaced by one single space character) string.
50 The entire file is also whitespace-normalized including newlines.
52 For `@matches`, `PATTERN` is a Python-supported regular expression.
53 The file remains intact but the regexp is matched without the `MULTILINE`
54 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
55 to override them, and `\A` and `\Z` for definitely matching
56 the beginning and end of the file.
58 (The same distinction goes to other variants of these commands.)
60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
61 the presence of the given XPath `XPATH` in the specified HTML file,
62 and also the occurrence of the given pattern `PATTERN` in the matching
63 node or attribute. Only one occurrence of the pattern in the match
66 `PATH` should be a valid and well-formed HTML file. It does *not*
67 accept arbitrary HTML5; it should have matching open and close tags
68 and correct entity references at least.
70 `XPATH` is an XPath expression to match. The XPath is fairly limited:
71 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
72 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
73 and `@attr` (both as the last segment) are supported. Some examples:
75 - `//pre` or `.//pre` matches any element with a name `pre`.
76 - `//a[@href]` matches any element with an `href` attribute.
77 - `//*[@class="impl"]//code` matches any element with a name `code`,
78 which is an ancestor of some element which `class` attr is `impl`.
79 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
80 `class` attribute in the last `a` element (can be followed by more
81 elements that are not `a`) inside the first `span` in the `h1` with
82 a class of `fqn`. Note that there cannot be any additional elements
83 between them due to the use of `/` instead of `//`.
85 Do not try to use non-absolute paths, it won't work due to the flawed
86 ElementTree implementation. The script rejects them.
88 For the text matches (i.e. paths not ending with `@attr`), any
89 subelements are flattened into one string; this is handy for ignoring
90 highlights for example. If you want to simply check for the presence of
91 a given node or attribute, use an empty string (`""`) as a `PATTERN`.
93 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
94 in the specified file. The number of occurrences must match the given
97 * `@has-dir PATH` checks for the existence of the given directory.
99 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
100 checks if the given file does not exist, for example.
104 from __future__ import absolute_import, print_function, unicode_literals
112 from collections import namedtuple
114 from html.parser import HTMLParser
116 from HTMLParser import HTMLParser
117 from xml.etree import cElementTree as ET
120 from html.entities import name2codepoint
122 from htmlentitydefs import name2codepoint
124 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
125 VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
126 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'])
128 # Python 2 -> 3 compatibility
135 class CustomHTMLParser(HTMLParser):
136 """simplified HTML parser.
138 this is possible because we are dealing with very regular HTML from
139 rustdoc; we only have to deal with i) void elements and ii) empty
141 def __init__(self, target=None):
142 HTMLParser.__init__(self)
143 self.__builder = target or ET.TreeBuilder()
145 def handle_starttag(self, tag, attrs):
146 attrs = dict((k, v or '') for k, v in attrs)
147 self.__builder.start(tag, attrs)
148 if tag in VOID_ELEMENTS:
149 self.__builder.end(tag)
151 def handle_endtag(self, tag):
152 self.__builder.end(tag)
154 def handle_startendtag(self, tag, attrs):
155 attrs = dict((k, v or '') for k, v in attrs)
156 self.__builder.start(tag, attrs)
157 self.__builder.end(tag)
159 def handle_data(self, data):
160 self.__builder.data(data)
162 def handle_entityref(self, name):
163 self.__builder.data(unichr(name2codepoint[name]))
165 def handle_charref(self, name):
166 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
167 self.__builder.data(unichr(code))
170 HTMLParser.close(self)
171 return self.__builder.close()
174 Command = namedtuple('Command', 'negated cmd args lineno context')
177 class FailedCheck(Exception):
181 class InvalidCheck(Exception):
185 def concat_multi_lines(f):
186 """returns a generator out of the file object, which
187 - removes `\\` then `\n` then a shared prefix with the previous line then
189 - keeps a line number (starting from 0) of the first line being
191 lastline = None # set to the last line when the last line has a backslash
194 for lineno, line in enumerate(f):
195 line = line.rstrip('\r\n')
197 # strip the common prefix from the current line if needed
198 if lastline is not None:
199 common_prefix = os.path.commonprefix([line, lastline])
200 line = line[len(common_prefix):].lstrip()
202 firstlineno = firstlineno or lineno
203 if line.endswith('\\'):
206 catenated += line[:-1]
208 yield firstlineno, catenated + line
213 if lastline is not None:
214 print_err(lineno, line, 'Trailing backslash at the end of the file')
217 LINE_PATTERN = re.compile(r'''
218 (?<=(?<!\S)@)(?P<negated>!?)
219 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
221 ''', re.X | re.UNICODE)
224 def get_commands(template):
225 with io.open(template, encoding='utf-8') as f:
226 for lineno, line in concat_multi_lines(f):
227 m = LINE_PATTERN.search(line)
231 negated = (m.group('negated') == '!')
233 args = m.group('args')
234 if args and not args[:1].isspace():
235 print_err(lineno, line, 'Invalid template syntax')
238 args = shlex.split(args)
239 except UnicodeEncodeError:
240 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
241 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
244 def _flatten(node, acc):
246 acc.append(node.text)
259 def normalize_xpath(path):
260 if path.startswith('//'):
261 return '.' + path # avoid warnings
262 elif path.startswith('.//'):
265 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
268 class CachedFiles(object):
269 def __init__(self, root):
273 self.last_path = None
275 def resolve_path(self, path):
277 path = os.path.normpath(path)
278 self.last_path = path
280 elif self.last_path is None:
281 raise InvalidCheck('Tried to use the previous path in the first command')
283 return self.last_path
285 def get_file(self, path):
286 path = self.resolve_path(path)
287 if path in self.files:
288 return self.files[path]
290 abspath = os.path.join(self.root, path)
291 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
292 raise FailedCheck('File does not exist {!r}'.format(path))
294 with io.open(abspath, encoding='utf-8') as f:
296 self.files[path] = data
299 def get_tree(self, path):
300 path = self.resolve_path(path)
301 if path in self.trees:
302 return self.trees[path]
304 abspath = os.path.join(self.root, path)
305 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
306 raise FailedCheck('File does not exist {!r}'.format(path))
308 with io.open(abspath, encoding='utf-8') as f:
310 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
311 except Exception as e:
312 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
313 self.trees[path] = tree
314 return self.trees[path]
316 def get_dir(self, path):
317 path = self.resolve_path(path)
318 abspath = os.path.join(self.root, path)
319 if not(os.path.exists(abspath) and os.path.isdir(abspath)):
320 raise FailedCheck('Directory does not exist {!r}'.format(path))
323 def check_string(data, pat, regexp):
325 return True # special case a presence testing
327 return re.search(pat, data, flags=re.UNICODE) is not None
329 data = ' '.join(data.split())
330 pat = ' '.join(pat.split())
334 def check_tree_attr(tree, path, attr, pat, regexp):
335 path = normalize_xpath(path)
337 for e in tree.findall(path):
339 value = e.attrib[attr]
343 ret = check_string(value, pat, regexp)
349 def check_tree_text(tree, path, pat, regexp):
350 path = normalize_xpath(path)
353 for e in tree.findall(path):
359 ret = check_string(value, pat, regexp)
363 print('Failed to get path "{}"'.format(path))
368 def get_tree_count(tree, path):
369 path = normalize_xpath(path)
370 return len(tree.findall(path))
374 if sys.version_info.major < 3:
375 file = codecs.getwriter('utf-8')(sys.stderr)
379 print(*args, file=file)
382 def print_err(lineno, context, err, message=None):
385 stderr("{}: {}".format(lineno, message or err))
387 stderr("\t{}".format(err))
390 stderr("\t{}".format(context))
396 def check_command(c, cache):
399 if c.cmd == 'has' or c.cmd == 'matches': # string test
400 regexp = (c.cmd == 'matches')
401 if len(c.args) == 1 and not regexp: # @has <path> = file existence
403 cache.get_file(c.args[0])
405 except FailedCheck as err:
408 elif len(c.args) == 2: # @has/matches <path> <pat> = string test
409 cerr = "`PATTERN` did not match"
410 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
411 elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
412 cerr = "`XPATH PATTERN` did not match"
413 tree = cache.get_tree(c.args[0])
414 pat, sep, attr = c.args[1].partition('/@')
416 tree = cache.get_tree(c.args[0])
417 ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
418 else: # normalized text
420 if pat.endswith('/text()'):
422 ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
424 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
426 elif c.cmd == 'count': # count test
427 if len(c.args) == 3: # @count <path> <pat> <count> = count test
428 expected = int(c.args[2])
429 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
430 cerr = "Expected {} occurrences but found {}".format(expected, found)
431 ret = expected == found
433 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
434 elif c.cmd == 'has-dir': # has-dir test
435 if len(c.args) == 1: # @has-dir <path> = has-dir test
437 cache.get_dir(c.args[0])
439 except FailedCheck as err:
443 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
444 elif c.cmd == 'valid-html':
445 raise InvalidCheck('Unimplemented @valid-html')
447 elif c.cmd == 'valid-links':
448 raise InvalidCheck('Unimplemented @valid-links')
450 raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
453 raise FailedCheck(cerr)
455 except FailedCheck as err:
456 message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
457 print_err(c.lineno, c.context, str(err), message)
458 except InvalidCheck as err:
459 print_err(c.lineno, c.context, str(err))
462 def check(target, commands):
463 cache = CachedFiles(target)
465 check_command(c, cache)
468 if __name__ == '__main__':
469 if len(sys.argv) != 3:
470 stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
473 check(sys.argv[1], get_commands(sys.argv[2]))
475 stderr("\nEncountered {} errors".format(ERR_COUNT))