2 # -*- coding: utf-8 -*-
5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
9 The principle is simple: This script receives a path to generated HTML
10 documentation and a "template" script, which has a series of check
11 commands like `@has` or `@matches`. Each command is used to check if
12 some pattern is present or not present in the particular file or in
13 a particular node of the HTML tree. In many cases, the template script
14 happens to be the source code given to rustdoc.
16 While it indeed is possible to test in smaller portions, it has been
17 hard to construct tests in this fashion and major rendering errors were
18 discovered much later. This script is designed to make black-box and
19 regression testing of Rustdoc easy. This does not preclude the needs for
20 unit testing, but can be used to complement related tests by quickly
21 showing the expected renderings.
23 In order to avoid one-off dependencies for this task, this script uses
24 a reasonably working HTML parser and the existing XPath implementation
25 from Python's standard library. Hopefully, we won't render
30 Commands start with an `@` followed by a command name (letters and
31 hyphens), and zero or more arguments separated by one or more whitespace
32 characters and optionally delimited with single or double quotes. The `@`
33 mark cannot be preceded by a non-whitespace character. Other lines
34 (including every text up to the first `@`) are ignored, but it is
35 recommended to avoid the use of `@` in the template file.
37 There are a number of supported commands:
39 * `@has PATH` checks for the existence of the given file.
41 `PATH` is relative to the output directory. It can be given as `-`
42 which repeats the most recently used `PATH`.
44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
45 the occurrence of the given pattern `PATTERN` in the specified file.
46 Only one occurrence of the pattern is enough.
48 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
49 whitespace being replaced by one single space character) string.
50 The entire file is also whitespace-normalized including newlines.
52 For `@matches`, `PATTERN` is a Python-supported regular expression.
53 The file remains intact but the regexp is matched without the `MULTILINE`
54 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
55 to override them, and `\A` and `\Z` for definitely matching
56 the beginning and end of the file.
58 (The same distinction goes to other variants of these commands.)
60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
61 the presence of the given XPath `XPATH` in the specified HTML file,
62 and also the occurrence of the given pattern `PATTERN` in the matching
63 node or attribute. Only one occurrence of the pattern in the match
66 `PATH` should be a valid and well-formed HTML file. It does *not*
67 accept arbitrary HTML5; it should have matching open and close tags
68 and correct entity references at least.
70 `XPATH` is an XPath expression to match. The XPath is fairly limited:
71 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
72 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
73 and `@attr` (both as the last segment) are supported. Some examples:
75 - `//pre` or `.//pre` matches any element with a name `pre`.
76 - `//a[@href]` matches any element with an `href` attribute.
77 - `//*[@class="impl"]//code` matches any element with a name `code`,
78 which is an ancestor of some element which `class` attr is `impl`.
79 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
80 `class` attribute in the last `a` element (can be followed by more
81 elements that are not `a`) inside the first `span` in the `h1` with
82 a class of `fqn`. Note that there cannot be any additional elements
83 between them due to the use of `/` instead of `//`.
85 Do not try to use non-absolute paths, it won't work due to the flawed
86 ElementTree implementation. The script rejects them.
88 For the text matches (i.e. paths not ending with `@attr`), any
89 subelements are flattened into one string; this is handy for ignoring
90 highlights for example. If you want to simply check for the presence of
91 a given node or attribute, use an empty string (`""`) as a `PATTERN`.
93 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
94 in the specified file. The number of occurrences must match the given
97 * `@has-dir PATH` checks for the existence of the given directory.
99 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
100 checks if the given file does not exist, for example.
104 from __future__ import absolute_import, print_function, unicode_literals
112 from collections import namedtuple
114 from html.parser import HTMLParser
116 from HTMLParser import HTMLParser
117 from xml.etree import cElementTree as ET
120 from html.entities import name2codepoint
122 from htmlentitydefs import name2codepoint
124 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
125 VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
126 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'])
128 # Python 2 -> 3 compatibility
134 class CustomHTMLParser(HTMLParser):
135 """simplified HTML parser.
137 this is possible because we are dealing with very regular HTML from
138 rustdoc; we only have to deal with i) void elements and ii) empty
140 def __init__(self, target=None):
141 HTMLParser.__init__(self)
142 self.__builder = target or ET.TreeBuilder()
144 def handle_starttag(self, tag, attrs):
145 attrs = dict((k, v or '') for k, v in attrs)
146 self.__builder.start(tag, attrs)
147 if tag in VOID_ELEMENTS:
148 self.__builder.end(tag)
150 def handle_endtag(self, tag):
151 self.__builder.end(tag)
153 def handle_startendtag(self, tag, attrs):
154 attrs = dict((k, v or '') for k, v in attrs)
155 self.__builder.start(tag, attrs)
156 self.__builder.end(tag)
158 def handle_data(self, data):
159 self.__builder.data(data)
161 def handle_entityref(self, name):
162 self.__builder.data(unichr(name2codepoint[name]))
164 def handle_charref(self, name):
165 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
166 self.__builder.data(unichr(code))
169 HTMLParser.close(self)
170 return self.__builder.close()
172 Command = namedtuple('Command', 'negated cmd args lineno context')
174 class FailedCheck(Exception):
177 class InvalidCheck(Exception):
180 def concat_multi_lines(f):
181 """returns a generator out of the file object, which
182 - removes `\\` then `\n` then a shared prefix with the previous line then
184 - keeps a line number (starting from 0) of the first line being
186 lastline = None # set to the last line when the last line has a backslash
189 for lineno, line in enumerate(f):
190 line = line.rstrip('\r\n')
192 # strip the common prefix from the current line if needed
193 if lastline is not None:
194 common_prefix = os.path.commonprefix([line, lastline])
195 line = line[len(common_prefix):].lstrip()
197 firstlineno = firstlineno or lineno
198 if line.endswith('\\'):
201 catenated += line[:-1]
203 yield firstlineno, catenated + line
208 if lastline is not None:
209 print_err(lineno, line, 'Trailing backslash at the end of the file')
211 LINE_PATTERN = re.compile(r'''
212 (?<=(?<!\S)@)(?P<negated>!?)
213 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
215 ''', re.X | re.UNICODE)
218 def get_commands(template):
219 with io.open(template, encoding='utf-8') as f:
220 for lineno, line in concat_multi_lines(f):
221 m = LINE_PATTERN.search(line)
225 negated = (m.group('negated') == '!')
227 args = m.group('args')
228 if args and not args[:1].isspace():
229 print_err(lineno, line, 'Invalid template syntax')
232 args = shlex.split(args)
233 except UnicodeEncodeError:
234 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
235 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
238 def _flatten(node, acc):
240 acc.append(node.text)
253 def normalize_xpath(path):
254 if path.startswith('//'):
255 return '.' + path # avoid warnings
256 elif path.startswith('.//'):
259 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
262 class CachedFiles(object):
263 def __init__(self, root):
267 self.last_path = None
269 def resolve_path(self, path):
271 path = os.path.normpath(path)
272 self.last_path = path
274 elif self.last_path is None:
275 raise InvalidCheck('Tried to use the previous path in the first command')
277 return self.last_path
279 def get_file(self, path):
280 path = self.resolve_path(path)
281 if path in self.files:
282 return self.files[path]
284 abspath = os.path.join(self.root, path)
285 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
286 raise FailedCheck('File does not exist {!r}'.format(path))
288 with io.open(abspath, encoding='utf-8') as f:
290 self.files[path] = data
293 def get_tree(self, path):
294 path = self.resolve_path(path)
295 if path in self.trees:
296 return self.trees[path]
298 abspath = os.path.join(self.root, path)
299 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
300 raise FailedCheck('File does not exist {!r}'.format(path))
302 with io.open(abspath, encoding='utf-8') as f:
304 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
305 except Exception as e:
306 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
307 self.trees[path] = tree
308 return self.trees[path]
310 def get_dir(self, path):
311 path = self.resolve_path(path)
312 abspath = os.path.join(self.root, path)
313 if not(os.path.exists(abspath) and os.path.isdir(abspath)):
314 raise FailedCheck('Directory does not exist {!r}'.format(path))
317 def check_string(data, pat, regexp):
319 return True # special case a presence testing
321 return re.search(pat, data, flags=re.UNICODE) is not None
323 data = ' '.join(data.split())
324 pat = ' '.join(pat.split())
328 def check_tree_attr(tree, path, attr, pat, regexp):
329 path = normalize_xpath(path)
331 for e in tree.findall(path):
333 value = e.attrib[attr]
337 ret = check_string(value, pat, regexp)
343 def check_tree_text(tree, path, pat, regexp):
344 path = normalize_xpath(path)
347 for e in tree.findall(path):
353 ret = check_string(value, pat, regexp)
356 except Exception as e:
357 print('Failed to get path "{}"'.format(path))
362 def get_tree_count(tree, path):
363 path = normalize_xpath(path)
364 return len(tree.findall(path))
367 if sys.version_info.major < 3:
368 file = codecs.getwriter('utf-8')(sys.stderr)
372 print(*args, file=file)
374 def print_err(lineno, context, err, message=None):
377 stderr("{}: {}".format(lineno, message or err))
379 stderr("\t{}".format(err))
382 stderr("\t{}".format(context))
386 def check_command(c, cache):
389 if c.cmd == 'has' or c.cmd == 'matches': # string test
390 regexp = (c.cmd == 'matches')
391 if len(c.args) == 1 and not regexp: # @has <path> = file existence
393 cache.get_file(c.args[0])
395 except FailedCheck as err:
398 elif len(c.args) == 2: # @has/matches <path> <pat> = string test
399 cerr = "`PATTERN` did not match"
400 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
401 elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
402 cerr = "`XPATH PATTERN` did not match"
403 tree = cache.get_tree(c.args[0])
404 pat, sep, attr = c.args[1].partition('/@')
406 tree = cache.get_tree(c.args[0])
407 ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
408 else: # normalized text
410 if pat.endswith('/text()'):
412 ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
414 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
416 elif c.cmd == 'count': # count test
417 if len(c.args) == 3: # @count <path> <pat> <count> = count test
418 expected = int(c.args[2])
419 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
420 cerr = "Expected {} occurrences but found {}".format(expected, found)
421 ret = expected == found
423 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
424 elif c.cmd == 'has-dir': # has-dir test
425 if len(c.args) == 1: # @has-dir <path> = has-dir test
427 cache.get_dir(c.args[0])
429 except FailedCheck as err:
433 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
434 elif c.cmd == 'valid-html':
435 raise InvalidCheck('Unimplemented @valid-html')
437 elif c.cmd == 'valid-links':
438 raise InvalidCheck('Unimplemented @valid-links')
440 raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
443 raise FailedCheck(cerr)
445 except FailedCheck as err:
446 message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
447 print_err(c.lineno, c.context, str(err), message)
448 except InvalidCheck as err:
449 print_err(c.lineno, c.context, str(err))
451 def check(target, commands):
452 cache = CachedFiles(target)
454 check_command(c, cache)
456 if __name__ == '__main__':
457 if len(sys.argv) != 3:
458 stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
461 check(sys.argv[1], get_commands(sys.argv[2]))
463 stderr("\nEncountered {} errors".format(ERR_COUNT))