src/etc/htmldocck.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 r"""
   5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
   6
   7 # How and why?
   8
   9 The principle is simple: This script receives a path to generated HTML
  10 documentation and a "template" script, which has a series of check
  11 commands like `@has` or `@matches`. Each command is used to check if
  12 some pattern is present or not present in the particular file or in
  13 a particular node of the HTML tree. In many cases, the template script
  14 happens to be the source code given to rustdoc.
  15
  16 While it indeed is possible to test in smaller portions, it has been
  17 hard to construct tests in this fashion and major rendering errors were
  18 discovered much later. This script is designed to make black-box and
  19 regression testing of Rustdoc easy. This does not preclude the needs for
  20 unit testing, but can be used to complement related tests by quickly
  21 showing the expected renderings.
  22
  23 In order to avoid one-off dependencies for this task, this script uses
  24 a reasonably working HTML parser and the existing XPath implementation
  25 from Python's standard library. Hopefully, we won't render
  26 non-well-formed HTML.
  27
  28 # Commands
  29
  30 Commands start with an `@` followed by a command name (letters and
  31 hyphens), and zero or more arguments separated by one or more whitespace
  32 characters and optionally delimited with single or double quotes. The `@`
  33 mark cannot be preceded by a non-whitespace character. Other lines
  34 (including every text up to the first `@`) are ignored, but it is
  35 recommended to avoid the use of `@` in the template file.
  36
  37 There are a number of supported commands:
  38
  39 * `@has PATH` checks for the existence of the given file.
  40
  41   `PATH` is relative to the output directory. It can be given as `-`
  42   which repeats the most recently used `PATH`.
  43
  44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
  45   the occurrence of the given pattern `PATTERN` in the specified file.
  46   Only one occurrence of the pattern is enough.
  47
  48   For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
  49   whitespace being replaced by one single space character) string.
  50   The entire file is also whitespace-normalized including newlines.
  51
  52   For `@matches`, `PATTERN` is a Python-supported regular expression.
  53   The file remains intact but the regexp is matched without the `MULTILINE`
  54   and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
  55   to override them, and `\A` and `\Z` for definitely matching
  56   the beginning and end of the file.
  57
  58   (The same distinction goes to other variants of these commands.)
  59
  60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
  61   the presence of the given XPath `XPATH` in the specified HTML file,
  62   and also the occurrence of the given pattern `PATTERN` in the matching
  63   node or attribute. Only one occurrence of the pattern in the match
  64   is enough.
  65
  66   `PATH` should be a valid and well-formed HTML file. It does *not*
  67   accept arbitrary HTML5; it should have matching open and close tags
  68   and correct entity references at least.
  69
  70   `XPATH` is an XPath expression to match. The XPath is fairly limited:
  71   `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
  72   `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
  73   and `@attr` (both as the last segment) are supported. Some examples:
  74
  75   - `//pre` or `.//pre` matches any element with a name `pre`.
  76   - `//a[@href]` matches any element with an `href` attribute.
  77   - `//*[@class="impl"]//code` matches any element with a name `code`,
  78     which is an ancestor of some element which `class` attr is `impl`.
  79   - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
  80     `class` attribute in the last `a` element (can be followed by more
  81     elements that are not `a`) inside the first `span` in the `h1` with
  82     a class of `fqn`. Note that there cannot be any additional elements
  83     between them due to the use of `/` instead of `//`.
  84
  85   Do not try to use non-absolute paths, it won't work due to the flawed
  86   ElementTree implementation. The script rejects them.
  87
  88   For the text matches (i.e. paths not ending with `@attr`), any
  89   subelements are flattened into one string; this is handy for ignoring
  90   highlights for example. If you want to simply check for the presence of
  91   a given node or attribute, use an empty string (`""`) as a `PATTERN`.
  92
  93 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
  94   in the specified file. The number of occurrences must match the given
  95   count.
  96
  97 * `@has-dir PATH` checks for the existence of the given directory.
  98
  99 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
 100 checks if the given file does not exist, for example.
 101
 102 """
 103
 104 from __future__ import absolute_import, print_function, unicode_literals
 105
 106 import codecs
 107 import io
 108 import sys
 109 import os.path
 110 import re
 111 import shlex
 112 from collections import namedtuple
 113 try:
 114     from html.parser import HTMLParser
 115 except ImportError:
 116     from HTMLParser import HTMLParser
 117 from xml.etree import cElementTree as ET
 118
 119 try:
 120     from html.entities import name2codepoint
 121 except ImportError:
 122     from htmlentitydefs import name2codepoint
 123
 124 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
 125 VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
 126                      'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'])
 127
 128 # Python 2 -> 3 compatibility
 129 try:
 130     unichr
 131 except NameError:
 132     unichr = chr
 133
 134
 135 class CustomHTMLParser(HTMLParser):
 136     """simplified HTML parser.
 137
 138     this is possible because we are dealing with very regular HTML from
 139     rustdoc; we only have to deal with i) void elements and ii) empty
 140     attributes."""
 141     def __init__(self, target=None):
 142         HTMLParser.__init__(self)
 143         self.__builder = target or ET.TreeBuilder()
 144
 145     def handle_starttag(self, tag, attrs):
 146         attrs = dict((k, v or '') for k, v in attrs)
 147         self.__builder.start(tag, attrs)
 148         if tag in VOID_ELEMENTS:
 149             self.__builder.end(tag)
 150
 151     def handle_endtag(self, tag):
 152         self.__builder.end(tag)
 153
 154     def handle_startendtag(self, tag, attrs):
 155         attrs = dict((k, v or '') for k, v in attrs)
 156         self.__builder.start(tag, attrs)
 157         self.__builder.end(tag)
 158
 159     def handle_data(self, data):
 160         self.__builder.data(data)
 161
 162     def handle_entityref(self, name):
 163         self.__builder.data(unichr(name2codepoint[name]))
 164
 165     def handle_charref(self, name):
 166         code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
 167         self.__builder.data(unichr(code))
 168
 169     def close(self):
 170         HTMLParser.close(self)
 171         return self.__builder.close()
 172
 173
 174 Command = namedtuple('Command', 'negated cmd args lineno context')
 175
 176
 177 class FailedCheck(Exception):
 178     pass
 179
 180
 181 class InvalidCheck(Exception):
 182     pass
 183
 184
 185 def concat_multi_lines(f):
 186     """returns a generator out of the file object, which
 187     - removes `\\` then `\n` then a shared prefix with the previous line then
 188       optional whitespace;
 189     - keeps a line number (starting from 0) of the first line being
 190       concatenated."""
 191     lastline = None  # set to the last line when the last line has a backslash
 192     firstlineno = None
 193     catenated = ''
 194     for lineno, line in enumerate(f):
 195         line = line.rstrip('\r\n')
 196
 197         # strip the common prefix from the current line if needed
 198         if lastline is not None:
 199             common_prefix = os.path.commonprefix([line, lastline])
 200             line = line[len(common_prefix):].lstrip()
 201
 202         firstlineno = firstlineno or lineno
 203         if line.endswith('\\'):
 204             if lastline is None:
 205                 lastline = line[:-1]
 206             catenated += line[:-1]
 207         else:
 208             yield firstlineno, catenated + line
 209             lastline = None
 210             firstlineno = None
 211             catenated = ''
 212
 213     if lastline is not None:
 214         print_err(lineno, line, 'Trailing backslash at the end of the file')
 215
 216
 217 LINE_PATTERN = re.compile(r'''
 218     (?<=(?<!\S)@)(?P<negated>!?)
 219     (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
 220     (?P<args>.*)$
 221 ''', re.X | re.UNICODE)
 222
 223
 224 def get_commands(template):
 225     with io.open(template, encoding='utf-8') as f:
 226         for lineno, line in concat_multi_lines(f):
 227             m = LINE_PATTERN.search(line)
 228             if not m:
 229                 continue
 230
 231             negated = (m.group('negated') == '!')
 232             cmd = m.group('cmd')
 233             args = m.group('args')
 234             if args and not args[:1].isspace():
 235                 print_err(lineno, line, 'Invalid template syntax')
 236                 continue
 237             try:
 238                 args = shlex.split(args)
 239             except UnicodeEncodeError:
 240                 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
 241             yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
 242
 243
 244 def _flatten(node, acc):
 245     if node.text:
 246         acc.append(node.text)
 247     for e in node:
 248         _flatten(e, acc)
 249         if e.tail:
 250             acc.append(e.tail)
 251
 252
 253 def flatten(node):
 254     acc = []
 255     _flatten(node, acc)
 256     return ''.join(acc)
 257
 258
 259 def normalize_xpath(path):
 260     if path.startswith('//'):
 261         return '.' + path  # avoid warnings
 262     elif path.startswith('.//'):
 263         return path
 264     else:
 265         raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
 266
 267
 268 class CachedFiles(object):
 269     def __init__(self, root):
 270         self.root = root
 271         self.files = {}
 272         self.trees = {}
 273         self.last_path = None
 274
 275     def resolve_path(self, path):
 276         if path != '-':
 277             path = os.path.normpath(path)
 278             self.last_path = path
 279             return path
 280         elif self.last_path is None:
 281             raise InvalidCheck('Tried to use the previous path in the first command')
 282         else:
 283             return self.last_path
 284
 285     def get_file(self, path):
 286         path = self.resolve_path(path)
 287         if path in self.files:
 288             return self.files[path]
 289
 290         abspath = os.path.join(self.root, path)
 291         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 292             raise FailedCheck('File does not exist {!r}'.format(path))
 293
 294         with io.open(abspath, encoding='utf-8') as f:
 295             data = f.read()
 296             self.files[path] = data
 297             return data
 298
 299     def get_tree(self, path):
 300         path = self.resolve_path(path)
 301         if path in self.trees:
 302             return self.trees[path]
 303
 304         abspath = os.path.join(self.root, path)
 305         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 306             raise FailedCheck('File does not exist {!r}'.format(path))
 307
 308         with io.open(abspath, encoding='utf-8') as f:
 309             try:
 310                 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
 311             except Exception as e:
 312                 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
 313             self.trees[path] = tree
 314             return self.trees[path]
 315
 316     def get_dir(self, path):
 317         path = self.resolve_path(path)
 318         abspath = os.path.join(self.root, path)
 319         if not(os.path.exists(abspath) and os.path.isdir(abspath)):
 320             raise FailedCheck('Directory does not exist {!r}'.format(path))
 321
 322
 323 def check_string(data, pat, regexp):
 324     if not pat:
 325         return True  # special case a presence testing
 326     elif regexp:
 327         return re.search(pat, data, flags=re.UNICODE) is not None
 328     else:
 329         data = ' '.join(data.split())
 330         pat = ' '.join(pat.split())
 331         return pat in data
 332
 333
 334 def check_tree_attr(tree, path, attr, pat, regexp):
 335     path = normalize_xpath(path)
 336     ret = False
 337     for e in tree.findall(path):
 338         if attr in e.attrib:
 339             value = e.attrib[attr]
 340         else:
 341             continue
 342
 343         ret = check_string(value, pat, regexp)
 344         if ret:
 345             break
 346     return ret
 347
 348
 349 def check_tree_text(tree, path, pat, regexp):
 350     path = normalize_xpath(path)
 351     ret = False
 352     try:
 353         for e in tree.findall(path):
 354             try:
 355                 value = flatten(e)
 356             except KeyError:
 357                 continue
 358             else:
 359                 ret = check_string(value, pat, regexp)
 360                 if ret:
 361                     break
 362     except Exception:
 363         print('Failed to get path "{}"'.format(path))
 364         raise
 365     return ret
 366
 367
 368 def get_tree_count(tree, path):
 369     path = normalize_xpath(path)
 370     return len(tree.findall(path))
 371
 372
 373 def stderr(*args):
 374     if sys.version_info.major < 3:
 375         file = codecs.getwriter('utf-8')(sys.stderr)
 376     else:
 377         file = sys.stderr
 378
 379     print(*args, file=file)
 380
 381
 382 def print_err(lineno, context, err, message=None):
 383     global ERR_COUNT
 384     ERR_COUNT += 1
 385     stderr("{}: {}".format(lineno, message or err))
 386     if message and err:
 387         stderr("\t{}".format(err))
 388
 389     if context:
 390         stderr("\t{}".format(context))
 391
 392
 393 ERR_COUNT = 0
 394
 395
 396 def check_command(c, cache):
 397     try:
 398         cerr = ""
 399         if c.cmd == 'has' or c.cmd == 'matches':  # string test
 400             regexp = (c.cmd == 'matches')
 401             if len(c.args) == 1 and not regexp:  # @has <path> = file existence
 402                 try:
 403                     cache.get_file(c.args[0])
 404                     ret = True
 405                 except FailedCheck as err:
 406                     cerr = str(err)
 407                     ret = False
 408             elif len(c.args) == 2:  # @has/matches <path> <pat> = string test
 409                 cerr = "`PATTERN` did not match"
 410                 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
 411             elif len(c.args) == 3:  # @has/matches <path> <pat> <match> = XML tree test
 412                 cerr = "`XPATH PATTERN` did not match"
 413                 tree = cache.get_tree(c.args[0])
 414                 pat, sep, attr = c.args[1].partition('/@')
 415                 if sep:  # attribute
 416                     tree = cache.get_tree(c.args[0])
 417                     ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
 418                 else:  # normalized text
 419                     pat = c.args[1]
 420                     if pat.endswith('/text()'):
 421                         pat = pat[:-7]
 422                     ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
 423             else:
 424                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 425
 426         elif c.cmd == 'count':  # count test
 427             if len(c.args) == 3:  # @count <path> <pat> <count> = count test
 428                 expected = int(c.args[2])
 429                 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
 430                 cerr = "Expected {} occurrences but found {}".format(expected, found)
 431                 ret = expected == found
 432             else:
 433                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 434         elif c.cmd == 'has-dir':  # has-dir test
 435             if len(c.args) == 1:  # @has-dir <path> = has-dir test
 436                 try:
 437                     cache.get_dir(c.args[0])
 438                     ret = True
 439                 except FailedCheck as err:
 440                     cerr = str(err)
 441                     ret = False
 442             else:
 443                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 444         elif c.cmd == 'valid-html':
 445             raise InvalidCheck('Unimplemented @valid-html')
 446
 447         elif c.cmd == 'valid-links':
 448             raise InvalidCheck('Unimplemented @valid-links')
 449         else:
 450             raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
 451
 452         if ret == c.negated:
 453             raise FailedCheck(cerr)
 454
 455     except FailedCheck as err:
 456         message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
 457         print_err(c.lineno, c.context, str(err), message)
 458     except InvalidCheck as err:
 459         print_err(c.lineno, c.context, str(err))
 460
 461
 462 def check(target, commands):
 463     cache = CachedFiles(target)
 464     for c in commands:
 465         check_command(c, cache)
 466
 467
 468 if __name__ == '__main__':
 469     if len(sys.argv) != 3:
 470         stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
 471         raise SystemExit(1)
 472
 473     check(sys.argv[1], get_commands(sys.argv[2]))
 474     if ERR_COUNT:
 475         stderr("\nEncountered {} errors".format(ERR_COUNT))
 476         raise SystemExit(1)