src/etc/htmldocck.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 r"""
   5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
   6
   7 # How and why?
   8
   9 The principle is simple: This script receives a path to generated HTML
  10 documentation and a "template" script, which has a series of check
  11 commands like `@has` or `@matches`. Each command is used to check if
  12 some pattern is present or not present in the particular file or in
  13 a particular node of the HTML tree. In many cases, the template script
  14 happens to be the source code given to rustdoc.
  15
  16 While it indeed is possible to test in smaller portions, it has been
  17 hard to construct tests in this fashion and major rendering errors were
  18 discovered much later. This script is designed to make black-box and
  19 regression testing of Rustdoc easy. This does not preclude the needs for
  20 unit testing, but can be used to complement related tests by quickly
  21 showing the expected renderings.
  22
  23 In order to avoid one-off dependencies for this task, this script uses
  24 a reasonably working HTML parser and the existing XPath implementation
  25 from Python's standard library. Hopefully, we won't render
  26 non-well-formed HTML.
  27
  28 # Commands
  29
  30 Commands start with an `@` followed by a command name (letters and
  31 hyphens), and zero or more arguments separated by one or more whitespace
  32 characters and optionally delimited with single or double quotes. The `@`
  33 mark cannot be preceded by a non-whitespace character. Other lines
  34 (including every text up to the first `@`) are ignored, but it is
  35 recommended to avoid the use of `@` in the template file.
  36
  37 There are a number of supported commands:
  38
  39 * `@has PATH` checks for the existence of the given file.
  40
  41   `PATH` is relative to the output directory. It can be given as `-`
  42   which repeats the most recently used `PATH`.
  43
  44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
  45   the occurrence of the given pattern `PATTERN` in the specified file.
  46   Only one occurrence of the pattern is enough.
  47
  48   For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
  49   whitespace being replaced by one single space character) string.
  50   The entire file is also whitespace-normalized including newlines.
  51
  52   For `@matches`, `PATTERN` is a Python-supported regular expression.
  53   The file remains intact but the regexp is matched without the `MULTILINE`
  54   and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
  55   to override them, and `\A` and `\Z` for definitely matching
  56   the beginning and end of the file.
  57
  58   (The same distinction goes to other variants of these commands.)
  59
  60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
  61   the presence of the given XPath `XPATH` in the specified HTML file,
  62   and also the occurrence of the given pattern `PATTERN` in the matching
  63   node or attribute. Only one occurrence of the pattern in the match
  64   is enough.
  65
  66   `PATH` should be a valid and well-formed HTML file. It does *not*
  67   accept arbitrary HTML5; it should have matching open and close tags
  68   and correct entity references at least.
  69
  70   `XPATH` is an XPath expression to match. The XPath is fairly limited:
  71   `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
  72   `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
  73   and `@attr` (both as the last segment) are supported. Some examples:
  74
  75   - `//pre` or `.//pre` matches any element with a name `pre`.
  76   - `//a[@href]` matches any element with an `href` attribute.
  77   - `//*[@class="impl"]//code` matches any element with a name `code`,
  78     which is an ancestor of some element which `class` attr is `impl`.
  79   - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
  80     `class` attribute in the last `a` element (can be followed by more
  81     elements that are not `a`) inside the first `span` in the `h1` with
  82     a class of `fqn`. Note that there cannot be any additional elements
  83     between them due to the use of `/` instead of `//`.
  84
  85   Do not try to use non-absolute paths, it won't work due to the flawed
  86   ElementTree implementation. The script rejects them.
  87
  88   For the text matches (i.e. paths not ending with `@attr`), any
  89   subelements are flattened into one string; this is handy for ignoring
  90   highlights for example. If you want to simply check for the presence of
  91   a given node or attribute, use an empty string (`""`) as a `PATTERN`.
  92
  93 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
  94   in the specified file. The number of occurrences must match the given
  95   count.
  96
  97 * `@has-dir PATH` checks for the existence of the given directory.
  98
  99 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
 100 checks if the given file does not exist, for example.
 101
 102 """
 103
 104 from __future__ import absolute_import, print_function, unicode_literals
 105
 106 import codecs
 107 import io
 108 import sys
 109 import os.path
 110 import re
 111 import shlex
 112 from collections import namedtuple
 113 try:
 114     from html.parser import HTMLParser
 115 except ImportError:
 116     from HTMLParser import HTMLParser
 117 try:
 118     from xml.etree import cElementTree as ET
 119 except ImportError:
 120     from xml.etree import ElementTree as ET
 121
 122 try:
 123     from html.entities import name2codepoint
 124 except ImportError:
 125     from htmlentitydefs import name2codepoint
 126
 127 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
 128 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
 129                      'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
 130
 131 # Python 2 -> 3 compatibility
 132 try:
 133     unichr
 134 except NameError:
 135     unichr = chr
 136
 137
 138 class CustomHTMLParser(HTMLParser):
 139     """simplified HTML parser.
 140
 141     this is possible because we are dealing with very regular HTML from
 142     rustdoc; we only have to deal with i) void elements and ii) empty
 143     attributes."""
 144     def __init__(self, target=None):
 145         HTMLParser.__init__(self)
 146         self.__builder = target or ET.TreeBuilder()
 147
 148     def handle_starttag(self, tag, attrs):
 149         attrs = {k: v or '' for k, v in attrs}
 150         self.__builder.start(tag, attrs)
 151         if tag in VOID_ELEMENTS:
 152             self.__builder.end(tag)
 153
 154     def handle_endtag(self, tag):
 155         self.__builder.end(tag)
 156
 157     def handle_startendtag(self, tag, attrs):
 158         attrs = {k: v or '' for k, v in attrs}
 159         self.__builder.start(tag, attrs)
 160         self.__builder.end(tag)
 161
 162     def handle_data(self, data):
 163         self.__builder.data(data)
 164
 165     def handle_entityref(self, name):
 166         self.__builder.data(unichr(name2codepoint[name]))
 167
 168     def handle_charref(self, name):
 169         code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
 170         self.__builder.data(unichr(code))
 171
 172     def close(self):
 173         HTMLParser.close(self)
 174         return self.__builder.close()
 175
 176
 177 Command = namedtuple('Command', 'negated cmd args lineno context')
 178
 179
 180 class FailedCheck(Exception):
 181     pass
 182
 183
 184 class InvalidCheck(Exception):
 185     pass
 186
 187
 188 def concat_multi_lines(f):
 189     """returns a generator out of the file object, which
 190     - removes `\\` then `\n` then a shared prefix with the previous line then
 191       optional whitespace;
 192     - keeps a line number (starting from 0) of the first line being
 193       concatenated."""
 194     lastline = None  # set to the last line when the last line has a backslash
 195     firstlineno = None
 196     catenated = ''
 197     for lineno, line in enumerate(f):
 198         line = line.rstrip('\r\n')
 199
 200         # strip the common prefix from the current line if needed
 201         if lastline is not None:
 202             common_prefix = os.path.commonprefix([line, lastline])
 203             line = line[len(common_prefix):].lstrip()
 204
 205         firstlineno = firstlineno or lineno
 206         if line.endswith('\\'):
 207             if lastline is None:
 208                 lastline = line[:-1]
 209             catenated += line[:-1]
 210         else:
 211             yield firstlineno, catenated + line
 212             lastline = None
 213             firstlineno = None
 214             catenated = ''
 215
 216     if lastline is not None:
 217         print_err(lineno, line, 'Trailing backslash at the end of the file')
 218
 219
 220 LINE_PATTERN = re.compile(r'''
 221     (?<=(?<!\S)@)(?P<negated>!?)
 222     (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
 223     (?P<args>.*)$
 224 ''', re.X | re.UNICODE)
 225
 226
 227 def get_commands(template):
 228     with io.open(template, encoding='utf-8') as f:
 229         for lineno, line in concat_multi_lines(f):
 230             m = LINE_PATTERN.search(line)
 231             if not m:
 232                 continue
 233
 234             negated = (m.group('negated') == '!')
 235             cmd = m.group('cmd')
 236             args = m.group('args')
 237             if args and not args[:1].isspace():
 238                 print_err(lineno, line, 'Invalid template syntax')
 239                 continue
 240             try:
 241                 args = shlex.split(args)
 242             except UnicodeEncodeError:
 243                 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
 244             yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
 245
 246
 247 def _flatten(node, acc):
 248     if node.text:
 249         acc.append(node.text)
 250     for e in node:
 251         _flatten(e, acc)
 252         if e.tail:
 253             acc.append(e.tail)
 254
 255
 256 def flatten(node):
 257     acc = []
 258     _flatten(node, acc)
 259     return ''.join(acc)
 260
 261
 262 def normalize_xpath(path):
 263     if path.startswith('//'):
 264         return '.' + path  # avoid warnings
 265     elif path.startswith('.//'):
 266         return path
 267     else:
 268         raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
 269
 270
 271 class CachedFiles(object):
 272     def __init__(self, root):
 273         self.root = root
 274         self.files = {}
 275         self.trees = {}
 276         self.last_path = None
 277
 278     def resolve_path(self, path):
 279         if path != '-':
 280             path = os.path.normpath(path)
 281             self.last_path = path
 282             return path
 283         elif self.last_path is None:
 284             raise InvalidCheck('Tried to use the previous path in the first command')
 285         else:
 286             return self.last_path
 287
 288     def get_file(self, path):
 289         path = self.resolve_path(path)
 290         if path in self.files:
 291             return self.files[path]
 292
 293         abspath = os.path.join(self.root, path)
 294         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 295             raise FailedCheck('File does not exist {!r}'.format(path))
 296
 297         with io.open(abspath, encoding='utf-8') as f:
 298             data = f.read()
 299             self.files[path] = data
 300             return data
 301
 302     def get_tree(self, path):
 303         path = self.resolve_path(path)
 304         if path in self.trees:
 305             return self.trees[path]
 306
 307         abspath = os.path.join(self.root, path)
 308         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 309             raise FailedCheck('File does not exist {!r}'.format(path))
 310
 311         with io.open(abspath, encoding='utf-8') as f:
 312             try:
 313                 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
 314             except Exception as e:
 315                 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
 316             self.trees[path] = tree
 317             return self.trees[path]
 318
 319     def get_dir(self, path):
 320         path = self.resolve_path(path)
 321         abspath = os.path.join(self.root, path)
 322         if not(os.path.exists(abspath) and os.path.isdir(abspath)):
 323             raise FailedCheck('Directory does not exist {!r}'.format(path))
 324
 325
 326 def check_string(data, pat, regexp):
 327     if not pat:
 328         return True  # special case a presence testing
 329     elif regexp:
 330         return re.search(pat, data, flags=re.UNICODE) is not None
 331     else:
 332         data = ' '.join(data.split())
 333         pat = ' '.join(pat.split())
 334         return pat in data
 335
 336
 337 def check_tree_attr(tree, path, attr, pat, regexp):
 338     path = normalize_xpath(path)
 339     ret = False
 340     for e in tree.findall(path):
 341         if attr in e.attrib:
 342             value = e.attrib[attr]
 343         else:
 344             continue
 345
 346         ret = check_string(value, pat, regexp)
 347         if ret:
 348             break
 349     return ret
 350
 351
 352 def check_tree_text(tree, path, pat, regexp):
 353     path = normalize_xpath(path)
 354     ret = False
 355     try:
 356         for e in tree.findall(path):
 357             try:
 358                 value = flatten(e)
 359             except KeyError:
 360                 continue
 361             else:
 362                 ret = check_string(value, pat, regexp)
 363                 if ret:
 364                     break
 365     except Exception:
 366         print('Failed to get path "{}"'.format(path))
 367         raise
 368     return ret
 369
 370
 371 def get_tree_count(tree, path):
 372     path = normalize_xpath(path)
 373     return len(tree.findall(path))
 374
 375
 376 def stderr(*args):
 377     if sys.version_info.major < 3:
 378         file = codecs.getwriter('utf-8')(sys.stderr)
 379     else:
 380         file = sys.stderr
 381
 382     print(*args, file=file)
 383
 384
 385 def print_err(lineno, context, err, message=None):
 386     global ERR_COUNT
 387     ERR_COUNT += 1
 388     stderr("{}: {}".format(lineno, message or err))
 389     if message and err:
 390         stderr("\t{}".format(err))
 391
 392     if context:
 393         stderr("\t{}".format(context))
 394
 395
 396 ERR_COUNT = 0
 397
 398
 399 def check_command(c, cache):
 400     try:
 401         cerr = ""
 402         if c.cmd == 'has' or c.cmd == 'matches':  # string test
 403             regexp = (c.cmd == 'matches')
 404             if len(c.args) == 1 and not regexp:  # @has <path> = file existence
 405                 try:
 406                     cache.get_file(c.args[0])
 407                     ret = True
 408                 except FailedCheck as err:
 409                     cerr = str(err)
 410                     ret = False
 411             elif len(c.args) == 2:  # @has/matches <path> <pat> = string test
 412                 cerr = "`PATTERN` did not match"
 413                 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
 414             elif len(c.args) == 3:  # @has/matches <path> <pat> <match> = XML tree test
 415                 cerr = "`XPATH PATTERN` did not match"
 416                 tree = cache.get_tree(c.args[0])
 417                 pat, sep, attr = c.args[1].partition('/@')
 418                 if sep:  # attribute
 419                     tree = cache.get_tree(c.args[0])
 420                     ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
 421                 else:  # normalized text
 422                     pat = c.args[1]
 423                     if pat.endswith('/text()'):
 424                         pat = pat[:-7]
 425                     ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
 426             else:
 427                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 428
 429         elif c.cmd == 'count':  # count test
 430             if len(c.args) == 3:  # @count <path> <pat> <count> = count test
 431                 expected = int(c.args[2])
 432                 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
 433                 cerr = "Expected {} occurrences but found {}".format(expected, found)
 434                 ret = expected == found
 435             else:
 436                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 437         elif c.cmd == 'has-dir':  # has-dir test
 438             if len(c.args) == 1:  # @has-dir <path> = has-dir test
 439                 try:
 440                     cache.get_dir(c.args[0])
 441                     ret = True
 442                 except FailedCheck as err:
 443                     cerr = str(err)
 444                     ret = False
 445             else:
 446                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 447         elif c.cmd == 'valid-html':
 448             raise InvalidCheck('Unimplemented @valid-html')
 449
 450         elif c.cmd == 'valid-links':
 451             raise InvalidCheck('Unimplemented @valid-links')
 452         else:
 453             raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
 454
 455         if ret == c.negated:
 456             raise FailedCheck(cerr)
 457
 458     except FailedCheck as err:
 459         message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
 460         print_err(c.lineno, c.context, str(err), message)
 461     except InvalidCheck as err:
 462         print_err(c.lineno, c.context, str(err))
 463
 464
 465 def check(target, commands):
 466     cache = CachedFiles(target)
 467     for c in commands:
 468         check_command(c, cache)
 469
 470
 471 if __name__ == '__main__':
 472     if len(sys.argv) != 3:
 473         stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
 474         raise SystemExit(1)
 475
 476     check(sys.argv[1], get_commands(sys.argv[2]))
 477     if ERR_COUNT:
 478         stderr("\nEncountered {} errors".format(ERR_COUNT))
 479         raise SystemExit(1)