src/etc/htmldocck.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 r"""
   5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
   6
   7 # How and why?
   8
   9 The principle is simple: This script receives a path to generated HTML
  10 documentation and a "template" script, which has a series of check
  11 commands like `@has` or `@matches`. Each command is used to check if
  12 some pattern is present or not present in the particular file or in
  13 a particular node of the HTML tree. In many cases, the template script
  14 happens to be the source code given to rustdoc.
  15
  16 While it indeed is possible to test in smaller portions, it has been
  17 hard to construct tests in this fashion and major rendering errors were
  18 discovered much later. This script is designed to make black-box and
  19 regression testing of Rustdoc easy. This does not preclude the needs for
  20 unit testing, but can be used to complement related tests by quickly
  21 showing the expected renderings.
  22
  23 In order to avoid one-off dependencies for this task, this script uses
  24 a reasonably working HTML parser and the existing XPath implementation
  25 from Python's standard library. Hopefully, we won't render
  26 non-well-formed HTML.
  27
  28 # Commands
  29
  30 Commands start with an `@` followed by a command name (letters and
  31 hyphens), and zero or more arguments separated by one or more whitespace
  32 characters and optionally delimited with single or double quotes. The `@`
  33 mark cannot be preceded by a non-whitespace character. Other lines
  34 (including every text up to the first `@`) are ignored, but it is
  35 recommended to avoid the use of `@` in the template file.
  36
  37 There are a number of supported commands:
  38
  39 * `@has PATH` checks for the existence of the given file.
  40
  41   `PATH` is relative to the output directory. It can be given as `-`
  42   which repeats the most recently used `PATH`.
  43
  44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
  45   the occurrence of the given pattern `PATTERN` in the specified file.
  46   Only one occurrence of the pattern is enough.
  47
  48   For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
  49   whitespace being replaced by one single space character) string.
  50   The entire file is also whitespace-normalized including newlines.
  51
  52   For `@matches`, `PATTERN` is a Python-supported regular expression.
  53   The file remains intact but the regexp is matched without the `MULTILINE`
  54   and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
  55   to override them, and `\A` and `\Z` for definitely matching
  56   the beginning and end of the file.
  57
  58   (The same distinction goes to other variants of these commands.)
  59
  60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
  61   the presence of the given XPath `XPATH` in the specified HTML file,
  62   and also the occurrence of the given pattern `PATTERN` in the matching
  63   node or attribute. Only one occurrence of the pattern in the match
  64   is enough.
  65
  66   `PATH` should be a valid and well-formed HTML file. It does *not*
  67   accept arbitrary HTML5; it should have matching open and close tags
  68   and correct entity references at least.
  69
  70   `XPATH` is an XPath expression to match. The XPath is fairly limited:
  71   `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
  72   `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
  73   and `@attr` (both as the last segment) are supported. Some examples:
  74
  75   - `//pre` or `.//pre` matches any element with a name `pre`.
  76   - `//a[@href]` matches any element with an `href` attribute.
  77   - `//*[@class="impl"]//code` matches any element with a name `code`,
  78     which is an ancestor of some element which `class` attr is `impl`.
  79   - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
  80     `class` attribute in the last `a` element (can be followed by more
  81     elements that are not `a`) inside the first `span` in the `h1` with
  82     a class of `fqn`. Note that there cannot be any additional elements
  83     between them due to the use of `/` instead of `//`.
  84
  85   Do not try to use non-absolute paths, it won't work due to the flawed
  86   ElementTree implementation. The script rejects them.
  87
  88   For the text matches (i.e. paths not ending with `@attr`), any
  89   subelements are flattened into one string; this is handy for ignoring
  90   highlights for example. If you want to simply check for the presence of
  91   a given node or attribute, use an empty string (`""`) as a `PATTERN`.
  92
  93 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
  94   in the specified file. The number of occurrences must match the given
  95   count.
  96
  97 * `@has-dir PATH` checks for the existence of the given directory.
  98
  99 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
 100 checks if the given file does not exist, for example.
 101
 102 """
 103
 104 from __future__ import absolute_import, print_function, unicode_literals
 105
 106 import codecs
 107 import io
 108 import sys
 109 import os.path
 110 import re
 111 import shlex
 112 from collections import namedtuple
 113 try:
 114     from html.parser import HTMLParser
 115 except ImportError:
 116     from HTMLParser import HTMLParser
 117 from xml.etree import cElementTree as ET
 118
 119 try:
 120     from html.entities import name2codepoint
 121 except ImportError:
 122     from htmlentitydefs import name2codepoint
 123
 124 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
 125 VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
 126                      'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'])
 127
 128 # Python 2 -> 3 compatibility
 129 try:
 130     unichr
 131 except NameError:
 132     unichr = chr
 133
 134 class CustomHTMLParser(HTMLParser):
 135     """simplified HTML parser.
 136
 137     this is possible because we are dealing with very regular HTML from
 138     rustdoc; we only have to deal with i) void elements and ii) empty
 139     attributes."""
 140     def __init__(self, target=None):
 141         HTMLParser.__init__(self)
 142         self.__builder = target or ET.TreeBuilder()
 143
 144     def handle_starttag(self, tag, attrs):
 145         attrs = dict((k, v or '') for k, v in attrs)
 146         self.__builder.start(tag, attrs)
 147         if tag in VOID_ELEMENTS:
 148             self.__builder.end(tag)
 149
 150     def handle_endtag(self, tag):
 151         self.__builder.end(tag)
 152
 153     def handle_startendtag(self, tag, attrs):
 154         attrs = dict((k, v or '') for k, v in attrs)
 155         self.__builder.start(tag, attrs)
 156         self.__builder.end(tag)
 157
 158     def handle_data(self, data):
 159         self.__builder.data(data)
 160
 161     def handle_entityref(self, name):
 162         self.__builder.data(unichr(name2codepoint[name]))
 163
 164     def handle_charref(self, name):
 165         code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
 166         self.__builder.data(unichr(code))
 167
 168     def close(self):
 169         HTMLParser.close(self)
 170         return self.__builder.close()
 171
 172 Command = namedtuple('Command', 'negated cmd args lineno context')
 173
 174 class FailedCheck(Exception):
 175     pass
 176
 177 class InvalidCheck(Exception):
 178     pass
 179
 180 def concat_multi_lines(f):
 181     """returns a generator out of the file object, which
 182     - removes `\\` then `\n` then a shared prefix with the previous line then
 183       optional whitespace;
 184     - keeps a line number (starting from 0) of the first line being
 185       concatenated."""
 186     lastline = None # set to the last line when the last line has a backslash
 187     firstlineno = None
 188     catenated = ''
 189     for lineno, line in enumerate(f):
 190         line = line.rstrip('\r\n')
 191
 192         # strip the common prefix from the current line if needed
 193         if lastline is not None:
 194             common_prefix = os.path.commonprefix([line, lastline])
 195             line = line[len(common_prefix):].lstrip()
 196
 197         firstlineno = firstlineno or lineno
 198         if line.endswith('\\'):
 199             if lastline is None:
 200                 lastline = line[:-1]
 201             catenated += line[:-1]
 202         else:
 203             yield firstlineno, catenated + line
 204             lastline = None
 205             firstlineno = None
 206             catenated = ''
 207
 208     if lastline is not None:
 209         print_err(lineno, line, 'Trailing backslash at the end of the file')
 210
 211 LINE_PATTERN = re.compile(r'''
 212     (?<=(?<!\S)@)(?P<negated>!?)
 213     (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
 214     (?P<args>.*)$
 215 ''', re.X | re.UNICODE)
 216
 217
 218 def get_commands(template):
 219     with io.open(template, encoding='utf-8') as f:
 220         for lineno, line in concat_multi_lines(f):
 221             m = LINE_PATTERN.search(line)
 222             if not m:
 223                 continue
 224
 225             negated = (m.group('negated') == '!')
 226             cmd = m.group('cmd')
 227             args = m.group('args')
 228             if args and not args[:1].isspace():
 229                 print_err(lineno, line, 'Invalid template syntax')
 230                 continue
 231             try:
 232                 args = shlex.split(args)
 233             except UnicodeEncodeError:
 234                 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
 235             yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
 236
 237
 238 def _flatten(node, acc):
 239     if node.text:
 240         acc.append(node.text)
 241     for e in node:
 242         _flatten(e, acc)
 243         if e.tail:
 244             acc.append(e.tail)
 245
 246
 247 def flatten(node):
 248     acc = []
 249     _flatten(node, acc)
 250     return ''.join(acc)
 251
 252
 253 def normalize_xpath(path):
 254     if path.startswith('//'):
 255         return '.' + path # avoid warnings
 256     elif path.startswith('.//'):
 257         return path
 258     else:
 259         raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
 260
 261
 262 class CachedFiles(object):
 263     def __init__(self, root):
 264         self.root = root
 265         self.files = {}
 266         self.trees = {}
 267         self.last_path = None
 268
 269     def resolve_path(self, path):
 270         if path != '-':
 271             path = os.path.normpath(path)
 272             self.last_path = path
 273             return path
 274         elif self.last_path is None:
 275             raise InvalidCheck('Tried to use the previous path in the first command')
 276         else:
 277             return self.last_path
 278
 279     def get_file(self, path):
 280         path = self.resolve_path(path)
 281         if path in self.files:
 282             return self.files[path]
 283
 284         abspath = os.path.join(self.root, path)
 285         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 286             raise FailedCheck('File does not exist {!r}'.format(path))
 287
 288         with io.open(abspath, encoding='utf-8') as f:
 289             data = f.read()
 290             self.files[path] = data
 291             return data
 292
 293     def get_tree(self, path):
 294         path = self.resolve_path(path)
 295         if path in self.trees:
 296             return self.trees[path]
 297
 298         abspath = os.path.join(self.root, path)
 299         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 300             raise FailedCheck('File does not exist {!r}'.format(path))
 301
 302         with io.open(abspath, encoding='utf-8') as f:
 303             try:
 304                 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
 305             except Exception as e:
 306                 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
 307             self.trees[path] = tree
 308             return self.trees[path]
 309
 310     def get_dir(self, path):
 311         path = self.resolve_path(path)
 312         abspath = os.path.join(self.root, path)
 313         if not(os.path.exists(abspath) and os.path.isdir(abspath)):
 314             raise FailedCheck('Directory does not exist {!r}'.format(path))
 315
 316
 317 def check_string(data, pat, regexp):
 318     if not pat:
 319         return True # special case a presence testing
 320     elif regexp:
 321         return re.search(pat, data, flags=re.UNICODE) is not None
 322     else:
 323         data = ' '.join(data.split())
 324         pat = ' '.join(pat.split())
 325         return pat in data
 326
 327
 328 def check_tree_attr(tree, path, attr, pat, regexp):
 329     path = normalize_xpath(path)
 330     ret = False
 331     for e in tree.findall(path):
 332         if attr in e.attrib:
 333             value = e.attrib[attr]
 334         else:
 335             continue
 336
 337         ret = check_string(value, pat, regexp)
 338         if ret:
 339             break
 340     return ret
 341
 342
 343 def check_tree_text(tree, path, pat, regexp):
 344     path = normalize_xpath(path)
 345     ret = False
 346     try:
 347         for e in tree.findall(path):
 348             try:
 349                 value = flatten(e)
 350             except KeyError:
 351                 continue
 352             else:
 353                 ret = check_string(value, pat, regexp)
 354                 if ret:
 355                     break
 356     except Exception as e:
 357         print('Failed to get path "{}"'.format(path))
 358         raise
 359     return ret
 360
 361
 362 def get_tree_count(tree, path):
 363     path = normalize_xpath(path)
 364     return len(tree.findall(path))
 365
 366 def stderr(*args):
 367     if sys.version_info.major < 3:
 368         file = codecs.getwriter('utf-8')(sys.stderr)
 369     else:
 370         file = sys.stderr
 371
 372     print(*args, file=file)
 373
 374 def print_err(lineno, context, err, message=None):
 375     global ERR_COUNT
 376     ERR_COUNT += 1
 377     stderr("{}: {}".format(lineno, message or err))
 378     if message and err:
 379         stderr("\t{}".format(err))
 380
 381     if context:
 382         stderr("\t{}".format(context))
 383
 384 ERR_COUNT = 0
 385
 386 def check_command(c, cache):
 387     try:
 388         cerr = ""
 389         if c.cmd == 'has' or c.cmd == 'matches': # string test
 390             regexp = (c.cmd == 'matches')
 391             if len(c.args) == 1 and not regexp: # @has <path> = file existence
 392                 try:
 393                     cache.get_file(c.args[0])
 394                     ret = True
 395                 except FailedCheck as err:
 396                     cerr = str(err)
 397                     ret = False
 398             elif len(c.args) == 2: # @has/matches <path> <pat> = string test
 399                 cerr = "`PATTERN` did not match"
 400                 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
 401             elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
 402                 cerr = "`XPATH PATTERN` did not match"
 403                 tree = cache.get_tree(c.args[0])
 404                 pat, sep, attr = c.args[1].partition('/@')
 405                 if sep: # attribute
 406                     tree = cache.get_tree(c.args[0])
 407                     ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
 408                 else: # normalized text
 409                     pat = c.args[1]
 410                     if pat.endswith('/text()'):
 411                         pat = pat[:-7]
 412                     ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
 413             else:
 414                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 415
 416         elif c.cmd == 'count': # count test
 417             if len(c.args) == 3: # @count <path> <pat> <count> = count test
 418                 expected = int(c.args[2])
 419                 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
 420                 cerr = "Expected {} occurrences but found {}".format(expected, found)
 421                 ret = expected == found
 422             else:
 423                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 424         elif c.cmd == 'has-dir': # has-dir test
 425             if len(c.args) == 1: # @has-dir <path> = has-dir test
 426                 try:
 427                     cache.get_dir(c.args[0])
 428                     ret = True
 429                 except FailedCheck as err:
 430                     cerr = str(err)
 431                     ret = False
 432             else:
 433                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 434         elif c.cmd == 'valid-html':
 435             raise InvalidCheck('Unimplemented @valid-html')
 436
 437         elif c.cmd == 'valid-links':
 438             raise InvalidCheck('Unimplemented @valid-links')
 439         else:
 440             raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
 441
 442         if ret == c.negated:
 443             raise FailedCheck(cerr)
 444
 445     except FailedCheck as err:
 446         message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
 447         print_err(c.lineno, c.context, str(err), message)
 448     except InvalidCheck as err:
 449         print_err(c.lineno, c.context, str(err))
 450
 451 def check(target, commands):
 452     cache = CachedFiles(target)
 453     for c in commands:
 454         check_command(c, cache)
 455
 456 if __name__ == '__main__':
 457     if len(sys.argv) != 3:
 458         stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
 459         raise SystemExit(1)
 460
 461     check(sys.argv[1], get_commands(sys.argv[2]))
 462     if ERR_COUNT:
 463         stderr("\nEncountered {} errors".format(ERR_COUNT))
 464         raise SystemExit(1)