src/etc/htmldocck.py

   1 # Copyright 2015 The Rust Project Developers. See the COPYRIGHT
   2 # file at the top-level directory of this distribution and at
   3 # http://rust-lang.org/COPYRIGHT.
   4 #
   5 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 # option. This file may not be copied, modified, or distributed
   9 # except according to those terms.
  10
  11 r"""
  12 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
  13
  14 # How and why?
  15
  16 The principle is simple: This script receives a path to generated HTML
  17 documentation and a "template" script, which has a series of check
  18 commands like `@has` or `@matches`. Each command is used to check if
  19 some pattern is present or not present in the particular file or in
  20 a particular node of the HTML tree. In many cases, the template script
  21 happens to be the source code given to rustdoc.
  22
  23 While it indeed is possible to test in smaller portions, it has been
  24 hard to construct tests in this fashion and major rendering errors were
  25 discovered much later. This script is designed to make black-box and
  26 regression testing of Rustdoc easy. This does not preclude the needs for
  27 unit testing, but can be used to complement related tests by quickly
  28 showing the expected renderings.
  29
  30 In order to avoid one-off dependencies for this task, this script uses
  31 a reasonably working HTML parser and the existing XPath implementation
  32 from Python's standard library. Hopefully, we won't render
  33 non-well-formed HTML.
  34
  35 # Commands
  36
  37 Commands start with an `@` followed by a command name (letters and
  38 hyphens), and zero or more arguments separated by one or more whitespace
  39 characters and optionally delimited with single or double quotes. The `@`
  40 mark cannot be preceded by a non-whitespace character. Other lines
  41 (including every text up to the first `@`) are ignored, but it is
  42 recommended to avoid the use of `@` in the template file.
  43
  44 There are a number of supported commands:
  45
  46 * `@has PATH` checks for the existence of the given file.
  47
  48   `PATH` is relative to the output directory. It can be given as `-`
  49   which repeats the most recently used `PATH`.
  50
  51 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
  52   the occurrence of the given pattern `PATTERN` in the specified file.
  53   Only one occurrence of the pattern is enough.
  54
  55   For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
  56   whitespace being replaced by one single space character) string.
  57   The entire file is also whitespace-normalized including newlines.
  58
  59   For `@matches`, `PATTERN` is a Python-supported regular expression.
  60   The file remains intact but the regexp is matched without the `MULTILINE`
  61   and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
  62   to override them, and `\A` and `\Z` for definitely matching
  63   the beginning and end of the file.
  64
  65   (The same distinction goes to other variants of these commands.)
  66
  67 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
  68   the presence of the given XPath `XPATH` in the specified HTML file,
  69   and also the occurrence of the given pattern `PATTERN` in the matching
  70   node or attribute. Only one occurrence of the pattern in the match
  71   is enough.
  72
  73   `PATH` should be a valid and well-formed HTML file. It does *not*
  74   accept arbitrary HTML5; it should have matching open and close tags
  75   and correct entity references at least.
  76
  77   `XPATH` is an XPath expression to match. The XPath is fairly limited:
  78   `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
  79   `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
  80   and `@attr` (both as the last segment) are supported. Some examples:
  81
  82   - `//pre` or `.//pre` matches any element with a name `pre`.
  83   - `//a[@href]` matches any element with an `href` attribute.
  84   - `//*[@class="impl"]//code` matches any element with a name `code`,
  85     which is an ancestor of some element which `class` attr is `impl`.
  86   - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
  87     `class` attribute in the last `a` element (can be followed by more
  88     elements that are not `a`) inside the first `span` in the `h1` with
  89     a class of `fqn`. Note that there cannot be any additional elements
  90     between them due to the use of `/` instead of `//`.
  91
  92   Do not try to use non-absolute paths, it won't work due to the flawed
  93   ElementTree implementation. The script rejects them.
  94
  95   For the text matches (i.e. paths not ending with `@attr`), any
  96   subelements are flattened into one string; this is handy for ignoring
  97   highlights for example. If you want to simply check for the presence of
  98   a given node or attribute, use an empty string (`""`) as a `PATTERN`.
  99
 100 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
 101   in the specified file. The number of occurrences must match the given
 102   count.
 103
 104 * `@has-dir PATH` checks for the existence of the given directory.
 105
 106 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
 107 checks if the given file does not exist, for example.
 108
 109 """
 110
 111 from __future__ import print_function
 112 import sys
 113 import os.path
 114 import re
 115 import shlex
 116 from collections import namedtuple
 117 try:
 118     from html.parser import HTMLParser
 119 except ImportError:
 120     from HTMLParser import HTMLParser
 121 from xml.etree import cElementTree as ET
 122
 123 # &larrb;/&rarrb; are not in HTML 4 but are in HTML 5
 124 try:
 125     from html.entities import entitydefs
 126 except ImportError:
 127     from htmlentitydefs import entitydefs
 128 entitydefs['larrb'] = u'\u21e4'
 129 entitydefs['rarrb'] = u'\u21e5'
 130 entitydefs['nbsp'] = ' '
 131
 132 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
 133 VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
 134                      'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'])
 135
 136 # Python 2 -> 3 compatibility
 137 try:
 138     unichr
 139 except NameError:
 140     unichr = chr
 141
 142 class CustomHTMLParser(HTMLParser):
 143     """simplified HTML parser.
 144
 145     this is possible because we are dealing with very regular HTML from
 146     rustdoc; we only have to deal with i) void elements and ii) empty
 147     attributes."""
 148     def __init__(self, target=None):
 149         HTMLParser.__init__(self)
 150         self.__builder = target or ET.TreeBuilder()
 151
 152     def handle_starttag(self, tag, attrs):
 153         attrs = dict((k, v or '') for k, v in attrs)
 154         self.__builder.start(tag, attrs)
 155         if tag in VOID_ELEMENTS:
 156             self.__builder.end(tag)
 157
 158     def handle_endtag(self, tag):
 159         self.__builder.end(tag)
 160
 161     def handle_startendtag(self, tag, attrs):
 162         attrs = dict((k, v or '') for k, v in attrs)
 163         self.__builder.start(tag, attrs)
 164         self.__builder.end(tag)
 165
 166     def handle_data(self, data):
 167         self.__builder.data(data)
 168
 169     def handle_entityref(self, name):
 170         self.__builder.data(entitydefs[name])
 171
 172     def handle_charref(self, name):
 173         code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
 174         self.__builder.data(unichr(code).encode('utf-8'))
 175
 176     def close(self):
 177         HTMLParser.close(self)
 178         return self.__builder.close()
 179
 180 Command = namedtuple('Command', 'negated cmd args lineno context')
 181
 182 class FailedCheck(Exception):
 183     pass
 184
 185 class InvalidCheck(Exception):
 186     pass
 187
 188 def concat_multi_lines(f):
 189     """returns a generator out of the file object, which
 190     - removes `\\` then `\n` then a shared prefix with the previous line then
 191       optional whitespace;
 192     - keeps a line number (starting from 0) of the first line being
 193       concatenated."""
 194     lastline = None # set to the last line when the last line has a backslash
 195     firstlineno = None
 196     catenated = ''
 197     for lineno, line in enumerate(f):
 198         line = line.rstrip('\r\n')
 199
 200         # strip the common prefix from the current line if needed
 201         if lastline is not None:
 202             common_prefix = os.path.commonprefix([line, lastline])
 203             line = line[len(common_prefix):].lstrip()
 204
 205         firstlineno = firstlineno or lineno
 206         if line.endswith('\\'):
 207             if lastline is None:
 208                 lastline = line[:-1]
 209             catenated += line[:-1]
 210         else:
 211             yield firstlineno, catenated + line
 212             lastline = None
 213             firstlineno = None
 214             catenated = ''
 215
 216     if lastline is not None:
 217         print_err(lineno, line, 'Trailing backslash at the end of the file')
 218
 219 LINE_PATTERN = re.compile(r'''
 220     (?<=(?<!\S)@)(?P<negated>!?)
 221     (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
 222     (?P<args>.*)$
 223 ''', re.X)
 224
 225
 226 def get_commands(template):
 227     with open(template, 'rU') as f:
 228         for lineno, line in concat_multi_lines(f):
 229             m = LINE_PATTERN.search(line)
 230             if not m:
 231                 continue
 232
 233             negated = (m.group('negated') == '!')
 234             cmd = m.group('cmd')
 235             args = m.group('args')
 236             if args and not args[:1].isspace():
 237                 print_err(lineno, line, 'Invalid template syntax')
 238                 continue
 239             args = shlex.split(args)
 240             yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
 241
 242
 243 def _flatten(node, acc):
 244     if node.text:
 245         acc.append(node.text)
 246     for e in node:
 247         _flatten(e, acc)
 248         if e.tail:
 249             acc.append(e.tail)
 250
 251
 252 def flatten(node):
 253     acc = []
 254     _flatten(node, acc)
 255     return ''.join(acc)
 256
 257
 258 def normalize_xpath(path):
 259     if path.startswith('//'):
 260         return '.' + path # avoid warnings
 261     elif path.startswith('.//'):
 262         return path
 263     else:
 264         raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
 265
 266
 267 class CachedFiles(object):
 268     def __init__(self, root):
 269         self.root = root
 270         self.files = {}
 271         self.trees = {}
 272         self.last_path = None
 273
 274     def resolve_path(self, path):
 275         if path != '-':
 276             path = os.path.normpath(path)
 277             self.last_path = path
 278             return path
 279         elif self.last_path is None:
 280             raise InvalidCheck('Tried to use the previous path in the first command')
 281         else:
 282             return self.last_path
 283
 284     def get_file(self, path):
 285         path = self.resolve_path(path)
 286         if path in self.files:
 287             return self.files[path]
 288
 289         abspath = os.path.join(self.root, path)
 290         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 291             raise FailedCheck('File does not exist {!r}'.format(path))
 292
 293         with open(abspath) as f:
 294             data = f.read()
 295             self.files[path] = data
 296             return data
 297
 298     def get_tree(self, path):
 299         path = self.resolve_path(path)
 300         if path in self.trees:
 301             return self.trees[path]
 302
 303         abspath = os.path.join(self.root, path)
 304         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 305             raise FailedCheck('File does not exist {!r}'.format(path))
 306
 307         with open(abspath) as f:
 308             try:
 309                 tree = ET.parse(f, CustomHTMLParser())
 310             except Exception as e:
 311                 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
 312             self.trees[path] = tree
 313             return self.trees[path]
 314
 315     def get_dir(self, path):
 316         path = self.resolve_path(path)
 317         abspath = os.path.join(self.root, path)
 318         if not(os.path.exists(abspath) and os.path.isdir(abspath)):
 319             raise FailedCheck('Directory does not exist {!r}'.format(path))
 320
 321
 322 def check_string(data, pat, regexp):
 323     if not pat:
 324         return True # special case a presence testing
 325     elif regexp:
 326         return re.search(pat, data) is not None
 327     else:
 328         data = ' '.join(data.split())
 329         pat = ' '.join(pat.split())
 330         return pat in data
 331
 332
 333 def check_tree_attr(tree, path, attr, pat, regexp):
 334     path = normalize_xpath(path)
 335     ret = False
 336     for e in tree.findall(path):
 337         if attr in e.attrib:
 338             value = e.attrib[attr]
 339         else:
 340             continue
 341
 342         ret = check_string(value, pat, regexp)
 343         if ret:
 344             break
 345     return ret
 346
 347
 348 def check_tree_text(tree, path, pat, regexp):
 349     path = normalize_xpath(path)
 350     ret = False
 351     try:
 352         for e in tree.findall(path):
 353             try:
 354                 value = flatten(e)
 355             except KeyError:
 356                 continue
 357             else:
 358                 ret = check_string(value, pat, regexp)
 359                 if ret:
 360                     break
 361     except Exception as e:
 362         print('Failed to get path "{}"'.format(path))
 363         raise e
 364     return ret
 365
 366
 367 def get_tree_count(tree, path):
 368     path = normalize_xpath(path)
 369     return len(tree.findall(path))
 370
 371 def stderr(*args):
 372     print(*args, file=sys.stderr)
 373
 374 def print_err(lineno, context, err, message=None):
 375     global ERR_COUNT
 376     ERR_COUNT += 1
 377     stderr("{}: {}".format(lineno, message or err))
 378     if message and err:
 379         stderr("\t{}".format(err))
 380
 381     if context:
 382         stderr("\t{}".format(context))
 383
 384 ERR_COUNT = 0
 385
 386 def check_command(c, cache):
 387     try:
 388         cerr = ""
 389         if c.cmd == 'has' or c.cmd == 'matches': # string test
 390             regexp = (c.cmd == 'matches')
 391             if len(c.args) == 1 and not regexp: # @has <path> = file existence
 392                 try:
 393                     cache.get_file(c.args[0])
 394                     ret = True
 395                 except FailedCheck as err:
 396                     cerr = str(err)
 397                     ret = False
 398             elif len(c.args) == 2: # @has/matches <path> <pat> = string test
 399                 cerr = "`PATTERN` did not match"
 400                 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
 401             elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
 402                 cerr = "`XPATH PATTERN` did not match"
 403                 tree = cache.get_tree(c.args[0])
 404                 pat, sep, attr = c.args[1].partition('/@')
 405                 if sep: # attribute
 406                     tree = cache.get_tree(c.args[0])
 407                     ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
 408                 else: # normalized text
 409                     pat = c.args[1]
 410                     if pat.endswith('/text()'):
 411                         pat = pat[:-7]
 412                     ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
 413             else:
 414                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 415
 416         elif c.cmd == 'count': # count test
 417             if len(c.args) == 3: # @count <path> <pat> <count> = count test
 418                 expected = int(c.args[2])
 419                 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
 420                 cerr = "Expected {} occurrences but found {}".format(expected, found)
 421                 ret = expected == found
 422             else:
 423                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 424         elif c.cmd == 'has-dir': # has-dir test
 425             if len(c.args) == 1: # @has-dir <path> = has-dir test
 426                 try:
 427                     cache.get_dir(c.args[0])
 428                     ret = True
 429                 except FailedCheck as err:
 430                     cerr = str(err)
 431                     ret = False
 432             else:
 433                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 434         elif c.cmd == 'valid-html':
 435             raise InvalidCheck('Unimplemented @valid-html')
 436
 437         elif c.cmd == 'valid-links':
 438             raise InvalidCheck('Unimplemented @valid-links')
 439         else:
 440             raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
 441
 442         if ret == c.negated:
 443             raise FailedCheck(cerr)
 444
 445     except FailedCheck as err:
 446         message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
 447         print_err(c.lineno, c.context, str(err), message)
 448     except InvalidCheck as err:
 449         print_err(c.lineno, c.context, str(err))
 450
 451 def check(target, commands):
 452     cache = CachedFiles(target)
 453     for c in commands:
 454         check_command(c, cache)
 455
 456 if __name__ == '__main__':
 457     if len(sys.argv) != 3:
 458         stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
 459         raise SystemExit(1)
 460
 461     check(sys.argv[1], get_commands(sys.argv[2]))
 462     if ERR_COUNT:
 463         stderr("\nEncountered {} errors".format(ERR_COUNT))
 464         raise SystemExit(1)