src/etc/htmldocck.py

   1 # Copyright 2015 The Rust Project Developers. See the COPYRIGHT
   2 # file at the top-level directory of this distribution and at
   3 # http://rust-lang.org/COPYRIGHT.
   4 #
   5 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 # option. This file may not be copied, modified, or distributed
   9 # except according to those terms.
  10
  11 r"""
  12 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
  13
  14 # How and why?
  15
  16 The principle is simple: This script receives a path to generated HTML
  17 documentation and a "template" script, which has a series of check
  18 commands like `@has` or `@matches`. Each command can be used to check if
  19 some pattern is present or not present in the particular file or in
  20 the particular node of HTML tree. In many cases, the template script
  21 happens to be a source code given to rustdoc.
  22
  23 While it indeed is possible to test in smaller portions, it has been
  24 hard to construct tests in this fashion and major rendering errors were
  25 discovered much later. This script is designed for making the black-box
  26 and regression testing of Rustdoc easy. This does not preclude the needs
  27 for unit testing, but can be used to complement related tests by quickly
  28 showing the expected renderings.
  29
  30 In order to avoid one-off dependencies for this task, this script uses
  31 a reasonably working HTML parser and the existing XPath implementation
  32 from Python's standard library. Hopefully we won't render
  33 non-well-formed HTML.
  34
  35 # Commands
  36
  37 Commands start with an `@` followed by a command name (letters and
  38 hyphens), and zero or more arguments separated by one or more whitespace
  39 and optionally delimited with single or double quotes. The `@` mark
  40 cannot be preceded by a non-whitespace character. Other lines (including
  41 every text up to the first `@`) are ignored, but it is recommended to
  42 avoid the use of `@` in the template file.
  43
  44 There are a number of supported commands:
  45
  46 * `@has PATH` checks for the existence of given file.
  47
  48   `PATH` is relative to the output directory. It can be given as `-`
  49   which repeats the most recently used `PATH`.
  50
  51 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
  52   the occurrence of given `PATTERN` in the given file. Only one
  53   occurrence of given pattern is enough.
  54
  55   For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
  56   whitespace being replaced by one single space character) string.
  57   The entire file is also whitespace-normalized including newlines.
  58
  59   For `@matches`, `PATTERN` is a Python-supported regular expression.
  60   The file remains intact but the regexp is matched with no `MULTILINE`
  61   and `IGNORECASE` option. You can still use a prefix `(?m)` or `(?i)`
  62   to override them, and `\A` and `\Z` for definitely matching
  63   the beginning and end of the file.
  64
  65   (The same distinction goes to other variants of these commands.)
  66
  67 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
  68   the presence of given `XPATH` in the given HTML file, and also
  69   the occurrence of given `PATTERN` in the matching node or attribute.
  70   Only one occurrence of given pattern in the match is enough.
  71
  72   `PATH` should be a valid and well-formed HTML file. It does *not*
  73   accept arbitrary HTML5; it should have matching open and close tags
  74   and correct entity references at least.
  75
  76   `XPATH` is an XPath expression to match. This is fairly limited:
  77   `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
  78   `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
  79   and `@attr` (both as the last segment) are supported. Some examples:
  80
  81   - `//pre` or `.//pre` matches any element with a name `pre`.
  82   - `//a[@href]` matches any element with an `href` attribute.
  83   - `//*[@class="impl"]//code` matches any element with a name `code`,
  84     which is an ancestor of some element which `class` attr is `impl`.
  85   - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
  86     `class` attribute in the last `a` element (can be followed by more
  87     elements that are not `a`) inside the first `span` in the `h1` with
  88     a class of `fqn`. Note that there cannot be no additional elements
  89     between them due to the use of `/` instead of `//`.
  90
  91   Do not try to use non-absolute paths, it won't work due to the flawed
  92   ElementTree implementation. The script rejects them.
  93
  94   For the text matches (i.e. paths not ending with `@attr`), any
  95   subelements are flattened into one string; this is handy for ignoring
  96   highlights for example. If you want to simply check the presence of
  97   given node or attribute, use an empty string (`""`) as a `PATTERN`.
  98
  99 * `@count PATH XPATH COUNT' checks for the occurrence of given XPath
 100   in the given file. The number of occurrences must match the given count.
 101
 102 * `@has-dir PATH` checks for the existence of the given directory.
 103
 104 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
 105 checks if the given file does not exist, for example.
 106
 107 """
 108
 109 from __future__ import print_function
 110 import sys
 111 import os.path
 112 import re
 113 import shlex
 114 from collections import namedtuple
 115 try:
 116     from html.parser import HTMLParser
 117 except ImportError:
 118     from HTMLParser import HTMLParser
 119 from xml.etree import cElementTree as ET
 120
 121 # &larrb;/&rarrb; are not in HTML 4 but are in HTML 5
 122 try:
 123     from html.entities import entitydefs
 124 except ImportError:
 125     from htmlentitydefs import entitydefs
 126 entitydefs['larrb'] = u'\u21e4'
 127 entitydefs['rarrb'] = u'\u21e5'
 128 entitydefs['nbsp'] = ' '
 129
 130 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
 131 VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
 132                      'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'])
 133
 134 # Python 2 -> 3 compatibility
 135 try:
 136     unichr
 137 except NameError:
 138     unichr = chr
 139
 140 class CustomHTMLParser(HTMLParser):
 141     """simplified HTML parser.
 142
 143     this is possible because we are dealing with very regular HTML from
 144     rustdoc; we only have to deal with i) void elements and ii) empty
 145     attributes."""
 146     def __init__(self, target=None):
 147         HTMLParser.__init__(self)
 148         self.__builder = target or ET.TreeBuilder()
 149
 150     def handle_starttag(self, tag, attrs):
 151         attrs = dict((k, v or '') for k, v in attrs)
 152         self.__builder.start(tag, attrs)
 153         if tag in VOID_ELEMENTS:
 154             self.__builder.end(tag)
 155
 156     def handle_endtag(self, tag):
 157         self.__builder.end(tag)
 158
 159     def handle_startendtag(self, tag, attrs):
 160         attrs = dict((k, v or '') for k, v in attrs)
 161         self.__builder.start(tag, attrs)
 162         self.__builder.end(tag)
 163
 164     def handle_data(self, data):
 165         self.__builder.data(data)
 166
 167     def handle_entityref(self, name):
 168         self.__builder.data(entitydefs[name])
 169
 170     def handle_charref(self, name):
 171         code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
 172         self.__builder.data(unichr(code).encode('utf-8'))
 173
 174     def close(self):
 175         HTMLParser.close(self)
 176         return self.__builder.close()
 177
 178 Command = namedtuple('Command', 'negated cmd args lineno context')
 179
 180 class FailedCheck(Exception):
 181     pass
 182
 183 class InvalidCheck(Exception):
 184     pass
 185
 186 def concat_multi_lines(f):
 187     """returns a generator out of the file object, which
 188     - removes `\\` then `\n` then a shared prefix with the previous line then
 189       optional whitespace;
 190     - keeps a line number (starting from 0) of the first line being
 191       concatenated."""
 192     lastline = None # set to the last line when the last line has a backslash
 193     firstlineno = None
 194     catenated = ''
 195     for lineno, line in enumerate(f):
 196         line = line.rstrip('\r\n')
 197
 198         # strip the common prefix from the current line if needed
 199         if lastline is not None:
 200             common_prefix = os.path.commonprefix([line, lastline])
 201             line = line[len(common_prefix):].lstrip()
 202
 203         firstlineno = firstlineno or lineno
 204         if line.endswith('\\'):
 205             if lastline is None:
 206                 lastline = line[:-1]
 207             catenated += line[:-1]
 208         else:
 209             yield firstlineno, catenated + line
 210             lastline = None
 211             firstlineno = None
 212             catenated = ''
 213
 214     if lastline is not None:
 215         print_err(lineno, line, 'Trailing backslash at the end of the file')
 216
 217 LINE_PATTERN = re.compile(r'''
 218     (?<=(?<!\S)@)(?P<negated>!?)
 219     (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
 220     (?P<args>.*)$
 221 ''', re.X)
 222
 223
 224 def get_commands(template):
 225     with open(template, 'rU') as f:
 226         for lineno, line in concat_multi_lines(f):
 227             m = LINE_PATTERN.search(line)
 228             if not m:
 229                 continue
 230
 231             negated = (m.group('negated') == '!')
 232             cmd = m.group('cmd')
 233             args = m.group('args')
 234             if args and not args[:1].isspace():
 235                 print_err(lineno, line, 'Invalid template syntax')
 236                 continue
 237             args = shlex.split(args)
 238             yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
 239
 240
 241 def _flatten(node, acc):
 242     if node.text:
 243         acc.append(node.text)
 244     for e in node:
 245         _flatten(e, acc)
 246         if e.tail:
 247             acc.append(e.tail)
 248
 249
 250 def flatten(node):
 251     acc = []
 252     _flatten(node, acc)
 253     return ''.join(acc)
 254
 255
 256 def normalize_xpath(path):
 257     if path.startswith('//'):
 258         return '.' + path # avoid warnings
 259     elif path.startswith('.//'):
 260         return path
 261     else:
 262         raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
 263
 264
 265 class CachedFiles(object):
 266     def __init__(self, root):
 267         self.root = root
 268         self.files = {}
 269         self.trees = {}
 270         self.last_path = None
 271
 272     def resolve_path(self, path):
 273         if path != '-':
 274             path = os.path.normpath(path)
 275             self.last_path = path
 276             return path
 277         elif self.last_path is None:
 278             raise InvalidCheck('Tried to use the previous path in the first command')
 279         else:
 280             return self.last_path
 281
 282     def get_file(self, path):
 283         path = self.resolve_path(path)
 284         if path in self.files:
 285             return self.files[path]
 286
 287         abspath = os.path.join(self.root, path)
 288         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 289             raise FailedCheck('File does not exist {!r}'.format(path))
 290
 291         with open(abspath) as f:
 292             data = f.read()
 293             self.files[path] = data
 294             return data
 295
 296     def get_tree(self, path):
 297         path = self.resolve_path(path)
 298         if path in self.trees:
 299             return self.trees[path]
 300
 301         abspath = os.path.join(self.root, path)
 302         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 303             raise FailedCheck('File does not exist {!r}'.format(path))
 304
 305         with open(abspath) as f:
 306             try:
 307                 tree = ET.parse(f, CustomHTMLParser())
 308             except Exception as e:
 309                 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
 310             self.trees[path] = tree
 311             return self.trees[path]
 312
 313     def get_dir(self, path):
 314         path = self.resolve_path(path)
 315         abspath = os.path.join(self.root, path)
 316         if not(os.path.exists(abspath) and os.path.isdir(abspath)):
 317             raise FailedCheck('Directory does not exist {!r}'.format(path))
 318
 319
 320 def check_string(data, pat, regexp):
 321     if not pat:
 322         return True # special case a presence testing
 323     elif regexp:
 324         return re.search(pat, data) is not None
 325     else:
 326         data = ' '.join(data.split())
 327         pat = ' '.join(pat.split())
 328         return pat in data
 329
 330
 331 def check_tree_attr(tree, path, attr, pat, regexp):
 332     path = normalize_xpath(path)
 333     ret = False
 334     for e in tree.findall(path):
 335         if attr in e.attrib:
 336             value = e.attrib[attr]
 337         else:
 338             continue
 339
 340         ret = check_string(value, pat, regexp)
 341         if ret:
 342             break
 343     return ret
 344
 345
 346 def check_tree_text(tree, path, pat, regexp):
 347     path = normalize_xpath(path)
 348     ret = False
 349     try:
 350         for e in tree.findall(path):
 351             try:
 352                 value = flatten(e)
 353             except KeyError:
 354                 continue
 355             else:
 356                 ret = check_string(value, pat, regexp)
 357                 if ret:
 358                     break
 359     except Exception as e:
 360         print('Failed to get path "{}"'.format(path))
 361         raise e
 362     return ret
 363
 364
 365 def get_tree_count(tree, path):
 366     path = normalize_xpath(path)
 367     return len(tree.findall(path))
 368
 369 def stderr(*args):
 370     print(*args, file=sys.stderr)
 371
 372 def print_err(lineno, context, err, message=None):
 373     global ERR_COUNT
 374     ERR_COUNT += 1
 375     stderr("{}: {}".format(lineno, message or err))
 376     if message and err:
 377         stderr("\t{}".format(err))
 378
 379     if context:
 380         stderr("\t{}".format(context))
 381
 382 ERR_COUNT = 0
 383
 384 def check_command(c, cache):
 385     try:
 386         cerr = ""
 387         if c.cmd == 'has' or c.cmd == 'matches': # string test
 388             regexp = (c.cmd == 'matches')
 389             if len(c.args) == 1 and not regexp: # @has <path> = file existence
 390                 try:
 391                     cache.get_file(c.args[0])
 392                     ret = True
 393                 except FailedCheck as err:
 394                     cerr = str(err)
 395                     ret = False
 396             elif len(c.args) == 2: # @has/matches <path> <pat> = string test
 397                 cerr = "`PATTERN` did not match"
 398                 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
 399             elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
 400                 cerr = "`XPATH PATTERN` did not match"
 401                 tree = cache.get_tree(c.args[0])
 402                 pat, sep, attr = c.args[1].partition('/@')
 403                 if sep: # attribute
 404                     tree = cache.get_tree(c.args[0])
 405                     ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
 406                 else: # normalized text
 407                     pat = c.args[1]
 408                     if pat.endswith('/text()'):
 409                         pat = pat[:-7]
 410                     ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
 411             else:
 412                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 413
 414         elif c.cmd == 'count': # count test
 415             if len(c.args) == 3: # @count <path> <pat> <count> = count test
 416                 expected = int(c.args[2])
 417                 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
 418                 cerr = "Expected {} occurrences but found {}".format(expected, found)
 419                 ret = expected == found
 420             else:
 421                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 422         elif c.cmd == 'has-dir': # has-dir test
 423             if len(c.args) == 1: # @has-dir <path> = has-dir test
 424                 try:
 425                     cache.get_dir(c.args[0])
 426                     ret = True
 427                 except FailedCheck as err:
 428                     cerr = str(err)
 429                     ret = False
 430             else:
 431                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 432         elif c.cmd == 'valid-html':
 433             raise InvalidCheck('Unimplemented @valid-html')
 434
 435         elif c.cmd == 'valid-links':
 436             raise InvalidCheck('Unimplemented @valid-links')
 437         else:
 438             raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
 439
 440         if ret == c.negated:
 441             raise FailedCheck(cerr)
 442
 443     except FailedCheck as err:
 444         message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
 445         print_err(c.lineno, c.context, str(err), message)
 446     except InvalidCheck as err:
 447         print_err(c.lineno, c.context, str(err))
 448
 449 def check(target, commands):
 450     cache = CachedFiles(target)
 451     for c in commands:
 452         check_command(c, cache)
 453
 454 if __name__ == '__main__':
 455     if len(sys.argv) != 3:
 456         stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
 457         raise SystemExit(1)
 458
 459     check(sys.argv[1], get_commands(sys.argv[2]))
 460     if ERR_COUNT:
 461         stderr("\nEncountered {} errors".format(ERR_COUNT))
 462         raise SystemExit(1)