src/etc/htmldocck.py

   1 # Copyright 2015 The Rust Project Developers. See the COPYRIGHT
   2 # file at the top-level directory of this distribution and at
   3 # http://rust-lang.org/COPYRIGHT.
   4 #
   5 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 # option. This file may not be copied, modified, or distributed
   9 # except according to those terms.
  10
  11 r"""
  12 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
  13
  14 # How and why?
  15
  16 The principle is simple: This script receives a path to generated HTML
  17 documentation and a "template" script, which has a series of check
  18 commands like `@has` or `@matches`. Each command can be used to check if
  19 some pattern is present or not present in the particular file or in
  20 the particular node of HTML tree. In many cases, the template script
  21 happens to be a source code given to rustdoc.
  22
  23 While it indeed is possible to test in smaller portions, it has been
  24 hard to construct tests in this fashion and major rendering errors were
  25 discovered much later. This script is designed for making the black-box
  26 and regression testing of Rustdoc easy. This does not preclude the needs
  27 for unit testing, but can be used to complement related tests by quickly
  28 showing the expected renderings.
  29
  30 In order to avoid one-off dependencies for this task, this script uses
  31 a reasonably working HTML parser and the existing XPath implementation
  32 from Python 2's standard library. Hopefully we won't render
  33 non-well-formed HTML.
  34
  35 # Commands
  36
  37 Commands start with an `@` followed by a command name (letters and
  38 hyphens), and zero or more arguments separated by one or more whitespace
  39 and optionally delimited with single or double quotes. The `@` mark
  40 cannot be preceded by a non-whitespace character. Other lines (including
  41 every text up to the first `@`) are ignored, but it is recommended to
  42 avoid the use of `@` in the template file.
  43
  44 There are a number of supported commands:
  45
  46 * `@has PATH` checks for the existence of given file.
  47
  48   `PATH` is relative to the output directory. It can be given as `-`
  49   which repeats the most recently used `PATH`.
  50
  51 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
  52   the occurrence of given `PATTERN` in the given file. Only one
  53   occurrence of given pattern is enough.
  54
  55   For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
  56   whitespace being replaced by one single space character) string.
  57   The entire file is also whitespace-normalized including newlines.
  58
  59   For `@matches`, `PATTERN` is a Python-supported regular expression.
  60   The file remains intact but the regexp is matched with no `MULTILINE`
  61   and `IGNORECASE` option. You can still use a prefix `(?m)` or `(?i)`
  62   to override them, and `\A` and `\Z` for definitely matching
  63   the beginning and end of the file.
  64
  65   (The same distinction goes to other variants of these commands.)
  66
  67 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
  68   the presence of given `XPATH` in the given HTML file, and also
  69   the occurrence of given `PATTERN` in the matching node or attribute.
  70   Only one occurrence of given pattern in the match is enough.
  71
  72   `PATH` should be a valid and well-formed HTML file. It does *not*
  73   accept arbitrary HTML5; it should have matching open and close tags
  74   and correct entity references at least.
  75
  76   `XPATH` is an XPath expression to match. This is fairly limited:
  77   `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
  78   `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
  79   and `@attr` (both as the last segment) are supported. Some examples:
  80
  81   - `//pre` or `.//pre` matches any element with a name `pre`.
  82   - `//a[@href]` matches any element with an `href` attribute.
  83   - `//*[@class="impl"]//code` matches any element with a name `code`,
  84     which is an ancestor of some element which `class` attr is `impl`.
  85   - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
  86     `class` attribute in the last `a` element (can be followed by more
  87     elements that are not `a`) inside the first `span` in the `h1` with
  88     a class of `fqn`. Note that there cannot be no additional elements
  89     between them due to the use of `/` instead of `//`.
  90
  91   Do not try to use non-absolute paths, it won't work due to the flawed
  92   ElementTree implementation. The script rejects them.
  93
  94   For the text matches (i.e. paths not ending with `@attr`), any
  95   subelements are flattened into one string; this is handy for ignoring
  96   highlights for example. If you want to simply check the presence of
  97   given node or attribute, use an empty string (`""`) as a `PATTERN`.
  98
  99 * `@count PATH XPATH COUNT' checks for the occurrence of given XPath
 100   in the given file. The number of occurrences must match the given count.
 101
 102 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
 103 checks if the given file does not exist, for example.
 104
 105 """
 106
 107 from __future__ import print_function
 108 import sys
 109 import os.path
 110 import re
 111 import shlex
 112 from collections import namedtuple
 113 from HTMLParser import HTMLParser
 114 from xml.etree import cElementTree as ET
 115
 116 # &larrb;/&rarrb; are not in HTML 4 but are in HTML 5
 117 from htmlentitydefs import entitydefs
 118 entitydefs['larrb'] = u'\u21e4'
 119 entitydefs['rarrb'] = u'\u21e5'
 120
 121 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
 122 VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
 123                      'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'])
 124
 125
 126 class CustomHTMLParser(HTMLParser):
 127     """simplified HTML parser.
 128
 129     this is possible because we are dealing with very regular HTML from
 130     rustdoc; we only have to deal with i) void elements and ii) empty
 131     attributes."""
 132     def __init__(self, target=None):
 133         HTMLParser.__init__(self)
 134         self.__builder = target or ET.TreeBuilder()
 135
 136     def handle_starttag(self, tag, attrs):
 137         attrs = dict((k, v or '') for k, v in attrs)
 138         self.__builder.start(tag, attrs)
 139         if tag in VOID_ELEMENTS:
 140             self.__builder.end(tag)
 141
 142     def handle_endtag(self, tag):
 143         self.__builder.end(tag)
 144
 145     def handle_startendtag(self, tag, attrs):
 146         attrs = dict((k, v or '') for k, v in attrs)
 147         self.__builder.start(tag, attrs)
 148         self.__builder.end(tag)
 149
 150     def handle_data(self, data):
 151         self.__builder.data(data)
 152
 153     def handle_entityref(self, name):
 154         self.__builder.data(entitydefs[name])
 155
 156     def handle_charref(self, name):
 157         code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
 158         self.__builder.data(unichr(code).encode('utf-8'))
 159
 160     def close(self):
 161         HTMLParser.close(self)
 162         return self.__builder.close()
 163
 164 Command = namedtuple('Command', 'negated cmd args lineno context')
 165
 166 class FailedCheck(Exception):
 167     pass
 168
 169 class InvalidCheck(Exception):
 170     pass
 171
 172 def concat_multi_lines(f):
 173     """returns a generator out of the file object, which
 174     - removes `\\` then `\n` then a shared prefix with the previous line then
 175       optional whitespace;
 176     - keeps a line number (starting from 0) of the first line being
 177       concatenated."""
 178     lastline = None # set to the last line when the last line has a backslash
 179     firstlineno = None
 180     catenated = ''
 181     for lineno, line in enumerate(f):
 182         line = line.rstrip('\r\n')
 183
 184         # strip the common prefix from the current line if needed
 185         if lastline is not None:
 186             maxprefix = 0
 187             for i in xrange(min(len(line), len(lastline))):
 188                 if line[i] != lastline[i]:
 189                     break
 190                 maxprefix += 1
 191             line = line[maxprefix:].lstrip()
 192
 193         firstlineno = firstlineno or lineno
 194         if line.endswith('\\'):
 195             if lastline is None:
 196                 lastline = line[:-1]
 197             catenated += line[:-1]
 198         else:
 199             yield firstlineno, catenated + line
 200             lastline = None
 201             firstlineno = None
 202             catenated = ''
 203
 204     if lastline is not None:
 205         print_err(lineno, line, 'Trailing backslash at the end of the file')
 206
 207 LINE_PATTERN = re.compile(r'''
 208     (?<=(?<!\S)@)(?P<negated>!?)
 209     (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
 210     (?P<args>.*)$
 211 ''', re.X)
 212
 213
 214 def get_commands(template):
 215     with open(template, 'rUb') as f:
 216         for lineno, line in concat_multi_lines(f):
 217             m = LINE_PATTERN.search(line)
 218             if not m:
 219                 continue
 220
 221             negated = (m.group('negated') == '!')
 222             cmd = m.group('cmd')
 223             args = m.group('args')
 224             if args and not args[:1].isspace():
 225                 print_err(lineno, line, 'Invalid template syntax')
 226                 continue
 227             args = shlex.split(args)
 228             yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
 229
 230
 231 def _flatten(node, acc):
 232     if node.text:
 233         acc.append(node.text)
 234     for e in node:
 235         _flatten(e, acc)
 236         if e.tail:
 237             acc.append(e.tail)
 238
 239
 240 def flatten(node):
 241     acc = []
 242     _flatten(node, acc)
 243     return ''.join(acc)
 244
 245
 246 def normalize_xpath(path):
 247     if path.startswith('//'):
 248         return '.' + path # avoid warnings
 249     elif path.startswith('.//'):
 250         return path
 251     else:
 252         raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
 253
 254
 255 class CachedFiles(object):
 256     def __init__(self, root):
 257         self.root = root
 258         self.files = {}
 259         self.trees = {}
 260         self.last_path = None
 261
 262     def resolve_path(self, path):
 263         if path != '-':
 264             path = os.path.normpath(path)
 265             self.last_path = path
 266             return path
 267         elif self.last_path is None:
 268             raise InvalidCheck('Tried to use the previous path in the first command')
 269         else:
 270             return self.last_path
 271
 272     def get_file(self, path):
 273         path = self.resolve_path(path)
 274         if path in self.files:
 275             return self.files[path]
 276
 277         abspath = os.path.join(self.root, path)
 278         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 279             raise FailedCheck('File does not exist {!r}'.format(path))
 280
 281         with open(abspath) as f:
 282             data = f.read()
 283             self.files[path] = data
 284             return data
 285
 286     def get_tree(self, path):
 287         path = self.resolve_path(path)
 288         if path in self.trees:
 289             return self.trees[path]
 290
 291         abspath = os.path.join(self.root, path)
 292         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 293             raise FailedCheck('File does not exist {!r}'.format(path))
 294
 295         with open(abspath) as f:
 296             try:
 297                 tree = ET.parse(f, CustomHTMLParser())
 298             except Exception as e:
 299                 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
 300             self.trees[path] = tree
 301             return self.trees[path]
 302
 303
 304 def check_string(data, pat, regexp):
 305     if not pat:
 306         return True # special case a presence testing
 307     elif regexp:
 308         return re.search(pat, data) is not None
 309     else:
 310         data = ' '.join(data.split())
 311         pat = ' '.join(pat.split())
 312         return pat in data
 313
 314
 315 def check_tree_attr(tree, path, attr, pat, regexp):
 316     path = normalize_xpath(path)
 317     ret = False
 318     for e in tree.findall(path):
 319         if attr in e.attrib:
 320             value = e.attrib[attr]
 321         else:
 322             continue
 323
 324         ret = check_string(value, pat, regexp)
 325         if ret:
 326             break
 327     return ret
 328
 329
 330 def check_tree_text(tree, path, pat, regexp):
 331     path = normalize_xpath(path)
 332     ret = False
 333     for e in tree.findall(path):
 334         try:
 335             value = flatten(e)
 336         except KeyError:
 337             continue
 338         else:
 339             ret = check_string(value, pat, regexp)
 340             if ret:
 341                 break
 342     return ret
 343
 344
 345 def check_tree_count(tree, path, count):
 346     path = normalize_xpath(path)
 347     return len(tree.findall(path)) == count
 348
 349 def stderr(*args):
 350     print(*args, file=sys.stderr)
 351
 352 def print_err(lineno, context, err, message=None):
 353     global ERR_COUNT
 354     ERR_COUNT += 1
 355     stderr("{}: {}".format(lineno, message or err))
 356     if message and err:
 357         stderr("\t{}".format(err))
 358
 359     if context:
 360         stderr("\t{}".format(context))
 361
 362 ERR_COUNT = 0
 363
 364 def check_command(c, cache):
 365     try:
 366         cerr = ""
 367         if c.cmd == 'has' or c.cmd == 'matches': # string test
 368             regexp = (c.cmd == 'matches')
 369             if len(c.args) == 1 and not regexp: # @has <path> = file existence
 370                 try:
 371                     cache.get_file(c.args[0])
 372                     ret = True
 373                 except FailedCheck as err:
 374                     cerr = err.message
 375                     ret = False
 376             elif len(c.args) == 2: # @has/matches <path> <pat> = string test
 377                 cerr = "`PATTERN` did not match"
 378                 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
 379             elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
 380                 cerr = "`XPATH PATTERN` did not match"
 381                 tree = cache.get_tree(c.args[0])
 382                 pat, sep, attr = c.args[1].partition('/@')
 383                 if sep: # attribute
 384                     tree = cache.get_tree(c.args[0])
 385                     ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
 386                 else: # normalized text
 387                     pat = c.args[1]
 388                     if pat.endswith('/text()'):
 389                         pat = pat[:-7]
 390                     ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
 391             else:
 392                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 393
 394         elif c.cmd == 'count': # count test
 395             if len(c.args) == 3: # @count <path> <pat> <count> = count test
 396                 ret = check_tree_count(cache.get_tree(c.args[0]), c.args[1], int(c.args[2]))
 397             else:
 398                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 399         elif c.cmd == 'valid-html':
 400             raise InvalidCheck('Unimplemented @valid-html')
 401
 402         elif c.cmd == 'valid-links':
 403             raise InvalidCheck('Unimplemented @valid-links')
 404         else:
 405             raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
 406
 407         if ret == c.negated:
 408             raise FailedCheck(cerr)
 409
 410     except FailedCheck as err:
 411         message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
 412         print_err(c.lineno, c.context, err.message, message)
 413     except InvalidCheck as err:
 414         print_err(c.lineno, c.context, err.message)
 415
 416 def check(target, commands):
 417     cache = CachedFiles(target)
 418     for c in commands:
 419         check_command(c, cache)
 420
 421 if __name__ == '__main__':
 422     if len(sys.argv) != 3:
 423         stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
 424         raise SystemExit(1)
 425
 426     check(sys.argv[1], get_commands(sys.argv[2]))
 427     if ERR_COUNT:
 428         stderr("\nEncountered {} errors".format(ERR_COUNT))
 429         raise SystemExit(1)