src/etc/htmldocck.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 r"""
   5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
   6
   7 # How and why?
   8
   9 The principle is simple: This script receives a path to generated HTML
  10 documentation and a "template" script, which has a series of check
  11 commands like `@has` or `@matches`. Each command is used to check if
  12 some pattern is present or not present in the particular file or in
  13 a particular node of the HTML tree. In many cases, the template script
  14 happens to be the source code given to rustdoc.
  15
  16 While it indeed is possible to test in smaller portions, it has been
  17 hard to construct tests in this fashion and major rendering errors were
  18 discovered much later. This script is designed to make black-box and
  19 regression testing of Rustdoc easy. This does not preclude the needs for
  20 unit testing, but can be used to complement related tests by quickly
  21 showing the expected renderings.
  22
  23 In order to avoid one-off dependencies for this task, this script uses
  24 a reasonably working HTML parser and the existing XPath implementation
  25 from Python's standard library. Hopefully, we won't render
  26 non-well-formed HTML.
  27
  28 # Commands
  29
  30 Commands start with an `@` followed by a command name (letters and
  31 hyphens), and zero or more arguments separated by one or more whitespace
  32 characters and optionally delimited with single or double quotes. The `@`
  33 mark cannot be preceded by a non-whitespace character. Other lines
  34 (including every text up to the first `@`) are ignored, but it is
  35 recommended to avoid the use of `@` in the template file.
  36
  37 There are a number of supported commands:
  38
  39 * `@has PATH` checks for the existence of the given file.
  40
  41   `PATH` is relative to the output directory. It can be given as `-`
  42   which repeats the most recently used `PATH`.
  43
  44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
  45   the occurrence of the given pattern `PATTERN` in the specified file.
  46   Only one occurrence of the pattern is enough.
  47
  48   For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
  49   whitespace being replaced by one single space character) string.
  50   The entire file is also whitespace-normalized including newlines.
  51
  52   For `@matches`, `PATTERN` is a Python-supported regular expression.
  53   The file remains intact but the regexp is matched without the `MULTILINE`
  54   and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
  55   to override them, and `\A` and `\Z` for definitely matching
  56   the beginning and end of the file.
  57
  58   (The same distinction goes to other variants of these commands.)
  59
  60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
  61   the presence of the given XPath `XPATH` in the specified HTML file,
  62   and also the occurrence of the given pattern `PATTERN` in the matching
  63   node or attribute. Only one occurrence of the pattern in the match
  64   is enough.
  65
  66   `PATH` should be a valid and well-formed HTML file. It does *not*
  67   accept arbitrary HTML5; it should have matching open and close tags
  68   and correct entity references at least.
  69
  70   `XPATH` is an XPath expression to match. The XPath is fairly limited:
  71   `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
  72   `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
  73   and `@attr` (both as the last segment) are supported. Some examples:
  74
  75   - `//pre` or `.//pre` matches any element with a name `pre`.
  76   - `//a[@href]` matches any element with an `href` attribute.
  77   - `//*[@class="impl"]//code` matches any element with a name `code`,
  78     which is an ancestor of some element which `class` attr is `impl`.
  79   - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
  80     `class` attribute in the last `a` element (can be followed by more
  81     elements that are not `a`) inside the first `span` in the `h1` with
  82     a class of `fqn`. Note that there cannot be any additional elements
  83     between them due to the use of `/` instead of `//`.
  84
  85   Do not try to use non-absolute paths, it won't work due to the flawed
  86   ElementTree implementation. The script rejects them.
  87
  88   For the text matches (i.e. paths not ending with `@attr`), any
  89   subelements are flattened into one string; this is handy for ignoring
  90   highlights for example. If you want to simply check for the presence of
  91   a given node or attribute, use an empty string (`""`) as a `PATTERN`.
  92
  93 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
  94   in the specified file. The number of occurrences must match the given
  95   count.
  96
  97 * `@has-dir PATH` checks for the existence of the given directory.
  98
  99 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
 100 checks if the given file does not exist, for example.
 101
 102 """
 103
 104 from __future__ import absolute_import, print_function, unicode_literals
 105
 106 import codecs
 107 import io
 108 import sys
 109 import os.path
 110 import re
 111 import shlex
 112 from collections import namedtuple
 113 try:
 114     from html.parser import HTMLParser
 115 except ImportError:
 116     from HTMLParser import HTMLParser
 117 try:
 118     from xml.etree import cElementTree as ET
 119 except ImportError:
 120     from xml.etree import ElementTree as ET
 121
 122 try:
 123     from html.entities import name2codepoint
 124 except ImportError:
 125     from htmlentitydefs import name2codepoint
 126
 127 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
 128 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
 129                      'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
 130
 131 # Python 2 -> 3 compatibility
 132 try:
 133     unichr
 134 except NameError:
 135     unichr = chr
 136
 137
 138 channel = os.environ["DOC_RUST_LANG_ORG_CHANNEL"]
 139
 140 class CustomHTMLParser(HTMLParser):
 141     """simplified HTML parser.
 142
 143     this is possible because we are dealing with very regular HTML from
 144     rustdoc; we only have to deal with i) void elements and ii) empty
 145     attributes."""
 146     def __init__(self, target=None):
 147         HTMLParser.__init__(self)
 148         self.__builder = target or ET.TreeBuilder()
 149
 150     def handle_starttag(self, tag, attrs):
 151         attrs = {k: v or '' for k, v in attrs}
 152         self.__builder.start(tag, attrs)
 153         if tag in VOID_ELEMENTS:
 154             self.__builder.end(tag)
 155
 156     def handle_endtag(self, tag):
 157         self.__builder.end(tag)
 158
 159     def handle_startendtag(self, tag, attrs):
 160         attrs = {k: v or '' for k, v in attrs}
 161         self.__builder.start(tag, attrs)
 162         self.__builder.end(tag)
 163
 164     def handle_data(self, data):
 165         self.__builder.data(data)
 166
 167     def handle_entityref(self, name):
 168         self.__builder.data(unichr(name2codepoint[name]))
 169
 170     def handle_charref(self, name):
 171         code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
 172         self.__builder.data(unichr(code))
 173
 174     def close(self):
 175         HTMLParser.close(self)
 176         return self.__builder.close()
 177
 178
 179 Command = namedtuple('Command', 'negated cmd args lineno context')
 180
 181
 182 class FailedCheck(Exception):
 183     pass
 184
 185
 186 class InvalidCheck(Exception):
 187     pass
 188
 189
 190 def concat_multi_lines(f):
 191     """returns a generator out of the file object, which
 192     - removes `\\` then `\n` then a shared prefix with the previous line then
 193       optional whitespace;
 194     - keeps a line number (starting from 0) of the first line being
 195       concatenated."""
 196     lastline = None  # set to the last line when the last line has a backslash
 197     firstlineno = None
 198     catenated = ''
 199     for lineno, line in enumerate(f):
 200         line = line.rstrip('\r\n')
 201
 202         # strip the common prefix from the current line if needed
 203         if lastline is not None:
 204             common_prefix = os.path.commonprefix([line, lastline])
 205             line = line[len(common_prefix):].lstrip()
 206
 207         firstlineno = firstlineno or lineno
 208         if line.endswith('\\'):
 209             if lastline is None:
 210                 lastline = line[:-1]
 211             catenated += line[:-1]
 212         else:
 213             yield firstlineno, catenated + line
 214             lastline = None
 215             firstlineno = None
 216             catenated = ''
 217
 218     if lastline is not None:
 219         print_err(lineno, line, 'Trailing backslash at the end of the file')
 220
 221
 222 LINE_PATTERN = re.compile(r'''
 223     (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?)
 224     (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
 225     (?P<args>.*)$
 226 ''', re.X | re.UNICODE)
 227
 228
 229 def get_commands(template):
 230     with io.open(template, encoding='utf-8') as f:
 231         for lineno, line in concat_multi_lines(f):
 232             m = LINE_PATTERN.search(line)
 233             if not m:
 234                 continue
 235
 236             negated = (m.group('negated') == '!')
 237             cmd = m.group('cmd')
 238             if m.group('invalid') == '!':
 239                 print_err(
 240                     lineno,
 241                     line,
 242                     'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format(
 243                         '!' if negated else '',
 244                         cmd,
 245                     ),
 246                 )
 247                 continue
 248             args = m.group('args')
 249             if args and not args[:1].isspace():
 250                 print_err(lineno, line, 'Invalid template syntax')
 251                 continue
 252             try:
 253                 args = shlex.split(args)
 254             except UnicodeEncodeError:
 255                 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
 256             yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
 257
 258
 259 def _flatten(node, acc):
 260     if node.text:
 261         acc.append(node.text)
 262     for e in node:
 263         _flatten(e, acc)
 264         if e.tail:
 265             acc.append(e.tail)
 266
 267
 268 def flatten(node):
 269     acc = []
 270     _flatten(node, acc)
 271     return ''.join(acc)
 272
 273
 274 def normalize_xpath(path):
 275     path = path.replace("{{channel}}", channel)
 276     if path.startswith('//'):
 277         return '.' + path  # avoid warnings
 278     elif path.startswith('.//'):
 279         return path
 280     else:
 281         raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
 282
 283
 284 class CachedFiles(object):
 285     def __init__(self, root):
 286         self.root = root
 287         self.files = {}
 288         self.trees = {}
 289         self.last_path = None
 290
 291     def resolve_path(self, path):
 292         if path != '-':
 293             path = os.path.normpath(path)
 294             self.last_path = path
 295             return path
 296         elif self.last_path is None:
 297             raise InvalidCheck('Tried to use the previous path in the first command')
 298         else:
 299             return self.last_path
 300
 301     def get_file(self, path):
 302         path = self.resolve_path(path)
 303         if path in self.files:
 304             return self.files[path]
 305
 306         abspath = os.path.join(self.root, path)
 307         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 308             raise FailedCheck('File does not exist {!r}'.format(path))
 309
 310         with io.open(abspath, encoding='utf-8') as f:
 311             data = f.read()
 312             self.files[path] = data
 313             return data
 314
 315     def get_tree(self, path):
 316         path = self.resolve_path(path)
 317         if path in self.trees:
 318             return self.trees[path]
 319
 320         abspath = os.path.join(self.root, path)
 321         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 322             raise FailedCheck('File does not exist {!r}'.format(path))
 323
 324         with io.open(abspath, encoding='utf-8') as f:
 325             try:
 326                 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
 327             except Exception as e:
 328                 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
 329             self.trees[path] = tree
 330             return self.trees[path]
 331
 332     def get_dir(self, path):
 333         path = self.resolve_path(path)
 334         abspath = os.path.join(self.root, path)
 335         if not(os.path.exists(abspath) and os.path.isdir(abspath)):
 336             raise FailedCheck('Directory does not exist {!r}'.format(path))
 337
 338
 339 def check_string(data, pat, regexp):
 340     pat = pat.replace("{{channel}}", channel)
 341     if not pat:
 342         return True  # special case a presence testing
 343     elif regexp:
 344         return re.search(pat, data, flags=re.UNICODE) is not None
 345     else:
 346         data = ' '.join(data.split())
 347         pat = ' '.join(pat.split())
 348         return pat in data
 349
 350
 351 def check_tree_attr(tree, path, attr, pat, regexp):
 352     path = normalize_xpath(path)
 353     ret = False
 354     for e in tree.findall(path):
 355         if attr in e.attrib:
 356             value = e.attrib[attr]
 357         else:
 358             continue
 359
 360         ret = check_string(value, pat, regexp)
 361         if ret:
 362             break
 363     return ret
 364
 365
 366 def check_tree_text(tree, path, pat, regexp):
 367     path = normalize_xpath(path)
 368     ret = False
 369     try:
 370         for e in tree.findall(path):
 371             try:
 372                 value = flatten(e)
 373             except KeyError:
 374                 continue
 375             else:
 376                 ret = check_string(value, pat, regexp)
 377                 if ret:
 378                     break
 379     except Exception:
 380         print('Failed to get path "{}"'.format(path))
 381         raise
 382     return ret
 383
 384
 385 def get_tree_count(tree, path):
 386     path = normalize_xpath(path)
 387     return len(tree.findall(path))
 388
 389
 390 def stderr(*args):
 391     if sys.version_info.major < 3:
 392         file = codecs.getwriter('utf-8')(sys.stderr)
 393     else:
 394         file = sys.stderr
 395
 396     print(*args, file=file)
 397
 398
 399 def print_err(lineno, context, err, message=None):
 400     global ERR_COUNT
 401     ERR_COUNT += 1
 402     stderr("{}: {}".format(lineno, message or err))
 403     if message and err:
 404         stderr("\t{}".format(err))
 405
 406     if context:
 407         stderr("\t{}".format(context))
 408
 409
 410 ERR_COUNT = 0
 411
 412
 413 def check_command(c, cache):
 414     try:
 415         cerr = ""
 416         if c.cmd == 'has' or c.cmd == 'matches':  # string test
 417             regexp = (c.cmd == 'matches')
 418             if len(c.args) == 1 and not regexp:  # @has <path> = file existence
 419                 try:
 420                     cache.get_file(c.args[0])
 421                     ret = True
 422                 except FailedCheck as err:
 423                     cerr = str(err)
 424                     ret = False
 425             elif len(c.args) == 2:  # @has/matches <path> <pat> = string test
 426                 cerr = "`PATTERN` did not match"
 427                 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
 428             elif len(c.args) == 3:  # @has/matches <path> <pat> <match> = XML tree test
 429                 cerr = "`XPATH PATTERN` did not match"
 430                 tree = cache.get_tree(c.args[0])
 431                 pat, sep, attr = c.args[1].partition('/@')
 432                 if sep:  # attribute
 433                     tree = cache.get_tree(c.args[0])
 434                     ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
 435                 else:  # normalized text
 436                     pat = c.args[1]
 437                     if pat.endswith('/text()'):
 438                         pat = pat[:-7]
 439                     ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
 440             else:
 441                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 442
 443         elif c.cmd == 'count':  # count test
 444             if len(c.args) == 3:  # @count <path> <pat> <count> = count test
 445                 expected = int(c.args[2])
 446                 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
 447                 cerr = "Expected {} occurrences but found {}".format(expected, found)
 448                 ret = expected == found
 449             else:
 450                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 451         elif c.cmd == 'has-dir':  # has-dir test
 452             if len(c.args) == 1:  # @has-dir <path> = has-dir test
 453                 try:
 454                     cache.get_dir(c.args[0])
 455                     ret = True
 456                 except FailedCheck as err:
 457                     cerr = str(err)
 458                     ret = False
 459             else:
 460                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 461         elif c.cmd == 'valid-html':
 462             raise InvalidCheck('Unimplemented @valid-html')
 463
 464         elif c.cmd == 'valid-links':
 465             raise InvalidCheck('Unimplemented @valid-links')
 466         else:
 467             raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
 468
 469         if ret == c.negated:
 470             raise FailedCheck(cerr)
 471
 472     except FailedCheck as err:
 473         message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
 474         print_err(c.lineno, c.context, str(err), message)
 475     except InvalidCheck as err:
 476         print_err(c.lineno, c.context, str(err))
 477
 478
 479 def check(target, commands):
 480     cache = CachedFiles(target)
 481     for c in commands:
 482         check_command(c, cache)
 483
 484
 485 if __name__ == '__main__':
 486     if len(sys.argv) != 3:
 487         stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
 488         raise SystemExit(1)
 489
 490     check(sys.argv[1], get_commands(sys.argv[2]))
 491     if ERR_COUNT:
 492         stderr("\nEncountered {} errors".format(ERR_COUNT))
 493         raise SystemExit(1)