src/etc/htmldocck.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 r"""
   5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
   6
   7 # How and why?
   8
   9 The principle is simple: This script receives a path to generated HTML
  10 documentation and a "template" script, which has a series of check
  11 commands like `@has` or `@matches`. Each command is used to check if
  12 some pattern is present or not present in the particular file or in
  13 a particular node of the HTML tree. In many cases, the template script
  14 happens to be the source code given to rustdoc.
  15
  16 While it indeed is possible to test in smaller portions, it has been
  17 hard to construct tests in this fashion and major rendering errors were
  18 discovered much later. This script is designed to make black-box and
  19 regression testing of Rustdoc easy. This does not preclude the needs for
  20 unit testing, but can be used to complement related tests by quickly
  21 showing the expected renderings.
  22
  23 In order to avoid one-off dependencies for this task, this script uses
  24 a reasonably working HTML parser and the existing XPath implementation
  25 from Python's standard library. Hopefully, we won't render
  26 non-well-formed HTML.
  27
  28 # Commands
  29
  30 Commands start with an `@` followed by a command name (letters and
  31 hyphens), and zero or more arguments separated by one or more whitespace
  32 characters and optionally delimited with single or double quotes. The `@`
  33 mark cannot be preceded by a non-whitespace character. Other lines
  34 (including every text up to the first `@`) are ignored, but it is
  35 recommended to avoid the use of `@` in the template file.
  36
  37 There are a number of supported commands:
  38
  39 * `@has PATH` checks for the existence of the given file.
  40
  41   `PATH` is relative to the output directory. It can be given as `-`
  42   which repeats the most recently used `PATH`.
  43
  44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
  45   the occurrence of the given pattern `PATTERN` in the specified file.
  46   Only one occurrence of the pattern is enough.
  47
  48   For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
  49   whitespace being replaced by one single space character) string.
  50   The entire file is also whitespace-normalized including newlines.
  51
  52   For `@matches`, `PATTERN` is a Python-supported regular expression.
  53   The file remains intact but the regexp is matched without the `MULTILINE`
  54   and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
  55   to override them, and `\A` and `\Z` for definitely matching
  56   the beginning and end of the file.
  57
  58   (The same distinction goes to other variants of these commands.)
  59
  60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
  61   the presence of the given XPath `XPATH` in the specified HTML file,
  62   and also the occurrence of the given pattern `PATTERN` in the matching
  63   node or attribute. Only one occurrence of the pattern in the match
  64   is enough.
  65
  66   `PATH` should be a valid and well-formed HTML file. It does *not*
  67   accept arbitrary HTML5; it should have matching open and close tags
  68   and correct entity references at least.
  69
  70   `XPATH` is an XPath expression to match. The XPath is fairly limited:
  71   `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
  72   `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
  73   and `@attr` (both as the last segment) are supported. Some examples:
  74
  75   - `//pre` or `.//pre` matches any element with a name `pre`.
  76   - `//a[@href]` matches any element with an `href` attribute.
  77   - `//*[@class="impl"]//code` matches any element with a name `code`,
  78     which is an ancestor of some element which `class` attr is `impl`.
  79   - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
  80     `class` attribute in the last `a` element (can be followed by more
  81     elements that are not `a`) inside the first `span` in the `h1` with
  82     a class of `fqn`. Note that there cannot be any additional elements
  83     between them due to the use of `/` instead of `//`.
  84
  85   Do not try to use non-absolute paths, it won't work due to the flawed
  86   ElementTree implementation. The script rejects them.
  87
  88   For the text matches (i.e. paths not ending with `@attr`), any
  89   subelements are flattened into one string; this is handy for ignoring
  90   highlights for example. If you want to simply check for the presence of
  91   a given node or attribute, use an empty string (`""`) as a `PATTERN`.
  92
  93 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
  94   in the specified file. The number of occurrences must match the given
  95   count.
  96
  97 * `@has-dir PATH` checks for the existence of the given directory.
  98
  99 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
 100 checks if the given file does not exist, for example.
 101
 102 """
 103
 104 from __future__ import absolute_import, print_function, unicode_literals
 105
 106 import codecs
 107 import io
 108 import sys
 109 import os.path
 110 import re
 111 import shlex
 112 from collections import namedtuple
 113 try:
 114     from html.parser import HTMLParser
 115 except ImportError:
 116     from HTMLParser import HTMLParser
 117 try:
 118     from xml.etree import cElementTree as ET
 119 except ImportError:
 120     from xml.etree import ElementTree as ET
 121
 122 try:
 123     from html.entities import name2codepoint
 124 except ImportError:
 125     from htmlentitydefs import name2codepoint
 126
 127 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
 128 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
 129                      'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
 130
 131 # Python 2 -> 3 compatibility
 132 try:
 133     unichr
 134 except NameError:
 135     unichr = chr
 136
 137
 138 class CustomHTMLParser(HTMLParser):
 139     """simplified HTML parser.
 140
 141     this is possible because we are dealing with very regular HTML from
 142     rustdoc; we only have to deal with i) void elements and ii) empty
 143     attributes."""
 144     def __init__(self, target=None):
 145         HTMLParser.__init__(self)
 146         self.__builder = target or ET.TreeBuilder()
 147
 148     def handle_starttag(self, tag, attrs):
 149         attrs = {k: v or '' for k, v in attrs}
 150         self.__builder.start(tag, attrs)
 151         if tag in VOID_ELEMENTS:
 152             self.__builder.end(tag)
 153
 154     def handle_endtag(self, tag):
 155         self.__builder.end(tag)
 156
 157     def handle_startendtag(self, tag, attrs):
 158         attrs = {k: v or '' for k, v in attrs}
 159         self.__builder.start(tag, attrs)
 160         self.__builder.end(tag)
 161
 162     def handle_data(self, data):
 163         self.__builder.data(data)
 164
 165     def handle_entityref(self, name):
 166         self.__builder.data(unichr(name2codepoint[name]))
 167
 168     def handle_charref(self, name):
 169         code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
 170         self.__builder.data(unichr(code))
 171
 172     def close(self):
 173         HTMLParser.close(self)
 174         return self.__builder.close()
 175
 176
 177 Command = namedtuple('Command', 'negated cmd args lineno context')
 178
 179
 180 class FailedCheck(Exception):
 181     pass
 182
 183
 184 class InvalidCheck(Exception):
 185     pass
 186
 187
 188 def concat_multi_lines(f):
 189     """returns a generator out of the file object, which
 190     - removes `\\` then `\n` then a shared prefix with the previous line then
 191       optional whitespace;
 192     - keeps a line number (starting from 0) of the first line being
 193       concatenated."""
 194     lastline = None  # set to the last line when the last line has a backslash
 195     firstlineno = None
 196     catenated = ''
 197     for lineno, line in enumerate(f):
 198         line = line.rstrip('\r\n')
 199
 200         # strip the common prefix from the current line if needed
 201         if lastline is not None:
 202             common_prefix = os.path.commonprefix([line, lastline])
 203             line = line[len(common_prefix):].lstrip()
 204
 205         firstlineno = firstlineno or lineno
 206         if line.endswith('\\'):
 207             if lastline is None:
 208                 lastline = line[:-1]
 209             catenated += line[:-1]
 210         else:
 211             yield firstlineno, catenated + line
 212             lastline = None
 213             firstlineno = None
 214             catenated = ''
 215
 216     if lastline is not None:
 217         print_err(lineno, line, 'Trailing backslash at the end of the file')
 218
 219
 220 LINE_PATTERN = re.compile(r'''
 221     (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?)
 222     (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
 223     (?P<args>.*)$
 224 ''', re.X | re.UNICODE)
 225
 226
 227 def get_commands(template):
 228     with io.open(template, encoding='utf-8') as f:
 229         for lineno, line in concat_multi_lines(f):
 230             m = LINE_PATTERN.search(line)
 231             if not m:
 232                 continue
 233
 234             negated = (m.group('negated') == '!')
 235             cmd = m.group('cmd')
 236             if m.group('invalid') == '!':
 237                 print_err(
 238                     lineno,
 239                     line,
 240                     'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format(
 241                         '!' if negated else '',
 242                         cmd,
 243                     ),
 244                 )
 245                 continue
 246             args = m.group('args')
 247             if args and not args[:1].isspace():
 248                 print_err(lineno, line, 'Invalid template syntax')
 249                 continue
 250             try:
 251                 args = shlex.split(args)
 252             except UnicodeEncodeError:
 253                 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
 254             yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
 255
 256
 257 def _flatten(node, acc):
 258     if node.text:
 259         acc.append(node.text)
 260     for e in node:
 261         _flatten(e, acc)
 262         if e.tail:
 263             acc.append(e.tail)
 264
 265
 266 def flatten(node):
 267     acc = []
 268     _flatten(node, acc)
 269     return ''.join(acc)
 270
 271
 272 def normalize_xpath(path):
 273     if path.startswith('//'):
 274         return '.' + path  # avoid warnings
 275     elif path.startswith('.//'):
 276         return path
 277     else:
 278         raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
 279
 280
 281 class CachedFiles(object):
 282     def __init__(self, root):
 283         self.root = root
 284         self.files = {}
 285         self.trees = {}
 286         self.last_path = None
 287
 288     def resolve_path(self, path):
 289         if path != '-':
 290             path = os.path.normpath(path)
 291             self.last_path = path
 292             return path
 293         elif self.last_path is None:
 294             raise InvalidCheck('Tried to use the previous path in the first command')
 295         else:
 296             return self.last_path
 297
 298     def get_file(self, path):
 299         path = self.resolve_path(path)
 300         if path in self.files:
 301             return self.files[path]
 302
 303         abspath = os.path.join(self.root, path)
 304         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 305             raise FailedCheck('File does not exist {!r}'.format(path))
 306
 307         with io.open(abspath, encoding='utf-8') as f:
 308             data = f.read()
 309             self.files[path] = data
 310             return data
 311
 312     def get_tree(self, path):
 313         path = self.resolve_path(path)
 314         if path in self.trees:
 315             return self.trees[path]
 316
 317         abspath = os.path.join(self.root, path)
 318         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 319             raise FailedCheck('File does not exist {!r}'.format(path))
 320
 321         with io.open(abspath, encoding='utf-8') as f:
 322             try:
 323                 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
 324             except Exception as e:
 325                 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
 326             self.trees[path] = tree
 327             return self.trees[path]
 328
 329     def get_dir(self, path):
 330         path = self.resolve_path(path)
 331         abspath = os.path.join(self.root, path)
 332         if not(os.path.exists(abspath) and os.path.isdir(abspath)):
 333             raise FailedCheck('Directory does not exist {!r}'.format(path))
 334
 335
 336 def check_string(data, pat, regexp):
 337     if not pat:
 338         return True  # special case a presence testing
 339     elif regexp:
 340         return re.search(pat, data, flags=re.UNICODE) is not None
 341     else:
 342         data = ' '.join(data.split())
 343         pat = ' '.join(pat.split())
 344         return pat in data
 345
 346
 347 def check_tree_attr(tree, path, attr, pat, regexp):
 348     path = normalize_xpath(path)
 349     ret = False
 350     for e in tree.findall(path):
 351         if attr in e.attrib:
 352             value = e.attrib[attr]
 353         else:
 354             continue
 355
 356         ret = check_string(value, pat, regexp)
 357         if ret:
 358             break
 359     return ret
 360
 361
 362 def check_tree_text(tree, path, pat, regexp):
 363     path = normalize_xpath(path)
 364     ret = False
 365     try:
 366         for e in tree.findall(path):
 367             try:
 368                 value = flatten(e)
 369             except KeyError:
 370                 continue
 371             else:
 372                 ret = check_string(value, pat, regexp)
 373                 if ret:
 374                     break
 375     except Exception:
 376         print('Failed to get path "{}"'.format(path))
 377         raise
 378     return ret
 379
 380
 381 def get_tree_count(tree, path):
 382     path = normalize_xpath(path)
 383     return len(tree.findall(path))
 384
 385
 386 def stderr(*args):
 387     if sys.version_info.major < 3:
 388         file = codecs.getwriter('utf-8')(sys.stderr)
 389     else:
 390         file = sys.stderr
 391
 392     print(*args, file=file)
 393
 394
 395 def print_err(lineno, context, err, message=None):
 396     global ERR_COUNT
 397     ERR_COUNT += 1
 398     stderr("{}: {}".format(lineno, message or err))
 399     if message and err:
 400         stderr("\t{}".format(err))
 401
 402     if context:
 403         stderr("\t{}".format(context))
 404
 405
 406 ERR_COUNT = 0
 407
 408
 409 def check_command(c, cache):
 410     try:
 411         cerr = ""
 412         if c.cmd == 'has' or c.cmd == 'matches':  # string test
 413             regexp = (c.cmd == 'matches')
 414             if len(c.args) == 1 and not regexp:  # @has <path> = file existence
 415                 try:
 416                     cache.get_file(c.args[0])
 417                     ret = True
 418                 except FailedCheck as err:
 419                     cerr = str(err)
 420                     ret = False
 421             elif len(c.args) == 2:  # @has/matches <path> <pat> = string test
 422                 cerr = "`PATTERN` did not match"
 423                 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
 424             elif len(c.args) == 3:  # @has/matches <path> <pat> <match> = XML tree test
 425                 cerr = "`XPATH PATTERN` did not match"
 426                 tree = cache.get_tree(c.args[0])
 427                 pat, sep, attr = c.args[1].partition('/@')
 428                 if sep:  # attribute
 429                     tree = cache.get_tree(c.args[0])
 430                     ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
 431                 else:  # normalized text
 432                     pat = c.args[1]
 433                     if pat.endswith('/text()'):
 434                         pat = pat[:-7]
 435                     ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
 436             else:
 437                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 438
 439         elif c.cmd == 'count':  # count test
 440             if len(c.args) == 3:  # @count <path> <pat> <count> = count test
 441                 expected = int(c.args[2])
 442                 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
 443                 cerr = "Expected {} occurrences but found {}".format(expected, found)
 444                 ret = expected == found
 445             else:
 446                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 447         elif c.cmd == 'has-dir':  # has-dir test
 448             if len(c.args) == 1:  # @has-dir <path> = has-dir test
 449                 try:
 450                     cache.get_dir(c.args[0])
 451                     ret = True
 452                 except FailedCheck as err:
 453                     cerr = str(err)
 454                     ret = False
 455             else:
 456                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 457         elif c.cmd == 'valid-html':
 458             raise InvalidCheck('Unimplemented @valid-html')
 459
 460         elif c.cmd == 'valid-links':
 461             raise InvalidCheck('Unimplemented @valid-links')
 462         else:
 463             raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
 464
 465         if ret == c.negated:
 466             raise FailedCheck(cerr)
 467
 468     except FailedCheck as err:
 469         message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
 470         print_err(c.lineno, c.context, str(err), message)
 471     except InvalidCheck as err:
 472         print_err(c.lineno, c.context, str(err))
 473
 474
 475 def check(target, commands):
 476     cache = CachedFiles(target)
 477     for c in commands:
 478         check_command(c, cache)
 479
 480
 481 if __name__ == '__main__':
 482     if len(sys.argv) != 3:
 483         stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
 484         raise SystemExit(1)
 485
 486     check(sys.argv[1], get_commands(sys.argv[2]))
 487     if ERR_COUNT:
 488         stderr("\nEncountered {} errors".format(ERR_COUNT))
 489         raise SystemExit(1)