src/etc/htmldocck.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 r"""
   5 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
   6
   7 # How and why?
   8
   9 The principle is simple: This script receives a path to generated HTML
  10 documentation and a "template" script, which has a series of check
  11 commands like `@has` or `@matches`. Each command is used to check if
  12 some pattern is present or not present in the particular file or in
  13 a particular node of the HTML tree. In many cases, the template script
  14 happens to be the source code given to rustdoc.
  15
  16 While it indeed is possible to test in smaller portions, it has been
  17 hard to construct tests in this fashion and major rendering errors were
  18 discovered much later. This script is designed to make black-box and
  19 regression testing of Rustdoc easy. This does not preclude the needs for
  20 unit testing, but can be used to complement related tests by quickly
  21 showing the expected renderings.
  22
  23 In order to avoid one-off dependencies for this task, this script uses
  24 a reasonably working HTML parser and the existing XPath implementation
  25 from Python's standard library. Hopefully, we won't render
  26 non-well-formed HTML.
  27
  28 # Commands
  29
  30 Commands start with an `@` followed by a command name (letters and
  31 hyphens), and zero or more arguments separated by one or more whitespace
  32 characters and optionally delimited with single or double quotes. The `@`
  33 mark cannot be preceded by a non-whitespace character. Other lines
  34 (including every text up to the first `@`) are ignored, but it is
  35 recommended to avoid the use of `@` in the template file.
  36
  37 There are a number of supported commands:
  38
  39 * `@has PATH` checks for the existence of the given file.
  40
  41   `PATH` is relative to the output directory. It can be given as `-`
  42   which repeats the most recently used `PATH`.
  43
  44 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
  45   the occurrence of the given pattern `PATTERN` in the specified file.
  46   Only one occurrence of the pattern is enough.
  47
  48   For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
  49   whitespace being replaced by one single space character) string.
  50   The entire file is also whitespace-normalized including newlines.
  51
  52   For `@matches`, `PATTERN` is a Python-supported regular expression.
  53   The file remains intact but the regexp is matched without the `MULTILINE`
  54   and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
  55   to override them, and `\A` and `\Z` for definitely matching
  56   the beginning and end of the file.
  57
  58   (The same distinction goes to other variants of these commands.)
  59
  60 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
  61   the presence of the given XPath `XPATH` in the specified HTML file,
  62   and also the occurrence of the given pattern `PATTERN` in the matching
  63   node or attribute. Only one occurrence of the pattern in the match
  64   is enough.
  65
  66   `PATH` should be a valid and well-formed HTML file. It does *not*
  67   accept arbitrary HTML5; it should have matching open and close tags
  68   and correct entity references at least.
  69
  70   `XPATH` is an XPath expression to match. The XPath is fairly limited:
  71   `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
  72   `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
  73   and `@attr` (both as the last segment) are supported. Some examples:
  74
  75   - `//pre` or `.//pre` matches any element with a name `pre`.
  76   - `//a[@href]` matches any element with an `href` attribute.
  77   - `//*[@class="impl"]//code` matches any element with a name `code`,
  78     which is an ancestor of some element which `class` attr is `impl`.
  79   - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
  80     `class` attribute in the last `a` element (can be followed by more
  81     elements that are not `a`) inside the first `span` in the `h1` with
  82     a class of `fqn`. Note that there cannot be any additional elements
  83     between them due to the use of `/` instead of `//`.
  84
  85   Do not try to use non-absolute paths, it won't work due to the flawed
  86   ElementTree implementation. The script rejects them.
  87
  88   For the text matches (i.e. paths not ending with `@attr`), any
  89   subelements are flattened into one string; this is handy for ignoring
  90   highlights for example. If you want to simply check for the presence of
  91   a given node or attribute, use an empty string (`""`) as a `PATTERN`.
  92
  93 * `@count PATH XPATH COUNT` checks for the occurrence of the given XPath
  94   in the specified file. The number of occurrences must match the given
  95   count.
  96
  97 * `@snapshot NAME PATH XPATH` creates a snapshot test named NAME.
  98   A snapshot test captures a subtree of the DOM, at the location
  99   determined by the XPath, and compares it to a pre-recorded value
 100   in a file. The file's name is the test's name with the `.rs` extension
 101   replaced with `.NAME.html`, where NAME is the snapshot's name.
 102
 103   htmldocck supports the `--bless` option to accept the current subtree
 104   as expected, saving it to the file determined by the snapshot's name.
 105   compiletest's `--bless` flag is forwarded to htmldocck.
 106
 107 * `@has-dir PATH` checks for the existence of the given directory.
 108
 109 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
 110 checks if the given file does not exist, for example.
 111
 112 """
 113
 114 from __future__ import absolute_import, print_function, unicode_literals
 115
 116 import codecs
 117 import io
 118 import sys
 119 import os.path
 120 import re
 121 import shlex
 122 from collections import namedtuple
 123 try:
 124     from html.parser import HTMLParser
 125 except ImportError:
 126     from HTMLParser import HTMLParser
 127 try:
 128     from xml.etree import cElementTree as ET
 129 except ImportError:
 130     from xml.etree import ElementTree as ET
 131
 132 try:
 133     from html.entities import name2codepoint
 134 except ImportError:
 135     from htmlentitydefs import name2codepoint
 136
 137 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
 138 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
 139                      'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
 140
 141 # Python 2 -> 3 compatibility
 142 try:
 143     unichr
 144 except NameError:
 145     unichr = chr
 146
 147
 148 channel = os.environ["DOC_RUST_LANG_ORG_CHANNEL"]
 149
 150 # Initialized in main
 151 rust_test_path = None
 152 bless = None
 153
 154 class CustomHTMLParser(HTMLParser):
 155     """simplified HTML parser.
 156
 157     this is possible because we are dealing with very regular HTML from
 158     rustdoc; we only have to deal with i) void elements and ii) empty
 159     attributes."""
 160     def __init__(self, target=None):
 161         HTMLParser.__init__(self)
 162         self.__builder = target or ET.TreeBuilder()
 163
 164     def handle_starttag(self, tag, attrs):
 165         attrs = {k: v or '' for k, v in attrs}
 166         self.__builder.start(tag, attrs)
 167         if tag in VOID_ELEMENTS:
 168             self.__builder.end(tag)
 169
 170     def handle_endtag(self, tag):
 171         self.__builder.end(tag)
 172
 173     def handle_startendtag(self, tag, attrs):
 174         attrs = {k: v or '' for k, v in attrs}
 175         self.__builder.start(tag, attrs)
 176         self.__builder.end(tag)
 177
 178     def handle_data(self, data):
 179         self.__builder.data(data)
 180
 181     def handle_entityref(self, name):
 182         self.__builder.data(unichr(name2codepoint[name]))
 183
 184     def handle_charref(self, name):
 185         code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
 186         self.__builder.data(unichr(code))
 187
 188     def close(self):
 189         HTMLParser.close(self)
 190         return self.__builder.close()
 191
 192
 193 Command = namedtuple('Command', 'negated cmd args lineno context')
 194
 195
 196 class FailedCheck(Exception):
 197     pass
 198
 199
 200 class InvalidCheck(Exception):
 201     pass
 202
 203
 204 def concat_multi_lines(f):
 205     """returns a generator out of the file object, which
 206     - removes `\\` then `\n` then a shared prefix with the previous line then
 207       optional whitespace;
 208     - keeps a line number (starting from 0) of the first line being
 209       concatenated."""
 210     lastline = None  # set to the last line when the last line has a backslash
 211     firstlineno = None
 212     catenated = ''
 213     for lineno, line in enumerate(f):
 214         line = line.rstrip('\r\n')
 215
 216         # strip the common prefix from the current line if needed
 217         if lastline is not None:
 218             common_prefix = os.path.commonprefix([line, lastline])
 219             line = line[len(common_prefix):].lstrip()
 220
 221         firstlineno = firstlineno or lineno
 222         if line.endswith('\\'):
 223             if lastline is None:
 224                 lastline = line[:-1]
 225             catenated += line[:-1]
 226         else:
 227             yield firstlineno, catenated + line
 228             lastline = None
 229             firstlineno = None
 230             catenated = ''
 231
 232     if lastline is not None:
 233         print_err(lineno, line, 'Trailing backslash at the end of the file')
 234
 235
 236 LINE_PATTERN = re.compile(r'''
 237     (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?)
 238     (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
 239     (?P<args>.*)$
 240 ''', re.X | re.UNICODE)
 241
 242
 243 def get_commands(template):
 244     with io.open(template, encoding='utf-8') as f:
 245         for lineno, line in concat_multi_lines(f):
 246             m = LINE_PATTERN.search(line)
 247             if not m:
 248                 continue
 249
 250             negated = (m.group('negated') == '!')
 251             cmd = m.group('cmd')
 252             if m.group('invalid') == '!':
 253                 print_err(
 254                     lineno,
 255                     line,
 256                     'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format(
 257                         '!' if negated else '',
 258                         cmd,
 259                     ),
 260                 )
 261                 continue
 262             args = m.group('args')
 263             if args and not args[:1].isspace():
 264                 print_err(lineno, line, 'Invalid template syntax')
 265                 continue
 266             try:
 267                 args = shlex.split(args)
 268             except UnicodeEncodeError:
 269                 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
 270             yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
 271
 272
 273 def _flatten(node, acc):
 274     if node.text:
 275         acc.append(node.text)
 276     for e in node:
 277         _flatten(e, acc)
 278         if e.tail:
 279             acc.append(e.tail)
 280
 281
 282 def flatten(node):
 283     acc = []
 284     _flatten(node, acc)
 285     return ''.join(acc)
 286
 287
 288 def make_xml(text):
 289     xml = ET.XML('<xml>%s</xml>' % text)
 290     return xml
 291
 292
 293 def normalize_xpath(path):
 294     path = path.replace("{{channel}}", channel)
 295     if path.startswith('//'):
 296         return '.' + path  # avoid warnings
 297     elif path.startswith('.//'):
 298         return path
 299     else:
 300         raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
 301
 302
 303 class CachedFiles(object):
 304     def __init__(self, root):
 305         self.root = root
 306         self.files = {}
 307         self.trees = {}
 308         self.last_path = None
 309
 310     def resolve_path(self, path):
 311         if path != '-':
 312             path = os.path.normpath(path)
 313             self.last_path = path
 314             return path
 315         elif self.last_path is None:
 316             raise InvalidCheck('Tried to use the previous path in the first command')
 317         else:
 318             return self.last_path
 319
 320     def get_file(self, path):
 321         path = self.resolve_path(path)
 322         if path in self.files:
 323             return self.files[path]
 324
 325         abspath = os.path.join(self.root, path)
 326         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 327             raise FailedCheck('File does not exist {!r}'.format(path))
 328
 329         with io.open(abspath, encoding='utf-8') as f:
 330             data = f.read()
 331             self.files[path] = data
 332             return data
 333
 334     def get_tree(self, path):
 335         path = self.resolve_path(path)
 336         if path in self.trees:
 337             return self.trees[path]
 338
 339         abspath = os.path.join(self.root, path)
 340         if not(os.path.exists(abspath) and os.path.isfile(abspath)):
 341             raise FailedCheck('File does not exist {!r}'.format(path))
 342
 343         with io.open(abspath, encoding='utf-8') as f:
 344             try:
 345                 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
 346             except Exception as e:
 347                 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
 348             self.trees[path] = tree
 349             return self.trees[path]
 350
 351     def get_dir(self, path):
 352         path = self.resolve_path(path)
 353         abspath = os.path.join(self.root, path)
 354         if not(os.path.exists(abspath) and os.path.isdir(abspath)):
 355             raise FailedCheck('Directory does not exist {!r}'.format(path))
 356
 357
 358 def check_string(data, pat, regexp):
 359     pat = pat.replace("{{channel}}", channel)
 360     if not pat:
 361         return True  # special case a presence testing
 362     elif regexp:
 363         return re.search(pat, data, flags=re.UNICODE) is not None
 364     else:
 365         data = ' '.join(data.split())
 366         pat = ' '.join(pat.split())
 367         return pat in data
 368
 369
 370 def check_tree_attr(tree, path, attr, pat, regexp):
 371     path = normalize_xpath(path)
 372     ret = False
 373     for e in tree.findall(path):
 374         if attr in e.attrib:
 375             value = e.attrib[attr]
 376         else:
 377             continue
 378
 379         ret = check_string(value, pat, regexp)
 380         if ret:
 381             break
 382     return ret
 383
 384
 385 def check_tree_text(tree, path, pat, regexp):
 386     path = normalize_xpath(path)
 387     ret = False
 388     try:
 389         for e in tree.findall(path):
 390             try:
 391                 value = flatten(e)
 392             except KeyError:
 393                 continue
 394             else:
 395                 ret = check_string(value, pat, regexp)
 396                 if ret:
 397                     break
 398     except Exception:
 399         print('Failed to get path "{}"'.format(path))
 400         raise
 401     return ret
 402
 403
 404 def get_tree_count(tree, path):
 405     path = normalize_xpath(path)
 406     return len(tree.findall(path))
 407
 408
 409 def check_snapshot(snapshot_name, actual_tree, normalize_to_text):
 410     assert rust_test_path.endswith('.rs')
 411     snapshot_path = '{}.{}.{}'.format(rust_test_path[:-3], snapshot_name, 'html')
 412     try:
 413         with open(snapshot_path, 'r') as snapshot_file:
 414             expected_str = snapshot_file.read()
 415     except FileNotFoundError:
 416         if bless:
 417             expected_str = None
 418         else:
 419             raise FailedCheck('No saved snapshot value')
 420
 421     if not normalize_to_text:
 422         actual_str = ET.tostring(actual_tree).decode('utf-8')
 423     else:
 424         actual_str = flatten(actual_tree)
 425
 426     # Conditions:
 427     #  1. Is --bless
 428     #  2. Are actual and expected tree different
 429     #  3. Are actual and expected text different
 430     if not expected_str \
 431         or (not normalize_to_text and \
 432             not compare_tree(make_xml(actual_str), make_xml(expected_str), stderr)) \
 433         or (normalize_to_text and actual_str != expected_str):
 434
 435         if bless:
 436             with open(snapshot_path, 'w') as snapshot_file:
 437                 snapshot_file.write(actual_str)
 438         else:
 439             print('--- expected ---\n')
 440             print(expected_str)
 441             print('\n\n--- actual ---\n')
 442             print(actual_str)
 443             print()
 444             raise FailedCheck('Actual snapshot value is different than expected')
 445
 446
 447 # Adapted from https://github.com/formencode/formencode/blob/3a1ba9de2fdd494dd945510a4568a3afeddb0b2e/formencode/doctest_xml_compare.py#L72-L120
 448 def compare_tree(x1, x2, reporter=None):
 449     if x1.tag != x2.tag:
 450         if reporter:
 451             reporter('Tags do not match: %s and %s' % (x1.tag, x2.tag))
 452         return False
 453     for name, value in x1.attrib.items():
 454         if x2.attrib.get(name) != value:
 455             if reporter:
 456                 reporter('Attributes do not match: %s=%r, %s=%r'
 457                          % (name, value, name, x2.attrib.get(name)))
 458             return False
 459     for name in x2.attrib:
 460         if name not in x1.attrib:
 461             if reporter:
 462                 reporter('x2 has an attribute x1 is missing: %s'
 463                          % name)
 464             return False
 465     if not text_compare(x1.text, x2.text):
 466         if reporter:
 467             reporter('text: %r != %r' % (x1.text, x2.text))
 468         return False
 469     if not text_compare(x1.tail, x2.tail):
 470         if reporter:
 471             reporter('tail: %r != %r' % (x1.tail, x2.tail))
 472         return False
 473     cl1 = list(x1)
 474     cl2 = list(x2)
 475     if len(cl1) != len(cl2):
 476         if reporter:
 477             reporter('children length differs, %i != %i'
 478                      % (len(cl1), len(cl2)))
 479         return False
 480     i = 0
 481     for c1, c2 in zip(cl1, cl2):
 482         i += 1
 483         if not compare_tree(c1, c2, reporter=reporter):
 484             if reporter:
 485                 reporter('children %i do not match: %s'
 486                          % (i, c1.tag))
 487             return False
 488     return True
 489
 490
 491 def text_compare(t1, t2):
 492     if not t1 and not t2:
 493         return True
 494     if t1 == '*' or t2 == '*':
 495         return True
 496     return (t1 or '').strip() == (t2 or '').strip()
 497
 498
 499 def stderr(*args):
 500     if sys.version_info.major < 3:
 501         file = codecs.getwriter('utf-8')(sys.stderr)
 502     else:
 503         file = sys.stderr
 504
 505     print(*args, file=file)
 506
 507
 508 def print_err(lineno, context, err, message=None):
 509     global ERR_COUNT
 510     ERR_COUNT += 1
 511     stderr("{}: {}".format(lineno, message or err))
 512     if message and err:
 513         stderr("\t{}".format(err))
 514
 515     if context:
 516         stderr("\t{}".format(context))
 517
 518
 519 ERR_COUNT = 0
 520
 521
 522 def check_command(c, cache):
 523     try:
 524         cerr = ""
 525         if c.cmd == 'has' or c.cmd == 'matches':  # string test
 526             regexp = (c.cmd == 'matches')
 527             if len(c.args) == 1 and not regexp:  # @has <path> = file existence
 528                 try:
 529                     cache.get_file(c.args[0])
 530                     ret = True
 531                 except FailedCheck as err:
 532                     cerr = str(err)
 533                     ret = False
 534             elif len(c.args) == 2:  # @has/matches <path> <pat> = string test
 535                 cerr = "`PATTERN` did not match"
 536                 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
 537             elif len(c.args) == 3:  # @has/matches <path> <pat> <match> = XML tree test
 538                 cerr = "`XPATH PATTERN` did not match"
 539                 tree = cache.get_tree(c.args[0])
 540                 pat, sep, attr = c.args[1].partition('/@')
 541                 if sep:  # attribute
 542                     tree = cache.get_tree(c.args[0])
 543                     ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
 544                 else:  # normalized text
 545                     pat = c.args[1]
 546                     if pat.endswith('/text()'):
 547                         pat = pat[:-7]
 548                     ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
 549             else:
 550                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 551
 552         elif c.cmd == 'count':  # count test
 553             if len(c.args) == 3:  # @count <path> <pat> <count> = count test
 554                 expected = int(c.args[2])
 555                 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
 556                 cerr = "Expected {} occurrences but found {}".format(expected, found)
 557                 ret = expected == found
 558             else:
 559                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 560
 561         elif c.cmd == 'snapshot':  # snapshot test
 562             if len(c.args) == 3:  # @snapshot <snapshot-name> <html-path> <xpath>
 563                 [snapshot_name, html_path, pattern] = c.args
 564                 tree = cache.get_tree(html_path)
 565                 xpath = normalize_xpath(pattern)
 566                 normalize_to_text = False
 567                 if xpath.endswith('/text()'):
 568                     xpath = xpath[:-7]
 569                     normalize_to_text = True
 570
 571                 subtrees = tree.findall(xpath)
 572                 if len(subtrees) == 1:
 573                     [subtree] = subtrees
 574                     try:
 575                         check_snapshot(snapshot_name, subtree, normalize_to_text)
 576                         ret = True
 577                     except FailedCheck as err:
 578                         cerr = str(err)
 579                         ret = False
 580                 elif len(subtrees) == 0:
 581                     raise FailedCheck('XPATH did not match')
 582                 else:
 583                     raise FailedCheck('Expected 1 match, but found {}'.format(len(subtrees)))
 584             else:
 585                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 586
 587         elif c.cmd == 'has-dir':  # has-dir test
 588             if len(c.args) == 1:  # @has-dir <path> = has-dir test
 589                 try:
 590                     cache.get_dir(c.args[0])
 591                     ret = True
 592                 except FailedCheck as err:
 593                     cerr = str(err)
 594                     ret = False
 595             else:
 596                 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
 597
 598         elif c.cmd == 'valid-html':
 599             raise InvalidCheck('Unimplemented @valid-html')
 600
 601         elif c.cmd == 'valid-links':
 602             raise InvalidCheck('Unimplemented @valid-links')
 603
 604         else:
 605             raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
 606
 607         if ret == c.negated:
 608             raise FailedCheck(cerr)
 609
 610     except FailedCheck as err:
 611         message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
 612         print_err(c.lineno, c.context, str(err), message)
 613     except InvalidCheck as err:
 614         print_err(c.lineno, c.context, str(err))
 615
 616
 617 def check(target, commands):
 618     cache = CachedFiles(target)
 619     for c in commands:
 620         check_command(c, cache)
 621
 622
 623 if __name__ == '__main__':
 624     if len(sys.argv) not in [3, 4]:
 625         stderr('Usage: {} <doc dir> <template> [--bless]'.format(sys.argv[0]))
 626         raise SystemExit(1)
 627
 628     rust_test_path = sys.argv[2]
 629     if len(sys.argv) > 3 and sys.argv[3] == '--bless':
 630         bless = True
 631     else:
 632         # We only support `--bless` at the end of the arguments.
 633         # This assert is to prevent silent failures.
 634         assert '--bless' not in sys.argv
 635         bless = False
 636     check(sys.argv[1], get_commands(rust_test_path))
 637     if ERR_COUNT:
 638         stderr("\nEncountered {} errors".format(ERR_COUNT))
 639         raise SystemExit(1)