PhysLean/scripts/lint-style.py

#!/usr/bin/env python3
"""

Taken from Mathlib4.

Lint a file or files from mathlib for style.

Sample usage:

    $ ./scripts/lint-style.py $(find Mathlib -name '*.lean')

which will lint all of the Lean files in the specified directories.

The resulting error output will contain one line for each style error
encountered that isn't in the list of allowed / ignored style exceptions.

Paths with no errors will not appear in the output, and the script will
exit with successful return code if there are no errors encountered in
any provided paths.

Paths emitted in the output will match the paths provided on the
command line for any files containing errors -- in particular, linting
a relative path (like ``Mathlib/Foo/Bar.lean``) will produce errors
that contain the relative path, whilst linting absolute paths (like
``/root/mathlib4/Mathlib/Foo/Bar.lean``) will produce errors with the
absolute path.

This script can also be used to regenerate the list of allowed / ignored style
exceptions by redirecting the output to ``style-exceptions.txt``. Use:

    $ ./scripts/update-style-exceptions.sh

to perform this update.
"""

# TODO: This is adapted from the linter for mathlib3. It should be rewritten in Lean.

from pathlib import Path
import sys
import re
import shutil

ERR_COP = 0 # copyright header
ERR_MOD = 2 # module docstring
ERR_LIN = 3 # line length
ERR_OPT = 6 # set_option
ERR_AUT = 7 # malformed authors list
ERR_TAC = 9 # imported Mathlib.Tactic
ERR_IBY = 11 # isolated by
ERR_DOT = 12 # isolated or low focusing dot
ERR_SEM = 13 # the substring " ;"
ERR_WIN = 14 # Windows line endings "\r\n"
ERR_TWS = 15 # trailing whitespace
ERR_CLN = 16 # line starts with a colon
ERR_IND = 17 # second line not correctly indented
ERR_ARR = 18 # space after "←"
ERR_NUM_LIN = 19 # file is too large
ERR_NSP = 20 # non-terminal simp

exceptions = []

SCRIPTS_DIR = Path(__file__).parent.resolve()
ROOT_DIR = SCRIPTS_DIR.parent


with SCRIPTS_DIR.joinpath("style-exceptions.txt").open(encoding="utf-8") as f:
    for exline in f:
        filename, _, _, _, _, errno, *extra = exline.split()
        path = ROOT_DIR / filename
        if errno == "ERR_COP":
            exceptions += [(ERR_COP, path, None)]
        elif errno == "ERR_MOD":
            exceptions += [(ERR_MOD, path, None)]
        elif errno == "ERR_LIN":
            exceptions += [(ERR_LIN, path, None)]
        elif errno == "ERR_OPT":
            exceptions += [(ERR_OPT, path, None)]
        elif errno == "ERR_AUT":
            exceptions += [(ERR_AUT, path, None)]
        elif errno == "ERR_TAC":
            exceptions += [(ERR_TAC, path, None)]
        elif errno == "ERR_NUM_LIN":
            exceptions += [(ERR_NUM_LIN, path, extra[1])]
        else:
            print(f"Error: unexpected errno in style-exceptions.txt: {errno}")
            sys.exit(1)

new_exceptions = False

def annotate_comments(enumerate_lines):
    """
    Take a list of tuples of enumerated lines of the form
    (line_number, line, ...)
    and return a list of
    (line_number, line, ..., True/False)
    where lines have True attached when they are in comments.
    """
    nesting_depth = 0 # We're in a comment when `nesting_depth > 0`.
    starts_in_comment = False # Whether we're in a comment when starting the line.
    for line_nr, line, *rem in enumerate_lines:
        # We assume multiline comments do not begin or end within single-line comments.
        if line == "\n" or line.lstrip().startswith("--"):
            yield line_nr, line, *rem, True
            continue
        # We assume that "/-/" and "-/-" never occur outside of "--" comments.
        # We assume that we do not encounter "... -/ <term> /- ...".
        # We also don't account for "/-" and "-/" appearing in strings.
        starts_in_comment = (nesting_depth > 0)
        nesting_depth = nesting_depth + line.count("/-") - line.count("-/")
        in_comment = (starts_in_comment or line.lstrip().startswith("/-")) and \
            (nesting_depth > 0 or line.rstrip().endswith("-/"))
        yield line_nr, line, *rem, in_comment

def annotate_strings(enumerate_lines):
    """
    Take a list of tuples of enumerated lines of the form
    (line_number, line, ...)
    and return a list of
    (line_number, line, ..., True/False)
    where lines have True attached when they are in strings.
    """
    in_string = False
    in_comment = False
    for line_nr, line, *rem in enumerate_lines:
        # ignore comment markers inside string literals
        if not in_string:
            if "/-" in line:
                in_comment = True
            if "-/" in line:
                in_comment = False
        # ignore quotes inside comments
        if not in_comment:
            # crude heuristic: if the number of non-escaped quote signs is odd,
            # we're starting / ending a string literal
            if line.count("\"") - line.count("\\\"") % 2 == 1:
                in_string = not in_string
            # if there are quote signs in this line,
            # a string literal probably begins and / or ends here,
            # so we skip this line
            if line.count("\"") > 0:
                yield line_nr, line, *rem, True
                continue
            if in_string:
                yield line_nr, line, *rem, True
                continue
        yield line_nr, line, *rem, False

def set_option_check(lines, path):
    errors = []
    newlines = []
    for line_nr, line, in_comment, in_string in annotate_strings(annotate_comments(lines)):
        if line.strip().startswith('set_option') and not in_comment and not in_string:
            option_prefix = line.strip().split(' ', 2)[1].split('.', 1)[0]
            # forbidden options: pp, profiler, trace
            if option_prefix in {'pp', 'profiler', 'trace'}:
                errors += [(ERR_OPT, line_nr, path)]
                # skip adding this line to newlines so that we suggest removal
                continue
        newlines.append((line_nr, line))
    return errors, newlines

def line_endings_check(lines, path):
    errors = []
    newlines = []
    for line_nr, line in lines:
        if "\r\n" in line:
            errors += [(ERR_WIN, line_nr, path)]
            line = line.replace("\r\n", "\n")
        if line.endswith(" \n"):
            errors += [(ERR_TWS, line_nr, path)]
            line = line.rstrip() + "\n"
        newlines.append((line_nr, line))
    return errors, newlines

def four_spaces_in_second_line(lines, path):
    # TODO: also fix the space for all lines before ":=", right now we only fix the line after
    # the first line break
    errors = []
    # We never alter the first line, as it does not occur as next_line in the iteration over the
    # zipped lines below, hence we add it here
    newlines = [lines[0]]
    annotated_lines = list(annotate_comments(lines))
    for (_, line, is_comment), (next_line_nr, next_line, _) in zip(annotated_lines,
                                                                   annotated_lines[1:]):
        # Check if the current line matches "(lemma|theorem) .* :"
        new_next_line = next_line
        if (not is_comment) and re.search(r"^(protected )?(def|lemma|theorem) (?!.*:=).*(where)?$",
                                          line):
            # Calculate the number of spaces before the first non-space character in the next line
            stripped_next_line = next_line.lstrip()
            if not (next_line == '\n' or next_line.startswith("#") or stripped_next_line.startswith("--")):
                num_spaces = len(next_line) - len(stripped_next_line)
                # The match with "| " could potentially match with a different usage of the same
                # symbol, e.g. some sort of norm. In that case a space is not necessary, so
                # looking for "| " should be enough.
                if stripped_next_line.startswith("| ") or line.endswith("where\n"):
                    # Check and fix if the number of leading space is not 2
                    if num_spaces != 2:
                        errors += [(ERR_IND, next_line_nr, path)]
                        new_next_line = ' ' * 2 + stripped_next_line
                # Check and fix if the number of leading spaces is not 4
                else:
                    if num_spaces != 4:
                        errors += [(ERR_IND, next_line_nr, path)]
                        new_next_line = ' ' * 4 + stripped_next_line
        newlines.append((next_line_nr, new_next_line))
    return errors, newlines

def nonterminal_simp_check(lines, path):
    errors = []
    newlines = []
    annotated_lines = list(annotate_comments(lines))
    for (line_nr, line, is_comment), (_, next_line, _) in zip(annotated_lines,
                                                              annotated_lines[1:]):
        # Check if the current line matches whitespace followed by "simp"
        new_line = line
        # TODO it would be better to use a regex like r"^\s*simp( \[.*\])?( at .*)?$" and thereby
        # catch all possible simp invocations. Adding this will require more initial cleanup or
        # nolint.
        if (not is_comment) and re.search(r"^\s*simp$", line):
            # Calculate the number of spaces before the first non-space character in the line
            num_spaces = len(line) - len(line.lstrip())
            # Calculate the number of spaces before the first non-space character in the next line
            stripped_next_line = next_line.lstrip()
            if not (next_line == '\n' or next_line.startswith("#") or stripped_next_line.startswith("--") or "rfl" in next_line):
                num_next_spaces = len(next_line) - len(stripped_next_line)
                # Check if the number of leading spaces is the same
                if num_spaces == num_next_spaces:
                    # If so, the simp is nonterminal
                    errors += [(ERR_NSP, line_nr, path)]
                    new_line = line.replace("simp", "simp?")
        newlines.append((line_nr, new_line))
    newlines.append(lines[-1])
    return errors, newlines

def long_lines_check(lines, path):
    errors = []
    # TODO: find a good way to break long lines
    # TODO: some string literals (in e.g. tactic output messages) can be excepted from this rule
    for line_nr, line in lines:
        if "http" in line or "#align" in line:
            continue
        if len(line) > 101:
            errors += [(ERR_LIN, line_nr, path)]
    return errors, lines

def import_only_check(lines, path):
    for _, line, is_comment in annotate_comments(lines):
        if is_comment:
            continue
        imports = line.split()
        if imports[0] == "#align_import":
            continue
        if imports[0] != "import":
            return False
    return True

def regular_check(lines, path):
    errors = []
    copy_started = False
    copy_done = False
    copy_start_line_nr = 1
    copy_lines = ""
    for line_nr, line in lines:
        if not copy_started and line == "\n":
            errors += [(ERR_COP, copy_start_line_nr, path)]
            continue
        if not copy_started and line == "/-\n":
            copy_started = True
            copy_start_line_nr = line_nr
            continue
        if not copy_started:
            errors += [(ERR_COP, line_nr, path)]
        if copy_started and not copy_done:
            copy_lines += line
            if "Author" in line:
                # Validating names is not a reasonable thing to do,
                # so we just look for the two common variations:
                # using ' and ' between names, and a '.' at the end of line.
                if ((not line.startswith("Authors: ")) or
                    ("  " in line) or
                    (" and " in line) or
                    (line[-2] == '.')):
                    errors += [(ERR_AUT, line_nr, path)]
            if line == "-/\n":
                if ((not "Copyright" in copy_lines) or
                    (not "Apache" in copy_lines) or
                    (not "Authors: " in copy_lines)):
                    errors += [(ERR_COP, copy_start_line_nr, path)]
                copy_done = True
            continue
        if copy_done and line == "\n":
            continue
        words = line.split()
        if words[0] != "import" and words[0] != "--" and words[0] != "/-!" and words[0] != "#align_import":
            errors += [(ERR_MOD, line_nr, path)]
            break
        if words[0] == "/-!":
            break
    return errors, lines

def banned_import_check(lines, path):
    errors = []
    for line_nr, line, is_comment in annotate_comments(lines):
        if is_comment:
            continue
        imports = line.split()
        if imports[0] != "import":
            break
        if imports[1] in ["Mathlib.Tactic"]:
            errors += [(ERR_TAC, line_nr, path)]
    return errors, lines

def isolated_by_dot_semicolon_check(lines, path):
    errors = []
    newlines = []
    for line_nr, line in lines:
        if line.strip() == "by":
            # We excuse those "by"s following a comma or ", fun ... =>", since generally hanging "by"s
            # should not be used in the second or later arguments of a tuple/anonymous constructor
            # See https://github.com/leanprover-community/mathlib4/pull/3825#discussion_r1186702599
            prev_line = lines[line_nr - 2][1].rstrip()
            if not prev_line.endswith(",") and not re.search(", fun [^,]* (=>|↦)$", prev_line):
                errors += [(ERR_IBY, line_nr, path)]
        if line.lstrip().startswith(". "):
            errors += [(ERR_DOT, line_nr, path)]
            line = line.replace(". ", "· ", 1)
        if line.strip() in (".", "·"):
            errors += [(ERR_DOT, line_nr, path)]
        if " ;" in line:
            errors += [(ERR_SEM, line_nr, path)]
            line = line.replace(" ;", ";")
        if line.lstrip().startswith(":"):
            errors += [(ERR_CLN, line_nr, path)]
        newlines.append((line_nr, line))
    return errors, newlines

def left_arrow_check(lines, path):
    errors = []
    newlines = []
    for line_nr, line, is_comment, in_string in annotate_strings(annotate_comments(lines)):
        if is_comment or in_string:
            newlines.append((line_nr, line))
            continue
        # Allow "←" to be followed by "%" or "`", but not by "`(" or "``(" (since "`()" and "``()"
        # are used for syntax quotations). Otherwise, insert a space after "←".
        new_line = re.sub(r'←(?:(?=``?\()|(?![%`]))(\S)', r'← \1', line)
        if new_line != line:
            errors += [(ERR_ARR, line_nr, path)]
        newlines.append((line_nr, new_line))
    return errors, newlines

def output_message(path, line_nr, code, msg):
    if len(exceptions) == 0:
        # we are generating a new exceptions file
        # filename first, then line so that we can call "sort" on the output
        #print(f"{path} : line {line_nr} : {code} : {msg}")
        print(f"{path}:{line_nr} : {code} : {msg}")
    else:
        if code.startswith("ERR"):
            msg_type = "error"
        if code.startswith("WRN"):
            msg_type = "warning"
        # We are outputting for github. We duplicate path, line_nr and code,
        # so that they are also visible in the plaintext output.
        print(f"::{msg_type} file={path},line={line_nr},code={code}::{path}#L{line_nr}: {code}: {msg}")

def format_errors(errors):
    global new_exceptions
    for errno, line_nr, path in errors:
        if (errno, path.resolve(), None) in exceptions:
            continue
        new_exceptions = True
        if errno == ERR_COP:
            output_message(path, line_nr, "ERR_COP", "Malformed or missing copyright header")
        if errno == ERR_MOD:
            output_message(path, line_nr, "ERR_MOD", "Module docstring missing, or too late")
        if errno == ERR_LIN:
            output_message(path, line_nr, "ERR_LIN", "Line has more than 100 characters")
        if errno == ERR_OPT:
            output_message(path, line_nr, "ERR_OPT", "Forbidden set_option command")
        if errno == ERR_AUT:
            output_message(path, line_nr, "ERR_AUT", "Authors line should look like: 'Authors: Jean Dupont, Иван Иванович Иванов'")
        if errno == ERR_TAC:
            output_message(path, line_nr, "ERR_TAC", "Files in mathlib cannot import the whole tactic folder")
        if errno == ERR_IBY:
            output_message(path, line_nr, "ERR_IBY", "Line is an isolated 'by'")
        if errno == ERR_DOT:
            output_message(path, line_nr, "ERR_DOT", "Line is an isolated focusing dot or uses . instead of ·")
        if errno == ERR_SEM:
            output_message(path, line_nr, "ERR_SEM", "Line contains a space before a semicolon")
        if errno == ERR_WIN:
            output_message(path, line_nr, "ERR_WIN", "Windows line endings (\\r\\n) detected")
        if errno == ERR_TWS:
            output_message(path, line_nr, "ERR_TWS", "Trailing whitespace detected on line")
        if errno == ERR_CLN:
            output_message(path, line_nr, "ERR_CLN", "Put : and := before line breaks, not after")
        if errno == ERR_IND:
            output_message(path, line_nr, "ERR_IND", "If the theorem/def statement requires multiple lines, indent it correctly (4 spaces or 2 for `|`)")
        if errno == ERR_ARR:
            output_message(path, line_nr, "ERR_ARR", "Missing space after '←'.")
        if errno == ERR_NSP:
            output_message(path, line_nr, "ERR_NSP", "Non-terminal simp. Replace with `simp?` and use the suggested output")

def lint(path, fix=False):
    global new_exceptions
    with path.open(encoding="utf-8", newline="") as f:
        # We enumerate the lines so that we can report line numbers in the error messages correctly
        # we will modify lines as we go, so we need to keep track of the original line numbers
        lines = f.readlines()
        enum_lines = enumerate(lines, 1)
        newlines = enum_lines
        for error_check in [line_endings_check,
                            four_spaces_in_second_line,
                            long_lines_check,
                            isolated_by_dot_semicolon_check,
                            set_option_check,
                            left_arrow_check,
                            nonterminal_simp_check]:
            errs, newlines = error_check(newlines, path)
            format_errors(errs)

        if not import_only_check(newlines, path):
            # Check for too long files: either longer than 1500 lines, or not covered by an exception.
            # Each exception contains a "watermark". If the file is longer than that, we also complain.
            if len(lines) > 1500:
                ex = [e for e in exceptions if e[1] == path.resolve()]
                if ex:
                    (_ERR_NUM, _path, watermark) = list(ex)[0]
                    assert int(watermark) > 500 # protect against parse error
                    is_too_long = len(lines) > int(watermark)
                else:
                    is_too_long = True
                if is_too_long:
                    new_exceptions = True
                    # add up to 200 lines of slack, so simple PRs don't trigger this right away
                    watermark = len(lines) // 100 * 100 + 200
                    output_message(path, 1, "ERR_NUM_LIN", f"{watermark} file contains {len(lines)} lines, try to split it up")
            errs, newlines = regular_check(newlines, path)
            format_errors(errs)
            errs, newlines = banned_import_check(newlines, path)
            format_errors(errs)
    # if we haven't been asked to fix errors, or there are no errors or no fixes, we're done
    if fix and new_exceptions and enum_lines != newlines:
        path.with_name(path.name + '.bak').write_text("".join(l for _,l in newlines), encoding = "utf8")
        shutil.move(path.with_name(path.name + '.bak'), path)

fix = "--fix" in sys.argv
argv = (arg for arg in sys.argv[1:] if arg != "--fix")

for filename in argv:
    lint(Path(filename), fix=fix)

if new_exceptions:
    exit(1)