PhysLean/scripts/lint-style.py
2024-08-15 10:16:42 -04:00

455 lines
20 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Taken from Mathlib4.
Lint a file or files from mathlib for style.
Sample usage:
$ ./scripts/lint-style.py $(find Mathlib -name '*.lean')
which will lint all of the Lean files in the specified directories.
The resulting error output will contain one line for each style error
encountered that isn't in the list of allowed / ignored style exceptions.
Paths with no errors will not appear in the output, and the script will
exit with successful return code if there are no errors encountered in
any provided paths.
Paths emitted in the output will match the paths provided on the
command line for any files containing errors -- in particular, linting
a relative path (like ``Mathlib/Foo/Bar.lean``) will produce errors
that contain the relative path, whilst linting absolute paths (like
``/root/mathlib4/Mathlib/Foo/Bar.lean``) will produce errors with the
absolute path.
This script can also be used to regenerate the list of allowed / ignored style
exceptions by redirecting the output to ``style-exceptions.txt``. Use:
$ ./scripts/update-style-exceptions.sh
to perform this update.
"""
# TODO: This is adapted from the linter for mathlib3. It should be rewritten in Lean.
from pathlib import Path
import sys
import re
import shutil
ERR_COP = 0 # copyright header
ERR_MOD = 2 # module docstring
ERR_LIN = 3 # line length
ERR_OPT = 6 # set_option
ERR_AUT = 7 # malformed authors list
ERR_TAC = 9 # imported Mathlib.Tactic
ERR_IBY = 11 # isolated by
ERR_DOT = 12 # isolated or low focusing dot
ERR_SEM = 13 # the substring " ;"
ERR_WIN = 14 # Windows line endings "\r\n"
ERR_TWS = 15 # trailing whitespace
ERR_CLN = 16 # line starts with a colon
ERR_IND = 17 # second line not correctly indented
ERR_ARR = 18 # space after "←"
ERR_NUM_LIN = 19 # file is too large
ERR_NSP = 20 # non-terminal simp
exceptions = []
SCRIPTS_DIR = Path(__file__).parent.resolve()
ROOT_DIR = SCRIPTS_DIR.parent
with SCRIPTS_DIR.joinpath("style-exceptions.txt").open(encoding="utf-8") as f:
for exline in f:
filename, _, _, _, _, errno, *extra = exline.split()
path = ROOT_DIR / filename
if errno == "ERR_COP":
exceptions += [(ERR_COP, path, None)]
elif errno == "ERR_MOD":
exceptions += [(ERR_MOD, path, None)]
elif errno == "ERR_LIN":
exceptions += [(ERR_LIN, path, None)]
elif errno == "ERR_OPT":
exceptions += [(ERR_OPT, path, None)]
elif errno == "ERR_AUT":
exceptions += [(ERR_AUT, path, None)]
elif errno == "ERR_TAC":
exceptions += [(ERR_TAC, path, None)]
elif errno == "ERR_NUM_LIN":
exceptions += [(ERR_NUM_LIN, path, extra[1])]
else:
print(f"Error: unexpected errno in style-exceptions.txt: {errno}")
sys.exit(1)
new_exceptions = False
def annotate_comments(enumerate_lines):
"""
Take a list of tuples of enumerated lines of the form
(line_number, line, ...)
and return a list of
(line_number, line, ..., True/False)
where lines have True attached when they are in comments.
"""
nesting_depth = 0 # We're in a comment when `nesting_depth > 0`.
starts_in_comment = False # Whether we're in a comment when starting the line.
for line_nr, line, *rem in enumerate_lines:
# We assume multiline comments do not begin or end within single-line comments.
if line == "\n" or line.lstrip().startswith("--"):
yield line_nr, line, *rem, True
continue
# We assume that "/-/" and "-/-" never occur outside of "--" comments.
# We assume that we do not encounter "... -/ <term> /- ...".
# We also don't account for "/-" and "-/" appearing in strings.
starts_in_comment = (nesting_depth > 0)
nesting_depth = nesting_depth + line.count("/-") - line.count("-/")
in_comment = (starts_in_comment or line.lstrip().startswith("/-")) and \
(nesting_depth > 0 or line.rstrip().endswith("-/"))
yield line_nr, line, *rem, in_comment
def annotate_strings(enumerate_lines):
"""
Take a list of tuples of enumerated lines of the form
(line_number, line, ...)
and return a list of
(line_number, line, ..., True/False)
where lines have True attached when they are in strings.
"""
in_string = False
in_comment = False
for line_nr, line, *rem in enumerate_lines:
# ignore comment markers inside string literals
if not in_string:
if "/-" in line:
in_comment = True
if "-/" in line:
in_comment = False
# ignore quotes inside comments
if not in_comment:
# crude heuristic: if the number of non-escaped quote signs is odd,
# we're starting / ending a string literal
if line.count("\"") - line.count("\\\"") % 2 == 1:
in_string = not in_string
# if there are quote signs in this line,
# a string literal probably begins and / or ends here,
# so we skip this line
if line.count("\"") > 0:
yield line_nr, line, *rem, True
continue
if in_string:
yield line_nr, line, *rem, True
continue
yield line_nr, line, *rem, False
def set_option_check(lines, path):
errors = []
newlines = []
for line_nr, line, in_comment, in_string in annotate_strings(annotate_comments(lines)):
if line.strip().startswith('set_option') and not in_comment and not in_string:
option_prefix = line.strip().split(' ', 2)[1].split('.', 1)[0]
# forbidden options: pp, profiler, trace
if option_prefix in {'pp', 'profiler', 'trace'}:
errors += [(ERR_OPT, line_nr, path)]
# skip adding this line to newlines so that we suggest removal
continue
newlines.append((line_nr, line))
return errors, newlines
def line_endings_check(lines, path):
errors = []
newlines = []
for line_nr, line in lines:
if "\r\n" in line:
errors += [(ERR_WIN, line_nr, path)]
line = line.replace("\r\n", "\n")
if line.endswith(" \n"):
errors += [(ERR_TWS, line_nr, path)]
line = line.rstrip() + "\n"
newlines.append((line_nr, line))
return errors, newlines
def four_spaces_in_second_line(lines, path):
# TODO: also fix the space for all lines before ":=", right now we only fix the line after
# the first line break
errors = []
# We never alter the first line, as it does not occur as next_line in the iteration over the
# zipped lines below, hence we add it here
newlines = [lines[0]]
annotated_lines = list(annotate_comments(lines))
for (_, line, is_comment), (next_line_nr, next_line, _) in zip(annotated_lines,
annotated_lines[1:]):
# Check if the current line matches "(lemma|theorem) .* :"
new_next_line = next_line
if (not is_comment) and re.search(r"^(protected )?(def|lemma|theorem) (?!.*:=).*(where)?$",
line):
# Calculate the number of spaces before the first non-space character in the next line
stripped_next_line = next_line.lstrip()
if not (next_line == '\n' or next_line.startswith("#") or stripped_next_line.startswith("--")):
num_spaces = len(next_line) - len(stripped_next_line)
# The match with "| " could potentially match with a different usage of the same
# symbol, e.g. some sort of norm. In that case a space is not necessary, so
# looking for "| " should be enough.
if stripped_next_line.startswith("| ") or line.endswith("where\n"):
# Check and fix if the number of leading space is not 2
if num_spaces != 2:
errors += [(ERR_IND, next_line_nr, path)]
new_next_line = ' ' * 2 + stripped_next_line
# Check and fix if the number of leading spaces is not 4
else:
if num_spaces != 4:
errors += [(ERR_IND, next_line_nr, path)]
new_next_line = ' ' * 4 + stripped_next_line
newlines.append((next_line_nr, new_next_line))
return errors, newlines
def nonterminal_simp_check(lines, path):
errors = []
newlines = []
annotated_lines = list(annotate_comments(lines))
for (line_nr, line, is_comment), (_, next_line, _) in zip(annotated_lines,
annotated_lines[1:]):
# Check if the current line matches whitespace followed by "simp"
new_line = line
# TODO it would be better to use a regex like r"^\s*simp( \[.*\])?( at .*)?$" and thereby
# catch all possible simp invocations. Adding this will require more initial cleanup or
# nolint.
if (not is_comment) and re.search(r"^\s*simp$", line):
# Calculate the number of spaces before the first non-space character in the line
num_spaces = len(line) - len(line.lstrip())
# Calculate the number of spaces before the first non-space character in the next line
stripped_next_line = next_line.lstrip()
if not (next_line == '\n' or next_line.startswith("#") or stripped_next_line.startswith("--") or "rfl" in next_line):
num_next_spaces = len(next_line) - len(stripped_next_line)
# Check if the number of leading spaces is the same
if num_spaces == num_next_spaces:
# If so, the simp is nonterminal
errors += [(ERR_NSP, line_nr, path)]
new_line = line.replace("simp", "simp?")
newlines.append((line_nr, new_line))
newlines.append(lines[-1])
return errors, newlines
def long_lines_check(lines, path):
errors = []
# TODO: find a good way to break long lines
# TODO: some string literals (in e.g. tactic output messages) can be excepted from this rule
for line_nr, line in lines:
if "http" in line or "#align" in line:
continue
if len(line) > 101:
errors += [(ERR_LIN, line_nr, path)]
return errors, lines
def import_only_check(lines, path):
for _, line, is_comment in annotate_comments(lines):
if is_comment:
continue
imports = line.split()
if imports[0] == "#align_import":
continue
if imports[0] != "import":
return False
return True
def regular_check(lines, path):
errors = []
copy_started = False
copy_done = False
copy_start_line_nr = 1
copy_lines = ""
for line_nr, line in lines:
if not copy_started and line == "\n":
errors += [(ERR_COP, copy_start_line_nr, path)]
continue
if not copy_started and line == "/-\n":
copy_started = True
copy_start_line_nr = line_nr
continue
if not copy_started:
errors += [(ERR_COP, line_nr, path)]
if copy_started and not copy_done:
copy_lines += line
if "Author" in line:
# Validating names is not a reasonable thing to do,
# so we just look for the two common variations:
# using ' and ' between names, and a '.' at the end of line.
if ((not line.startswith("Authors: ")) or
(" " in line) or
(" and " in line) or
(line[-2] == '.')):
errors += [(ERR_AUT, line_nr, path)]
if line == "-/\n":
if ((not "Copyright" in copy_lines) or
(not "Apache" in copy_lines) or
(not "Authors: " in copy_lines)):
errors += [(ERR_COP, copy_start_line_nr, path)]
copy_done = True
continue
if copy_done and line == "\n":
continue
words = line.split()
if words[0] != "import" and words[0] != "--" and words[0] != "/-!" and words[0] != "#align_import":
errors += [(ERR_MOD, line_nr, path)]
break
if words[0] == "/-!":
break
return errors, lines
def banned_import_check(lines, path):
errors = []
for line_nr, line, is_comment in annotate_comments(lines):
if is_comment:
continue
imports = line.split()
if imports[0] != "import":
break
if imports[1] in ["Mathlib.Tactic"]:
errors += [(ERR_TAC, line_nr, path)]
return errors, lines
def isolated_by_dot_semicolon_check(lines, path):
errors = []
newlines = []
for line_nr, line in lines:
if line.strip() == "by":
# We excuse those "by"s following a comma or ", fun ... =>", since generally hanging "by"s
# should not be used in the second or later arguments of a tuple/anonymous constructor
# See https://github.com/leanprover-community/mathlib4/pull/3825#discussion_r1186702599
prev_line = lines[line_nr - 2][1].rstrip()
if not prev_line.endswith(",") and not re.search(", fun [^,]* (=>|↦)$", prev_line):
errors += [(ERR_IBY, line_nr, path)]
if line.lstrip().startswith(". "):
errors += [(ERR_DOT, line_nr, path)]
line = line.replace(". ", "· ", 1)
if line.strip() in (".", "·"):
errors += [(ERR_DOT, line_nr, path)]
if " ;" in line:
errors += [(ERR_SEM, line_nr, path)]
line = line.replace(" ;", ";")
if line.lstrip().startswith(":"):
errors += [(ERR_CLN, line_nr, path)]
newlines.append((line_nr, line))
return errors, newlines
def left_arrow_check(lines, path):
errors = []
newlines = []
for line_nr, line, is_comment, in_string in annotate_strings(annotate_comments(lines)):
if is_comment or in_string:
newlines.append((line_nr, line))
continue
# Allow "←" to be followed by "%" or "`", but not by "`(" or "``(" (since "`()" and "``()"
# are used for syntax quotations). Otherwise, insert a space after "←".
new_line = re.sub(r'←(?:(?=``?\()|(?![%`]))(\S)', r'\1', line)
if new_line != line:
errors += [(ERR_ARR, line_nr, path)]
newlines.append((line_nr, new_line))
return errors, newlines
def output_message(path, line_nr, code, msg):
if len(exceptions) == 0:
# we are generating a new exceptions file
# filename first, then line so that we can call "sort" on the output
#print(f"{path} : line {line_nr} : {code} : {msg}")
print(f"{path}:{line_nr} : {code} : {msg}")
else:
if code.startswith("ERR"):
msg_type = "error"
if code.startswith("WRN"):
msg_type = "warning"
# We are outputting for github. We duplicate path, line_nr and code,
# so that they are also visible in the plaintext output.
print(f"::{msg_type} file={path},line={line_nr},code={code}::{path}#L{line_nr}: {code}: {msg}")
def format_errors(errors):
global new_exceptions
for errno, line_nr, path in errors:
if (errno, path.resolve(), None) in exceptions:
continue
new_exceptions = True
if errno == ERR_COP:
output_message(path, line_nr, "ERR_COP", "Malformed or missing copyright header")
if errno == ERR_MOD:
output_message(path, line_nr, "ERR_MOD", "Module docstring missing, or too late")
if errno == ERR_LIN:
output_message(path, line_nr, "ERR_LIN", "Line has more than 100 characters")
if errno == ERR_OPT:
output_message(path, line_nr, "ERR_OPT", "Forbidden set_option command")
if errno == ERR_AUT:
output_message(path, line_nr, "ERR_AUT", "Authors line should look like: 'Authors: Jean Dupont, Иван Иванович Иванов'")
if errno == ERR_TAC:
output_message(path, line_nr, "ERR_TAC", "Files in mathlib cannot import the whole tactic folder")
if errno == ERR_IBY:
output_message(path, line_nr, "ERR_IBY", "Line is an isolated 'by'")
if errno == ERR_DOT:
output_message(path, line_nr, "ERR_DOT", "Line is an isolated focusing dot or uses . instead of ·")
if errno == ERR_SEM:
output_message(path, line_nr, "ERR_SEM", "Line contains a space before a semicolon")
if errno == ERR_WIN:
output_message(path, line_nr, "ERR_WIN", "Windows line endings (\\r\\n) detected")
if errno == ERR_TWS:
output_message(path, line_nr, "ERR_TWS", "Trailing whitespace detected on line")
if errno == ERR_CLN:
output_message(path, line_nr, "ERR_CLN", "Put : and := before line breaks, not after")
if errno == ERR_IND:
output_message(path, line_nr, "ERR_IND", "If the theorem/def statement requires multiple lines, indent it correctly (4 spaces or 2 for `|`)")
if errno == ERR_ARR:
output_message(path, line_nr, "ERR_ARR", "Missing space after ''.")
if errno == ERR_NSP:
output_message(path, line_nr, "ERR_NSP", "Non-terminal simp. Replace with `simp?` and use the suggested output")
def lint(path, fix=False):
global new_exceptions
with path.open(encoding="utf-8", newline="") as f:
# We enumerate the lines so that we can report line numbers in the error messages correctly
# we will modify lines as we go, so we need to keep track of the original line numbers
lines = f.readlines()
enum_lines = enumerate(lines, 1)
newlines = enum_lines
for error_check in [line_endings_check,
four_spaces_in_second_line,
long_lines_check,
isolated_by_dot_semicolon_check,
set_option_check,
left_arrow_check,
nonterminal_simp_check]:
errs, newlines = error_check(newlines, path)
format_errors(errs)
if not import_only_check(newlines, path):
# Check for too long files: either longer than 1500 lines, or not covered by an exception.
# Each exception contains a "watermark". If the file is longer than that, we also complain.
if len(lines) > 1500:
ex = [e for e in exceptions if e[1] == path.resolve()]
if ex:
(_ERR_NUM, _path, watermark) = list(ex)[0]
assert int(watermark) > 500 # protect against parse error
is_too_long = len(lines) > int(watermark)
else:
is_too_long = True
if is_too_long:
new_exceptions = True
# add up to 200 lines of slack, so simple PRs don't trigger this right away
watermark = len(lines) // 100 * 100 + 200
output_message(path, 1, "ERR_NUM_LIN", f"{watermark} file contains {len(lines)} lines, try to split it up")
errs, newlines = regular_check(newlines, path)
format_errors(errs)
errs, newlines = banned_import_check(newlines, path)
format_errors(errs)
# if we haven't been asked to fix errors, or there are no errors or no fixes, we're done
if fix and new_exceptions and enum_lines != newlines:
path.with_name(path.name + '.bak').write_text("".join(l for _,l in newlines), encoding = "utf8")
shutil.move(path.with_name(path.name + '.bak'), path)
fix = "--fix" in sys.argv
argv = (arg for arg in sys.argv[1:] if arg != "--fix")
for filename in argv:
lint(Path(filename), fix=fix)
if new_exceptions:
exit(1)