check_files.py (21376B)
1 #!/usr/bin/env python3 2 3 # Copyright The Mbed TLS Contributors 4 # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 5 6 """ 7 This script checks the current state of the source code for minor issues, 8 including incorrect file permissions, presence of tabs, non-Unix line endings, 9 trailing whitespace, and presence of UTF-8 BOM. 10 Note: requires python 3, must be run from Mbed TLS root. 11 """ 12 13 import argparse 14 import codecs 15 import inspect 16 import logging 17 import os 18 import re 19 import subprocess 20 import sys 21 try: 22 from typing import FrozenSet, Optional, Pattern # pylint: disable=unused-import 23 except ImportError: 24 pass 25 26 from mbedtls_framework import build_tree 27 28 29 class FileIssueTracker: 30 """Base class for file-wide issue tracking. 31 32 To implement a checker that processes a file as a whole, inherit from 33 this class and implement `check_file_for_issue` and define ``heading``. 34 35 ``suffix_exemptions``: files whose name ends with a string in this set 36 will not be checked. 37 38 ``path_exemptions``: files whose path (relative to the root of the source 39 tree) matches this regular expression will not be checked. This can be 40 ``None`` to match no path. Paths are normalized and converted to ``/`` 41 separators before matching. 42 43 ``heading``: human-readable description of the issue 44 """ 45 46 suffix_exemptions = frozenset() #type: FrozenSet[str] 47 path_exemptions = None #type: Optional[Pattern[str]] 48 # heading must be defined in derived classes. 49 # pylint: disable=no-member 50 51 def __init__(self): 52 self.files_with_issues = {} 53 54 @staticmethod 55 def normalize_path(filepath): 56 """Normalize ``filepath`` with / as the directory separator.""" 57 filepath = os.path.normpath(filepath) 58 # On Windows, we may have backslashes to separate directories. 59 # We need slashes to match exemption lists. 60 seps = os.path.sep 61 if os.path.altsep is not None: 62 seps += os.path.altsep 63 return '/'.join(filepath.split(seps)) 64 65 def should_check_file(self, filepath): 66 """Whether the given file name should be checked. 67 68 Files whose name ends with a string listed in ``self.suffix_exemptions`` 69 or whose path matches ``self.path_exemptions`` will not be checked. 70 """ 71 for files_exemption in self.suffix_exemptions: 72 if filepath.endswith(files_exemption): 73 return False 74 if self.path_exemptions and \ 75 re.match(self.path_exemptions, self.normalize_path(filepath)): 76 return False 77 return True 78 79 def check_file_for_issue(self, filepath): 80 """Check the specified file for the issue that this class is for. 81 82 Subclasses must implement this method. 83 """ 84 raise NotImplementedError 85 86 def record_issue(self, filepath, line_number): 87 """Record that an issue was found at the specified location.""" 88 if filepath not in self.files_with_issues.keys(): 89 self.files_with_issues[filepath] = [] 90 self.files_with_issues[filepath].append(line_number) 91 92 def output_file_issues(self, logger): 93 """Log all the locations where the issue was found.""" 94 if self.files_with_issues.values(): 95 logger.info(self.heading) 96 for filename, lines in sorted(self.files_with_issues.items()): 97 if lines: 98 logger.info("{}: {}".format( 99 filename, ", ".join(str(x) for x in lines) 100 )) 101 else: 102 logger.info(filename) 103 logger.info("") 104 105 BINARY_FILE_PATH_RE_LIST = [ 106 r'docs/.*\.pdf\Z', 107 r'docs/.*\.png\Z', 108 r'tf-psa-crypto/docs/.*\.pdf\Z', 109 r'tf-psa-crypto/docs/.*\.png\Z', 110 r'programs/fuzz/corpuses/[^.]+\Z', 111 r'framework/data_files/[^.]+\Z', 112 r'framework/data_files/.*\.(crt|csr|db|der|key|pubkey)\Z', 113 r'framework/data_files/.*\.req\.[^/]+\Z', 114 r'framework/data_files/.*malformed[^/]+\Z', 115 r'framework/data_files/format_pkcs12\.fmt\Z', 116 r'framework/data_files/.*\.bin\Z', 117 ] 118 BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST)) 119 120 class LineIssueTracker(FileIssueTracker): 121 """Base class for line-by-line issue tracking. 122 123 To implement a checker that processes files line by line, inherit from 124 this class and implement `line_with_issue`. 125 """ 126 127 # Exclude binary files. 128 path_exemptions = BINARY_FILE_PATH_RE 129 130 def issue_with_line(self, line, filepath, line_number): 131 """Check the specified line for the issue that this class is for. 132 133 Subclasses must implement this method. 134 """ 135 raise NotImplementedError 136 137 def check_file_line(self, filepath, line, line_number): 138 if self.issue_with_line(line, filepath, line_number): 139 self.record_issue(filepath, line_number) 140 141 def check_file_for_issue(self, filepath): 142 """Check the lines of the specified file. 143 144 Subclasses must implement the ``issue_with_line`` method. 145 """ 146 with open(filepath, "rb") as f: 147 for i, line in enumerate(iter(f.readline, b"")): 148 self.check_file_line(filepath, line, i + 1) 149 150 151 def is_windows_file(filepath): 152 _root, ext = os.path.splitext(filepath) 153 return ext in ('.bat', '.dsp', '.dsw', '.sln', '.vcxproj') 154 155 156 class ShebangIssueTracker(FileIssueTracker): 157 """Track files with a bad, missing or extraneous shebang line. 158 159 Executable scripts must start with a valid shebang (#!) line. 160 """ 161 162 heading = "Invalid shebang line:" 163 164 # Allow either /bin/sh, /bin/bash, or /usr/bin/env. 165 # Allow at most one argument (this is a Linux limitation). 166 # For sh and bash, the argument if present must be options. 167 # For env, the argument must be the base name of the interpreter. 168 _shebang_re = re.compile(rb'^#! ?(?:/bin/(bash|sh)(?: -[^\n ]*)?' 169 rb'|/usr/bin/env ([^\n /]+))$') 170 _extensions = { 171 b'bash': 'sh', 172 b'perl': 'pl', 173 b'python3': 'py', 174 b'sh': 'sh', 175 } 176 177 path_exemptions = re.compile(r'framework/scripts/quiet/.*') 178 179 def is_valid_shebang(self, first_line, filepath): 180 m = re.match(self._shebang_re, first_line) 181 if not m: 182 return False 183 interpreter = m.group(1) or m.group(2) 184 if interpreter not in self._extensions: 185 return False 186 if not filepath.endswith('.' + self._extensions[interpreter]): 187 return False 188 return True 189 190 def check_file_for_issue(self, filepath): 191 is_executable = os.access(filepath, os.X_OK) 192 with open(filepath, "rb") as f: 193 first_line = f.readline() 194 if first_line.startswith(b'#!'): 195 if not is_executable: 196 # Shebang on a non-executable file 197 self.files_with_issues[filepath] = None 198 elif not self.is_valid_shebang(first_line, filepath): 199 self.files_with_issues[filepath] = [1] 200 elif is_executable: 201 # Executable without a shebang 202 self.files_with_issues[filepath] = None 203 204 205 class EndOfFileNewlineIssueTracker(FileIssueTracker): 206 """Track files that end with an incomplete line 207 (no newline character at the end of the last line).""" 208 209 heading = "Missing newline at end of file:" 210 211 path_exemptions = BINARY_FILE_PATH_RE 212 213 def check_file_for_issue(self, filepath): 214 with open(filepath, "rb") as f: 215 try: 216 f.seek(-1, 2) 217 except OSError: 218 # This script only works on regular files. If we can't seek 219 # 1 before the end, it means that this position is before 220 # the beginning of the file, i.e. that the file is empty. 221 return 222 if f.read(1) != b"\n": 223 self.files_with_issues[filepath] = None 224 225 226 class Utf8BomIssueTracker(FileIssueTracker): 227 """Track files that start with a UTF-8 BOM. 228 Files should be ASCII or UTF-8. Valid UTF-8 does not start with a BOM.""" 229 230 heading = "UTF-8 BOM present:" 231 232 suffix_exemptions = frozenset([".vcxproj", ".sln"]) 233 path_exemptions = BINARY_FILE_PATH_RE 234 235 def check_file_for_issue(self, filepath): 236 with open(filepath, "rb") as f: 237 if f.read().startswith(codecs.BOM_UTF8): 238 self.files_with_issues[filepath] = None 239 240 241 class UnicodeIssueTracker(LineIssueTracker): 242 """Track lines with invalid characters or invalid text encoding.""" 243 244 heading = "Invalid UTF-8 or forbidden character:" 245 246 # Only allow valid UTF-8, and only other explicitly allowed characters. 247 # We deliberately exclude all characters that aren't a simple non-blank, 248 # non-zero-width glyph, apart from a very small set (tab, ordinary space, 249 # line breaks, "basic" no-break space and soft hyphen). In particular, 250 # non-ASCII control characters, combinig characters, and Unicode state 251 # changes (e.g. right-to-left text) are forbidden. 252 # Note that we do allow some characters with a risk of visual confusion, 253 # for example '-' (U+002D HYPHEN-MINUS) vs '' (U+00AD SOFT HYPHEN) vs 254 # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs 255 # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA). 256 GOOD_CHARACTERS = ''.join([ 257 '\t\n\r -~', # ASCII (tabs and line endings are checked separately) 258 '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation) 259 '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable) 260 '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts 261 '\u2190-\u21FF', # Arrows 262 '\u2200-\u22FF', # Mathematical Symbols 263 '\u2500-\u257F' # Box Drawings characters used in markdown trees 264 ]) 265 # Allow any of the characters and ranges above, and anything classified 266 # as a word constituent. 267 GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS)) 268 269 def issue_with_line(self, line, _filepath, line_number): 270 try: 271 text = line.decode('utf-8') 272 except UnicodeDecodeError: 273 return True 274 if line_number == 1 and text.startswith('\uFEFF'): 275 # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning. 276 # Which files are allowed to have a BOM is handled in 277 # Utf8BomIssueTracker. 278 text = text[1:] 279 return not self.GOOD_CHARACTERS_RE.match(text) 280 281 class UnixLineEndingIssueTracker(LineIssueTracker): 282 """Track files with non-Unix line endings (i.e. files with CR).""" 283 284 heading = "Non-Unix line endings:" 285 286 def should_check_file(self, filepath): 287 if not super().should_check_file(filepath): 288 return False 289 return not is_windows_file(filepath) 290 291 def issue_with_line(self, line, _filepath, _line_number): 292 return b"\r" in line 293 294 295 class WindowsLineEndingIssueTracker(LineIssueTracker): 296 """Track files with non-Windows line endings (i.e. CR or LF not in CRLF).""" 297 298 heading = "Non-Windows line endings:" 299 300 def should_check_file(self, filepath): 301 if not super().should_check_file(filepath): 302 return False 303 return is_windows_file(filepath) 304 305 def issue_with_line(self, line, _filepath, _line_number): 306 return not line.endswith(b"\r\n") or b"\r" in line[:-2] 307 308 309 class TrailingWhitespaceIssueTracker(LineIssueTracker): 310 """Track lines with trailing whitespace.""" 311 312 heading = "Trailing whitespace:" 313 suffix_exemptions = frozenset([".dsp", ".md"]) 314 315 def issue_with_line(self, line, _filepath, _line_number): 316 return line.rstrip(b"\r\n") != line.rstrip() 317 318 319 class TabIssueTracker(LineIssueTracker): 320 """Track lines with tabs.""" 321 322 heading = "Tabs present:" 323 suffix_exemptions = frozenset([ 324 ".make", 325 ".pem", # some openssl dumps have tabs 326 ".sln", 327 "/.gitmodules", 328 "/Makefile", 329 "/Makefile.inc", 330 "/generate_visualc_files.pl", 331 ]) 332 333 def issue_with_line(self, line, _filepath, _line_number): 334 return b"\t" in line 335 336 337 class MergeArtifactIssueTracker(LineIssueTracker): 338 """Track lines with merge artifacts. 339 These are leftovers from a ``git merge`` that wasn't fully edited.""" 340 341 heading = "Merge artifact:" 342 343 def issue_with_line(self, line, _filepath, _line_number): 344 # Detect leftover git conflict markers. 345 if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '): 346 return True 347 if line.startswith(b'||||||| '): # from merge.conflictStyle=diff3 348 return True 349 if line.rstrip(b'\r\n') == b'=======' and \ 350 not _filepath.endswith('.md'): 351 return True 352 return False 353 354 355 def this_location(): 356 frame = inspect.currentframe() 357 assert frame is not None 358 info = inspect.getframeinfo(frame) 359 return os.path.basename(info.filename), info.lineno 360 THIS_FILE_BASE_NAME, LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER = this_location() 361 362 class LicenseIssueTracker(LineIssueTracker): 363 """Check copyright statements and license indications. 364 365 This class only checks that statements are correct if present. It does 366 not enforce the presence of statements in each file. 367 """ 368 369 heading = "License issue:" 370 371 LICENSE_EXEMPTION_RE_LIST = [] 372 373 # Exempt third-party drivers which may be under a different license 374 if build_tree.looks_like_tf_psa_crypto_root(os.getcwd()): 375 LICENSE_EXEMPTION_RE_LIST.append(r'drivers/(?=(everest)/.*)') 376 elif build_tree.is_mbedtls_3_6(): 377 LICENSE_EXEMPTION_RE_LIST.append(r'3rdparty/(?!(p256-m)/.*)') 378 379 LICENSE_EXEMPTION_RE_LIST += [ 380 # Documentation explaining the license may have accidental 381 # false positives. 382 r'(ChangeLog|LICENSE|framework\/LICENSE|[-0-9A-Z_a-z]+\.md)\Z', 383 # Files imported from TF-M, and not used except in test builds, 384 # may be under a different license. 385 r'configs/ext/crypto_config_profile_medium\.h\Z', 386 r'configs/ext/tfm_mbedcrypto_config_profile_medium\.h\Z', 387 r'configs/ext/README\.md\Z', 388 # Third-party file. 389 r'dco\.txt\Z', 390 r'framework\/dco\.txt\Z', 391 ] 392 path_exemptions = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST + 393 LICENSE_EXEMPTION_RE_LIST)) 394 395 COPYRIGHT_HOLDER = rb'The Mbed TLS Contributors' 396 # Catch "Copyright foo", "Copyright (C) foo", "Copyright © foo", etc. 397 COPYRIGHT_RE = re.compile(rb'.*\bcopyright\s+((?:\w|\s|[()]|[^ -~])*\w)', re.I) 398 399 SPDX_HEADER_KEY = b'SPDX-License-Identifier' 400 LICENSE_IDENTIFIER = b'Apache-2.0 OR GPL-2.0-or-later' 401 SPDX_RE = re.compile(br'.*?(' + 402 re.escape(SPDX_HEADER_KEY) + 403 br')(:\s*(.*?)\W*\Z|.*)', re.I) 404 405 LICENSE_MENTION_RE = re.compile(rb'.*(?:' + rb'|'.join([ 406 rb'Apache License', 407 rb'General Public License', 408 ]) + rb')', re.I) 409 410 def __init__(self): 411 super().__init__() 412 # Record what problem was caused. We can't easily report it due to 413 # the structure of the script. To be fixed after 414 # https://github.com/Mbed-TLS/mbedtls/pull/2506 415 self.problem = None 416 417 def issue_with_line(self, line, filepath, line_number): 418 #pylint: disable=too-many-return-statements 419 420 # Use endswith() rather than the more correct os.path.basename() 421 # because experimentally, it makes a significant difference to 422 # the running time. 423 if filepath.endswith(THIS_FILE_BASE_NAME) and \ 424 line_number > LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER: 425 # Avoid false positives from the code in this class. 426 # Also skip the rest of this file, which is highly unlikely to 427 # contain any problematic statements since we put those near the 428 # top of files. 429 return False 430 431 m = self.COPYRIGHT_RE.match(line) 432 if m and m.group(1) != self.COPYRIGHT_HOLDER: 433 self.problem = 'Invalid copyright line' 434 return True 435 436 m = self.SPDX_RE.match(line) 437 if m: 438 if m.group(1) != self.SPDX_HEADER_KEY: 439 self.problem = 'Misspelled ' + self.SPDX_HEADER_KEY.decode() 440 return True 441 if not m.group(3): 442 self.problem = 'Improperly formatted SPDX license identifier' 443 return True 444 if m.group(3) != self.LICENSE_IDENTIFIER: 445 self.problem = 'Wrong SPDX license identifier' 446 return True 447 448 m = self.LICENSE_MENTION_RE.match(line) 449 if m: 450 self.problem = 'Suspicious license mention' 451 return True 452 453 return False 454 455 456 class ErrorAddIssueTracker(LineIssueTracker): 457 """Signal direct additions of error codes. 458 459 Adding a low-level error code with a high-level error code is deprecated 460 and should use MBEDTLS_ERROR_ADD. 461 """ 462 463 heading = "Direct addition of error codes" 464 465 _ERR_PLUS_RE = re.compile(br'MBEDTLS_ERR_\w+ *\+|' 466 br'\+ *MBEDTLS_ERR_') 467 _EXCLUDE_RE = re.compile(br' *case ') 468 469 def issue_with_line(self, line, filepath, line_number): 470 if self._ERR_PLUS_RE.search(line) and not self._EXCLUDE_RE.match(line): 471 return True 472 return False 473 474 475 class IntegrityChecker: 476 """Sanity-check files under the current directory.""" 477 478 def __init__(self, log_file): 479 """Instantiate the sanity checker. 480 Check files under the current directory. 481 Write a report of issues to log_file.""" 482 if not build_tree.looks_like_root(os.getcwd()): 483 raise Exception("This script must be run from Mbed TLS or TF-PSA-Crypto root") 484 self.logger = None 485 self.setup_logger(log_file) 486 self.issues_to_check = [ 487 ShebangIssueTracker(), 488 EndOfFileNewlineIssueTracker(), 489 Utf8BomIssueTracker(), 490 UnicodeIssueTracker(), 491 UnixLineEndingIssueTracker(), 492 WindowsLineEndingIssueTracker(), 493 TrailingWhitespaceIssueTracker(), 494 TabIssueTracker(), 495 MergeArtifactIssueTracker(), 496 LicenseIssueTracker(), 497 ] 498 499 if not build_tree.is_mbedtls_3_6(): 500 self.issues_to_check.append(ErrorAddIssueTracker()) 501 502 def setup_logger(self, log_file, level=logging.INFO): 503 """Log to log_file if provided, or to stderr if None.""" 504 self.logger = logging.getLogger() 505 self.logger.setLevel(level) 506 if log_file: 507 handler = logging.FileHandler(log_file) 508 self.logger.addHandler(handler) 509 else: 510 console = logging.StreamHandler() 511 self.logger.addHandler(console) 512 513 @staticmethod 514 def collect_files(): 515 """Return the list of files to check. 516 517 These are the regular files commited into Git. 518 """ 519 bytes_output = subprocess.check_output(['git', '-C', 'framework', 520 'ls-files', '-z']) 521 bytes_framework_filepaths = bytes_output.split(b'\0')[:-1] 522 bytes_framework_filepaths = ["framework/".encode() + filepath 523 for filepath in bytes_framework_filepaths] 524 525 bytes_output = subprocess.check_output(['git', 'ls-files', '-z']) 526 bytes_filepaths = bytes_output.split(b'\0')[:-1] + \ 527 bytes_framework_filepaths 528 ascii_filepaths = map(lambda fp: fp.decode('ascii'), bytes_filepaths) 529 530 # Filter out directories. Normally Git doesn't list directories 531 # (it only knows about the files inside them), but there is 532 # at least one case where 'git ls-files' includes a directory: 533 # submodules. Just skip submodules (and any other directories). 534 ascii_filepaths = [fp for fp in ascii_filepaths 535 if os.path.isfile(fp)] 536 # Prepend './' to files in the top-level directory so that 537 # something like `'/Makefile' in fp` matches in the top-level 538 # directory as well as in subdirectories. 539 return [fp if os.path.dirname(fp) else os.path.join(os.curdir, fp) 540 for fp in ascii_filepaths] 541 542 def check_files(self): 543 """Check all files for all issues.""" 544 for issue_to_check in self.issues_to_check: 545 for filepath in self.collect_files(): 546 if issue_to_check.should_check_file(filepath): 547 issue_to_check.check_file_for_issue(filepath) 548 549 def output_issues(self): 550 """Log the issues found and their locations. 551 552 Return 1 if there were issues, 0 otherwise. 553 """ 554 integrity_return_code = 0 555 for issue_to_check in self.issues_to_check: 556 if issue_to_check.files_with_issues: 557 integrity_return_code = 1 558 issue_to_check.output_file_issues(self.logger) 559 return integrity_return_code 560 561 562 def run_main(): 563 parser = argparse.ArgumentParser(description=__doc__) 564 parser.add_argument( 565 "-l", "--log_file", type=str, help="path to optional output log", 566 ) 567 check_args = parser.parse_args() 568 integrity_check = IntegrityChecker(check_args.log_file) 569 integrity_check.check_files() 570 return_code = integrity_check.output_issues() 571 sys.exit(return_code) 572 573 574 if __name__ == "__main__": 575 run_main()