git_archive_all.py (24365B)
1 #! /usr/bin/env python3 2 # coding=utf-8 3 4 # The MIT License (MIT) 5 # 6 # Copyright (c) 2010 Ilya Kulakov 7 # 8 # Permission is hereby granted, free of charge, to any person obtaining a copy 9 # of this software and associated documentation files (the "Software"), to deal 10 # in the Software without restriction, including without limitation the rights 11 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 # copies of the Software, and to permit persons to whom the Software is 13 # furnished to do so, subject to the following conditions: 14 # 15 # The above copyright notice and this permission notice shall be included in 16 # all copies or substantial portions of the Software. 17 # 18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 # THE SOFTWARE. 25 26 from __future__ import print_function 27 from __future__ import unicode_literals 28 29 import logging 30 from os import environ, extsep, path, readlink 31 from subprocess import CalledProcessError, Popen, PIPE 32 import sys 33 import re 34 35 __version__ = "1.22.0" 36 37 38 try: 39 # Python 3.2+ 40 from os import fsdecode 41 except ImportError: 42 def fsdecode(filename): 43 if not isinstance(filename, unicode): 44 return filename.decode(sys.getfilesystemencoding(), 'strict') 45 else: 46 return filename 47 48 try: 49 # Python 3.2+ 50 from os import fsencode 51 except ImportError: 52 def fsencode(filename): 53 if not isinstance(filename, bytes): 54 return filename.encode(sys.getfilesystemencoding(), 'strict') 55 else: 56 return filename 57 58 59 def git_fsdecode(filename): 60 """ 61 Decode filename from git output into str. 62 """ 63 if sys.platform.startswith('win32'): 64 return filename.decode('utf-8') 65 else: 66 return fsdecode(filename) 67 68 69 def git_fsencode(filename): 70 """ 71 Encode filename from str into git input. 72 """ 73 if sys.platform.startswith('win32'): 74 return filename.encode('utf-8') 75 else: 76 return fsencode(filename) 77 78 79 try: 80 # Python 3.6+ 81 from os import fspath as _fspath 82 83 def fspath(filename, decoder=fsdecode, encoder=fsencode): 84 """ 85 Convert filename into bytes or str, depending on what's the best type 86 to represent paths for current Python and platform. 87 """ 88 # Python 3.6+: str can represent any path (PEP 383) 89 # str is not required on Windows (PEP 529) 90 # Decoding is still applied for consistency and to follow PEP 519 recommendation. 91 return decoder(_fspath(filename)) 92 except ImportError: 93 def fspath(filename, decoder=fsdecode, encoder=fsencode): 94 # Python 3.4 and 3.5: str can represent any path (PEP 383), 95 # but str is required on Windows (no PEP 529) 96 # 97 # Python 2.6 and 2.7: str cannot represent any path (no PEP 383), 98 # str is required on Windows (no PEP 529) 99 # bytes is required on POSIX (no PEP 383) 100 if sys.version_info > (3,): 101 import pathlib 102 if isinstance(filename, pathlib.PurePath): 103 return str(filename) 104 else: 105 return decoder(filename) 106 elif sys.platform.startswith('win32'): 107 return decoder(filename) 108 else: 109 return encoder(filename) 110 111 112 def git_fspath(filename): 113 """ 114 fspath representation of git output. 115 """ 116 return fspath(filename, git_fsdecode, git_fsencode) 117 118 119 class GitArchiver(object): 120 """ 121 GitArchiver 122 123 Scan a git repository and export all tracked files, and submodules. 124 Checks for .gitattributes files in each directory and uses 'export-ignore' 125 pattern entries for ignore files in the archive. 126 127 >>> archiver = GitArchiver(main_repo_abspath='my/repo/path') 128 >>> archiver.create('output.zip') 129 """ 130 TARFILE_FORMATS = { 131 'tar': 'w', 132 'tbz2': 'w:bz2', 133 'tgz': 'w:gz', 134 'txz': 'w:xz', 135 'bz2': 'w:bz2', 136 'gz': 'w:gz', 137 'xz': 'w:xz' 138 } 139 ZIPFILE_FORMATS = ('zip',) 140 141 LOG = logging.getLogger('GitArchiver') 142 143 def __init__(self, prefix='', exclude=True, force_sub=False, extra=None, main_repo_abspath=None, git_version=None): 144 """ 145 @param prefix: Prefix used to prepend all paths in the resulting archive. 146 Extra file paths are only prefixed if they are not relative. 147 E.g. if prefix is 'foo' and extra is ['bar', '/baz'] the resulting archive will look like this: 148 / 149 baz 150 foo/ 151 bar 152 153 @param exclude: Determines whether archiver should follow rules specified in .gitattributes files. 154 155 @param force_sub: Determines whether submodules are initialized and updated before archiving. 156 157 @param extra: List of extra paths to include in the resulting archive. 158 159 @param main_repo_abspath: Absolute path to the main repository (or one of subdirectories). 160 If given path is path to a subdirectory (but not a submodule directory!) it will be replaced 161 with abspath to top-level directory of the repository. 162 If None, current cwd is used. 163 164 @param git_version: Version of Git that determines whether various workarounds are on. 165 If None, tries to resolve via Git's CLI. 166 """ 167 self._check_attr_gens = {} 168 self._ignored_paths_cache = {} 169 170 if git_version is None: 171 git_version = self.get_git_version() 172 173 if git_version is not None and git_version < (1, 6, 1): 174 raise ValueError("git of version 1.6.1 and higher is required") 175 176 self.git_version = git_version 177 178 if main_repo_abspath is None: 179 main_repo_abspath = path.abspath('') 180 elif not path.isabs(main_repo_abspath): 181 raise ValueError("main_repo_abspath must be an absolute path") 182 183 self.main_repo_abspath = self.resolve_git_main_repo_abspath(main_repo_abspath) 184 185 self.prefix = fspath(prefix) 186 self.exclude = exclude 187 self.extra = [fspath(e) for e in extra] if extra is not None else [] 188 self.force_sub = force_sub 189 190 def create(self, output_path, dry_run=False, output_format=None, compresslevel=None): 191 """ 192 Create the archive at output_file_path. 193 194 Type of the archive is determined either by extension of output_file_path or by output_format. 195 Supported formats are: gz, zip, bz2, xz, tar, tgz, txz 196 197 @param output_path: Output file path. 198 199 @param dry_run: Determines whether create should do nothing but print what it would archive. 200 201 @param output_format: Determines format of the output archive. If None, format is determined from extension 202 of output_file_path. 203 204 @param compresslevel: Optional compression level. Interpretation depends on the output format. 205 """ 206 output_path = fspath(output_path) 207 208 if output_format is None: 209 file_name, file_ext = path.splitext(output_path) 210 output_format = file_ext[len(extsep):].lower() 211 self.LOG.debug("Output format is not explicitly set, determined format is {0}.".format(output_format)) 212 213 if not dry_run: 214 if output_format in self.ZIPFILE_FORMATS: 215 from zipfile import ZipFile, ZipInfo, ZIP_DEFLATED 216 217 if compresslevel is not None: 218 if sys.version_info > (3, 7): 219 archive = ZipFile(path.abspath(output_path), 'w', compresslevel=compresslevel) 220 else: 221 raise ValueError("Compression level for zip archives requires Python 3.7+") 222 else: 223 archive = ZipFile(path.abspath(output_path), 'w') 224 225 def add_file(file_path, arcname): 226 if not path.islink(file_path): 227 archive.write(file_path, arcname, ZIP_DEFLATED) 228 else: 229 i = ZipInfo(arcname) 230 i.create_system = 3 231 i.external_attr = 0xA1ED0000 232 archive.writestr(i, readlink(file_path)) 233 elif output_format in self.TARFILE_FORMATS: 234 import tarfile 235 236 mode = self.TARFILE_FORMATS[output_format] 237 238 if compresslevel is not None: 239 try: 240 archive = tarfile.open(path.abspath(output_path), mode, compresslevel=compresslevel) 241 except TypeError: 242 raise ValueError("{0} cannot be compressed".format(output_format)) 243 else: 244 archive = tarfile.open(path.abspath(output_path), mode) 245 246 def add_file(file_path, arcname): 247 archive.add(file_path, arcname) 248 else: 249 raise ValueError("unknown format: {0}".format(output_format)) 250 251 def archiver(file_path, arcname): 252 self.LOG.debug(fspath("{0} => {1}").format(file_path, arcname)) 253 add_file(file_path, arcname) 254 else: 255 archive = None 256 257 def archiver(file_path, arcname): 258 self.LOG.info(fspath("{0} => {1}").format(file_path, arcname)) 259 260 self.archive_all_files(archiver) 261 262 if archive is not None: 263 archive.close() 264 265 def is_file_excluded(self, repo_abspath, repo_file_path): 266 """ 267 Checks whether file at a given path is excluded. 268 269 @param repo_abspath: Absolute path to the git repository. 270 271 @param repo_file_path: Path to a file relative to repo_abspath. 272 273 @return: True if file should be excluded. Otherwise False. 274 """ 275 if not self.exclude: 276 return False 277 278 cache = self._ignored_paths_cache.setdefault(repo_abspath, {}) 279 280 if repo_file_path not in cache: 281 next(self._check_attr_gens[repo_abspath]) 282 attrs = self._check_attr_gens[repo_abspath].send(repo_file_path) 283 export_ignore_attr = attrs['export-ignore'] 284 285 if export_ignore_attr == b'set': 286 cache[repo_file_path] = True 287 elif export_ignore_attr == b'unset': 288 cache[repo_file_path] = False 289 else: 290 repo_file_dir_path = path.dirname(repo_file_path) 291 292 if repo_file_dir_path: 293 cache[repo_file_path] = self.is_file_excluded(repo_abspath, repo_file_dir_path) 294 else: 295 cache[repo_file_path] = False 296 297 return cache[repo_file_path] 298 299 def archive_all_files(self, archiver): 300 """ 301 Archive all files using archiver. 302 303 @param archiver: Callable that accepts 2 arguments: 304 abspath to file on the system and relative path within archive. 305 """ 306 for file_path in self.extra: 307 archiver(path.abspath(file_path), path.join(self.prefix, file_path)) 308 309 for file_path in self.walk_git_files(): 310 archiver(path.join(self.main_repo_abspath, file_path), path.join(self.prefix, file_path)) 311 312 def walk_git_files(self, repo_path=fspath('')): 313 """ 314 An iterator method that yields a file path relative to main_repo_abspath 315 for each file that should be included in the archive. 316 Skips those that match the exclusion patterns found in 317 any discovered .gitattributes files along the way. 318 319 Recurs into submodules as well. 320 321 @param repo_path: Path to the git submodule repository relative to main_repo_abspath. 322 323 @return: Generator to traverse files under git control relative to main_repo_abspath. 324 """ 325 repo_abspath = path.join(self.main_repo_abspath, fspath(repo_path)) 326 assert repo_abspath not in self._check_attr_gens 327 self._check_attr_gens[repo_abspath] = self.check_git_attr(repo_abspath, ['export-ignore']) 328 329 try: 330 repo_file_paths = self.list_repo_files(repo_abspath) 331 332 for repo_file_path in repo_file_paths: 333 repo_file_abspath = path.join(repo_abspath, repo_file_path) # absolute file path 334 main_repo_file_path = path.join(repo_path, repo_file_path) # relative to main_repo_abspath 335 336 if not path.islink(repo_file_abspath) and path.isdir(repo_file_abspath): 337 continue 338 339 if self.is_file_excluded(repo_abspath, repo_file_path): 340 continue 341 342 yield main_repo_file_path 343 344 if self.force_sub: 345 self.run_git_shell('git submodule init', repo_abspath) 346 self.run_git_shell('git submodule update', repo_abspath) 347 348 try: 349 repo_gitmodules_abspath = path.join(repo_abspath, fspath(".gitmodules")) 350 351 with open(repo_gitmodules_abspath) as f: 352 lines = f.readlines() 353 354 for l in lines: 355 m = re.match("^\\s*path\\s*=\\s*(.*)\\s*$", l) 356 357 if m: 358 repo_submodule_path = fspath(m.group(1)) # relative to repo_path 359 main_repo_submodule_path = path.join(repo_path, repo_submodule_path) # relative to main_repo_abspath 360 361 if self.is_file_excluded(repo_abspath, repo_submodule_path): 362 continue 363 364 for main_repo_submodule_file_path in self.walk_git_files(main_repo_submodule_path): 365 repo_submodule_file_path = path.relpath(main_repo_submodule_file_path, repo_path) # relative to repo_path 366 if self.is_file_excluded(repo_abspath, repo_submodule_file_path): 367 continue 368 369 yield main_repo_submodule_file_path 370 except IOError: 371 pass 372 finally: 373 self._check_attr_gens[repo_abspath].close() 374 del self._check_attr_gens[repo_abspath] 375 376 def check_git_attr(self, repo_abspath, attrs): 377 """ 378 Generator that returns git attributes for received paths relative to repo_abspath. 379 380 >>> archiver = GitArchiver(...) 381 >>> g = archiver.check_git_attr('repo_path', ['export-ignore']) 382 >>> next(g) 383 >>> attrs = g.send('relative_path') 384 >>> print(attrs['export-ignore']) 385 386 @param repo_abspath: Absolute path to a git repository. 387 388 @param attrs: Attributes to check 389 """ 390 def make_process(): 391 env = dict(environ, GIT_FLUSH='1') 392 cmd = 'git check-attr --stdin -z {0}'.format(' '.join(attrs)) 393 return Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, cwd=repo_abspath, env=env) 394 395 def read_attrs(process, repo_file_path): 396 process.stdin.write(repo_file_path + b'\0') 397 process.stdin.flush() 398 399 # For every attribute check-attr will output: <path> NUL <attribute> NUL <info> NUL 400 path, attr, info = b'', b'', b'' 401 nuls_count = 0 402 nuls_expected = 3 * len(attrs) 403 404 while nuls_count != nuls_expected: 405 b = process.stdout.read(1) 406 407 if b == b'' and process.poll() is not None: 408 raise RuntimeError("check-attr exited prematurely") 409 elif b == b'\0': 410 nuls_count += 1 411 412 if nuls_count % 3 == 0: 413 yield path, attr, info 414 415 path, attr, info = b'', b'', b'' 416 elif nuls_count % 3 == 0: 417 path += b 418 elif nuls_count % 3 == 1: 419 attr += b 420 elif nuls_count % 3 == 2: 421 info += b 422 423 def read_attrs_old(process, repo_file_path): 424 """ 425 Compatibility with versions 1.8.5 and below that do not recognize -z for output. 426 """ 427 process.stdin.write(repo_file_path + b'\0') 428 process.stdin.flush() 429 430 # For every attribute check-attr will output: <path>: <attribute>: <info>\n 431 # where <path> is c-quoted 432 433 path, attr, info = b'', b'', b'' 434 lines_count = 0 435 lines_expected = len(attrs) 436 437 while lines_count != lines_expected: 438 line = process.stdout.readline() 439 440 info_start = line.rfind(b': ') 441 if info_start == -1: 442 raise RuntimeError("unexpected output of check-attr: {0}".format(line)) 443 444 attr_start = line.rfind(b': ', 0, info_start) 445 if attr_start == -1: 446 raise RuntimeError("unexpected output of check-attr: {0}".format(line)) 447 448 path = line[:attr_start] 449 attr = line[attr_start + 2:info_start] # trim leading ": " 450 info = line[info_start + 2:len(line) - 1] # trim leading ": " and trailing \n 451 yield path, attr, info 452 453 lines_count += 1 454 455 if not attrs: 456 return 457 458 process = make_process() 459 460 if self.git_version is None or self.git_version > (1, 8, 5): 461 reader = read_attrs 462 else: 463 reader = read_attrs_old 464 465 try: 466 while True: 467 repo_file_path = yield 468 repo_file_path = git_fsencode(fspath(repo_file_path)) 469 repo_file_attrs = {} 470 471 for path, attr, value in reader(process, repo_file_path): 472 attr = attr.decode('utf-8') 473 repo_file_attrs[attr] = value 474 475 yield repo_file_attrs 476 finally: 477 process.stdin.close() 478 process.wait() 479 480 def resolve_git_main_repo_abspath(self, abspath): 481 """ 482 Return absolute path to the repo for a given path. 483 """ 484 try: 485 main_repo_abspath = self.run_git_shell('git rev-parse --show-toplevel', cwd=abspath).rstrip() 486 return path.abspath(git_fspath(main_repo_abspath)) 487 except CalledProcessError as e: 488 raise ValueError("{0} is not part of a git repository ({1})".format(abspath, e.returncode)) 489 490 @classmethod 491 def run_git_shell(cls, cmd, cwd=None): 492 """ 493 Run git shell command, read output and decode it into a unicode string. 494 495 @param cmd: Command to be executed. 496 497 @param cwd: Working directory. 498 499 @return: Output of the command. 500 501 @raise CalledProcessError: Raises exception if return code of the command is non-zero. 502 """ 503 p = Popen(cmd, shell=True, stdout=PIPE, cwd=cwd) 504 output, _ = p.communicate() 505 506 if p.returncode: 507 if sys.version_info > (2, 6): 508 raise CalledProcessError(returncode=p.returncode, cmd=cmd, output=output) 509 else: 510 raise CalledProcessError(returncode=p.returncode, cmd=cmd) 511 512 return output 513 514 @classmethod 515 def get_git_version(cls): 516 """ 517 Return version of git current shell points to. 518 519 If version cannot be parsed None is returned. 520 """ 521 try: 522 output = cls.run_git_shell('git version') 523 except CalledProcessError: 524 cls.LOG.warning("Unable to get Git version.") 525 return None 526 527 try: 528 version = output.split()[2] 529 except IndexError: 530 cls.LOG.warning("Unable to parse Git version \"%s\".", output) 531 return None 532 533 try: 534 return tuple(int(v) if v.isdigit() else 0 for v in version.split(b'.')) 535 except ValueError: 536 cls.LOG.warning("Unable to parse Git version \"%s\".", version) 537 return None 538 539 @classmethod 540 def list_repo_files(cls, repo_abspath): 541 repo_file_paths = cls.run_git_shell( 542 'git ls-files -z --cached --full-name --no-empty-directory', 543 cwd=repo_abspath 544 ) 545 repo_file_paths = repo_file_paths.split(b'\0')[:-1] 546 547 if sys.platform.startswith('win32'): 548 repo_file_paths = (git_fspath(p.replace(b'/', b'\\')) for p in repo_file_paths) 549 else: 550 repo_file_paths = map(git_fspath, repo_file_paths) 551 552 return repo_file_paths 553 554 555 def main(argv=None): 556 if argv is None: 557 argv = sys.argv 558 559 from optparse import OptionParser, SUPPRESS_HELP 560 561 parser = OptionParser( 562 usage="usage: %prog [-v] [-C BASE_REPO] [--prefix PREFIX] [--no-export-ignore]" 563 " [--force-submodules] [--include EXTRA1 ...] [--dry-run] [-0 | ... | -9] OUTPUT_FILE", 564 version="%prog {0}".format(__version__) 565 ) 566 567 parser.add_option('--prefix', 568 type='string', 569 dest='prefix', 570 default=None, 571 help="""prepend PREFIX to each filename in the archive; 572 defaults to OUTPUT_FILE name""") 573 574 parser.add_option('-C', 575 type='string', 576 dest='base_repo', 577 default=None, 578 help="""use BASE_REPO as the main git repository to archive; 579 defaults to the current directory when empty""") 580 581 parser.add_option('-v', '--verbose', 582 action='store_true', 583 dest='verbose', 584 help='enable verbose mode') 585 586 parser.add_option('--no-export-ignore', '--no-exclude', 587 action='store_false', 588 dest='exclude', 589 default=True, 590 help="ignore the [-]export-ignore attribute in .gitattributes") 591 592 parser.add_option('--force-submodules', 593 action='store_true', 594 dest='force_sub', 595 help='force `git submodule init && git submodule update` at each level before iterating submodules') 596 597 parser.add_option('--include', '--extra', 598 action='append', 599 dest='extra', 600 default=[], 601 help="additional files to include in the archive") 602 603 parser.add_option('--dry-run', 604 action='store_true', 605 dest='dry_run', 606 help="show files to be archived without actually creating the archive") 607 608 for i in range(10): 609 parser.add_option('-{0}'.format(i), 610 action='store_const', 611 const=i, 612 dest='compresslevel', 613 help=SUPPRESS_HELP) 614 615 options, args = parser.parse_args(argv[1:]) 616 617 if len(args) != 1: 618 parser.error("You must specify exactly one output file") 619 620 output_file_path = args[0] 621 622 if path.isdir(output_file_path): 623 parser.error("You cannot use directory as output") 624 625 # avoid tarbomb 626 if options.prefix is not None: 627 options.prefix = path.join(options.prefix, '') 628 else: 629 output_name = path.basename(output_file_path) 630 output_name = re.sub( 631 '(\\.zip|\\.tar|\\.tbz2|\\.tgz|\\.txz|\\.bz2|\\.gz|\\.xz|\\.tar\\.bz2|\\.tar\\.gz|\\.tar\\.xz)$', 632 '', 633 output_name 634 ) or "Archive" 635 options.prefix = path.join(output_name, '') 636 637 try: 638 handler = logging.StreamHandler(sys.stdout) 639 handler.setFormatter(logging.Formatter('%(message)s')) 640 GitArchiver.LOG.addHandler(handler) 641 GitArchiver.LOG.setLevel(logging.DEBUG if options.verbose else logging.INFO) 642 archiver = GitArchiver(options.prefix, 643 options.exclude, 644 options.force_sub, 645 options.extra, 646 path.abspath(options.base_repo) if options.base_repo is not None else None 647 ) 648 archiver.create(output_file_path, options.dry_run, compresslevel=options.compresslevel) 649 except Exception as e: 650 parser.exit(2, "{0}\n".format(e)) 651 652 return 0 653 654 655 if __name__ == '__main__': 656 sys.exit(main())