#!/usr/bin/env python # # Copyright 2016 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """This script creates a "jumbo" file which merges all incoming files for compiling. """ from __future__ import print_function from __future__ import unicode_literals import argparse import hashlib import io import os def cut_ranges(boundaries): # Given an increasing sequence of boundary indices, generate a sequence of # non-overlapping ranges. The total range is inclusive of the first index # and exclusive of the last index from the given sequence. for start, stop in zip(boundaries, boundaries[1:]): yield range(start, stop) def generate_chunk_stops(inputs, output_count, smart_merge=True): # Note: In the comments below, unique numeric labels are assigned to files. # Consider them as the sorted rank of the hash of each file path. # Simple jumbo chunking generates uniformly sized chunks with the ceiling of: # (output_index + 1) * input_count / output_count input_count = len(inputs) stops = [((i + 1) * input_count + output_count - 1) // output_count for i in range(output_count)] # This is disruptive at times because file insertions and removals can # invalidate many chunks as all files are offset by one. # For example, say we have 12 files in 4 uniformly sized chunks: # 9, 4, 0; 7, 1, 11; 5, 10, 2; 6, 3, 8 # If we delete the first file we get: # 4, 0, 7; 1, 11, 5; 10, 2, 6; 3, 8 # All of the chunks have new sets of inputs. # With path-aware chunking, we start with the uniformly sized chunks: # 9, 4, 0; 7, 1, 11; 5, 10, 2; 6, 3, 8 # First we find the smallest rank in each of the chunks. Their indices are # stored in the |centers| list and in this example the ranks would be: # 0, 1, 2, 3 # Then we find the largest rank between the centers. Their indices are stored # in the |stops| list and in this example the ranks would be: # 7, 11, 6 # These files mark the boundaries between chunks and these boundary files are # often maintained even as files are added or deleted. # In this example, 7, 11, and 6 are the first files in each chunk: # 9, 4, 0; 7, 1; 11, 5, 10, 2; 6, 3, 8 # If we delete the first file and repeat the process we get: # 4, 0; 7, 1; 11, 5, 10, 2; 6, 3, 8 # Only the first chunk has a new set of inputs. if smart_merge: # Starting with the simple chunks, every file is assigned a rank. # This requires a hash function that is stable across runs. hasher = lambda n: hashlib.md5(inputs[n].encode()).hexdigest() # In each chunk there is a key file with lowest rank; mark them. # Note that they will not easily change. centers = [min(indices, key=hasher) for indices in cut_ranges([0] + stops)] # Between each pair of key files there is a file with highest rank. # Mark these to be used as border files. They also will not easily change. # Forget the inital chunks and create new chunks by splitting the list at # every border file. stops = [max(indices, key=hasher) for indices in cut_ranges(centers)] stops.append(input_count) return stops def write_jumbo_files(inputs, outputs, written_input_set, written_output_set): chunk_stops = generate_chunk_stops(inputs, len(outputs)) written_inputs = 0 for output_index, output_file in enumerate(outputs): written_output_set.add(output_file) if os.path.isfile(output_file): with open(output_file, "r") as current: current_jumbo_file = current.read() else: current_jumbo_file = None out = io.StringIO() out.write("/* This is a Jumbo file. Don't edit. */\n\n") out.write("/* Generated with merge_for_jumbo.py. */\n\n") input_limit = chunk_stops[output_index] while written_inputs < input_limit: filename = inputs[written_inputs] written_inputs += 1 out.write("#include \"%s\"\n" % filename) written_input_set.add(filename) new_jumbo_file = out.getvalue() out.close() if new_jumbo_file != current_jumbo_file: with open(output_file, "w") as out: out.write(new_jumbo_file) def main(): parser = argparse.ArgumentParser() parser.add_argument("--outputs", nargs="+", required=True, help='List of output files to split input into') parser.add_argument("--file-list", required=True) parser.add_argument("--verbose", action="store_true") args = parser.parse_args() lines = [] # If written with gn |write_file| each file is on its own line. with open(args.file_list) as file_list_file: lines = [line.strip() for line in file_list_file if line.strip()] # If written with gn |response_file_contents| the files are space separated. all_inputs = [] for line in lines: all_inputs.extend(line.split()) written_output_set = set() # Just for double checking written_input_set = set() # Just for double checking for language_ext in (".cc", ".c", ".mm",): if language_ext == ".cc": ext_pattern = (".cc", ".cpp") else: ext_pattern = tuple([language_ext]) outputs = [x for x in args.outputs if x.endswith(ext_pattern)] inputs = [x for x in all_inputs if x.endswith(ext_pattern)] if not outputs: assert not inputs continue write_jumbo_files(inputs, outputs, written_input_set, written_output_set) assert set(args.outputs) == written_output_set, "Did not fill all outputs" assert set(all_inputs) == written_input_set, "Did not use all inputs" if args.verbose: print("Generated %s (%d files) based on %s" % ( str(args.outputs), len(written_input_set), args.file_list)) if __name__ == "__main__": main()