diff options
author | Florian Dold <florian.dold@gmail.com> | 2019-08-07 22:45:47 +0200 |
---|---|---|
committer | Florian Dold <florian.dold@gmail.com> | 2019-08-07 22:45:47 +0200 |
commit | 65e39b7046a29aa299f06285441b62bcf1e4df01 (patch) | |
tree | 2eb012aabb59533b954aa169199733292de336cf /deps/v8/build/android/pylib/symbols/elf_symbolizer.py | |
parent | 936cd90b7def6ef7c1e0b80265a9dc77a9ad23c6 (diff) | |
download | android-node-v8-65e39b7046a29aa299f06285441b62bcf1e4df01.tar.gz android-node-v8-65e39b7046a29aa299f06285441b62bcf1e4df01.tar.bz2 android-node-v8-65e39b7046a29aa299f06285441b62bcf1e4df01.zip |
Move v8/build into this repository.
Since we need to patch some files, we don't let depot_tools
manage these files anymore.
build.git commit a0b2e3b2708bcf81ec00ac1738b586bcc5e04eea
Diffstat (limited to 'deps/v8/build/android/pylib/symbols/elf_symbolizer.py')
-rw-r--r-- | deps/v8/build/android/pylib/symbols/elf_symbolizer.py | 487 |
1 files changed, 487 insertions, 0 deletions
diff --git a/deps/v8/build/android/pylib/symbols/elf_symbolizer.py b/deps/v8/build/android/pylib/symbols/elf_symbolizer.py new file mode 100644 index 0000000000..1f2f918255 --- /dev/null +++ b/deps/v8/build/android/pylib/symbols/elf_symbolizer.py @@ -0,0 +1,487 @@ +# Copyright 2014 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +import collections +import datetime +import logging +import multiprocessing +import os +import posixpath +import Queue +import re +import subprocess +import sys +import threading +import time + + +# addr2line builds a possibly infinite memory cache that can exhaust +# the computer's memory if allowed to grow for too long. This constant +# controls how many lookups we do before restarting the process. 4000 +# gives near peak performance without extreme memory usage. +ADDR2LINE_RECYCLE_LIMIT = 4000 + + +ELF_MAGIC = '\x7f\x45\x4c\x46' + + +def ContainsElfMagic(file_path): + if os.path.getsize(file_path) < 4: + return False + try: + with open(file_path, 'r') as f: + b = f.read(4) + return b == ELF_MAGIC + except IOError: + return False + + +class ELFSymbolizer(object): + """An uber-fast (multiprocessing, pipelined and asynchronous) ELF symbolizer. + + This class is a frontend for addr2line (part of GNU binutils), designed to + symbolize batches of large numbers of symbols for a given ELF file. It + supports sharding symbolization against many addr2line instances and + pipelining of multiple requests per each instance (in order to hide addr2line + internals and OS pipe latencies). + + The interface exhibited by this class is a very simple asynchronous interface, + which is based on the following three methods: + - SymbolizeAsync(): used to request (enqueue) resolution of a given address. + - The |callback| method: used to communicated back the symbol information. + - Join(): called to conclude the batch to gather the last outstanding results. + In essence, before the Join method returns, this class will have issued as + many callbacks as the number of SymbolizeAsync() calls. In this regard, note + that due to multiprocess sharding, callbacks can be delivered out of order. + + Some background about addr2line: + - it is invoked passing the elf path in the cmdline, piping the addresses in + its stdin and getting results on its stdout. + - it has pretty large response times for the first requests, but it + works very well in streaming mode once it has been warmed up. + - it doesn't scale by itself (on more cores). However, spawning multiple + instances at the same time on the same file is pretty efficient as they + keep hitting the pagecache and become mostly CPU bound. + - it might hang or crash, mostly for OOM. This class deals with both of these + problems. + + Despite the "scary" imports and the multi* words above, (almost) no multi- + threading/processing is involved from the python viewpoint. Concurrency + here is achieved by spawning several addr2line subprocesses and handling their + output pipes asynchronously. Therefore, all the code here (with the exception + of the Queue instance in Addr2Line) should be free from mind-blowing + thread-safety concerns. + + The multiprocess sharding works as follows: + The symbolizer tries to use the lowest number of addr2line instances as + possible (with respect of |max_concurrent_jobs|) and enqueue all the requests + in a single addr2line instance. For few symbols (i.e. dozens) sharding isn't + worth the startup cost. + The multiprocess logic kicks in as soon as the queues for the existing + instances grow. Specifically, once all the existing instances reach the + |max_queue_size| bound, a new addr2line instance is kicked in. + In the case of a very eager producer (i.e. all |max_concurrent_jobs| instances + have a backlog of |max_queue_size|), back-pressure is applied on the caller by + blocking the SymbolizeAsync method. + + This module has been deliberately designed to be dependency free (w.r.t. of + other modules in this project), to allow easy reuse in external projects. + """ + + def __init__(self, elf_file_path, addr2line_path, callback, inlines=False, + max_concurrent_jobs=None, addr2line_timeout=30, max_queue_size=50, + source_root_path=None, strip_base_path=None): + """Args: + elf_file_path: path of the elf file to be symbolized. + addr2line_path: path of the toolchain's addr2line binary. + callback: a callback which will be invoked for each resolved symbol with + the two args (sym_info, callback_arg). The former is an instance of + |ELFSymbolInfo| and contains the symbol information. The latter is an + embedder-provided argument which is passed to SymbolizeAsync(). + inlines: when True, the ELFSymbolInfo will contain also the details about + the outer inlining functions. When False, only the innermost function + will be provided. + max_concurrent_jobs: Max number of addr2line instances spawned. + Parallelize responsibly, addr2line is a memory and I/O monster. + max_queue_size: Max number of outstanding requests per addr2line instance. + addr2line_timeout: Max time (in seconds) to wait for a addr2line response. + After the timeout, the instance will be considered hung and respawned. + source_root_path: In some toolchains only the name of the source file is + is output, without any path information; disambiguation searches + through the source directory specified by |source_root_path| argument + for files whose name matches, adding the full path information to the + output. For example, if the toolchain outputs "unicode.cc" and there + is a file called "unicode.cc" located under |source_root_path|/foo, + the tool will replace "unicode.cc" with + "|source_root_path|/foo/unicode.cc". If there are multiple files with + the same name, disambiguation will fail because the tool cannot + determine which of the files was the source of the symbol. + strip_base_path: Rebases the symbols source paths onto |source_root_path| + (i.e replace |strip_base_path| with |source_root_path). + """ + assert(os.path.isfile(addr2line_path)), 'Cannot find ' + addr2line_path + self.elf_file_path = elf_file_path + self.addr2line_path = addr2line_path + self.callback = callback + self.inlines = inlines + self.max_concurrent_jobs = (max_concurrent_jobs or + min(multiprocessing.cpu_count(), 4)) + self.max_queue_size = max_queue_size + self.addr2line_timeout = addr2line_timeout + self.requests_counter = 0 # For generating monotonic request IDs. + self._a2l_instances = [] # Up to |max_concurrent_jobs| _Addr2Line inst. + + # If necessary, create disambiguation lookup table + self.disambiguate = source_root_path is not None + self.disambiguation_table = {} + self.strip_base_path = strip_base_path + if self.disambiguate: + self.source_root_path = os.path.abspath(source_root_path) + self._CreateDisambiguationTable() + + # Create one addr2line instance. More instances will be created on demand + # (up to |max_concurrent_jobs|) depending on the rate of the requests. + self._CreateNewA2LInstance() + + def SymbolizeAsync(self, addr, callback_arg=None): + """Requests symbolization of a given address. + + This method is not guaranteed to return immediately. It generally does, but + in some scenarios (e.g. all addr2line instances have full queues) it can + block to create back-pressure. + + Args: + addr: address to symbolize. + callback_arg: optional argument which will be passed to the |callback|.""" + assert isinstance(addr, int) + + # Process all the symbols that have been resolved in the meanwhile. + # Essentially, this drains all the addr2line(s) out queues. + for a2l_to_purge in self._a2l_instances: + a2l_to_purge.ProcessAllResolvedSymbolsInQueue() + a2l_to_purge.RecycleIfNecessary() + + # Find the best instance according to this logic: + # 1. Find an existing instance with the shortest queue. + # 2. If all of instances' queues are full, but there is room in the pool, + # (i.e. < |max_concurrent_jobs|) create a new instance. + # 3. If there were already |max_concurrent_jobs| instances and all of them + # had full queues, make back-pressure. + + # 1. + def _SortByQueueSizeAndReqID(a2l): + return (a2l.queue_size, a2l.first_request_id) + a2l = min(self._a2l_instances, key=_SortByQueueSizeAndReqID) + + # 2. + if (a2l.queue_size >= self.max_queue_size and + len(self._a2l_instances) < self.max_concurrent_jobs): + a2l = self._CreateNewA2LInstance() + + # 3. + if a2l.queue_size >= self.max_queue_size: + a2l.WaitForNextSymbolInQueue() + + a2l.EnqueueRequest(addr, callback_arg) + + def WaitForIdle(self): + """Waits for all the outstanding requests to complete.""" + for a2l in self._a2l_instances: + a2l.WaitForIdle() + + def Join(self): + """Waits for all the outstanding requests to complete and terminates.""" + for a2l in self._a2l_instances: + a2l.WaitForIdle() + a2l.Terminate() + + def _CreateNewA2LInstance(self): + assert len(self._a2l_instances) < self.max_concurrent_jobs + a2l = ELFSymbolizer.Addr2Line(self) + self._a2l_instances.append(a2l) + return a2l + + def _CreateDisambiguationTable(self): + """ Non-unique file names will result in None entries""" + start_time = time.time() + logging.info('Collecting information about available source files...') + self.disambiguation_table = {} + + for root, _, filenames in os.walk(self.source_root_path): + for f in filenames: + self.disambiguation_table[f] = os.path.join(root, f) if (f not in + self.disambiguation_table) else None + logging.info('Finished collecting information about ' + 'possible files (took %.1f s).', + (time.time() - start_time)) + + + class Addr2Line(object): + """A python wrapper around an addr2line instance. + + The communication with the addr2line process looks as follows: + [STDIN] [STDOUT] (from addr2line's viewpoint) + > f001111 + > f002222 + < Symbol::Name(foo, bar) for f001111 + < /path/to/source/file.c:line_number + > f003333 + < Symbol::Name2() for f002222 + < /path/to/source/file.c:line_number + < Symbol::Name3() for f003333 + < /path/to/source/file.c:line_number + """ + + SYM_ADDR_RE = re.compile(r'([^:]+):(\?|\d+).*') + + def __init__(self, symbolizer): + self._symbolizer = symbolizer + self._lib_file_name = posixpath.basename(symbolizer.elf_file_path) + + # The request queue (i.e. addresses pushed to addr2line's stdin and not + # yet retrieved on stdout) + self._request_queue = collections.deque() + + # This is essentially len(self._request_queue). It has been optimized to a + # separate field because turned out to be a perf hot-spot. + self.queue_size = 0 + + # Keep track of the number of symbols a process has processed to + # avoid a single process growing too big and using all the memory. + self._processed_symbols_count = 0 + + # Objects required to handle the addr2line subprocess. + self._proc = None # Subprocess.Popen(...) instance. + self._thread = None # Threading.thread instance. + self._out_queue = None # Queue.Queue instance (for buffering a2l stdout). + self._RestartAddr2LineProcess() + + def EnqueueRequest(self, addr, callback_arg): + """Pushes an address to addr2line's stdin (and keeps track of it).""" + self._symbolizer.requests_counter += 1 # For global "age" of requests. + req_idx = self._symbolizer.requests_counter + self._request_queue.append((addr, callback_arg, req_idx)) + self.queue_size += 1 + self._WriteToA2lStdin(addr) + + def WaitForIdle(self): + """Waits until all the pending requests have been symbolized.""" + while self.queue_size > 0: + self.WaitForNextSymbolInQueue() + + def WaitForNextSymbolInQueue(self): + """Waits for the next pending request to be symbolized.""" + if not self.queue_size: + return + + # This outer loop guards against a2l hanging (detecting stdout timeout). + while True: + start_time = datetime.datetime.now() + timeout = datetime.timedelta(seconds=self._symbolizer.addr2line_timeout) + + # The inner loop guards against a2l crashing (checking if it exited). + while datetime.datetime.now() - start_time < timeout: + # poll() returns !None if the process exited. a2l should never exit. + if self._proc.poll(): + logging.warning('addr2line crashed, respawning (lib: %s).', + self._lib_file_name) + self._RestartAddr2LineProcess() + # TODO(primiano): the best thing to do in this case would be + # shrinking the pool size as, very likely, addr2line is crashed + # due to low memory (and the respawned one will die again soon). + + try: + lines = self._out_queue.get(block=True, timeout=0.25) + except Queue.Empty: + # On timeout (1/4 s.) repeat the inner loop and check if either the + # addr2line process did crash or we waited its output for too long. + continue + + # In nominal conditions, we get straight to this point. + self._ProcessSymbolOutput(lines) + return + + # If this point is reached, we waited more than |addr2line_timeout|. + logging.warning('Hung addr2line process, respawning (lib: %s).', + self._lib_file_name) + self._RestartAddr2LineProcess() + + def ProcessAllResolvedSymbolsInQueue(self): + """Consumes all the addr2line output lines produced (without blocking).""" + if not self.queue_size: + return + while True: + try: + lines = self._out_queue.get_nowait() + except Queue.Empty: + break + self._ProcessSymbolOutput(lines) + + def RecycleIfNecessary(self): + """Restarts the process if it has been used for too long. + + A long running addr2line process will consume excessive amounts + of memory without any gain in performance.""" + if self._processed_symbols_count >= ADDR2LINE_RECYCLE_LIMIT: + self._RestartAddr2LineProcess() + + + def Terminate(self): + """Kills the underlying addr2line process. + + The poller |_thread| will terminate as well due to the broken pipe.""" + try: + self._proc.kill() + self._proc.communicate() # Essentially wait() without risking deadlock. + except Exception: # pylint: disable=broad-except + # An exception while terminating? How interesting. + pass + self._proc = None + + def _WriteToA2lStdin(self, addr): + self._proc.stdin.write('%s\n' % hex(addr)) + if self._symbolizer.inlines: + # In the case of inlines we output an extra blank line, which causes + # addr2line to emit a (??,??:0) tuple that we use as a boundary marker. + self._proc.stdin.write('\n') + self._proc.stdin.flush() + + def _ProcessSymbolOutput(self, lines): + """Parses an addr2line symbol output and triggers the client callback.""" + (_, callback_arg, _) = self._request_queue.popleft() + self.queue_size -= 1 + + innermost_sym_info = None + sym_info = None + for (line1, line2) in lines: + prev_sym_info = sym_info + name = line1 if not line1.startswith('?') else None + source_path = None + source_line = None + m = ELFSymbolizer.Addr2Line.SYM_ADDR_RE.match(line2) + if m: + if not m.group(1).startswith('?'): + source_path = m.group(1) + if not m.group(2).startswith('?'): + source_line = int(m.group(2)) + else: + logging.warning('Got invalid symbol path from addr2line: %s', line2) + + # In case disambiguation is on, and needed + was_ambiguous = False + disambiguated = False + if self._symbolizer.disambiguate: + if source_path and not posixpath.isabs(source_path): + path = self._symbolizer.disambiguation_table.get(source_path) + was_ambiguous = True + disambiguated = path is not None + source_path = path if disambiguated else source_path + + # Use absolute paths (so that paths are consistent, as disambiguation + # uses absolute paths) + if source_path and not was_ambiguous: + source_path = os.path.abspath(source_path) + + if source_path and self._symbolizer.strip_base_path: + # Strip the base path + source_path = re.sub('^' + self._symbolizer.strip_base_path, + self._symbolizer.source_root_path or '', source_path) + + sym_info = ELFSymbolInfo(name, source_path, source_line, was_ambiguous, + disambiguated) + if prev_sym_info: + prev_sym_info.inlined_by = sym_info + if not innermost_sym_info: + innermost_sym_info = sym_info + + self._processed_symbols_count += 1 + self._symbolizer.callback(innermost_sym_info, callback_arg) + + def _RestartAddr2LineProcess(self): + if self._proc: + self.Terminate() + + # The only reason of existence of this Queue (and the corresponding + # Thread below) is the lack of a subprocess.stdout.poll_avail_lines(). + # Essentially this is a pipe able to extract a couple of lines atomically. + self._out_queue = Queue.Queue() + + # Start the underlying addr2line process in line buffered mode. + + cmd = [self._symbolizer.addr2line_path, '--functions', '--demangle', + '--exe=' + self._symbolizer.elf_file_path] + if self._symbolizer.inlines: + cmd += ['--inlines'] + self._proc = subprocess.Popen(cmd, bufsize=1, stdout=subprocess.PIPE, + stdin=subprocess.PIPE, stderr=sys.stderr, close_fds=True) + + # Start the poller thread, which simply moves atomically the lines read + # from the addr2line's stdout to the |_out_queue|. + self._thread = threading.Thread( + target=ELFSymbolizer.Addr2Line.StdoutReaderThread, + args=(self._proc.stdout, self._out_queue, self._symbolizer.inlines)) + self._thread.daemon = True # Don't prevent early process exit. + self._thread.start() + + self._processed_symbols_count = 0 + + # Replay the pending requests on the new process (only for the case + # of a hung addr2line timing out during the game). + for (addr, _, _) in self._request_queue: + self._WriteToA2lStdin(addr) + + @staticmethod + def StdoutReaderThread(process_pipe, queue, inlines): + """The poller thread fn, which moves the addr2line stdout to the |queue|. + + This is the only piece of code not running on the main thread. It merely + writes to a Queue, which is thread-safe. In the case of inlines, it + detects the ??,??:0 marker and sends the lines atomically, such that the + main thread always receives all the lines corresponding to one symbol in + one shot.""" + try: + lines_for_one_symbol = [] + while True: + line1 = process_pipe.readline().rstrip('\r\n') + line2 = process_pipe.readline().rstrip('\r\n') + if not line1 or not line2: + break + inline_has_more_lines = inlines and (len(lines_for_one_symbol) == 0 or + (line1 != '??' and line2 != '??:0')) + if not inlines or inline_has_more_lines: + lines_for_one_symbol += [(line1, line2)] + if inline_has_more_lines: + continue + queue.put(lines_for_one_symbol) + lines_for_one_symbol = [] + process_pipe.close() + + # Every addr2line processes will die at some point, please die silently. + except (IOError, OSError): + pass + + @property + def first_request_id(self): + """Returns the request_id of the oldest pending request in the queue.""" + return self._request_queue[0][2] if self._request_queue else 0 + + +class ELFSymbolInfo(object): + """The result of the symbolization passed as first arg. of each callback.""" + + def __init__(self, name, source_path, source_line, was_ambiguous=False, + disambiguated=False): + """All the fields here can be None (if addr2line replies with '??').""" + self.name = name + self.source_path = source_path + self.source_line = source_line + # In the case of |inlines|=True, the |inlined_by| points to the outer + # function inlining the current one (and so on, to form a chain). + self.inlined_by = None + self.disambiguated = disambiguated + self.was_ambiguous = was_ambiguous + + def __str__(self): + return '%s [%s:%d]' % ( + self.name or '??', self.source_path or '??', self.source_line or 0) |