1 files changed, 447 insertions, 490 deletions
diff --git a/deps/v8/tools/run_perf.py b/deps/v8/tools/run_perf.py
index 9e05be99e5..419cc47847 100755..100644
--- a/deps/v8/tools/run_perf.py
+++ b/deps/v8/tools/run_perf.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # Copyright 2014 the V8 project authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
@@ -108,27 +107,30 @@ from __future__ import print_function
 from functools import reduce
 
 from collections import OrderedDict
-import datetime
+import copy
 import json
 import logging
 import math
-import optparse
+import argparse
 import os
 import re
 import subprocess
 import sys
+import time
 import traceback
 
+import numpy
+
 from testrunner.local import android
 from testrunner.local import command
 from testrunner.local import utils
+from testrunner.objects.output import Output, NULL_OUTPUT
 
 try:
   basestring       # Python 2
 except NameError:  # Python 3
   basestring = str
 
-ARCH_GUESS = utils.DefaultArch()
 SUPPORTED_ARCHS = ['arm',
                    'ia32',
                    'mips',
@@ -141,6 +143,7 @@ RESULT_STDDEV_RE = re.compile(r'^\{([^\}]+)\}$')
 RESULT_LIST_RE = re.compile(r'^\[([^\]]+)\]$')
 TOOLS_BASE = os.path.abspath(os.path.dirname(__file__))
 INFRA_FAILURE_RETCODE = 87
+MIN_RUNS_FOR_CONFIDENCE = 10
 
 
 def GeometricMean(values):
@@ -149,116 +152,130 @@ def GeometricMean(values):
   The mean is calculated using log to avoid overflow.
   """
   values = map(float, values)
-  return str(math.exp(sum(map(math.log, values)) / len(values)))
-
-
-class TestFailedError(Exception):
-  """Error raised when a test has failed due to a non-infra issue."""
-  pass
-
-
-class Results(object):
-  """Place holder for result traces."""
-  def __init__(self, traces=None, errors=None):
-    self.traces = traces or []
-    self.errors = errors or []
-    self.timeouts = []
-    self.near_timeouts = []  # > 90% of the max runtime
+  return math.exp(sum(map(math.log, values)) / len(values))
+
+
+class ResultTracker(object):
+  """Class that tracks trace/runnable results and produces script output.
+
+  The output is structured like this:
+  {
+    "traces": [
+      {
+        "graphs": ["path", "to", "trace", "config"],
+        "units": <string describing units, e.g. "ms" or "KB">,
+        "results": [<list of values measured over several runs>],
+        "stddev": <stddev of the value if measure by script or ''>
+      },
+      ...
+    ],
+    "runnables": [
+      {
+        "graphs": ["path", "to", "runnable", "config"],
+        "durations": [<list of durations of each runnable run in seconds>],
+        "timeout": <timeout configured for runnable in seconds>,
+      },
+      ...
+    ],
+    "errors": [<list of strings describing errors>],
+  }
+  """
+  def __init__(self):
+    self.traces = {}
+    self.errors = []
+    self.runnables = {}
+
+  def AddTraceResult(self, trace, result, stddev):
+    if trace.name not in self.traces:
+      self.traces[trace.name] = {
+        'graphs': trace.graphs,
+        'units': trace.units,
+        'results': [result],
+        'stddev': stddev or '',
+      }
+    else:
+      existing_entry = self.traces[trace.name]
+      assert trace.graphs == existing_entry['graphs']
+      assert trace.units == existing_entry['units']
+      if stddev:
+        existing_entry['stddev'] = stddev
+      existing_entry['results'].append(result)
+
+  def TraceHasStdDev(self, trace):
+    return trace.name in self.traces and self.traces[trace.name]['stddev'] != ''
+
+  def AddError(self, error):
+    self.errors.append(error)
+
+  def AddRunnableDuration(self, runnable, duration):
+    """Records a duration of a specific run of the runnable."""
+    if runnable.name not in self.runnables:
+      self.runnables[runnable.name] = {
+        'graphs': runnable.graphs,
+        'durations': [duration],
+        'timeout': runnable.timeout,
+      }
+    else:
+      existing_entry = self.runnables[runnable.name]
+      assert runnable.timeout == existing_entry['timeout']
+      assert runnable.graphs == existing_entry['graphs']
+      existing_entry['durations'].append(duration)
 
   def ToDict(self):
     return {
-        'traces': self.traces,
+        'traces': self.traces.values(),
         'errors': self.errors,
-        'timeouts': self.timeouts,
-        'near_timeouts': self.near_timeouts,
+        'runnables': self.runnables.values(),
     }
 
   def WriteToFile(self, file_name):
     with open(file_name, 'w') as f:
       f.write(json.dumps(self.ToDict()))
 
-  def __add__(self, other):
-    self.traces += other.traces
-    self.errors += other.errors
-    self.timeouts += other.timeouts
-    self.near_timeouts += other.near_timeouts
-    return self
-
-  def __str__(self):  # pragma: no cover
-    return str(self.ToDict())
-
-
-class Measurement(object):
-  """Represents a series of results of one trace.
-
-  The results are from repetitive runs of the same executable. They are
-  gathered by repeated calls to ConsumeOutput.
-  """
-  def __init__(self, graphs, units, results_regexp, stddev_regexp):
-    self.name = '/'.join(graphs)
-    self.graphs = graphs
-    self.units = units
-    self.results_regexp = results_regexp
-    self.stddev_regexp = stddev_regexp
-    self.results = []
-    self.errors = []
-    self.stddev = ''
-    self.process_size = False
-
-  def ConsumeOutput(self, stdout):
-    try:
-      result = re.search(self.results_regexp, stdout, re.M).group(1)
-      self.results.append(str(float(result)))
-    except ValueError:
-      self.errors.append('Regexp "%s" returned a non-numeric for test %s.'
-                         % (self.results_regexp, self.name))
-    except:
-      self.errors.append('Regexp "%s" did not match for test %s.'
-                         % (self.results_regexp, self.name))
-
-    try:
-      if self.stddev_regexp and self.stddev:
-        self.errors.append('Test %s should only run once since a stddev '
-                           'is provided by the test.' % self.name)
-      if self.stddev_regexp:
-        self.stddev = re.search(self.stddev_regexp, stdout, re.M).group(1)
-    except:
-      self.errors.append('Regexp "%s" did not match for test %s.'
-                         % (self.stddev_regexp, self.name))
+  def HasEnoughRuns(self, graph_config, confidence_level):
+    """Checks if the mean of the results for a given trace config is within
+    0.1% of the true value with the specified confidence level.
 
-  def GetResults(self):
-    return Results([{
-      'graphs': self.graphs,
-      'units': self.units,
-      'results': self.results,
-      'stddev': self.stddev,
-    }], self.errors)
+    This assumes Gaussian distribution of the noise and based on
+    https://en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule.
 
+    Args:
+      graph_config: An instance of GraphConfig.
+      confidence_level: Number of standard deviations from the mean that all
+          values must lie within. Typical values are 1, 2 and 3 and correspond
+          to 68%, 95% and 99.7% probability that the measured value is within
+          0.1% of the true value.
+
+    Returns:
+      True if specified confidence level have been achieved.
+    """
+    if not isinstance(graph_config, TraceConfig):
+      return all(self.HasEnoughRuns(child, confidence_level)
+                 for child in graph_config.children)
 
-class NullMeasurement(object):
-  """Null object to avoid having extra logic for configurations that don't
-  require secondary run, e.g. CI bots.
-  """
-  def ConsumeOutput(self, stdout):
-    pass
+    trace = self.traces.get(graph_config.name, {})
+    results = trace.get('results', [])
+    logging.debug('HasEnoughRuns for %s', graph_config.name)
 
-  def GetResults(self):
-    return Results()
+    if len(results) < MIN_RUNS_FOR_CONFIDENCE:
+      logging.debug('  Ran %d times, need at least %d',
+                    len(results), MIN_RUNS_FOR_CONFIDENCE)
+      return False
 
+    logging.debug('  Results: %d entries', len(results))
+    mean = numpy.mean(results)
+    mean_stderr = numpy.std(results) / numpy.sqrt(len(results))
+    logging.debug('  Mean: %.2f, mean_stderr: %.2f', mean, mean_stderr)
+    return confidence_level * mean_stderr < mean / 1000.0
 
-def Unzip(iterable):
-  left = []
-  right = []
-  for l, r in iterable:
-    left.append(l)
-    right.append(r)
-  return lambda: iter(left), lambda: iter(right)
+  def __str__(self):  # pragma: no cover
+    return json.dumps(self.ToDict(), indent=2, separators=(',', ': '))
 
 
-def RunResultsProcessor(results_processor, stdout, count):
+def RunResultsProcessor(results_processor, output, count):
   # Dummy pass through for null-runs.
-  if stdout is None:
-    return None
+  if output.stdout is None:
+    return output
 
   # We assume the results processor is relative to the suite.
   assert os.path.exists(results_processor)
@@ -268,112 +285,10 @@ def RunResultsProcessor(results_processor, stdout, count):
       stdout=subprocess.PIPE,
       stderr=subprocess.PIPE,
   )
-  result, _ = p.communicate(input=stdout)
-  logging.info('>>> Processed stdout (#%d):\n%s', count, result)
-  return result
-
-
-def AccumulateResults(
-    graph_names, trace_configs, iter_output, perform_measurement, calc_total):
-  """Iterates over the output of multiple benchmark reruns and accumulates
-  results for a configured list of traces.
-
-  Args:
-    graph_names: List of names that configure the base path of the traces. E.g.
-                 ['v8', 'Octane'].
-    trace_configs: List of 'TraceConfig' instances. Each trace config defines
-                   how to perform a measurement.
-    iter_output: Iterator over the standard output of each test run.
-    perform_measurement: Whether to actually run tests and perform measurements.
-                         This is needed so that we reuse this script for both CI
-                         and trybot, but want to ignore second run on CI without
-                         having to spread this logic throughout the script.
-    calc_total: Boolean flag to speficy the calculation of a summary trace.
-  Returns: A 'Results' object.
-  """
-  measurements = [
-    trace.CreateMeasurement(perform_measurement) for trace in trace_configs]
-  for stdout in iter_output():
-    for measurement in measurements:
-      measurement.ConsumeOutput(stdout)
-
-  res = reduce(lambda r, m: r + m.GetResults(), measurements, Results())
-
-  if not res.traces or not calc_total:
-    return res
-
-  # Assume all traces have the same structure.
-  if len(set(map(lambda t: len(t['results']), res.traces))) != 1:
-    res.errors.append('Not all traces have the same number of results.')
-    return res
-
-  # Calculate the geometric means for all traces. Above we made sure that
-  # there is at least one trace and that the number of results is the same
-  # for each trace.
-  n_results = len(res.traces[0]['results'])
-  total_results = [GeometricMean(t['results'][i] for t in res.traces)
-                   for i in range(0, n_results)]
-  res.traces.append({
-    'graphs': graph_names + ['Total'],
-    'units': res.traces[0]['units'],
-    'results': total_results,
-    'stddev': '',
-  })
-  return res
-
-
-def AccumulateGenericResults(graph_names, suite_units, iter_output):
-  """Iterates over the output of multiple benchmark reruns and accumulates
-  generic results.
-
-  Args:
-    graph_names: List of names that configure the base path of the traces. E.g.
-                 ['v8', 'Octane'].
-    suite_units: Measurement default units as defined by the benchmark suite.
-    iter_output: Iterator over the standard output of each test run.
-  Returns: A 'Results' object.
-  """
-  traces = OrderedDict()
-  for stdout in iter_output():
-    if stdout is None:
-      # The None value is used as a null object to simplify logic.
-      continue
-    for line in stdout.strip().splitlines():
-      match = GENERIC_RESULTS_RE.match(line)
-      if match:
-        stddev = ''
-        graph = match.group(1)
-        trace = match.group(2)
-        body = match.group(3)
-        units = match.group(4)
-        match_stddev = RESULT_STDDEV_RE.match(body)
-        match_list = RESULT_LIST_RE.match(body)
-        errors = []
-        if match_stddev:
-          result, stddev = map(str.strip, match_stddev.group(1).split(','))
-          results = [result]
-        elif match_list:
-          results = map(str.strip, match_list.group(1).split(','))
-        else:
-          results = [body.strip()]
-
-        try:
-          results = map(lambda r: str(float(r)), results)
-        except ValueError:
-          results = []
-          errors = ['Found non-numeric in %s' %
-                    '/'.join(graph_names + [graph, trace])]
-
-        trace_result = traces.setdefault(trace, Results([{
-          'graphs': graph_names + [graph, trace],
-          'units': (units or suite_units).strip(),
-          'results': [],
-          'stddev': '',
-        }], errors))
-        trace_result.traces[0]['results'].extend(results)
-        trace_result.traces[0]['stddev'] = stddev
-
-  return reduce(lambda r, t: r + t, traces.itervalues(), Results())
+  new_output = copy.copy(output)
+  new_output.stdout, _ = p.communicate(input=output.stdout)
+  logging.info('>>> Processed stdout (#%d):\n%s', count, output.stdout)
+  return new_output
 
 
 class Node(object):
@@ -384,6 +299,10 @@ class Node(object):
   def AppendChild(self, child):
     self._children.append(child)
 
+  @property
+  def children(self):
+    return self._children
+
 
 class DefaultSentinel(Node):
   """Fake parent node with all default values."""
@@ -392,7 +311,7 @@ class DefaultSentinel(Node):
     self.binary = binary
     self.run_count = 10
     self.timeout = 60
-    self.retry_count = 0
+    self.retry_count = 4
     self.path = []
     self.graphs = []
     self.flags = []
@@ -465,6 +384,10 @@ class GraphConfig(Node):
       stddev_default = None
     self.stddev_regexp = suite.get('stddev_regexp', stddev_default)
 
+  @property
+  def name(self):
+    return '/'.join(self.graphs)
+
 
 class TraceConfig(GraphConfig):
   """Represents a leaf in the suite tree structure."""
@@ -473,16 +396,46 @@ class TraceConfig(GraphConfig):
     assert self.results_regexp
     assert self.owners
 
-  def CreateMeasurement(self, perform_measurement):
-    if not perform_measurement:
-      return NullMeasurement()
+  def ConsumeOutput(self, output, result_tracker):
+    """Extracts trace results from the output.
+
+    Args:
+      output: Output object from the test run.
+      result_tracker: Result tracker to be updated.
+
+    Returns:
+      The raw extracted result value or None if an error occurred.
+    """
+    result = None
+    stddev = None
+
+    try:
+      result = float(
+        re.search(self.results_regexp, output.stdout, re.M).group(1))
+    except ValueError:
+      result_tracker.AddError(
+          'Regexp "%s" returned a non-numeric for test %s.' %
+          (self.results_regexp, self.name))
+    except:
+      result_tracker.AddError(
+          'Regexp "%s" did not match for test %s.' %
+          (self.results_regexp, self.name))
 
-    return Measurement(
-        self.graphs,
-        self.units,
-        self.results_regexp,
-        self.stddev_regexp,
-    )
+    try:
+      if self.stddev_regexp:
+        if result_tracker.TraceHasStdDev(self):
+          result_tracker.AddError(
+              'Test %s should only run once since a stddev is provided by the '
+              'test.' % self.name)
+        stddev = re.search(self.stddev_regexp, output.stdout, re.M).group(1)
+    except:
+      result_tracker.AddError(
+          'Regexp "%s" did not match for test %s.' %
+          (self.stddev_regexp, self.name))
+
+    if result:
+      result_tracker.AddTraceResult(self, result, stddev)
+    return result
 
 
 class RunnableConfig(GraphConfig):
@@ -490,22 +443,12 @@ class RunnableConfig(GraphConfig):
   """
   def __init__(self, suite, parent, arch):
     super(RunnableConfig, self).__init__(suite, parent, arch)
-    self.has_timeouts = False
-    self.has_near_timeouts = False
+    self.arch = arch
 
   @property
   def main(self):
     return self._suite.get('main', '')
 
-  def PostProcess(self, stdouts_iter):
-    if self.results_processor:
-      def it():
-        for i, stdout in enumerate(stdouts_iter()):
-          yield RunResultsProcessor(self.results_processor, stdout, i + 1)
-      return it
-    else:
-      return stdouts_iter
-
   def ChangeCWD(self, suite_path):
     """Changes the cwd to to path defined in the current graph.
 
@@ -537,25 +480,36 @@ class RunnableConfig(GraphConfig):
         args=self.GetCommandFlags(extra_flags=extra_flags),
         timeout=self.timeout or 60)
 
-  def Run(self, runner, trybot):
-    """Iterates over several runs and handles the output for all traces."""
-    stdout, stdout_secondary = Unzip(runner())
-    return (
-        AccumulateResults(
-            self.graphs,
-            self._children,
-            iter_output=self.PostProcess(stdout),
-            perform_measurement=True,
-            calc_total=self.total,
-        ),
-        AccumulateResults(
-            self.graphs,
-            self._children,
-            iter_output=self.PostProcess(stdout_secondary),
-            perform_measurement=trybot,  # only run second time on trybots
-            calc_total=self.total,
-        ),
-    )
+  def ProcessOutput(self, output, result_tracker, count):
+    """Processes test run output and updates result tracker.
+
+    Args:
+      output: Output object from the test run.
+      result_tracker: ResultTracker object to be updated.
+      count: Index of the test run (used for better logging).
+    """
+    if self.results_processor:
+      output = RunResultsProcessor(self.results_processor, output, count)
+
+    results_for_total = []
+    for trace in self.children:
+      result = trace.ConsumeOutput(output, result_tracker)
+      if result:
+        results_for_total.append(result)
+
+    if self.total:
+      # Produce total metric only when all traces have produced results.
+      if len(self.children) != len(results_for_total):
+        result_tracker.AddError(
+            'Not all traces have produced results. Can not compute total for '
+            '%s.' % self.name)
+        return
+
+      # Calculate total as a the geometric mean for results from all traces.
+      total_trace = TraceConfig(
+          {'name': 'Total', 'units': self.children[0].units}, self, self.arch)
+      result_tracker.AddTraceResult(
+          total_trace, GeometricMean(results_for_total), '')
 
 
 class RunnableTraceConfig(TraceConfig, RunnableConfig):
@@ -563,30 +517,9 @@ class RunnableTraceConfig(TraceConfig, RunnableConfig):
   def __init__(self, suite, parent, arch):
     super(RunnableTraceConfig, self).__init__(suite, parent, arch)
 
-  def Run(self, runner, trybot):
-    """Iterates over several runs and handles the output."""
-    measurement = self.CreateMeasurement(perform_measurement=True)
-    measurement_secondary = self.CreateMeasurement(perform_measurement=trybot)
-    for stdout, stdout_secondary in runner():
-      measurement.ConsumeOutput(stdout)
-      measurement_secondary.ConsumeOutput(stdout_secondary)
-    return (
-        measurement.GetResults(),
-        measurement_secondary.GetResults(),
-    )
-
-
-class RunnableGenericConfig(RunnableConfig):
-  """Represents a runnable suite definition with generic traces."""
-  def __init__(self, suite, parent, arch):
-    super(RunnableGenericConfig, self).__init__(suite, parent, arch)
-
-  def Run(self, runner, trybot):
-    stdout, stdout_secondary = Unzip(runner())
-    return (
-        AccumulateGenericResults(self.graphs, self.units, stdout),
-        AccumulateGenericResults(self.graphs, self.units, stdout_secondary),
-    )
+  def ProcessOutput(self, output, result_tracker, count):
+    result_tracker.AddRunnableDuration(self, output.duration)
+    self.ConsumeOutput(output, result_tracker)
 
 
 def MakeGraphConfig(suite, arch, parent):
@@ -602,10 +535,6 @@ def MakeGraphConfig(suite, arch, parent):
     else:
       # This graph has no subgraphs, it's a leaf.
       return RunnableTraceConfig(suite, parent, arch)
-  elif suite.get('generic'):
-    # This is a generic suite definition. It is either a runnable executable
-    # or has a main js file.
-    return RunnableGenericConfig(suite, parent, arch)
   elif suite.get('tests'):
     # This is neither a leaf nor a runnable.
     return GraphConfig(suite, parent, arch)
@@ -645,74 +574,85 @@ def FlattenRunnables(node, node_cb):
 
 
 class Platform(object):
-  def __init__(self, options):
-    self.shell_dir = options.shell_dir
-    self.shell_dir_secondary = options.shell_dir_secondary
-    self.extra_flags = options.extra_flags.split()
-    self.options = options
+  def __init__(self, args):
+    self.shell_dir = args.shell_dir
+    self.shell_dir_secondary = args.shell_dir_secondary
+    self.extra_flags = args.extra_flags.split()
+    self.args = args
 
   @staticmethod
-  def ReadBuildConfig(options):
-    config_path = os.path.join(options.shell_dir, 'v8_build_config.json')
+  def ReadBuildConfig(args):
+    config_path = os.path.join(args.shell_dir, 'v8_build_config.json')
     if not os.path.isfile(config_path):
       return {}
     with open(config_path) as f:
       return json.load(f)
 
   @staticmethod
-  def GetPlatform(options):
-    if Platform.ReadBuildConfig(options).get('is_android', False):
-      return AndroidPlatform(options)
+  def GetPlatform(args):
+    if Platform.ReadBuildConfig(args).get('is_android', False):
+      return AndroidPlatform(args)
     else:
-      return DesktopPlatform(options)
+      return DesktopPlatform(args)
 
   def _Run(self, runnable, count, secondary=False):
     raise NotImplementedError()  # pragma: no cover
 
-  def _TimedRun(self, runnable, count, secondary=False):
-    runnable_start_time = datetime.datetime.utcnow()
-    stdout = self._Run(runnable, count, secondary)
-    runnable_duration = datetime.datetime.utcnow() - runnable_start_time
-    if runnable_duration.total_seconds() > 0.9 * runnable.timeout:
-      runnable.has_near_timeouts = True
-    return stdout
+  def _LoggedRun(self, runnable, count, secondary=False):
+    suffix = ' - secondary' if secondary else ''
+    title = '>>> %%s (#%d)%s:' % ((count + 1), suffix)
+    try:
+      output = self._Run(runnable, count, secondary)
+    except OSError:
+      logging.exception(title % 'OSError')
+      raise
+    if output.stdout:
+      logging.info(title % 'Stdout' + '\n%s', output.stdout)
+    if output.stderr:  # pragma: no cover
+      # Print stderr for debugging.
+      logging.info(title % 'Stderr' + '\n%s', output.stderr)
+      logging.warning('>>> Test timed out after %ss.', runnable.timeout)
+    if output.exit_code != 0:
+      logging.warning('>>> Test crashed with exit code %d.', output.exit_code)
+    return output
 
-  def Run(self, runnable, count):
+  def Run(self, runnable, count, secondary):
     """Execute the benchmark's main file.
 
-    If options.shell_dir_secondary is specified, the benchmark is run twice,
-    e.g. with and without patch.
     Args:
       runnable: A Runnable benchmark instance.
       count: The number of this (repeated) run.
-    Returns: A tuple with the two benchmark outputs. The latter will be None if
-             options.shell_dir_secondary was not specified.
+      secondary: True if secondary run should be executed.
+
+    Returns:
+      A tuple with the two benchmark outputs. The latter will be NULL_OUTPUT if
+      secondary is False.
     """
-    stdout = self._TimedRun(runnable, count, secondary=False)
-    if self.shell_dir_secondary:
-      return stdout, self._TimedRun(runnable, count, secondary=True)
+    output = self._LoggedRun(runnable, count, secondary=False)
+    if secondary:
+      return output, self._LoggedRun(runnable, count, secondary=True)
     else:
-      return stdout, None
+      return output, NULL_OUTPUT
 
 
 class DesktopPlatform(Platform):
-  def __init__(self, options):
-    super(DesktopPlatform, self).__init__(options)
+  def __init__(self, args):
+    super(DesktopPlatform, self).__init__(args)
     self.command_prefix = []
 
     # Setup command class to OS specific version.
-    command.setup(utils.GuessOS(), options.device)
+    command.setup(utils.GuessOS(), args.device)
 
-    if options.prioritize or options.affinitize != None:
+    if args.prioritize or args.affinitize != None:
       self.command_prefix = ['schedtool']
-      if options.prioritize:
+      if args.prioritize:
         self.command_prefix += ['-n', '-20']
-      if options.affinitize != None:
+      if args.affinitize != None:
       # schedtool expects a bit pattern when setting affinity, where each
       # bit set to '1' corresponds to a core where the process may run on.
       # First bit corresponds to CPU 0. Since the 'affinitize' parameter is
       # a core number, we need to map to said bit pattern.
-        cpu = int(options.affinitize)
+        cpu = int(args.affinitize)
         core = 1 << cpu
         self.command_prefix += ['-a', ('0x%x' % core)]
       self.command_prefix += ['-e']
@@ -728,28 +668,11 @@ class DesktopPlatform(Platform):
       node.ChangeCWD(path)
 
   def _Run(self, runnable, count, secondary=False):
-    suffix = ' - secondary' if secondary else ''
     shell_dir = self.shell_dir_secondary if secondary else self.shell_dir
-    title = '>>> %%s (#%d)%s:' % ((count + 1), suffix)
     cmd = runnable.GetCommand(self.command_prefix, shell_dir, self.extra_flags)
-    try:
-      output = cmd.execute()
-    except OSError:  # pragma: no cover
-      logging.exception(title % 'OSError')
-      raise
+    output = cmd.execute()
 
-    logging.info(title % 'Stdout' + '\n%s', output.stdout)
-    if output.stderr:  # pragma: no cover
-      # Print stderr for debugging.
-      logging.info(title % 'Stderr' + '\n%s', output.stderr)
-    if output.timed_out:
-      logging.warning('>>> Test timed out after %ss.', runnable.timeout)
-      runnable.has_timeouts = True
-      raise TestFailedError()
-    if output.exit_code != 0:
-      logging.warning('>>> Test crashed.')
-      raise TestFailedError()
-    if '--prof' in self.extra_flags:
+    if output.IsSuccess() and '--prof' in self.extra_flags:
       os_prefix = {'linux': 'linux', 'macos': 'mac'}.get(utils.GuessOS())
       if os_prefix:
         tick_tools = os.path.join(TOOLS_BASE, '%s-tick-processor' % os_prefix)
@@ -758,17 +681,17 @@ class DesktopPlatform(Platform):
         logging.warning(
             'Profiler option currently supported on Linux and Mac OS.')
 
-    # time outputs to stderr
+    # /usr/bin/time outputs to stderr
     if runnable.process_size:
-      return output.stdout + output.stderr
-    return output.stdout
+      output.stdout += output.stderr
+    return output
 
 
 class AndroidPlatform(Platform):  # pragma: no cover
 
-  def __init__(self, options):
-    super(AndroidPlatform, self).__init__(options)
-    self.driver = android.android_driver(options.device)
+  def __init__(self, args):
+    super(AndroidPlatform, self).__init__(args)
+    self.driver = android.android_driver(args.device)
 
   def PreExecution(self):
     self.driver.set_high_perf_mode()
@@ -799,9 +722,7 @@ class AndroidPlatform(Platform):  # pragma: no cover
       self.driver.push_file(bench_abs, resource, bench_rel)
 
   def _Run(self, runnable, count, secondary=False):
-    suffix = ' - secondary' if secondary else ''
     target_dir = 'bin_secondary' if secondary else 'bin'
-    title = '>>> %%s (#%d)%s:' % ((count + 1), suffix)
     self.driver.drop_ram_caches()
 
     # Relative path to benchmark directory.
@@ -811,15 +732,17 @@ class AndroidPlatform(Platform):  # pragma: no cover
       bench_rel = '.'
 
     logcat_file = None
-    if self.options.dump_logcats_to:
+    if self.args.dump_logcats_to:
       runnable_name = '-'.join(runnable.graphs)
       logcat_file = os.path.join(
-          self.options.dump_logcats_to, 'logcat-%s-#%d%s.log' % (
+          self.args.dump_logcats_to, 'logcat-%s-#%d%s.log' % (
             runnable_name, count + 1, '-secondary' if secondary else ''))
       logging.debug('Dumping logcat into %s', logcat_file)
 
+    output = Output()
+    start = time.time()
     try:
-      stdout = self.driver.run(
+      output.stdout = self.driver.run(
           target_dir=target_dir,
           binary=runnable.binary,
           args=runnable.GetCommandFlags(self.extra_flags),
@@ -827,20 +750,17 @@ class AndroidPlatform(Platform):  # pragma: no cover
           timeout=runnable.timeout,
           logcat_file=logcat_file,
       )
-      logging.info(title % 'Stdout' + '\n%s', stdout)
     except android.CommandFailedException as e:
-      logging.info(title % 'Stdout' + '\n%s', e.output)
-      logging.warning('>>> Test crashed.')
-      raise TestFailedError()
+      output.stdout = e.output
+      output.exit_code = e.status
     except android.TimeoutException as e:
-      if e.output is not None:
-        logging.info(title % 'Stdout' + '\n%s', e.output)
-      logging.warning('>>> Test timed out after %ss.', runnable.timeout)
-      runnable.has_timeouts = True
-      raise TestFailedError()
+      output.stdout = e.output
+      output.timed_out = True
     if runnable.process_size:
-      return stdout + 'MaxMemory: Unsupported'
-    return stdout
+      output.stdout += 'MaxMemory: Unsupported'
+    output.duration = time.time() - start
+    return output
+
 
 class CustomMachineConfiguration:
   def __init__(self, disable_aslr = False, governor = None):
@@ -946,146 +866,164 @@ class CustomMachineConfiguration:
       raise Exception('Could not set CPU governor. Present value is %s'
                       % cur_value )
 
-def Main(args):
-  parser = optparse.OptionParser()
-  parser.add_option('--android-build-tools', help='Deprecated.')
-  parser.add_option('--arch',
-                    help=('The architecture to run tests for, '
-                          '"auto" or "native" for auto-detect'),
-                    default='x64')
-  parser.add_option('--buildbot',
-                    help='Adapt to path structure used on buildbots and adds '
-                         'timestamps/level to all logged status messages',
-                    default=False, action='store_true')
-  parser.add_option('-d', '--device',
-                    help='The device ID to run Android tests on. If not given '
-                         'it will be autodetected.')
-  parser.add_option('--extra-flags',
-                    help='Additional flags to pass to the test executable',
-                    default='')
-  parser.add_option('--json-test-results',
-                    help='Path to a file for storing json results.')
-  parser.add_option('--json-test-results-secondary',
-                    '--json-test-results-no-patch',  # TODO(sergiyb): Deprecate.
-                    help='Path to a file for storing json results from run '
-                         'without patch or for reference build run.')
-  parser.add_option('--outdir', help='Base directory with compile output',
-                    default='out')
-  parser.add_option('--outdir-secondary',
-                    '--outdir-no-patch',  # TODO(sergiyb): Deprecate.
-                    help='Base directory with compile output without patch or '
-                         'for reference build')
-  parser.add_option('--binary-override-path',
-                    help='JavaScript engine binary. By default, d8 under '
-                    'architecture-specific build dir. '
-                    'Not supported in conjunction with outdir-secondary.')
-  parser.add_option('--prioritize',
-                    help='Raise the priority to nice -20 for the benchmarking '
-                    'process.Requires Linux, schedtool, and sudo privileges.',
-                    default=False, action='store_true')
-  parser.add_option('--affinitize',
-                    help='Run benchmarking process on the specified core. '
-                    'For example: '
-                    '--affinitize=0 will run the benchmark process on core 0. '
-                    '--affinitize=3 will run the benchmark process on core 3. '
-                    'Requires Linux, schedtool, and sudo privileges.',
-                    default=None)
-  parser.add_option('--noaslr',
-                    help='Disable ASLR for the duration of the benchmarked '
-                    'process. Requires Linux and sudo privileges.',
-                    default=False, action='store_true')
-  parser.add_option('--cpu-governor',
-                    help='Set cpu governor to specified policy for the '
-                    'duration of the benchmarked process. Typical options: '
-                    '"powersave" for more stable results, or "performance" '
-                    'for shorter completion time of suite, with potentially '
-                    'more noise in results.')
-  parser.add_option('--filter',
-                    help='Only run the benchmarks beginning with this string. '
-                    'For example: '
-                    '--filter=JSTests/TypedArrays/ will run only TypedArray '
-                    'benchmarks from the JSTests suite.',
-                    default='')
-  parser.add_option('--run-count-multiplier', default=1, type='int',
-                    help='Multipled used to increase number of times each test '
-                    'is retried.')
-  parser.add_option('--dump-logcats-to',
-                    help='Writes logcat output from each test into specified '
-                    'directory. Only supported for android targets.')
-
-  (options, args) = parser.parse_args(args)
 
-  logging.basicConfig(
-      level=logging.INFO, format='%(asctime)s %(levelname)-8s  %(message)s')
+class MaxTotalDurationReachedError(Exception):
+  """Exception used to stop running tests when max total duration is reached."""
+  pass
 
-  if len(args) == 0:  # pragma: no cover
-    parser.print_help()
-    return INFRA_FAILURE_RETCODE
 
-  if options.arch in ['auto', 'native']:  # pragma: no cover
-    options.arch = ARCH_GUESS
+def Main(argv):
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--arch',
+                      help='The architecture to run tests for. Pass "auto" '
+                      'to auto-detect.', default='x64',
+                      choices=SUPPORTED_ARCHS + ['auto'])
+  parser.add_argument('--buildbot',
+                      help='Adapt to path structure used on buildbots and adds '
+                      'timestamps/level to all logged status messages',
+                      default=False, action='store_true')
+  parser.add_argument('-d', '--device',
+                      help='The device ID to run Android tests on. If not '
+                      'given it will be autodetected.')
+  parser.add_argument('--extra-flags',
+                      help='Additional flags to pass to the test executable',
+                      default='')
+  parser.add_argument('--json-test-results',
+                      help='Path to a file for storing json results.')
+  parser.add_argument('--json-test-results-secondary',
+                      help='Path to a file for storing json results from run '
+                      'without patch or for reference build run.')
+  parser.add_argument('--outdir', help='Base directory with compile output',
+                      default='out')
+  parser.add_argument('--outdir-secondary',
+                      help='Base directory with compile output without patch '
+                      'or for reference build')
+  parser.add_argument('--binary-override-path',
+                      help='JavaScript engine binary. By default, d8 under '
+                      'architecture-specific build dir. '
+                      'Not supported in conjunction with outdir-secondary.')
+  parser.add_argument('--prioritize',
+                      help='Raise the priority to nice -20 for the '
+                      'benchmarking process.Requires Linux, schedtool, and '
+                      'sudo privileges.', default=False, action='store_true')
+  parser.add_argument('--affinitize',
+                      help='Run benchmarking process on the specified core. '
+                      'For example: --affinitize=0 will run the benchmark '
+                      'process on core 0. --affinitize=3 will run the '
+                      'benchmark process on core 3. Requires Linux, schedtool, '
+                      'and sudo privileges.', default=None)
+  parser.add_argument('--noaslr',
+                      help='Disable ASLR for the duration of the benchmarked '
+                      'process. Requires Linux and sudo privileges.',
+                      default=False, action='store_true')
+  parser.add_argument('--cpu-governor',
+                      help='Set cpu governor to specified policy for the '
+                      'duration of the benchmarked process. Typical options: '
+                      '"powersave" for more stable results, or "performance" '
+                      'for shorter completion time of suite, with potentially '
+                      'more noise in results.')
+  parser.add_argument('--filter',
+                      help='Only run the benchmarks beginning with this '
+                      'string. For example: '
+                      '--filter=JSTests/TypedArrays/ will run only TypedArray '
+                      'benchmarks from the JSTests suite.',
+                      default='')
+  parser.add_argument('--confidence-level', type=int,
+                      help='Repeatedly runs each benchmark until specified '
+                      'confidence level is reached. The value is interpreted '
+                      'as the number of standard deviations from the mean that '
+                      'all values must lie within. Typical values are 1, 2 and '
+                      '3 and correspond to 68%, 95% and 99.7% probability that '
+                      'the measured value is within 0.1% of the true value. '
+                      'Larger values result in more retries and thus longer '
+                      'runtime, but also provide more reliable results. Also '
+                      'see --max-total-duration flag.')
+  parser.add_argument('--max-total-duration', type=int, default=7140,  # 1h 59m
+                      help='Max total duration in seconds allowed for retries '
+                      'across all tests. This is especially useful in '
+                      'combination with the --confidence-level flag.')
+  parser.add_argument('--dump-logcats-to',
+                      help='Writes logcat output from each test into specified '
+                      'directory. Only supported for android targets.')
+  parser.add_argument('--run-count', type=int, default=0,
+                      help='Override the run count specified by the test '
+                      'suite. The default 0 uses the suite\'s config.')
+  parser.add_argument('-v', '--verbose', default=False, action='store_true',
+                      help='Be verbose and print debug output.')
+  parser.add_argument('suite', nargs='+', help='Path to the suite config file.')
 
-  if not options.arch in SUPPORTED_ARCHS:  # pragma: no cover
-    logging.error('Unknown architecture %s', options.arch)
+  try:
+    args = parser.parse_args(argv)
+  except SystemExit:
     return INFRA_FAILURE_RETCODE
 
-  if (options.json_test_results_secondary and
-      not options.outdir_secondary):  # pragma: no cover
+  logging.basicConfig(
+      level=logging.DEBUG if args.verbose else logging.INFO,
+      format='%(asctime)s %(levelname)-8s  %(message)s')
+
+  if args.arch == 'auto':  # pragma: no cover
+    args.arch = utils.DefaultArch()
+    if args.arch not in SUPPORTED_ARCHS:
+      logging.error(
+          'Auto-detected architecture "%s" is not supported.', args.arch)
+      return INFRA_FAILURE_RETCODE
+
+  if (args.json_test_results_secondary and
+      not args.outdir_secondary):  # pragma: no cover
     logging.error('For writing secondary json test results, a secondary outdir '
                   'patch must be specified.')
     return INFRA_FAILURE_RETCODE
 
   workspace = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 
-  if options.buildbot:
+  if args.buildbot:
     build_config = 'Release'
   else:
-    build_config = '%s.release' % options.arch
+    build_config = '%s.release' % args.arch
 
-  if options.binary_override_path == None:
-    options.shell_dir = os.path.join(workspace, options.outdir, build_config)
+  if args.binary_override_path == None:
+    args.shell_dir = os.path.join(workspace, args.outdir, build_config)
     default_binary_name = 'd8'
   else:
-    if not os.path.isfile(options.binary_override_path):
+    if not os.path.isfile(args.binary_override_path):
       logging.error('binary-override-path must be a file name')
       return INFRA_FAILURE_RETCODE
-    if options.outdir_secondary:
+    if args.outdir_secondary:
       logging.error('specify either binary-override-path or outdir-secondary')
       return INFRA_FAILURE_RETCODE
-    options.shell_dir = os.path.abspath(
-        os.path.dirname(options.binary_override_path))
-    default_binary_name = os.path.basename(options.binary_override_path)
+    args.shell_dir = os.path.abspath(
+        os.path.dirname(args.binary_override_path))
+    default_binary_name = os.path.basename(args.binary_override_path)
 
-  if options.outdir_secondary:
-    options.shell_dir_secondary = os.path.join(
-        workspace, options.outdir_secondary, build_config)
+  if args.outdir_secondary:
+    args.shell_dir_secondary = os.path.join(
+        workspace, args.outdir_secondary, build_config)
   else:
-    options.shell_dir_secondary = None
+    args.shell_dir_secondary = None
 
-  if options.json_test_results:
-    options.json_test_results = os.path.abspath(options.json_test_results)
+  if args.json_test_results:
+    args.json_test_results = os.path.abspath(args.json_test_results)
 
-  if options.json_test_results_secondary:
-    options.json_test_results_secondary = os.path.abspath(
-        options.json_test_results_secondary)
+  if args.json_test_results_secondary:
+    args.json_test_results_secondary = os.path.abspath(
+        args.json_test_results_secondary)
 
   # Ensure all arguments have absolute path before we start changing current
   # directory.
-  args = map(os.path.abspath, args)
+  args.suite = map(os.path.abspath, args.suite)
 
   prev_aslr = None
   prev_cpu_gov = None
-  platform = Platform.GetPlatform(options)
-
-  results = Results()
-  results_secondary = Results()
-  # We use list here to allow modification in nested function below.
-  have_failed_tests = [False]
-  with CustomMachineConfiguration(governor = options.cpu_governor,
-                                  disable_aslr = options.noaslr) as conf:
-    for path in args:
+  platform = Platform.GetPlatform(args)
+
+  result_tracker = ResultTracker()
+  result_tracker_secondary = ResultTracker()
+  have_failed_tests = False
+  with CustomMachineConfiguration(governor = args.cpu_governor,
+                                  disable_aslr = args.noaslr) as conf:
+    for path in args.suite:
       if not os.path.exists(path):  # pragma: no cover
-        results.errors.append('Configuration file %s does not exist.' % path)
+        result_tracker.AddError('Configuration file %s does not exist.' % path)
         continue
 
       with open(path) as f:
@@ -1099,59 +1037,78 @@ def Main(args):
 
       # Build the graph/trace tree structure.
       default_parent = DefaultSentinel(default_binary_name)
-      root = BuildGraphConfigs(suite, options.arch, default_parent)
+      root = BuildGraphConfigs(suite, args.arch, default_parent)
 
       # Callback to be called on each node on traversal.
       def NodeCB(node):
         platform.PreTests(node, path)
 
       # Traverse graph/trace tree and iterate over all runnables.
-      for runnable in FlattenRunnables(root, NodeCB):
-        runnable_name = '/'.join(runnable.graphs)
-        if (not runnable_name.startswith(options.filter) and
-            runnable_name + '/' != options.filter):
-          continue
-        logging.info('>>> Running suite: %s', runnable_name)
-
-        def Runner():
-          """Output generator that reruns several times."""
-          total_runs = runnable.run_count * options.run_count_multiplier
-          for i in range(0, max(1, total_runs)):
+      start = time.time()
+      try:
+        for runnable in FlattenRunnables(root, NodeCB):
+          runnable_name = '/'.join(runnable.graphs)
+          if (not runnable_name.startswith(args.filter) and
+              runnable_name + '/' != args.filter):
+            continue
+          logging.info('>>> Running suite: %s', runnable_name)
+
+          def RunGenerator(runnable):
+            if args.confidence_level:
+              counter = 0
+              while not result_tracker.HasEnoughRuns(
+                  runnable, args.confidence_level):
+                yield counter
+                counter += 1
+            else:
+              for i in range(0, max(1, args.run_count or runnable.run_count)):
+                yield i
+
+          for i in RunGenerator(runnable):
             attempts_left = runnable.retry_count + 1
             while attempts_left:
-              try:
-                yield platform.Run(runnable, i)
-              except TestFailedError:
-                attempts_left -= 1
-                if not attempts_left:  # ignore failures until last attempt
-                  have_failed_tests[0] = True
-                else:
-                  logging.info('>>> Retrying suite: %s', runnable_name)
-              else:
+              total_duration = time.time() - start
+              if total_duration > args.max_total_duration:
+                logging.info(
+                    '>>> Stopping now since running for too long (%ds > %ds)',
+                    total_duration, args.max_total_duration)
+                raise MaxTotalDurationReachedError()
+
+              output, output_secondary = platform.Run(
+                  runnable, i, secondary=args.shell_dir_secondary)
+              result_tracker.AddRunnableDuration(runnable, output.duration)
+              result_tracker_secondary.AddRunnableDuration(
+                  runnable, output_secondary.duration)
+
+              if output.IsSuccess() and output_secondary.IsSuccess():
+                runnable.ProcessOutput(output, result_tracker, i)
+                if output_secondary is not NULL_OUTPUT:
+                  runnable.ProcessOutput(
+                      output_secondary, result_tracker_secondary, i)
                 break
 
-        # Let runnable iterate over all runs and handle output.
-        result, result_secondary = runnable.Run(
-          Runner, trybot=options.shell_dir_secondary)
-        results += result
-        results_secondary += result_secondary
-        if runnable.has_timeouts:
-          results.timeouts.append(runnable_name)
-        if runnable.has_near_timeouts:
-          results.near_timeouts.append(runnable_name)
+              attempts_left -= 1
+              have_failed_tests = True
+              if attempts_left:
+                logging.info('>>> Retrying suite: %s', runnable_name)
+      except MaxTotalDurationReachedError:
+        have_failed_tests = True
+
       platform.PostExecution()
 
-    if options.json_test_results:
-      results.WriteToFile(options.json_test_results)
+    if args.json_test_results:
+      result_tracker.WriteToFile(args.json_test_results)
     else:  # pragma: no cover
-      print(results)
+      print('Primary results:', result_tracker)
 
-  if options.json_test_results_secondary:
-    results_secondary.WriteToFile(options.json_test_results_secondary)
-  else:  # pragma: no cover
-    print(results_secondary)
+  if args.shell_dir_secondary:
+    if args.json_test_results_secondary:
+      result_tracker_secondary.WriteToFile(args.json_test_results_secondary)
+    else:  # pragma: no cover
+      print('Secondary results:', result_tracker_secondary)
 
-  if results.errors or have_failed_tests[0]:
+  if (result_tracker.errors or result_tracker_secondary.errors or
+      have_failed_tests):
     return 1
 
   return 0