tools/performance/diff-talos.py

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rwxr-xr-x

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 #!/usr/bin/env python
     3 # This Source Code Form is subject to the terms of the Mozilla Public
     4 # License, v. 2.0. If a copy of the MPL was not distributed with this
     5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
     7 """
     8 This is a simple script that does one thing only: compare talos runs from
     9 two revisions. It is intended to check which of two try runs is best or if
    10 a try improves over the m-c or m-i revision in branches from.
    12 A big design goal is to avoid bit rot and to assert when bit rot is detected.
    13 The set of tests we run is a moving target. When possible this script
    14 should work with any test set, but in parts where it has to hard code
    15 information, it should try to assert that it is valid so that changes
    16 are detected and it is fixed earlier.
    17 """
    19 import json
    20 import urllib2
    21 import math
    22 import sys
    23 from optparse import OptionParser
    25 # FIXME: currently we assert that we know all the benchmarks just so
    26 # we are sure to maintain the bigger_is_better set updated. Is there a better
    27 # way to find/compute it?
    28 bigger_is_better = frozenset(('v8_7', 'dromaeo_dom', 'dromaeo_css'))
    30 smaller_is_better = frozenset(('tdhtmlr_paint', 'tp5n_main_rss_paint',
    31                                'ts_paint', 'tp5n_paint', 'tsvgr_opacity',
    32                                'a11yr_paint', 'kraken',
    33                                'tdhtmlr_nochrome_paint',
    34                                'tspaint_places_generated_med', 'tpaint',
    35                                'tp5n_shutdown_paint', 'tsvgr',
    36                                'tp5n_pbytes_paint', 'tscrollr_paint',
    37                                'tspaint_places_generated_max',
    38                                'tp5n_responsiveness_paint',
    39                                'sunspider', 'tp5n_xres_paint', 'num_ctors',
    40                                'tresize', 'trobopan', 'tcheckerboard',
    41                                'tcheck3', 'tcheck2', 'tprovider',
    42                                'tp5n_modlistbytes_paint',
    43                                'trace_malloc_maxheap', 'tp4m_nochrome',
    44                                'trace_malloc_leaks', 'tp4m_main_rss_nochrome',
    45                                'tp4m_shutdown_nochrome', 'tdhtml_nochrome',
    46                                'ts_shutdown', 'tp5n_%cpu_paint',
    47                                'trace_malloc_allocs', 'ts',
    48                                'tsvg_nochrome', 'tp5n_content_rss_paint',
    49                                'tp5n_main_startup_fileio_paint',
    50                                'tp5n_nonmain_normal_netio_paint',
    51                                'tp5n_nonmain_startup_fileio_paint',
    52                                'tp5n_main_normal_fileio_paint',
    53                                'tp5n_nonmain_normal_fileio_paint',
    54                                'tp5n_main_startup_netio_paint',
    55                                'tp5n_main_normal_netio_paint',
    56                                'tp5n_main_shutdown_netio_paint',
    57                                'tp5n_main_shutdown_fileio_paint'))
    59 all_benchmarks = smaller_is_better | bigger_is_better
    60 assert len(smaller_is_better & bigger_is_better) == 0
    62 def get_raw_data_for_revisions(revisions):
    63     """Loads data for the revisions, returns an array with one element for each
    64     revision."""
    65     selectors = ["revision=%s" % revision for revision in revisions]
    66     selector = '&'.join(selectors)
    67     url = "http://graphs.mozilla.org/api/test/runs/revisions?%s" % selector
    68     url_stream = urllib2.urlopen(url)
    69     data = json.load(url_stream)
    70     assert frozenset(data.keys()) == frozenset(('stat', 'revisions'))
    71     assert data['stat'] == 'ok'
    72     rev_data = data['revisions']
    73     assert frozenset(rev_data.keys()) == frozenset(revisions)
    74     return [rev_data[r] for r in revisions]
    76 def mean(values):
    77     return float(sum(values))/len(values)
    79 def c4(n):
    80     n = float(n)
    81     numerator = math.gamma(n/2)*math.sqrt(2/(n-1))
    82     denominator = math.gamma((n-1)/2)
    83     return numerator/denominator
    85 def unbiased_standard_deviation(values):
    86     n = len(values)
    87     if n == 1:
    88         return None
    89     acc = 0
    90     avg = mean(values)
    91     for i in values:
    92         dist = i - avg
    93         acc += dist * dist
    94     return math.sqrt(acc/(n-1))/c4(n)
    96 class BenchmarkResult:
    97     """ Stores the summary (mean and standard deviation) of a set of talus
    98     runs on the same revision and OS."""
    99     def __init__(self, avg, std):
   100         self.avg = avg
   101         self.std = std
   102     def __str__(self):
   103         t = "%s," % self.avg
   104         return "(%-13s %s)" % (t, self.std)
   106 # FIXME: This function computes the statistics of multiple runs of talos on a
   107 # single revision. Should it also support computing statistics over runs of
   108 # different revisions assuming the revisions are equivalent from a performance
   109 # perspective?
   110 def digest_revision_data(data):
   111     ret = {}
   112     benchmarks = frozenset(data.keys())
   113     # assert that all the benchmarks are known. If they are not,
   114     # smaller_is_better or bigger_is_better needs to be updated depending on
   115     # the benchmark type.
   116     assert all_benchmarks.issuperset(benchmarks), \
   117         "%s not found in all_benchmarks" % ','.join((benchmarks - all_benchmarks))
   118     for benchmark in benchmarks:
   119         benchmark_data = data[benchmark]
   120         expected_keys = frozenset(("test_runs", "name", "id"))
   121         assert frozenset(benchmark_data.keys()) == expected_keys
   122         test_runs = benchmark_data["test_runs"]
   123         operating_systems = test_runs.keys()
   124         results = {}
   125         for os in operating_systems:
   126             os_runs = test_runs[os]
   127             values = []
   128             for os_run in os_runs:
   129                 # there are 4 fields: test run id, build id, timestamp,
   130                 # mean value
   131                 assert len(os_run) == 4
   132                 values.append(os_run[3])
   133             avg = mean(values)
   134             std = unbiased_standard_deviation(values)
   135             results[os] = BenchmarkResult(avg, std)
   136         ret[benchmark] = results
   137     return ret
   139 def get_data_for_revisions(revisions):
   140     raw_data = get_raw_data_for_revisions(revisions)
   141     return [digest_revision_data(x) for x in raw_data]
   143 def overlaps(a, b):
   144     return a[1] >= b[0] and b[1] >= a[0]
   146 def is_significant(old, new):
   147     # conservative hack: if we don't know, say it is significant.
   148     if old.std is None or new.std is None:
   149         return True
   150     # use a 2 standard deviation interval, which is about 95% confidence.
   151     old_interval = [old.avg - old.std, old.avg + old.std]
   152     new_interval = [new.avg - new.std, new.avg + new.std]
   153     return not overlaps(old_interval, new_interval)
   155 def compute_difference(benchmark, old, new):
   156     if benchmark in bigger_is_better:
   157         new, old = old, new
   159     if new.avg >= old.avg:
   160         return "%1.4fx worse" % (new.avg/old.avg)
   161     else:
   162         return "%1.4fx better" % (old.avg/new.avg)
   164 #FIXME: the printing could use a table class that computes the sizes of the
   165 # cells instead of the current hard coded values.
   166 def print_data_comparison(datav):
   167     assert len(datav) == 2
   168     old_data = datav[0]
   169     new_data = datav[1]
   170     old_benchmarks = frozenset(old_data.keys())
   171     new_benchmarks = frozenset(new_data.keys())
   172     benchmarks = old_benchmarks.intersection(new_benchmarks)
   173     for benchmark in sorted(benchmarks):
   174         print benchmark
   175         old_benchmark_data = old_data[benchmark]
   176         new_benchmark_data = new_data[benchmark]
   177         old_operating_systems = frozenset(old_benchmark_data.keys())
   178         new_operating_systems = frozenset(new_benchmark_data.keys())
   179         operating_systems = old_operating_systems.intersection(new_operating_systems)
   180         for os in sorted(operating_systems):
   181             old_os_data = old_benchmark_data[os]
   182             new_os_data = new_benchmark_data[os]
   183             if not is_significant(old_os_data, new_os_data):
   184                 continue
   186             diff = compute_difference(benchmark, old_os_data, new_os_data)
   187             print '%-33s | %-30s -> %-30s %s' % \
   188                 (os, old_os_data, new_os_data, diff)
   189         print
   191 def main():
   192     parser = OptionParser(usage='Usage: %prog old_revision new_revision')
   193     options, args = parser.parse_args()
   194     if len(args) != 2:
   195         parser.print_help()
   196         sys.exit(1)
   198     print_data_comparison(get_data_for_revisions(args))
   200 if __name__ == '__main__':
   201     main()

mercurial