tools/performance/diff-talos.py

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/tools/performance/diff-talos.py	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,201 @@
     1.4 +#!/usr/bin/env python
     1.5 +
     1.6 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.7 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.8 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.9 +
    1.10 +"""
    1.11 +This is a simple script that does one thing only: compare talos runs from
    1.12 +two revisions. It is intended to check which of two try runs is best or if
    1.13 +a try improves over the m-c or m-i revision in branches from.
    1.14 +
    1.15 +A big design goal is to avoid bit rot and to assert when bit rot is detected.
    1.16 +The set of tests we run is a moving target. When possible this script
    1.17 +should work with any test set, but in parts where it has to hard code
    1.18 +information, it should try to assert that it is valid so that changes
    1.19 +are detected and it is fixed earlier.
    1.20 +"""
    1.21 +
    1.22 +import json
    1.23 +import urllib2
    1.24 +import math
    1.25 +import sys
    1.26 +from optparse import OptionParser
    1.27 +
    1.28 +# FIXME: currently we assert that we know all the benchmarks just so
    1.29 +# we are sure to maintain the bigger_is_better set updated. Is there a better
    1.30 +# way to find/compute it?
    1.31 +bigger_is_better = frozenset(('v8_7', 'dromaeo_dom', 'dromaeo_css'))
    1.32 +
    1.33 +smaller_is_better = frozenset(('tdhtmlr_paint', 'tp5n_main_rss_paint',
    1.34 +                               'ts_paint', 'tp5n_paint', 'tsvgr_opacity',
    1.35 +                               'a11yr_paint', 'kraken',
    1.36 +                               'tdhtmlr_nochrome_paint',
    1.37 +                               'tspaint_places_generated_med', 'tpaint',
    1.38 +                               'tp5n_shutdown_paint', 'tsvgr',
    1.39 +                               'tp5n_pbytes_paint', 'tscrollr_paint',
    1.40 +                               'tspaint_places_generated_max',
    1.41 +                               'tp5n_responsiveness_paint',
    1.42 +                               'sunspider', 'tp5n_xres_paint', 'num_ctors',
    1.43 +                               'tresize', 'trobopan', 'tcheckerboard',
    1.44 +                               'tcheck3', 'tcheck2', 'tprovider',
    1.45 +                               'tp5n_modlistbytes_paint',
    1.46 +                               'trace_malloc_maxheap', 'tp4m_nochrome',
    1.47 +                               'trace_malloc_leaks', 'tp4m_main_rss_nochrome',
    1.48 +                               'tp4m_shutdown_nochrome', 'tdhtml_nochrome',
    1.49 +                               'ts_shutdown', 'tp5n_%cpu_paint',
    1.50 +                               'trace_malloc_allocs', 'ts',
    1.51 +                               'tsvg_nochrome', 'tp5n_content_rss_paint',
    1.52 +                               'tp5n_main_startup_fileio_paint',
    1.53 +                               'tp5n_nonmain_normal_netio_paint',
    1.54 +                               'tp5n_nonmain_startup_fileio_paint',
    1.55 +                               'tp5n_main_normal_fileio_paint',
    1.56 +                               'tp5n_nonmain_normal_fileio_paint',
    1.57 +                               'tp5n_main_startup_netio_paint',
    1.58 +                               'tp5n_main_normal_netio_paint',
    1.59 +                               'tp5n_main_shutdown_netio_paint',
    1.60 +                               'tp5n_main_shutdown_fileio_paint'))
    1.61 +
    1.62 +all_benchmarks = smaller_is_better | bigger_is_better
    1.63 +assert len(smaller_is_better & bigger_is_better) == 0
    1.64 +
    1.65 +def get_raw_data_for_revisions(revisions):
    1.66 +    """Loads data for the revisions, returns an array with one element for each
    1.67 +    revision."""
    1.68 +    selectors = ["revision=%s" % revision for revision in revisions]
    1.69 +    selector = '&'.join(selectors)
    1.70 +    url = "http://graphs.mozilla.org/api/test/runs/revisions?%s" % selector
    1.71 +    url_stream = urllib2.urlopen(url)
    1.72 +    data = json.load(url_stream)
    1.73 +    assert frozenset(data.keys()) == frozenset(('stat', 'revisions'))
    1.74 +    assert data['stat'] == 'ok'
    1.75 +    rev_data = data['revisions']
    1.76 +    assert frozenset(rev_data.keys()) == frozenset(revisions)
    1.77 +    return [rev_data[r] for r in revisions]
    1.78 +
    1.79 +def mean(values):
    1.80 +    return float(sum(values))/len(values)
    1.81 +
    1.82 +def c4(n):
    1.83 +    n = float(n)
    1.84 +    numerator = math.gamma(n/2)*math.sqrt(2/(n-1))
    1.85 +    denominator = math.gamma((n-1)/2)
    1.86 +    return numerator/denominator
    1.87 +
    1.88 +def unbiased_standard_deviation(values):
    1.89 +    n = len(values)
    1.90 +    if n == 1:
    1.91 +        return None
    1.92 +    acc = 0
    1.93 +    avg = mean(values)
    1.94 +    for i in values:
    1.95 +        dist = i - avg
    1.96 +        acc += dist * dist
    1.97 +    return math.sqrt(acc/(n-1))/c4(n)
    1.98 +
    1.99 +class BenchmarkResult:
   1.100 +    """ Stores the summary (mean and standard deviation) of a set of talus
   1.101 +    runs on the same revision and OS."""
   1.102 +    def __init__(self, avg, std):
   1.103 +        self.avg = avg
   1.104 +        self.std = std
   1.105 +    def __str__(self):
   1.106 +        t = "%s," % self.avg
   1.107 +        return "(%-13s %s)" % (t, self.std)
   1.108 +
   1.109 +# FIXME: This function computes the statistics of multiple runs of talos on a
   1.110 +# single revision. Should it also support computing statistics over runs of
   1.111 +# different revisions assuming the revisions are equivalent from a performance
   1.112 +# perspective?
   1.113 +def digest_revision_data(data):
   1.114 +    ret = {}
   1.115 +    benchmarks = frozenset(data.keys())
   1.116 +    # assert that all the benchmarks are known. If they are not,
   1.117 +    # smaller_is_better or bigger_is_better needs to be updated depending on
   1.118 +    # the benchmark type.
   1.119 +    assert all_benchmarks.issuperset(benchmarks), \
   1.120 +        "%s not found in all_benchmarks" % ','.join((benchmarks - all_benchmarks))
   1.121 +    for benchmark in benchmarks:
   1.122 +        benchmark_data = data[benchmark]
   1.123 +        expected_keys = frozenset(("test_runs", "name", "id"))
   1.124 +        assert frozenset(benchmark_data.keys()) == expected_keys
   1.125 +        test_runs = benchmark_data["test_runs"]
   1.126 +        operating_systems = test_runs.keys()
   1.127 +        results = {}
   1.128 +        for os in operating_systems:
   1.129 +            os_runs = test_runs[os]
   1.130 +            values = []
   1.131 +            for os_run in os_runs:
   1.132 +                # there are 4 fields: test run id, build id, timestamp,
   1.133 +                # mean value
   1.134 +                assert len(os_run) == 4
   1.135 +                values.append(os_run[3])
   1.136 +            avg = mean(values)
   1.137 +            std = unbiased_standard_deviation(values)
   1.138 +            results[os] = BenchmarkResult(avg, std)
   1.139 +        ret[benchmark] = results
   1.140 +    return ret
   1.141 +
   1.142 +def get_data_for_revisions(revisions):
   1.143 +    raw_data = get_raw_data_for_revisions(revisions)
   1.144 +    return [digest_revision_data(x) for x in raw_data]
   1.145 +
   1.146 +def overlaps(a, b):
   1.147 +    return a[1] >= b[0] and b[1] >= a[0]
   1.148 +
   1.149 +def is_significant(old, new):
   1.150 +    # conservative hack: if we don't know, say it is significant.
   1.151 +    if old.std is None or new.std is None:
   1.152 +        return True
   1.153 +    # use a 2 standard deviation interval, which is about 95% confidence.
   1.154 +    old_interval = [old.avg - old.std, old.avg + old.std]
   1.155 +    new_interval = [new.avg - new.std, new.avg + new.std]
   1.156 +    return not overlaps(old_interval, new_interval)
   1.157 +
   1.158 +def compute_difference(benchmark, old, new):
   1.159 +    if benchmark in bigger_is_better:
   1.160 +        new, old = old, new
   1.161 +
   1.162 +    if new.avg >= old.avg:
   1.163 +        return "%1.4fx worse" % (new.avg/old.avg)
   1.164 +    else:
   1.165 +        return "%1.4fx better" % (old.avg/new.avg)
   1.166 +
   1.167 +#FIXME: the printing could use a table class that computes the sizes of the
   1.168 +# cells instead of the current hard coded values.
   1.169 +def print_data_comparison(datav):
   1.170 +    assert len(datav) == 2
   1.171 +    old_data = datav[0]
   1.172 +    new_data = datav[1]
   1.173 +    old_benchmarks = frozenset(old_data.keys())
   1.174 +    new_benchmarks = frozenset(new_data.keys())
   1.175 +    benchmarks = old_benchmarks.intersection(new_benchmarks)
   1.176 +    for benchmark in sorted(benchmarks):
   1.177 +        print benchmark
   1.178 +        old_benchmark_data = old_data[benchmark]
   1.179 +        new_benchmark_data = new_data[benchmark]
   1.180 +        old_operating_systems = frozenset(old_benchmark_data.keys())
   1.181 +        new_operating_systems = frozenset(new_benchmark_data.keys())
   1.182 +        operating_systems = old_operating_systems.intersection(new_operating_systems)
   1.183 +        for os in sorted(operating_systems):
   1.184 +            old_os_data = old_benchmark_data[os]
   1.185 +            new_os_data = new_benchmark_data[os]
   1.186 +            if not is_significant(old_os_data, new_os_data):
   1.187 +                continue
   1.188 +
   1.189 +            diff = compute_difference(benchmark, old_os_data, new_os_data)
   1.190 +            print '%-33s | %-30s -> %-30s %s' % \
   1.191 +                (os, old_os_data, new_os_data, diff)
   1.192 +        print
   1.193 +
   1.194 +def main():
   1.195 +    parser = OptionParser(usage='Usage: %prog old_revision new_revision')
   1.196 +    options, args = parser.parse_args()
   1.197 +    if len(args) != 2:
   1.198 +        parser.print_help()
   1.199 +        sys.exit(1)
   1.200 +
   1.201 +    print_data_comparison(get_data_for_revisions(args))
   1.202 +
   1.203 +if __name__ == '__main__':
   1.204 +    main()

mercurial