1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/tools/performance/diff-talos.py Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,201 @@ 1.4 +#!/usr/bin/env python 1.5 + 1.6 +# This Source Code Form is subject to the terms of the Mozilla Public 1.7 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.9 + 1.10 +""" 1.11 +This is a simple script that does one thing only: compare talos runs from 1.12 +two revisions. It is intended to check which of two try runs is best or if 1.13 +a try improves over the m-c or m-i revision in branches from. 1.14 + 1.15 +A big design goal is to avoid bit rot and to assert when bit rot is detected. 1.16 +The set of tests we run is a moving target. When possible this script 1.17 +should work with any test set, but in parts where it has to hard code 1.18 +information, it should try to assert that it is valid so that changes 1.19 +are detected and it is fixed earlier. 1.20 +""" 1.21 + 1.22 +import json 1.23 +import urllib2 1.24 +import math 1.25 +import sys 1.26 +from optparse import OptionParser 1.27 + 1.28 +# FIXME: currently we assert that we know all the benchmarks just so 1.29 +# we are sure to maintain the bigger_is_better set updated. Is there a better 1.30 +# way to find/compute it? 1.31 +bigger_is_better = frozenset(('v8_7', 'dromaeo_dom', 'dromaeo_css')) 1.32 + 1.33 +smaller_is_better = frozenset(('tdhtmlr_paint', 'tp5n_main_rss_paint', 1.34 + 'ts_paint', 'tp5n_paint', 'tsvgr_opacity', 1.35 + 'a11yr_paint', 'kraken', 1.36 + 'tdhtmlr_nochrome_paint', 1.37 + 'tspaint_places_generated_med', 'tpaint', 1.38 + 'tp5n_shutdown_paint', 'tsvgr', 1.39 + 'tp5n_pbytes_paint', 'tscrollr_paint', 1.40 + 'tspaint_places_generated_max', 1.41 + 'tp5n_responsiveness_paint', 1.42 + 'sunspider', 'tp5n_xres_paint', 'num_ctors', 1.43 + 'tresize', 'trobopan', 'tcheckerboard', 1.44 + 'tcheck3', 'tcheck2', 'tprovider', 1.45 + 'tp5n_modlistbytes_paint', 1.46 + 'trace_malloc_maxheap', 'tp4m_nochrome', 1.47 + 'trace_malloc_leaks', 'tp4m_main_rss_nochrome', 1.48 + 'tp4m_shutdown_nochrome', 'tdhtml_nochrome', 1.49 + 'ts_shutdown', 'tp5n_%cpu_paint', 1.50 + 'trace_malloc_allocs', 'ts', 1.51 + 'tsvg_nochrome', 'tp5n_content_rss_paint', 1.52 + 'tp5n_main_startup_fileio_paint', 1.53 + 'tp5n_nonmain_normal_netio_paint', 1.54 + 'tp5n_nonmain_startup_fileio_paint', 1.55 + 'tp5n_main_normal_fileio_paint', 1.56 + 'tp5n_nonmain_normal_fileio_paint', 1.57 + 'tp5n_main_startup_netio_paint', 1.58 + 'tp5n_main_normal_netio_paint', 1.59 + 'tp5n_main_shutdown_netio_paint', 1.60 + 'tp5n_main_shutdown_fileio_paint')) 1.61 + 1.62 +all_benchmarks = smaller_is_better | bigger_is_better 1.63 +assert len(smaller_is_better & bigger_is_better) == 0 1.64 + 1.65 +def get_raw_data_for_revisions(revisions): 1.66 + """Loads data for the revisions, returns an array with one element for each 1.67 + revision.""" 1.68 + selectors = ["revision=%s" % revision for revision in revisions] 1.69 + selector = '&'.join(selectors) 1.70 + url = "http://graphs.mozilla.org/api/test/runs/revisions?%s" % selector 1.71 + url_stream = urllib2.urlopen(url) 1.72 + data = json.load(url_stream) 1.73 + assert frozenset(data.keys()) == frozenset(('stat', 'revisions')) 1.74 + assert data['stat'] == 'ok' 1.75 + rev_data = data['revisions'] 1.76 + assert frozenset(rev_data.keys()) == frozenset(revisions) 1.77 + return [rev_data[r] for r in revisions] 1.78 + 1.79 +def mean(values): 1.80 + return float(sum(values))/len(values) 1.81 + 1.82 +def c4(n): 1.83 + n = float(n) 1.84 + numerator = math.gamma(n/2)*math.sqrt(2/(n-1)) 1.85 + denominator = math.gamma((n-1)/2) 1.86 + return numerator/denominator 1.87 + 1.88 +def unbiased_standard_deviation(values): 1.89 + n = len(values) 1.90 + if n == 1: 1.91 + return None 1.92 + acc = 0 1.93 + avg = mean(values) 1.94 + for i in values: 1.95 + dist = i - avg 1.96 + acc += dist * dist 1.97 + return math.sqrt(acc/(n-1))/c4(n) 1.98 + 1.99 +class BenchmarkResult: 1.100 + """ Stores the summary (mean and standard deviation) of a set of talus 1.101 + runs on the same revision and OS.""" 1.102 + def __init__(self, avg, std): 1.103 + self.avg = avg 1.104 + self.std = std 1.105 + def __str__(self): 1.106 + t = "%s," % self.avg 1.107 + return "(%-13s %s)" % (t, self.std) 1.108 + 1.109 +# FIXME: This function computes the statistics of multiple runs of talos on a 1.110 +# single revision. Should it also support computing statistics over runs of 1.111 +# different revisions assuming the revisions are equivalent from a performance 1.112 +# perspective? 1.113 +def digest_revision_data(data): 1.114 + ret = {} 1.115 + benchmarks = frozenset(data.keys()) 1.116 + # assert that all the benchmarks are known. If they are not, 1.117 + # smaller_is_better or bigger_is_better needs to be updated depending on 1.118 + # the benchmark type. 1.119 + assert all_benchmarks.issuperset(benchmarks), \ 1.120 + "%s not found in all_benchmarks" % ','.join((benchmarks - all_benchmarks)) 1.121 + for benchmark in benchmarks: 1.122 + benchmark_data = data[benchmark] 1.123 + expected_keys = frozenset(("test_runs", "name", "id")) 1.124 + assert frozenset(benchmark_data.keys()) == expected_keys 1.125 + test_runs = benchmark_data["test_runs"] 1.126 + operating_systems = test_runs.keys() 1.127 + results = {} 1.128 + for os in operating_systems: 1.129 + os_runs = test_runs[os] 1.130 + values = [] 1.131 + for os_run in os_runs: 1.132 + # there are 4 fields: test run id, build id, timestamp, 1.133 + # mean value 1.134 + assert len(os_run) == 4 1.135 + values.append(os_run[3]) 1.136 + avg = mean(values) 1.137 + std = unbiased_standard_deviation(values) 1.138 + results[os] = BenchmarkResult(avg, std) 1.139 + ret[benchmark] = results 1.140 + return ret 1.141 + 1.142 +def get_data_for_revisions(revisions): 1.143 + raw_data = get_raw_data_for_revisions(revisions) 1.144 + return [digest_revision_data(x) for x in raw_data] 1.145 + 1.146 +def overlaps(a, b): 1.147 + return a[1] >= b[0] and b[1] >= a[0] 1.148 + 1.149 +def is_significant(old, new): 1.150 + # conservative hack: if we don't know, say it is significant. 1.151 + if old.std is None or new.std is None: 1.152 + return True 1.153 + # use a 2 standard deviation interval, which is about 95% confidence. 1.154 + old_interval = [old.avg - old.std, old.avg + old.std] 1.155 + new_interval = [new.avg - new.std, new.avg + new.std] 1.156 + return not overlaps(old_interval, new_interval) 1.157 + 1.158 +def compute_difference(benchmark, old, new): 1.159 + if benchmark in bigger_is_better: 1.160 + new, old = old, new 1.161 + 1.162 + if new.avg >= old.avg: 1.163 + return "%1.4fx worse" % (new.avg/old.avg) 1.164 + else: 1.165 + return "%1.4fx better" % (old.avg/new.avg) 1.166 + 1.167 +#FIXME: the printing could use a table class that computes the sizes of the 1.168 +# cells instead of the current hard coded values. 1.169 +def print_data_comparison(datav): 1.170 + assert len(datav) == 2 1.171 + old_data = datav[0] 1.172 + new_data = datav[1] 1.173 + old_benchmarks = frozenset(old_data.keys()) 1.174 + new_benchmarks = frozenset(new_data.keys()) 1.175 + benchmarks = old_benchmarks.intersection(new_benchmarks) 1.176 + for benchmark in sorted(benchmarks): 1.177 + print benchmark 1.178 + old_benchmark_data = old_data[benchmark] 1.179 + new_benchmark_data = new_data[benchmark] 1.180 + old_operating_systems = frozenset(old_benchmark_data.keys()) 1.181 + new_operating_systems = frozenset(new_benchmark_data.keys()) 1.182 + operating_systems = old_operating_systems.intersection(new_operating_systems) 1.183 + for os in sorted(operating_systems): 1.184 + old_os_data = old_benchmark_data[os] 1.185 + new_os_data = new_benchmark_data[os] 1.186 + if not is_significant(old_os_data, new_os_data): 1.187 + continue 1.188 + 1.189 + diff = compute_difference(benchmark, old_os_data, new_os_data) 1.190 + print '%-33s | %-30s -> %-30s %s' % \ 1.191 + (os, old_os_data, new_os_data, diff) 1.192 + print 1.193 + 1.194 +def main(): 1.195 + parser = OptionParser(usage='Usage: %prog old_revision new_revision') 1.196 + options, args = parser.parse_args() 1.197 + if len(args) != 2: 1.198 + parser.print_help() 1.199 + sys.exit(1) 1.200 + 1.201 + print_data_comparison(get_data_for_revisions(args)) 1.202 + 1.203 +if __name__ == '__main__': 1.204 + main()