|
1 #!/usr/bin/env python |
|
2 |
|
3 # This Source Code Form is subject to the terms of the Mozilla Public |
|
4 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
6 |
|
7 """ |
|
8 This is a simple script that does one thing only: compare talos runs from |
|
9 two revisions. It is intended to check which of two try runs is best or if |
|
10 a try improves over the m-c or m-i revision in branches from. |
|
11 |
|
12 A big design goal is to avoid bit rot and to assert when bit rot is detected. |
|
13 The set of tests we run is a moving target. When possible this script |
|
14 should work with any test set, but in parts where it has to hard code |
|
15 information, it should try to assert that it is valid so that changes |
|
16 are detected and it is fixed earlier. |
|
17 """ |
|
18 |
|
19 import json |
|
20 import urllib2 |
|
21 import math |
|
22 import sys |
|
23 from optparse import OptionParser |
|
24 |
|
25 # FIXME: currently we assert that we know all the benchmarks just so |
|
26 # we are sure to maintain the bigger_is_better set updated. Is there a better |
|
27 # way to find/compute it? |
|
28 bigger_is_better = frozenset(('v8_7', 'dromaeo_dom', 'dromaeo_css')) |
|
29 |
|
30 smaller_is_better = frozenset(('tdhtmlr_paint', 'tp5n_main_rss_paint', |
|
31 'ts_paint', 'tp5n_paint', 'tsvgr_opacity', |
|
32 'a11yr_paint', 'kraken', |
|
33 'tdhtmlr_nochrome_paint', |
|
34 'tspaint_places_generated_med', 'tpaint', |
|
35 'tp5n_shutdown_paint', 'tsvgr', |
|
36 'tp5n_pbytes_paint', 'tscrollr_paint', |
|
37 'tspaint_places_generated_max', |
|
38 'tp5n_responsiveness_paint', |
|
39 'sunspider', 'tp5n_xres_paint', 'num_ctors', |
|
40 'tresize', 'trobopan', 'tcheckerboard', |
|
41 'tcheck3', 'tcheck2', 'tprovider', |
|
42 'tp5n_modlistbytes_paint', |
|
43 'trace_malloc_maxheap', 'tp4m_nochrome', |
|
44 'trace_malloc_leaks', 'tp4m_main_rss_nochrome', |
|
45 'tp4m_shutdown_nochrome', 'tdhtml_nochrome', |
|
46 'ts_shutdown', 'tp5n_%cpu_paint', |
|
47 'trace_malloc_allocs', 'ts', |
|
48 'tsvg_nochrome', 'tp5n_content_rss_paint', |
|
49 'tp5n_main_startup_fileio_paint', |
|
50 'tp5n_nonmain_normal_netio_paint', |
|
51 'tp5n_nonmain_startup_fileio_paint', |
|
52 'tp5n_main_normal_fileio_paint', |
|
53 'tp5n_nonmain_normal_fileio_paint', |
|
54 'tp5n_main_startup_netio_paint', |
|
55 'tp5n_main_normal_netio_paint', |
|
56 'tp5n_main_shutdown_netio_paint', |
|
57 'tp5n_main_shutdown_fileio_paint')) |
|
58 |
|
59 all_benchmarks = smaller_is_better | bigger_is_better |
|
60 assert len(smaller_is_better & bigger_is_better) == 0 |
|
61 |
|
62 def get_raw_data_for_revisions(revisions): |
|
63 """Loads data for the revisions, returns an array with one element for each |
|
64 revision.""" |
|
65 selectors = ["revision=%s" % revision for revision in revisions] |
|
66 selector = '&'.join(selectors) |
|
67 url = "http://graphs.mozilla.org/api/test/runs/revisions?%s" % selector |
|
68 url_stream = urllib2.urlopen(url) |
|
69 data = json.load(url_stream) |
|
70 assert frozenset(data.keys()) == frozenset(('stat', 'revisions')) |
|
71 assert data['stat'] == 'ok' |
|
72 rev_data = data['revisions'] |
|
73 assert frozenset(rev_data.keys()) == frozenset(revisions) |
|
74 return [rev_data[r] for r in revisions] |
|
75 |
|
76 def mean(values): |
|
77 return float(sum(values))/len(values) |
|
78 |
|
79 def c4(n): |
|
80 n = float(n) |
|
81 numerator = math.gamma(n/2)*math.sqrt(2/(n-1)) |
|
82 denominator = math.gamma((n-1)/2) |
|
83 return numerator/denominator |
|
84 |
|
85 def unbiased_standard_deviation(values): |
|
86 n = len(values) |
|
87 if n == 1: |
|
88 return None |
|
89 acc = 0 |
|
90 avg = mean(values) |
|
91 for i in values: |
|
92 dist = i - avg |
|
93 acc += dist * dist |
|
94 return math.sqrt(acc/(n-1))/c4(n) |
|
95 |
|
96 class BenchmarkResult: |
|
97 """ Stores the summary (mean and standard deviation) of a set of talus |
|
98 runs on the same revision and OS.""" |
|
99 def __init__(self, avg, std): |
|
100 self.avg = avg |
|
101 self.std = std |
|
102 def __str__(self): |
|
103 t = "%s," % self.avg |
|
104 return "(%-13s %s)" % (t, self.std) |
|
105 |
|
106 # FIXME: This function computes the statistics of multiple runs of talos on a |
|
107 # single revision. Should it also support computing statistics over runs of |
|
108 # different revisions assuming the revisions are equivalent from a performance |
|
109 # perspective? |
|
110 def digest_revision_data(data): |
|
111 ret = {} |
|
112 benchmarks = frozenset(data.keys()) |
|
113 # assert that all the benchmarks are known. If they are not, |
|
114 # smaller_is_better or bigger_is_better needs to be updated depending on |
|
115 # the benchmark type. |
|
116 assert all_benchmarks.issuperset(benchmarks), \ |
|
117 "%s not found in all_benchmarks" % ','.join((benchmarks - all_benchmarks)) |
|
118 for benchmark in benchmarks: |
|
119 benchmark_data = data[benchmark] |
|
120 expected_keys = frozenset(("test_runs", "name", "id")) |
|
121 assert frozenset(benchmark_data.keys()) == expected_keys |
|
122 test_runs = benchmark_data["test_runs"] |
|
123 operating_systems = test_runs.keys() |
|
124 results = {} |
|
125 for os in operating_systems: |
|
126 os_runs = test_runs[os] |
|
127 values = [] |
|
128 for os_run in os_runs: |
|
129 # there are 4 fields: test run id, build id, timestamp, |
|
130 # mean value |
|
131 assert len(os_run) == 4 |
|
132 values.append(os_run[3]) |
|
133 avg = mean(values) |
|
134 std = unbiased_standard_deviation(values) |
|
135 results[os] = BenchmarkResult(avg, std) |
|
136 ret[benchmark] = results |
|
137 return ret |
|
138 |
|
139 def get_data_for_revisions(revisions): |
|
140 raw_data = get_raw_data_for_revisions(revisions) |
|
141 return [digest_revision_data(x) for x in raw_data] |
|
142 |
|
143 def overlaps(a, b): |
|
144 return a[1] >= b[0] and b[1] >= a[0] |
|
145 |
|
146 def is_significant(old, new): |
|
147 # conservative hack: if we don't know, say it is significant. |
|
148 if old.std is None or new.std is None: |
|
149 return True |
|
150 # use a 2 standard deviation interval, which is about 95% confidence. |
|
151 old_interval = [old.avg - old.std, old.avg + old.std] |
|
152 new_interval = [new.avg - new.std, new.avg + new.std] |
|
153 return not overlaps(old_interval, new_interval) |
|
154 |
|
155 def compute_difference(benchmark, old, new): |
|
156 if benchmark in bigger_is_better: |
|
157 new, old = old, new |
|
158 |
|
159 if new.avg >= old.avg: |
|
160 return "%1.4fx worse" % (new.avg/old.avg) |
|
161 else: |
|
162 return "%1.4fx better" % (old.avg/new.avg) |
|
163 |
|
164 #FIXME: the printing could use a table class that computes the sizes of the |
|
165 # cells instead of the current hard coded values. |
|
166 def print_data_comparison(datav): |
|
167 assert len(datav) == 2 |
|
168 old_data = datav[0] |
|
169 new_data = datav[1] |
|
170 old_benchmarks = frozenset(old_data.keys()) |
|
171 new_benchmarks = frozenset(new_data.keys()) |
|
172 benchmarks = old_benchmarks.intersection(new_benchmarks) |
|
173 for benchmark in sorted(benchmarks): |
|
174 print benchmark |
|
175 old_benchmark_data = old_data[benchmark] |
|
176 new_benchmark_data = new_data[benchmark] |
|
177 old_operating_systems = frozenset(old_benchmark_data.keys()) |
|
178 new_operating_systems = frozenset(new_benchmark_data.keys()) |
|
179 operating_systems = old_operating_systems.intersection(new_operating_systems) |
|
180 for os in sorted(operating_systems): |
|
181 old_os_data = old_benchmark_data[os] |
|
182 new_os_data = new_benchmark_data[os] |
|
183 if not is_significant(old_os_data, new_os_data): |
|
184 continue |
|
185 |
|
186 diff = compute_difference(benchmark, old_os_data, new_os_data) |
|
187 print '%-33s | %-30s -> %-30s %s' % \ |
|
188 (os, old_os_data, new_os_data, diff) |
|
189 print |
|
190 |
|
191 def main(): |
|
192 parser = OptionParser(usage='Usage: %prog old_revision new_revision') |
|
193 options, args = parser.parse_args() |
|
194 if len(args) != 2: |
|
195 parser.print_help() |
|
196 sys.exit(1) |
|
197 |
|
198 print_data_comparison(get_data_for_revisions(args)) |
|
199 |
|
200 if __name__ == '__main__': |
|
201 main() |