michael@0: #!/usr/bin/env python michael@0: michael@0: # This Source Code Form is subject to the terms of the Mozilla Public michael@0: # License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: # file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: """ michael@0: This is a simple script that does one thing only: compare talos runs from michael@0: two revisions. It is intended to check which of two try runs is best or if michael@0: a try improves over the m-c or m-i revision in branches from. michael@0: michael@0: A big design goal is to avoid bit rot and to assert when bit rot is detected. michael@0: The set of tests we run is a moving target. When possible this script michael@0: should work with any test set, but in parts where it has to hard code michael@0: information, it should try to assert that it is valid so that changes michael@0: are detected and it is fixed earlier. michael@0: """ michael@0: michael@0: import json michael@0: import urllib2 michael@0: import math michael@0: import sys michael@0: from optparse import OptionParser michael@0: michael@0: # FIXME: currently we assert that we know all the benchmarks just so michael@0: # we are sure to maintain the bigger_is_better set updated. Is there a better michael@0: # way to find/compute it? michael@0: bigger_is_better = frozenset(('v8_7', 'dromaeo_dom', 'dromaeo_css')) michael@0: michael@0: smaller_is_better = frozenset(('tdhtmlr_paint', 'tp5n_main_rss_paint', michael@0: 'ts_paint', 'tp5n_paint', 'tsvgr_opacity', michael@0: 'a11yr_paint', 'kraken', michael@0: 'tdhtmlr_nochrome_paint', michael@0: 'tspaint_places_generated_med', 'tpaint', michael@0: 'tp5n_shutdown_paint', 'tsvgr', michael@0: 'tp5n_pbytes_paint', 'tscrollr_paint', michael@0: 'tspaint_places_generated_max', michael@0: 'tp5n_responsiveness_paint', michael@0: 'sunspider', 'tp5n_xres_paint', 'num_ctors', michael@0: 'tresize', 'trobopan', 'tcheckerboard', michael@0: 'tcheck3', 'tcheck2', 'tprovider', michael@0: 'tp5n_modlistbytes_paint', michael@0: 'trace_malloc_maxheap', 'tp4m_nochrome', michael@0: 'trace_malloc_leaks', 'tp4m_main_rss_nochrome', michael@0: 'tp4m_shutdown_nochrome', 'tdhtml_nochrome', michael@0: 'ts_shutdown', 'tp5n_%cpu_paint', michael@0: 'trace_malloc_allocs', 'ts', michael@0: 'tsvg_nochrome', 'tp5n_content_rss_paint', michael@0: 'tp5n_main_startup_fileio_paint', michael@0: 'tp5n_nonmain_normal_netio_paint', michael@0: 'tp5n_nonmain_startup_fileio_paint', michael@0: 'tp5n_main_normal_fileio_paint', michael@0: 'tp5n_nonmain_normal_fileio_paint', michael@0: 'tp5n_main_startup_netio_paint', michael@0: 'tp5n_main_normal_netio_paint', michael@0: 'tp5n_main_shutdown_netio_paint', michael@0: 'tp5n_main_shutdown_fileio_paint')) michael@0: michael@0: all_benchmarks = smaller_is_better | bigger_is_better michael@0: assert len(smaller_is_better & bigger_is_better) == 0 michael@0: michael@0: def get_raw_data_for_revisions(revisions): michael@0: """Loads data for the revisions, returns an array with one element for each michael@0: revision.""" michael@0: selectors = ["revision=%s" % revision for revision in revisions] michael@0: selector = '&'.join(selectors) michael@0: url = "http://graphs.mozilla.org/api/test/runs/revisions?%s" % selector michael@0: url_stream = urllib2.urlopen(url) michael@0: data = json.load(url_stream) michael@0: assert frozenset(data.keys()) == frozenset(('stat', 'revisions')) michael@0: assert data['stat'] == 'ok' michael@0: rev_data = data['revisions'] michael@0: assert frozenset(rev_data.keys()) == frozenset(revisions) michael@0: return [rev_data[r] for r in revisions] michael@0: michael@0: def mean(values): michael@0: return float(sum(values))/len(values) michael@0: michael@0: def c4(n): michael@0: n = float(n) michael@0: numerator = math.gamma(n/2)*math.sqrt(2/(n-1)) michael@0: denominator = math.gamma((n-1)/2) michael@0: return numerator/denominator michael@0: michael@0: def unbiased_standard_deviation(values): michael@0: n = len(values) michael@0: if n == 1: michael@0: return None michael@0: acc = 0 michael@0: avg = mean(values) michael@0: for i in values: michael@0: dist = i - avg michael@0: acc += dist * dist michael@0: return math.sqrt(acc/(n-1))/c4(n) michael@0: michael@0: class BenchmarkResult: michael@0: """ Stores the summary (mean and standard deviation) of a set of talus michael@0: runs on the same revision and OS.""" michael@0: def __init__(self, avg, std): michael@0: self.avg = avg michael@0: self.std = std michael@0: def __str__(self): michael@0: t = "%s," % self.avg michael@0: return "(%-13s %s)" % (t, self.std) michael@0: michael@0: # FIXME: This function computes the statistics of multiple runs of talos on a michael@0: # single revision. Should it also support computing statistics over runs of michael@0: # different revisions assuming the revisions are equivalent from a performance michael@0: # perspective? michael@0: def digest_revision_data(data): michael@0: ret = {} michael@0: benchmarks = frozenset(data.keys()) michael@0: # assert that all the benchmarks are known. If they are not, michael@0: # smaller_is_better or bigger_is_better needs to be updated depending on michael@0: # the benchmark type. michael@0: assert all_benchmarks.issuperset(benchmarks), \ michael@0: "%s not found in all_benchmarks" % ','.join((benchmarks - all_benchmarks)) michael@0: for benchmark in benchmarks: michael@0: benchmark_data = data[benchmark] michael@0: expected_keys = frozenset(("test_runs", "name", "id")) michael@0: assert frozenset(benchmark_data.keys()) == expected_keys michael@0: test_runs = benchmark_data["test_runs"] michael@0: operating_systems = test_runs.keys() michael@0: results = {} michael@0: for os in operating_systems: michael@0: os_runs = test_runs[os] michael@0: values = [] michael@0: for os_run in os_runs: michael@0: # there are 4 fields: test run id, build id, timestamp, michael@0: # mean value michael@0: assert len(os_run) == 4 michael@0: values.append(os_run[3]) michael@0: avg = mean(values) michael@0: std = unbiased_standard_deviation(values) michael@0: results[os] = BenchmarkResult(avg, std) michael@0: ret[benchmark] = results michael@0: return ret michael@0: michael@0: def get_data_for_revisions(revisions): michael@0: raw_data = get_raw_data_for_revisions(revisions) michael@0: return [digest_revision_data(x) for x in raw_data] michael@0: michael@0: def overlaps(a, b): michael@0: return a[1] >= b[0] and b[1] >= a[0] michael@0: michael@0: def is_significant(old, new): michael@0: # conservative hack: if we don't know, say it is significant. michael@0: if old.std is None or new.std is None: michael@0: return True michael@0: # use a 2 standard deviation interval, which is about 95% confidence. michael@0: old_interval = [old.avg - old.std, old.avg + old.std] michael@0: new_interval = [new.avg - new.std, new.avg + new.std] michael@0: return not overlaps(old_interval, new_interval) michael@0: michael@0: def compute_difference(benchmark, old, new): michael@0: if benchmark in bigger_is_better: michael@0: new, old = old, new michael@0: michael@0: if new.avg >= old.avg: michael@0: return "%1.4fx worse" % (new.avg/old.avg) michael@0: else: michael@0: return "%1.4fx better" % (old.avg/new.avg) michael@0: michael@0: #FIXME: the printing could use a table class that computes the sizes of the michael@0: # cells instead of the current hard coded values. michael@0: def print_data_comparison(datav): michael@0: assert len(datav) == 2 michael@0: old_data = datav[0] michael@0: new_data = datav[1] michael@0: old_benchmarks = frozenset(old_data.keys()) michael@0: new_benchmarks = frozenset(new_data.keys()) michael@0: benchmarks = old_benchmarks.intersection(new_benchmarks) michael@0: for benchmark in sorted(benchmarks): michael@0: print benchmark michael@0: old_benchmark_data = old_data[benchmark] michael@0: new_benchmark_data = new_data[benchmark] michael@0: old_operating_systems = frozenset(old_benchmark_data.keys()) michael@0: new_operating_systems = frozenset(new_benchmark_data.keys()) michael@0: operating_systems = old_operating_systems.intersection(new_operating_systems) michael@0: for os in sorted(operating_systems): michael@0: old_os_data = old_benchmark_data[os] michael@0: new_os_data = new_benchmark_data[os] michael@0: if not is_significant(old_os_data, new_os_data): michael@0: continue michael@0: michael@0: diff = compute_difference(benchmark, old_os_data, new_os_data) michael@0: print '%-33s | %-30s -> %-30s %s' % \ michael@0: (os, old_os_data, new_os_data, diff) michael@0: print michael@0: michael@0: def main(): michael@0: parser = OptionParser(usage='Usage: %prog old_revision new_revision') michael@0: options, args = parser.parse_args() michael@0: if len(args) != 2: michael@0: parser.print_help() michael@0: sys.exit(1) michael@0: michael@0: print_data_comparison(get_data_for_revisions(args)) michael@0: michael@0: if __name__ == '__main__': michael@0: main()