diff -r 000000000000 -r 6474c204b198 tools/performance/diff-talos.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/performance/diff-talos.py	Wed Dec 31 06:09:35 2014 +0100
@@ -0,0 +1,201 @@
+#!/usr/bin/env python
+
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+This is a simple script that does one thing only: compare talos runs from
+two revisions. It is intended to check which of two try runs is best or if
+a try improves over the m-c or m-i revision in branches from.
+
+A big design goal is to avoid bit rot and to assert when bit rot is detected.
+The set of tests we run is a moving target. When possible this script
+should work with any test set, but in parts where it has to hard code
+information, it should try to assert that it is valid so that changes
+are detected and it is fixed earlier.
+"""
+
+import json
+import urllib2
+import math
+import sys
+from optparse import OptionParser
+
+# FIXME: currently we assert that we know all the benchmarks just so
+# we are sure to maintain the bigger_is_better set updated. Is there a better
+# way to find/compute it?
+bigger_is_better = frozenset(('v8_7', 'dromaeo_dom', 'dromaeo_css'))
+
+smaller_is_better = frozenset(('tdhtmlr_paint', 'tp5n_main_rss_paint',
+                               'ts_paint', 'tp5n_paint', 'tsvgr_opacity',
+                               'a11yr_paint', 'kraken',
+                               'tdhtmlr_nochrome_paint',
+                               'tspaint_places_generated_med', 'tpaint',
+                               'tp5n_shutdown_paint', 'tsvgr',
+                               'tp5n_pbytes_paint', 'tscrollr_paint',
+                               'tspaint_places_generated_max',
+                               'tp5n_responsiveness_paint',
+                               'sunspider', 'tp5n_xres_paint', 'num_ctors',
+                               'tresize', 'trobopan', 'tcheckerboard',
+                               'tcheck3', 'tcheck2', 'tprovider',
+                               'tp5n_modlistbytes_paint',
+                               'trace_malloc_maxheap', 'tp4m_nochrome',
+                               'trace_malloc_leaks', 'tp4m_main_rss_nochrome',
+                               'tp4m_shutdown_nochrome', 'tdhtml_nochrome',
+                               'ts_shutdown', 'tp5n_%cpu_paint',
+                               'trace_malloc_allocs', 'ts',
+                               'tsvg_nochrome', 'tp5n_content_rss_paint',
+                               'tp5n_main_startup_fileio_paint',
+                               'tp5n_nonmain_normal_netio_paint',
+                               'tp5n_nonmain_startup_fileio_paint',
+                               'tp5n_main_normal_fileio_paint',
+                               'tp5n_nonmain_normal_fileio_paint',
+                               'tp5n_main_startup_netio_paint',
+                               'tp5n_main_normal_netio_paint',
+                               'tp5n_main_shutdown_netio_paint',
+                               'tp5n_main_shutdown_fileio_paint'))
+
+all_benchmarks = smaller_is_better | bigger_is_better
+assert len(smaller_is_better & bigger_is_better) == 0
+
+def get_raw_data_for_revisions(revisions):
+    """Loads data for the revisions, returns an array with one element for each
+    revision."""
+    selectors = ["revision=%s" % revision for revision in revisions]
+    selector = '&'.join(selectors)
+    url = "http://graphs.mozilla.org/api/test/runs/revisions?%s" % selector
+    url_stream = urllib2.urlopen(url)
+    data = json.load(url_stream)
+    assert frozenset(data.keys()) == frozenset(('stat', 'revisions'))
+    assert data['stat'] == 'ok'
+    rev_data = data['revisions']
+    assert frozenset(rev_data.keys()) == frozenset(revisions)
+    return [rev_data[r] for r in revisions]
+
+def mean(values):
+    return float(sum(values))/len(values)
+
+def c4(n):
+    n = float(n)
+    numerator = math.gamma(n/2)*math.sqrt(2/(n-1))
+    denominator = math.gamma((n-1)/2)
+    return numerator/denominator
+
+def unbiased_standard_deviation(values):
+    n = len(values)
+    if n == 1:
+        return None
+    acc = 0
+    avg = mean(values)
+    for i in values:
+        dist = i - avg
+        acc += dist * dist
+    return math.sqrt(acc/(n-1))/c4(n)
+
+class BenchmarkResult:
+    """ Stores the summary (mean and standard deviation) of a set of talus
+    runs on the same revision and OS."""
+    def __init__(self, avg, std):
+        self.avg = avg
+        self.std = std
+    def __str__(self):
+        t = "%s," % self.avg
+        return "(%-13s %s)" % (t, self.std)
+
+# FIXME: This function computes the statistics of multiple runs of talos on a
+# single revision. Should it also support computing statistics over runs of
+# different revisions assuming the revisions are equivalent from a performance
+# perspective?
+def digest_revision_data(data):
+    ret = {}
+    benchmarks = frozenset(data.keys())
+    # assert that all the benchmarks are known. If they are not,
+    # smaller_is_better or bigger_is_better needs to be updated depending on
+    # the benchmark type.
+    assert all_benchmarks.issuperset(benchmarks), \
+        "%s not found in all_benchmarks" % ','.join((benchmarks - all_benchmarks))
+    for benchmark in benchmarks:
+        benchmark_data = data[benchmark]
+        expected_keys = frozenset(("test_runs", "name", "id"))
+        assert frozenset(benchmark_data.keys()) == expected_keys
+        test_runs = benchmark_data["test_runs"]
+        operating_systems = test_runs.keys()
+        results = {}
+        for os in operating_systems:
+            os_runs = test_runs[os]
+            values = []
+            for os_run in os_runs:
+                # there are 4 fields: test run id, build id, timestamp,
+                # mean value
+                assert len(os_run) == 4
+                values.append(os_run[3])
+            avg = mean(values)
+            std = unbiased_standard_deviation(values)
+            results[os] = BenchmarkResult(avg, std)
+        ret[benchmark] = results
+    return ret
+
+def get_data_for_revisions(revisions):
+    raw_data = get_raw_data_for_revisions(revisions)
+    return [digest_revision_data(x) for x in raw_data]
+
+def overlaps(a, b):
+    return a[1] >= b[0] and b[1] >= a[0]
+
+def is_significant(old, new):
+    # conservative hack: if we don't know, say it is significant.
+    if old.std is None or new.std is None:
+        return True
+    # use a 2 standard deviation interval, which is about 95% confidence.
+    old_interval = [old.avg - old.std, old.avg + old.std]
+    new_interval = [new.avg - new.std, new.avg + new.std]
+    return not overlaps(old_interval, new_interval)
+
+def compute_difference(benchmark, old, new):
+    if benchmark in bigger_is_better:
+        new, old = old, new
+
+    if new.avg >= old.avg:
+        return "%1.4fx worse" % (new.avg/old.avg)
+    else:
+        return "%1.4fx better" % (old.avg/new.avg)
+
+#FIXME: the printing could use a table class that computes the sizes of the
+# cells instead of the current hard coded values.
+def print_data_comparison(datav):
+    assert len(datav) == 2
+    old_data = datav[0]
+    new_data = datav[1]
+    old_benchmarks = frozenset(old_data.keys())
+    new_benchmarks = frozenset(new_data.keys())
+    benchmarks = old_benchmarks.intersection(new_benchmarks)
+    for benchmark in sorted(benchmarks):
+        print benchmark
+        old_benchmark_data = old_data[benchmark]
+        new_benchmark_data = new_data[benchmark]
+        old_operating_systems = frozenset(old_benchmark_data.keys())
+        new_operating_systems = frozenset(new_benchmark_data.keys())
+        operating_systems = old_operating_systems.intersection(new_operating_systems)
+        for os in sorted(operating_systems):
+            old_os_data = old_benchmark_data[os]
+            new_os_data = new_benchmark_data[os]
+            if not is_significant(old_os_data, new_os_data):
+                continue
+
+            diff = compute_difference(benchmark, old_os_data, new_os_data)
+            print '%-33s | %-30s -> %-30s %s' % \
+                (os, old_os_data, new_os_data, diff)
+        print
+
+def main():
+    parser = OptionParser(usage='Usage: %prog old_revision new_revision')
+    options, args = parser.parse_args()
+    if len(args) != 2:
+        parser.print_help()
+        sys.exit(1)
+
+    print_data_comparison(get_data_for_revisions(args))
+
+if __name__ == '__main__':
+    main()