config/find_OOM_errors.py

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/config/find_OOM_errors.py	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,352 @@
     1.4 +#!/usr/bin/env python
     1.5 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.6 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.8 +from __future__ import print_function
     1.9 +
    1.10 +usage = """%prog: A test for OOM conditions in the shell.
    1.11 +
    1.12 +%prog finds segfaults and other errors caused by incorrect handling of
    1.13 +allocation during OOM (out-of-memory) conditions.
    1.14 +"""
    1.15 +
    1.16 +help = """Check for regressions only. This runs a set of files with a known
    1.17 +number of OOM errors (specified by REGRESSION_COUNT), and exits with a non-zero
    1.18 +result if more or less errors are found. See js/src/Makefile.in for invocation.
    1.19 +"""
    1.20 +
    1.21 +
    1.22 +import hashlib
    1.23 +import re
    1.24 +import shlex
    1.25 +import subprocess
    1.26 +import sys
    1.27 +import threading
    1.28 +import time
    1.29 +
    1.30 +from optparse import OptionParser
    1.31 +
    1.32 +#####################################################################
    1.33 +# Utility functions
    1.34 +#####################################################################
    1.35 +def run(args, stdin=None):
    1.36 +  class ThreadWorker(threading.Thread):
    1.37 +    def __init__(self, pipe):
    1.38 +      super(ThreadWorker, self).__init__()
    1.39 +      self.all = ""
    1.40 +      self.pipe = pipe
    1.41 +      self.setDaemon(True)
    1.42 +
    1.43 +    def run(self):
    1.44 +      while True:
    1.45 +        line = self.pipe.readline()
    1.46 +        if line == '': break
    1.47 +        else:
    1.48 +          self.all += line
    1.49 +
    1.50 +  try:
    1.51 +    if type(args) == str:
    1.52 +      args = shlex.split(args)
    1.53 +
    1.54 +    args = [str(a) for a in args] # convert to strs
    1.55 +
    1.56 +    stdin_pipe = subprocess.PIPE if stdin else None
    1.57 +    proc = subprocess.Popen(args, stdin=stdin_pipe, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    1.58 +    if stdin_pipe:
    1.59 +      proc.stdin.write(stdin)
    1.60 +      proc.stdin.close()
    1.61 +
    1.62 +    stdout_worker = ThreadWorker(proc.stdout)
    1.63 +    stderr_worker = ThreadWorker(proc.stderr)
    1.64 +    stdout_worker.start()
    1.65 +    stderr_worker.start()
    1.66 +
    1.67 +    proc.wait()
    1.68 +    stdout_worker.join()
    1.69 +    stderr_worker.join()
    1.70 +
    1.71 +  except KeyboardInterrupt as e:
    1.72 +    sys.exit(-1)
    1.73 +
    1.74 +  stdout, stderr = stdout_worker.all, stderr_worker.all
    1.75 +  result = (stdout, stderr, proc.returncode)
    1.76 +  return result
    1.77 +
    1.78 +def get_js_files():
    1.79 +  (out, err, exit) = run('find ../jit-test/tests -name "*.js"')
    1.80 +  if (err, exit) != ("", 0):
    1.81 +    sys.exit("Wrong directory, run from an objdir")
    1.82 +  return out.split()
    1.83 +
    1.84 +
    1.85 +
    1.86 +#####################################################################
    1.87 +# Blacklisting
    1.88 +#####################################################################
    1.89 +def in_blacklist(sig):
    1.90 +  return sig in blacklist
    1.91 +
    1.92 +def add_to_blacklist(sig):
    1.93 +  blacklist[sig] = blacklist.get(sig, 0)
    1.94 +  blacklist[sig] += 1
    1.95 +
    1.96 +# How often is a particular lines important for this.
    1.97 +def count_lines():
    1.98 +  """Keep track of the amount of times individual lines occur, in order to
    1.99 +     prioritize the errors which occur most frequently."""
   1.100 +  counts = {}
   1.101 +  for string,count in blacklist.items():
   1.102 +    for line in string.split("\n"):
   1.103 +      counts[line] = counts.get(line, 0) + count
   1.104 +
   1.105 +  lines = []
   1.106 +  for k,v in counts.items():
   1.107 +    lines.append("{0:6}: {1}".format(v, k))
   1.108 +
   1.109 +  lines.sort()
   1.110 +
   1.111 +  countlog = file("../OOM_count_log", "w")
   1.112 +  countlog.write("\n".join(lines))
   1.113 +  countlog.flush()
   1.114 +  countlog.close()
   1.115 +
   1.116 +
   1.117 +#####################################################################
   1.118 +# Output cleaning
   1.119 +#####################################################################
   1.120 +def clean_voutput(err):
   1.121 +  # Skip what we can't reproduce
   1.122 +  err = re.sub(r"^--\d+-- run: /usr/bin/dsymutil \"shell/js\"$", "", err, flags=re.MULTILINE)
   1.123 +  err = re.sub(r"^==\d+==", "", err, flags=re.MULTILINE)
   1.124 +  err = re.sub(r"^\*\*\d+\*\*", "", err, flags=re.MULTILINE)
   1.125 +  err = re.sub(r"^\s+by 0x[0-9A-Fa-f]+: ", "by: ", err, flags=re.MULTILINE)
   1.126 +  err = re.sub(r"^\s+at 0x[0-9A-Fa-f]+: ", "at: ", err, flags=re.MULTILINE)
   1.127 +  err = re.sub(r"(^\s+Address 0x)[0-9A-Fa-f]+( is not stack'd)", r"\1\2", err, flags=re.MULTILINE)
   1.128 +  err = re.sub(r"(^\s+Invalid write of size )\d+", r"\1x", err, flags=re.MULTILINE)
   1.129 +  err = re.sub(r"(^\s+Invalid read of size )\d+", r"\1x", err, flags=re.MULTILINE)
   1.130 +  err = re.sub(r"(^\s+Address 0x)[0-9A-Fa-f]+( is )\d+( bytes inside a block of size )[0-9,]+( free'd)", r"\1\2\3\4", err, flags=re.MULTILINE)
   1.131 +
   1.132 +  # Skip the repeating bit due to the segfault
   1.133 +  lines = []
   1.134 +  for l in err.split('\n'):
   1.135 +    if l == " Process terminating with default action of signal 11 (SIGSEGV)":
   1.136 +      break
   1.137 +    lines.append(l)
   1.138 +  err = '\n'.join(lines)
   1.139 +
   1.140 +  return err
   1.141 +
   1.142 +def remove_failed_allocation_backtraces(err):
   1.143 +  lines = []
   1.144 +
   1.145 +  add = True
   1.146 +  for l in err.split('\n'):
   1.147 +
   1.148 +    # Set start and end conditions for including text
   1.149 +    if l == " The site of the failed allocation is:":
   1.150 +      add = False
   1.151 +    elif l[:2] not in ['by: ', 'at:']:
   1.152 +      add = True
   1.153 +
   1.154 +    if add:
   1.155 +      lines.append(l)
   1.156 +
   1.157 +
   1.158 +  err = '\n'.join(lines)
   1.159 +
   1.160 +  return err
   1.161 +
   1.162 +
   1.163 +def clean_output(err):
   1.164 +  err = re.sub(r"^js\(\d+,0x[0-9a-f]+\) malloc: \*\*\* error for object 0x[0-9a-f]+: pointer being freed was not allocated\n\*\*\* set a breakppoint in malloc_error_break to debug\n$", "pointer being freed was not allocated", err, flags=re.MULTILINE)
   1.165 +
   1.166 +  return err
   1.167 +
   1.168 +
   1.169 +#####################################################################
   1.170 +# Consts, etc
   1.171 +#####################################################################
   1.172 +
   1.173 +command_template = 'shell/js' \
   1.174 +                 + ' -m -j -p' \
   1.175 +                 + ' -e "const platform=\'darwin\'; const libdir=\'../jit-test/lib/\';"' \
   1.176 +                 + ' -f ../jit-test/lib/prolog.js' \
   1.177 +                 + ' -f {0}'
   1.178 +
   1.179 +
   1.180 +# Blacklists are things we don't want to see in our logs again (though we do
   1.181 +# want to count them when they happen). Whitelists we do want to see in our
   1.182 +# logs again, principally because the information we have isn't enough.
   1.183 +
   1.184 +blacklist = {}
   1.185 +add_to_blacklist(r"('', '', 1)") # 1 means OOM if the shell hasn't launched yet.
   1.186 +add_to_blacklist(r"('', 'out of memory\n', 1)")
   1.187 +
   1.188 +whitelist = set()
   1.189 +whitelist.add(r"('', 'out of memory\n', -11)") # -11 means OOM
   1.190 +whitelist.add(r"('', 'out of memory\nout of memory\n', -11)")
   1.191 +
   1.192 +
   1.193 +
   1.194 +#####################################################################
   1.195 +# Program
   1.196 +#####################################################################
   1.197 +
   1.198 +# Options
   1.199 +parser = OptionParser(usage=usage)
   1.200 +parser.add_option("-r", "--regression", action="store", metavar="REGRESSION_COUNT", help=help,
   1.201 +                  type="int", dest="regression", default=None)
   1.202 +                  
   1.203 +(OPTIONS, args) = parser.parse_args()
   1.204 +
   1.205 +
   1.206 +if OPTIONS.regression != None:
   1.207 +  # TODO: This should be expanded as we get a better hang of the OOM problems.
   1.208 +  # For now, we'll just check that the number of OOMs in one short file does not
   1.209 +  # increase.
   1.210 +  files = ["../jit-test/tests/arguments/args-createontrace.js"]
   1.211 +else:
   1.212 +  files = get_js_files()
   1.213 +
   1.214 +  # Use a command-line arg to reduce the set of files
   1.215 +  if len (args):
   1.216 +    files = [f for f in files if f.find(args[0]) != -1]
   1.217 +
   1.218 +
   1.219 +if OPTIONS.regression == None:
   1.220 +  # Don't use a logfile, this is automated for tinderbox.
   1.221 +  log = file("../OOM_log", "w")
   1.222 +
   1.223 +
   1.224 +num_failures = 0
   1.225 +for f in files:
   1.226 +
   1.227 +  # Run it once to establish boundaries
   1.228 +  command = (command_template + ' -O').format(f)
   1.229 +  out, err, exit = run(command)
   1.230 +  max = re.match(".*OOM max count: (\d+).*", out, flags=re.DOTALL).groups()[0]
   1.231 +  max = int(max)
   1.232 +  
   1.233 +  # OOMs don't recover well for the first 20 allocations or so.
   1.234 +  # TODO: revisit this.
   1.235 +  for i in range(20, max): 
   1.236 +
   1.237 +    if OPTIONS.regression == None:
   1.238 +      print("Testing allocation {0}/{1} in {2}".format(i,max,f))
   1.239 +    else:
   1.240 +      sys.stdout.write('.') # something short for tinderbox, no space or \n
   1.241 +
   1.242 +    command = (command_template + ' -A {0}').format(f, i)
   1.243 +    out, err, exit = run(command)
   1.244 +
   1.245 +    # Success (5 is SM's exit code for controlled errors)
   1.246 +    if exit == 5 and err.find("out of memory") != -1:
   1.247 +      continue
   1.248 +
   1.249 +    # Failure
   1.250 +    else:
   1.251 +
   1.252 +      if OPTIONS.regression != None:
   1.253 +        # Just count them
   1.254 +        num_failures += 1
   1.255 +        continue
   1.256 +
   1.257 +      #########################################################################
   1.258 +      # The regression tests ends above. The rest of this is for running  the
   1.259 +      # script manually.
   1.260 +      #########################################################################
   1.261 +
   1.262 +      problem = str((out, err, exit))
   1.263 +      if in_blacklist(problem) and problem not in whitelist:
   1.264 +        add_to_blacklist(problem)
   1.265 +        continue
   1.266 +
   1.267 +      add_to_blacklist(problem)
   1.268 +
   1.269 +
   1.270 +      # Get valgrind output for a good stack trace
   1.271 +      vcommand = "valgrind --dsymutil=yes -q --log-file=OOM_valgrind_log_file " + command
   1.272 +      run(vcommand)
   1.273 +      vout = file("OOM_valgrind_log_file").read()
   1.274 +      vout = clean_voutput(vout)
   1.275 +      sans_alloc_sites = remove_failed_allocation_backtraces(vout)
   1.276 +
   1.277 +      # Don't print duplicate information
   1.278 +      if in_blacklist(sans_alloc_sites):
   1.279 +        add_to_blacklist(sans_alloc_sites)
   1.280 +        continue
   1.281 +
   1.282 +      add_to_blacklist(sans_alloc_sites)
   1.283 +
   1.284 +      log.write ("\n")
   1.285 +      log.write ("\n")
   1.286 +      log.write ("=========================================================================")
   1.287 +      log.write ("\n")
   1.288 +      log.write ("An allocation failure at\n\tallocation {0}/{1} in {2}\n\t"
   1.289 +                 "causes problems (detected using bug 624094)"
   1.290 +                 .format(i, max, f))
   1.291 +      log.write ("\n")
   1.292 +      log.write ("\n")
   1.293 +
   1.294 +      log.write ("Command (from obj directory, using patch from bug 624094):\n  " + command)
   1.295 +      log.write ("\n")
   1.296 +      log.write ("\n")
   1.297 +      log.write ("stdout, stderr, exitcode:\n  " + problem)
   1.298 +      log.write ("\n")
   1.299 +      log.write ("\n")
   1.300 +
   1.301 +      double_free = err.find("pointer being freed was not allocated") != -1
   1.302 +      oom_detected = err.find("out of memory") != -1
   1.303 +      multiple_oom_detected = err.find("out of memory\nout of memory") != -1
   1.304 +      segfault_detected = exit == -11
   1.305 +
   1.306 +      log.write ("Diagnosis: ")
   1.307 +      log.write ("\n")
   1.308 +      if multiple_oom_detected:
   1.309 +        log.write ("  - Multiple OOMs reported")
   1.310 +        log.write ("\n")
   1.311 +      if segfault_detected:
   1.312 +        log.write ("  - segfault")
   1.313 +        log.write ("\n")
   1.314 +      if not oom_detected:
   1.315 +        log.write ("  - No OOM checking")
   1.316 +        log.write ("\n")
   1.317 +      if double_free:
   1.318 +        log.write ("  - Double free")
   1.319 +        log.write ("\n")
   1.320 +
   1.321 +      log.write ("\n")
   1.322 +
   1.323 +      log.write ("Valgrind info:\n" + vout)
   1.324 +      log.write ("\n")
   1.325 +      log.write ("\n")
   1.326 +      log.flush()
   1.327 +
   1.328 +  if OPTIONS.regression == None:
   1.329 +    count_lines()
   1.330 +
   1.331 +print()
   1.332 +
   1.333 +# Do the actual regression check
   1.334 +if OPTIONS.regression != None:
   1.335 +  expected_num_failures = OPTIONS.regression
   1.336 +
   1.337 +  if num_failures != expected_num_failures:
   1.338 +
   1.339 +    print("TEST-UNEXPECTED-FAIL |", end='')
   1.340 +    if num_failures > expected_num_failures:
   1.341 +      print("More out-of-memory errors were found ({0}) than expected ({1}). "
   1.342 +            "This probably means an allocation site has been added without a "
   1.343 +            "NULL-check. If this is unavoidable, you can account for it by "
   1.344 +            "updating Makefile.in.".format(num_failures, expected_num_failures),
   1.345 +            end='')
   1.346 +    else:
   1.347 +      print("Congratulations, you have removed {0} out-of-memory error(s) "
   1.348 +            "({1} remain)! Please account for it by updating Makefile.in." 
   1.349 +            .format(expected_num_failures - num_failures, num_failures),
   1.350 +            end='')
   1.351 +    sys.exit(-1)
   1.352 +  else:
   1.353 +    print('TEST-PASS | find_OOM_errors | Found the expected number of OOM '
   1.354 +          'errors ({0})'.format(expected_num_failures))
   1.355 +

mercurial