The Tor Browser: tools/page-loader/URLTimingDataSet.pm@97036ab72558

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

1 #

     2 # This Source Code Form is subject to the terms of the Mozilla Public

     3 # License, v. 2.0. If a copy of the MPL was not distributed with this

     4 # file, You can obtain one at http://mozilla.org/MPL/2.0/.

     5 package URLTimingDataSet;

     6 use DBI;

     7 use PageData;       # list of test pages, etc.

     8 use strict;

    10 sub new {

    11     my $proto = shift;

    12     my $class = ref($proto) || $proto;

    13     my $self  = {

    14         dataset   => [],

    15         results   => [],

    16         sorted    => [],

    17         average   => undef,

    18         avgmedian => undef,    # note: average of individual medians

    19         maximum   => undef,

    20         minimum   => undef,

    21     };

    22     $self->{id}    = shift || die "No id supplied";

    23     $self->{table} = shift || "t" . $self->{id};

    24     $self->{pages} = PageData->new;

    25     bless ($self, $class);

    26     $self->_grok();

    27     return $self;

    28 }

    31 sub _grok {

    32     my $self = shift;

    33     my ($res);

    35     # select the dataset from the db

    36     $self->_select();

    38     for (my $i=0; $i < $self->{pages}->length; $i++) {

    39         my $name = $self->{pages}->name($i);

    40         my $count = 0;

    41         my @times = ();

    42         my $nan = 0;

    43         foreach my $ref (@{$self->{dataset}}) {

    44             next if ($name ne $ref->{content});

    45             $count++;

    46             if ($ref->{c_part} eq "NaN") {

    47                 # we bailed out of this page load

    48                 $res = "NaN";

    49                 $nan = 1;

    50             }

    51             else {

    52                 my $s_intvl = $ref->{s_intvl};

    53                 my $c_intvl = $ref->{c_intvl};

    54                 my $errval = abs($s_intvl-$c_intvl)/(($s_intvl+$c_intvl)/2);

    55                 if ($errval > 0.08) {       # one of them went wrong and stalled out (see [1] below)

    56                     $res = ($s_intvl <= $c_intvl) ? $s_intvl : $c_intvl;

    57                 } else {

    58                     $res = int(($s_intvl + $c_intvl)/2);

    59                 }

    60             }

    61             push @times, $res;

    62         }

    64         my $avg = int(_avg(@times));

    65         my $med = _med(@times);

    66         my $max = $nan ? "NaN" : _max(@times);

    67         my $min = _min(@times);

    68         push @{$self->{results}}, [ $i, $name, $count, $avg, $med, $max, $min, @times ];

    69     }

    71     $self->_get_summary();

    72     $self->_sort_result_set();

    74 }

    76 sub _select {

    77     my $self = shift;

    79     my $dbh = DBI->connect("DBI:CSV:f_dir=./db", {RaiseError => 1, AutoCommit => 1})

    80          or die "Cannot connect: " . $DBI::errstr;

    82     my $sql = qq{

    83         SELECT INDEX, S_INTVL, C_INTVL, C_PART, CONTENT, ID

    84              FROM $self->{table}

    85                   WHERE ID = '$self->{id}'

    86                 };

    88     my $sth = $dbh->prepare($sql);

    89     $sth->execute();

    91     while (my @data = $sth->fetchrow_array()) {

    92         push @{$self->{dataset}},

    93         {index   => $data[0],

    94          s_intvl => $data[1],

    95          c_intvl => $data[2],

    96          c_part  => $data[3],

    97          content => $data[4],

    98          id      => $data[5]

    99           };

   100     }

   101     $sth->finish();

   102     $dbh->disconnect();

   103 }

   105 sub _get_summary {

   106     my $self = shift;

   107     my (@avg, @med, @max, @min);

   109     # how many pages were loaded in total ('sampled')

   110     $self->{samples} = scalar(@{$self->{dataset}});

   112     # how many cycles (should I get this from test parameters instead?)

   113     $self->{count} = int(_avg( map($_->[2],  @{$self->{results}}) ));

   114     #warn $self->{count};

   116     # calculate overall average, average median, maximum, minimum, (RMS Error?)

   117     for (@{$self->{results}}) {

   118         push @avg, $_->[3];

   119         push @med, $_->[4];

   120         push @max, $_->[5];

   121         push @min, $_->[6];

   122     }

   123     $self->{average}   = int(_avg(@avg));

   124     $self->{avgmedian} = int(_avg(@med));     # note: averaging individual medians

   125     $self->{maximum}   = _max(@max);

   126     $self->{minimum}   = _min(@min);

   127 }

   129 sub _sort_result_set {

   130     my $self = shift;

   131     # sort by median load time

   132     # @{$self->{sorted}} = sort {$a->[4] <=> $b->[4]} @{$self->{results}};

   133     # might be "NaN", but this is lame of me to be carrying around a string instead of undef

   134     @{$self->{sorted}} =

   135          sort {

   136              if ($a->[4] eq "NaN" || $b->[4] eq "NaN") {

   137                  return $a->[4] cmp $b->[4];

   138              } else {

   139                  return $a->[4] <=> $b->[4];

   140              }

   141          } @{$self->{results}};

   142 }

   144 sub as_string {

   145     my $self = shift;

   146     return $self->_as_string();

   147 }

   149 sub as_string_sorted {

   150     my $self = shift;

   151     return $self->_as_string(@{$self->{sorted}});

   152 }

   155 sub _as_string {

   156     my $self = shift;

   157     my @ary = @_ ? @_ : @{$self->{results}};

   158     my $str;

   159     for (@ary) {

   160         my ($index, $path, $count, $avg, $med, $max, $min, @times) = @$_;

   161         $str .= sprintf "%3s %-26s\t", $index, $path;

   162         if ($count > 0) {

   163             $str .= sprintf "%6s %6s %6s %6s ", $avg, $med, $max, $min;

   164             foreach my $time (@times) {

   165                 $str .= sprintf "%6s ", $time;

   166             }

   167         }

   168         $str .= "\n";

   169     }

   170     return $str;

   171 }

   173 #

   174 # package internal helper functions

   175 #

   176 sub _num {

   177     my @array = ();

   178     for (@_) { push @array, $_ if /^[+-]?\d+\.?\d*$/o; }

   179     return @array;

   180 }

   182 sub _avg {

   183     my @array = _num(@_);

   184     return "NaN" unless scalar(@array);

   185     my $sum = 0;

   186     for (@array) { $sum += $_; }

   187     return $sum/scalar(@array);

   188 }

   190 sub _max {

   191     my @array = _num(@_);

   192     return "NaN" unless scalar(@array);

   193     my $max = $array[0];

   194     for (@array) { $max = ($max > $_) ? $max : $_; }

   195     return $max;

   196 }

   198 sub _min {

   199     my @array = _num(@_);

   200     return "NaN" unless scalar(@array);

   201     my $min = $array[0];

   202     for (@array) { $min = ($min < $_) ? $min : $_; }

   203     return $min;

   204 }

   206 # returns the floor(N/2) element of a sorted ascending array

   207 sub _med {

   208     my @array = _num(@_);

   209     return "NaN" unless scalar(@array);

   210     my $index = int((scalar(@array)-1)/2);

   211     @array = sort {$a <=> $b} @array;

   212     return $array[$index];

   213 }

   215 1; # return true

   217 ################################################################################

   218 #

   219 # [1] in looking at the test results, in almost all cases, the

   220 # round-trip time measured by the server logic and the client logic

   221 # would be almost the same value (which is what one would

   222 # expect). However, on occasion, one of the them would be "out of

   223 # whack", and inconsistent with the additional "layout" measure by the

   224 # client.

   225 #

   226 #    i.e., a set of numbers like these:

   227 #      c_part     c_intvl      s_intvl

   228 #      800      1003        997

   229 #      804      1007        1005

   230 #      801      1001        1325             <--

   231 #      803      1318        998              <--

   232 #      799      1002        1007

   233 #      ...

   234 #

   235 # which looks like the server side would stall in doing the accept or

   236 # in running the mod-perl handler (possibly a GC?). (The following

   237 # c_intvl would then be out of whack by a matching amount on the next

   238 # cycle).

   239 #

   240 # At any rate, since it was clear from comparing with the 'c_part'

   241 # measure, which of the times was bogus, I just use an arbitrary error

   242 # measure to determine when to toss out the "bad" value.

   243 #

The Tor Browser / file revision

tools/page-loader/URLTimingDataSet.pm@97036ab72558

tools/page-loader/URLTimingDataSet.pm