michael@0: # 
michael@0: # This Source Code Form is subject to the terms of the Mozilla Public
michael@0: # License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0: # file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0: package URLTimingDataSet;
michael@0: use DBI;
michael@0: use PageData;       # list of test pages, etc.
michael@0: use strict;
michael@0: 
michael@0: sub new {
michael@0:     my $proto = shift;
michael@0:     my $class = ref($proto) || $proto;
michael@0:     my $self  = {
michael@0:         dataset   => [],
michael@0:         results   => [],
michael@0:         sorted    => [],
michael@0:         average   => undef,
michael@0:         avgmedian => undef,    # note: average of individual medians
michael@0:         maximum   => undef,
michael@0:         minimum   => undef,
michael@0:     };
michael@0:     $self->{id}    = shift || die "No id supplied"; 
michael@0:     $self->{table} = shift || "t" . $self->{id};
michael@0:     $self->{pages} = PageData->new;
michael@0:     bless ($self, $class);
michael@0:     $self->_grok();
michael@0:     return $self;
michael@0: }
michael@0: 
michael@0: 
michael@0: sub _grok {    
michael@0:     my $self = shift;
michael@0:     my ($res);
michael@0: 
michael@0:     # select the dataset from the db
michael@0:     $self->_select();
michael@0: 
michael@0:     for (my $i=0; $i < $self->{pages}->length; $i++) {
michael@0:         my $name = $self->{pages}->name($i);
michael@0:         my $count = 0;
michael@0:         my @times = (); 
michael@0:         my $nan = 0;
michael@0:         foreach my $ref (@{$self->{dataset}}) {
michael@0:             next if ($name ne $ref->{content});
michael@0:             $count++;
michael@0:             if ($ref->{c_part} eq "NaN") { 
michael@0:                 # we bailed out of this page load
michael@0:                 $res = "NaN";
michael@0:                 $nan = 1;
michael@0:             }
michael@0:             else {
michael@0:                 my $s_intvl = $ref->{s_intvl};
michael@0:                 my $c_intvl = $ref->{c_intvl};
michael@0:                 my $errval = abs($s_intvl-$c_intvl)/(($s_intvl+$c_intvl)/2);
michael@0:                 if ($errval > 0.08) {       # one of them went wrong and stalled out (see [1] below)
michael@0:                     $res = ($s_intvl <= $c_intvl) ? $s_intvl : $c_intvl;
michael@0:                 } else {
michael@0:                     $res = int(($s_intvl + $c_intvl)/2);
michael@0:                 }
michael@0:             }
michael@0:             push @times, $res;
michael@0:         }
michael@0: 
michael@0:         my $avg = int(_avg(@times));
michael@0:         my $med = _med(@times);
michael@0:         my $max = $nan ? "NaN" : _max(@times);
michael@0:         my $min = _min(@times);
michael@0:         push @{$self->{results}}, [ $i, $name, $count, $avg, $med, $max, $min, @times ];
michael@0:     }
michael@0:     
michael@0:     $self->_get_summary();
michael@0:     $self->_sort_result_set();
michael@0:     
michael@0: }
michael@0: 
michael@0: sub _select {
michael@0:     my $self = shift;
michael@0: 
michael@0:     my $dbh = DBI->connect("DBI:CSV:f_dir=./db", {RaiseError => 1, AutoCommit => 1})
michael@0:          or die "Cannot connect: " . $DBI::errstr;
michael@0: 
michael@0:     my $sql = qq{
michael@0:         SELECT INDEX, S_INTVL, C_INTVL, C_PART, CONTENT, ID 
michael@0:              FROM $self->{table}
michael@0:                   WHERE ID = '$self->{id}'
michael@0:                 };
michael@0:  
michael@0:     my $sth = $dbh->prepare($sql);
michael@0:     $sth->execute();
michael@0: 
michael@0:     while (my @data = $sth->fetchrow_array()) {
michael@0:         push @{$self->{dataset}}, 
michael@0:         {index   => $data[0],
michael@0:          s_intvl => $data[1],
michael@0:          c_intvl => $data[2],
michael@0:          c_part  => $data[3],
michael@0:          content => $data[4],
michael@0:          id      => $data[5]
michael@0:           };
michael@0:     }
michael@0:     $sth->finish();
michael@0:     $dbh->disconnect();
michael@0: }
michael@0: 
michael@0: sub _get_summary {
michael@0:     my $self = shift;
michael@0:     my (@avg, @med, @max, @min);
michael@0: 
michael@0:     # how many pages were loaded in total ('sampled')
michael@0:     $self->{samples} = scalar(@{$self->{dataset}}); 
michael@0: 
michael@0:     # how many cycles (should I get this from test parameters instead?)
michael@0:     $self->{count} = int(_avg( map($_->[2],  @{$self->{results}}) ));
michael@0:     #warn $self->{count};
michael@0: 
michael@0:     # calculate overall average, average median, maximum, minimum, (RMS Error?)
michael@0:     for (@{$self->{results}}) {
michael@0:         push @avg, $_->[3];
michael@0:         push @med, $_->[4];
michael@0:         push @max, $_->[5];
michael@0:         push @min, $_->[6];
michael@0:     }
michael@0:     $self->{average}   = int(_avg(@avg));
michael@0:     $self->{avgmedian} = int(_avg(@med));     # note: averaging individual medians
michael@0:     $self->{maximum}   = _max(@max);
michael@0:     $self->{minimum}   = _min(@min);
michael@0: }
michael@0: 
michael@0: sub _sort_result_set {
michael@0:     my $self = shift;
michael@0:     # sort by median load time
michael@0:     # @{$self->{sorted}} = sort {$a->[4] <=> $b->[4]} @{$self->{results}};
michael@0:     # might be "NaN", but this is lame of me to be carrying around a string instead of undef
michael@0:     @{$self->{sorted}} = 
michael@0:          sort {
michael@0:              if ($a->[4] eq "NaN" || $b->[4] eq "NaN") {
michael@0:                  return $a->[4] cmp $b->[4]; 
michael@0:              } else {
michael@0:                  return $a->[4] <=> $b->[4];
michael@0:              }
michael@0:          } @{$self->{results}};
michael@0: }
michael@0: 
michael@0: sub as_string {
michael@0:     my $self = shift;
michael@0:     return $self->_as_string();
michael@0: }
michael@0: 
michael@0: sub as_string_sorted {
michael@0:     my $self = shift;
michael@0:     return $self->_as_string(@{$self->{sorted}});
michael@0: }
michael@0: 
michael@0: 
michael@0: sub _as_string {
michael@0:     my $self = shift;
michael@0:     my @ary = @_ ? @_ : @{$self->{results}};
michael@0:     my $str;
michael@0:     for (@ary) {
michael@0:         my ($index, $path, $count, $avg, $med, $max, $min, @times) = @$_;
michael@0:         $str .= sprintf "%3s %-26s\t", $index, $path;
michael@0:         if ($count > 0) {
michael@0:             $str .= sprintf "%6s %6s %6s %6s ", $avg, $med, $max, $min;
michael@0:             foreach my $time (@times) {
michael@0:                 $str .= sprintf "%6s ", $time;
michael@0:             }
michael@0:         }
michael@0:         $str .= "\n";
michael@0:     }
michael@0:     return $str;
michael@0: }
michael@0:     
michael@0: #
michael@0: # package internal helper functions
michael@0: #
michael@0: sub _num {
michael@0:     my @array = ();
michael@0:     for (@_) { push @array, $_ if /^[+-]?\d+\.?\d*$/o; }
michael@0:     return @array;
michael@0: }
michael@0: 
michael@0: sub _avg {
michael@0:     my @array = _num(@_);
michael@0:     return "NaN" unless scalar(@array);
michael@0:     my $sum = 0;
michael@0:     for (@array) { $sum += $_; }
michael@0:     return $sum/scalar(@array);
michael@0: }
michael@0: 
michael@0: sub _max {
michael@0:     my @array = _num(@_);
michael@0:     return "NaN" unless scalar(@array);
michael@0:     my $max = $array[0];
michael@0:     for (@array) { $max = ($max > $_) ? $max : $_; }
michael@0:     return $max;
michael@0: }
michael@0: 
michael@0: sub _min {
michael@0:     my @array = _num(@_);
michael@0:     return "NaN" unless scalar(@array);
michael@0:     my $min = $array[0];
michael@0:     for (@array) { $min = ($min < $_) ? $min : $_; }
michael@0:     return $min;
michael@0: }
michael@0: 
michael@0: # returns the floor(N/2) element of a sorted ascending array
michael@0: sub _med {
michael@0:     my @array = _num(@_);
michael@0:     return "NaN" unless scalar(@array);
michael@0:     my $index = int((scalar(@array)-1)/2);
michael@0:     @array = sort {$a <=> $b} @array;
michael@0:     return $array[$index];
michael@0: }
michael@0: 
michael@0: 1; # return true
michael@0: 
michael@0: ################################################################################
michael@0: #
michael@0: # [1] in looking at the test results, in almost all cases, the
michael@0: # round-trip time measured by the server logic and the client logic
michael@0: # would be almost the same value (which is what one would
michael@0: # expect). However, on occasion, one of the them would be "out of
michael@0: # whack", and inconsistent with the additional "layout" measure by the
michael@0: # client.
michael@0: #
michael@0: #    i.e., a set of numbers like these:
michael@0: #      c_part     c_intvl      s_intvl
michael@0: #      800      1003        997
michael@0: #      804      1007        1005
michael@0: #      801      1001        1325             <--
michael@0: #      803      1318        998              <--
michael@0: #      799      1002        1007
michael@0: #      ...
michael@0: #
michael@0: # which looks like the server side would stall in doing the accept or
michael@0: # in running the mod-perl handler (possibly a GC?). (The following
michael@0: # c_intvl would then be out of whack by a matching amount on the next
michael@0: # cycle).
michael@0: #
michael@0: # At any rate, since it was clear from comparing with the 'c_part'
michael@0: # measure, which of the times was bogus, I just use an arbitrary error
michael@0: # measure to determine when to toss out the "bad" value.
michael@0: #