openpkg/mirror.pl

changeset 428
f880f219c566
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/openpkg/mirror.pl	Tue Jul 31 12:23:42 2012 +0200
     1.3 @@ -0,0 +1,463 @@
     1.4 +##
     1.5 +##  mirror.pl -- OpenPKG Package Mirroring
     1.6 +##  Copyright (c) 2011-2012 OpenPKG GmbH <http://openpkg.com/>
     1.7 +##
     1.8 +##  This software is property of the OpenPKG GmbH, DE MUC HRB 160208.
     1.9 +##  All rights reserved. Licenses which grant limited permission to use,
    1.10 +##  copy, modify and distribute this software are available from the
    1.11 +##  OpenPKG GmbH.
    1.12 +##
    1.13 +##  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED
    1.14 +##  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
    1.15 +##  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
    1.16 +##  IN NO EVENT SHALL THE AUTHORS AND COPYRIGHT HOLDERS AND THEIR
    1.17 +##  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    1.18 +##  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    1.19 +##  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
    1.20 +##  USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
    1.21 +##  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
    1.22 +##  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
    1.23 +##  OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    1.24 +##  SUCH DAMAGE.
    1.25 +##
    1.26 +
    1.27 +require 5.003;
    1.28 +
    1.29 +#   OpenPKG instance prefix and tools
    1.30 +my $my_prefix = $ENV{'OPENPKG_PREFIX'};
    1.31 +my $my_rpm = "$my_prefix/bin/openpkg rpm";
    1.32 +my $my_openssl = "$my_prefix/lib/openpkg/openssl";
    1.33 +delete $ENV{'OPENPKG_PREFIX'};
    1.34 +
    1.35 +#   program identification
    1.36 +my $progname  = "search";
    1.37 +my $progvers  = "0.1.0";
    1.38 +
    1.39 +#   determine reasonable temporary directory
    1.40 +my $tmpdir = ($ENV{"TMPDIR"} || "/tmp");
    1.41 +
    1.42 +#   use unbuffered stdout
    1.43 +$|++;
    1.44 +
    1.45 +##
    1.46 +##  COMMAND-LINE INTERFACE HANDLING
    1.47 +##
    1.48 +
    1.49 +#   home-brewn getopt(3) style option parser
    1.50 +sub getopts ($$@) {
    1.51 +    my ($opt_spec, $opts, @argv_orig) = @_;
    1.52 +    my (%optf) = map { m/(\w)/; $1 => $_ } $opt_spec =~ m/(\w:|\w)/g;
    1.53 +    my (@argv, $optarg);
    1.54 +
    1.55 +    foreach (@argv_orig) {
    1.56 +        if (@argv) {
    1.57 +            push @argv, $_;
    1.58 +        } elsif (defined $optarg) {
    1.59 +            if (exists $opts->{$optarg}) {
    1.60 +                $opts->{$optarg} .= " $_";
    1.61 +            } else {
    1.62 +                $opts->{$optarg} = $_;
    1.63 +            }
    1.64 +            $optarg = undef;
    1.65 +        } elsif (!/^[-]/) {
    1.66 +            push @argv, $_;
    1.67 +        } else {
    1.68 +            while (/^\-(\w)(.*)/) {
    1.69 +                if (exists $optf{$1}) {
    1.70 +                    if (length($optf{$1}) > 1) {
    1.71 +                        if ($2 ne '') {
    1.72 +                            if (exists $opts->{$1}) {
    1.73 +                                $opts->{$1} .= " $2";
    1.74 +                            } else {
    1.75 +                                $opts->{$1} = $2;
    1.76 +                            }
    1.77 +                        } else {
    1.78 +                            $optarg = $1;
    1.79 +                        }
    1.80 +                        last;
    1.81 +                    } else {
    1.82 +                        $opts->{$1} = 1;
    1.83 +                    }
    1.84 +                } else {
    1.85 +                    warn "openpkg:$prog_name:WARNING: unknown option $_\n";
    1.86 +                }
    1.87 +                $_ = "-$2";
    1.88 +            }
    1.89 +        }
    1.90 +    }
    1.91 +    if (defined $optarg) {
    1.92 +        warn "openpkg:$prog_name:WARNING: option $optarg requires an argument\n";
    1.93 +    }
    1.94 +    foreach my $opt (keys %optf) {
    1.95 +        if (not exists $opts->{$opt}) {
    1.96 +            $opts->{$opt} = (length($optf{$opt}) > 1 ? "" : 0);
    1.97 +        }
    1.98 +    }
    1.99 +    return @argv;
   1.100 +}
   1.101 +
   1.102 +#   parse command line options
   1.103 +my $opts = {};
   1.104 +@ARGV = getopts("hfqr:p:C:", $opts, @ARGV);
   1.105 +
   1.106 +#   usage sanity check and usage help
   1.107 +sub usage {
   1.108 +    my ($rc) = @_;
   1.109 +    my $usage = "openpkg:$prog_name:USAGE: openpkg mirror [-h] [-f] [-C <cache-file>] [-r <remote-url>] [-p <remote-url-prefix>] <local-prefix-dir>\n";
   1.110 +    if ($rc == 0) {
   1.111 +        print STDOUT $usage;
   1.112 +    }
   1.113 +    else {
   1.114 +        print STDERR $usage;
   1.115 +    }
   1.116 +    exit($rc);
   1.117 +}
   1.118 +if ($opts->{"h"}) {
   1.119 +    usage(0);
   1.120 +}
   1.121 +if (@ARGV != 1) {
   1.122 +    usage(1);
   1.123 +}
   1.124 +
   1.125 +#   determine arguments
   1.126 +my ($localprefixdir) = @ARGV;
   1.127 +if (not -d $localprefixdir) {
   1.128 +    print STDERR "openpkg:$prog_name:ERROR: local directory already has to exist: $localprefixdir\n";
   1.129 +    exit(1);
   1.130 +}
   1.131 +
   1.132 +#   determine start URL
   1.133 +my $url = $opts->{"r"} || "";
   1.134 +if ($url eq "") {
   1.135 +    $url = `$my_prefix/bin/openpkg release --fmt='%u' 2>/dev/null`;
   1.136 +    $url =~ s/^\s+//s;
   1.137 +    $url =~ s/\s+$//s;
   1.138 +}
   1.139 +if ($url eq "") {
   1.140 +    print STDERR "openpkg:$prog_name:ERROR: no repository URL known\n";
   1.141 +    exit(1);
   1.142 +}
   1.143 +
   1.144 +#   determine URL prefix
   1.145 +my $remoteprefixurl = $opts->{"p"};
   1.146 +if (not defined $remoteprefixurl) {
   1.147 +    $remoteprefixurl = $url;
   1.148 +    $remoteprefixurl =~ s/\/([^\/]+)$//s;
   1.149 +}
   1.150 +
   1.151 +##
   1.152 +##  HELPER FUNCTIONS
   1.153 +##
   1.154 +
   1.155 +#   determine size and mtime of a file
   1.156 +sub sizetime ($) {
   1.157 +    my ($file) = @_;
   1.158 +    my @stat = stat($file);
   1.159 +    return ($stat[7], $stat[9]);
   1.160 +}
   1.161 +
   1.162 +#   determine MD5 of a file
   1.163 +sub digest ($) {
   1.164 +    my ($file) = @_;
   1.165 +    my $digest = "";
   1.166 +    my $output = `($my_openssl md5 -hex "$file") 2>/dev/null || true`;
   1.167 +    if ($output =~ m/MD5\(.+?\)=\s*([0-9a-fA-F]+)/s) {
   1.168 +        $digest = uc($1);
   1.169 +    }
   1.170 +    return $digest;
   1.171 +}
   1.172 +
   1.173 +#   print a byte size in human-readable form
   1.174 +sub humansize ($) {
   1.175 +    my ($bytes) = @_;
   1.176 +    my $size = "";
   1.177 +    if    ($bytes > 1024*1024*1024) { $size = sprintf("%.1fGB", $bytes / (1024*1024*1024)); }
   1.178 +    elsif ($bytes > 1024*1024)      { $size = sprintf("%.1fMB", $bytes / (1024*1024)     ); }
   1.179 +    elsif ($bytes > 1024)           { $size = sprintf("%.1fKB", $bytes / (1024)          ); }
   1.180 +    else                            { $size = sprintf("%dB",    $bytes                   ); }
   1.181 +    return $size;
   1.182 +}
   1.183 +
   1.184 +#   caching support: load cache
   1.185 +sub cache_load () {
   1.186 +    my $cache = {};
   1.187 +    if ($opts->{"C"}) {
   1.188 +        if (-f $opts->{"C"}) {
   1.189 +            #   load cache file
   1.190 +            open(FP, "<" . $opts->{"C"})
   1.191 +                or die "unable to load cache file: " . $opts->{"C"};
   1.192 +            my $txt = ""; $txt .= $_ while (<FP>);
   1.193 +            close(FP);
   1.194 +
   1.195 +            #   parse cache file
   1.196 +            $txt =~ s/^\s*(\d+)\s+(\d+)\s+([\da-fA-F]+)\s+(.+)$/
   1.197 +                $cache->{$4} = { size => $1, mtime => $2, md5 => $3 }, ''
   1.198 +            /mge;
   1.199 +        }
   1.200 +    }
   1.201 +    return $cache;
   1.202 +}
   1.203 +
   1.204 +#   caching support: save cache
   1.205 +sub cache_save ($) {
   1.206 +    my ($cache) = @_;
   1.207 +    if ($opts->{"C"}) {
   1.208 +        #   generate cache file
   1.209 +        my $txt = "";
   1.210 +        foreach my $url (sort { $a cmp $b } keys %{$cache}) {
   1.211 +            $txt .= sprintf("%10d %10d %s %s\n",
   1.212 +                $cache->{$url}->{"size"},
   1.213 +                $cache->{$url}->{"mtime"},
   1.214 +                $cache->{$url}->{"md5"},
   1.215 +                $url
   1.216 +            );
   1.217 +        }
   1.218 +
   1.219 +        #   write cache file
   1.220 +        open(FP, ">" . $opts->{"C"})
   1.221 +            or die "unable to save cache file: " . $opts->{"C"};
   1.222 +        print FP $txt;
   1.223 +        close(FP);
   1.224 +    }
   1.225 +    return;
   1.226 +}
   1.227 +
   1.228 +##
   1.229 +##   DETERMINE LOCAL AND REMOTE FILES
   1.230 +##
   1.231 +
   1.232 +#   optionally load the cache
   1.233 +my $cache = cache_load();
   1.234 +
   1.235 +#   determine local files
   1.236 +print "++ determining local files\n";
   1.237 +my @local_files = map {
   1.238 +    s/\r?\n$//s;
   1.239 +    s/\/$//s;
   1.240 +    $_;
   1.241 +} `find $localprefixdir -depth -print`;
   1.242 +print "-- local: " . @local_files . " files\n";
   1.243 +
   1.244 +#   determine remote files
   1.245 +print "++ determining remote files\n";
   1.246 +sub relurl ($$) {
   1.247 +    my ($url, $suburl) = @_;
   1.248 +    if ($suburl =~ m/^\w+:\/\//) {
   1.249 +        $url = $suburl;
   1.250 +    }
   1.251 +    elsif ($suburl =~ m/^\//) {
   1.252 +        $url = "file://$suburl";
   1.253 +    }
   1.254 +    else {
   1.255 +        $url =~ s/(\/)?\/*[^\/]*$/$1$suburl/;
   1.256 +    }
   1.257 +    1 while ($url =~ s/\/\.\//\//s);
   1.258 +    1 while ($url =~ s/\/[^\/]+\/\.\.\//\//s);
   1.259 +    return $url;
   1.260 +}
   1.261 +sub parse_attr ($) {
   1.262 +    my ($xml) = @_;
   1.263 +    my $info = { href => "", size => -1, mtime => -1, md5 => "-" };
   1.264 +    $info->{"href"}  = $1 if ($xml =~ m/\bhref="([^"]*)"/s);
   1.265 +    $info->{"size"}  = $1 if ($xml =~ m/\bsize="([^"]*)"/s);
   1.266 +    $info->{"mtime"} = $1 if ($xml =~ m/\bmtime="([^"]*)"/s);
   1.267 +    $info->{"md5"}   = $1 if ($xml =~ m/\bmd5="([^"]*)"/s);
   1.268 +    return $info;
   1.269 +}
   1.270 +sub get_urls ($) {
   1.271 +    my ($info) = @_;
   1.272 +
   1.273 +    #   determine URL
   1.274 +    my $url = $info->{"href"};
   1.275 +    if ($url =~ m/^\//) {
   1.276 +        $url = "file://$url";
   1.277 +    }
   1.278 +    if ($url =~ m/\/$/) {
   1.279 +        $url .= "00INDEX.rdf";
   1.280 +    }
   1.281 +    $info->{"href"} = $url;
   1.282 +
   1.283 +    #   fetch URL content
   1.284 +    print "-- fetching XML/RDF index: $url\n";
   1.285 +    my $tmpfile = "$tmpdir/openpkg-mirror.tmp";
   1.286 +    unlink($tmpfile);
   1.287 +    my $cmd = "$my_prefix/bin/openpkg curl -k -s -R -o \"$tmpfile\" \"$url\"";
   1.288 +    system($cmd);
   1.289 +    my $xml;
   1.290 +    if ($url =~ m/\.bz2$/) {
   1.291 +        $xml = `$my_prefix/lib/openpkg/bzip2 -d -c <"$tmpfile"`;
   1.292 +    }
   1.293 +    else {
   1.294 +        $xml = `cat "$tmpfile"`;
   1.295 +    }
   1.296 +
   1.297 +    #   provide information about this URL
   1.298 +    my @urls = ();
   1.299 +    if ($info->{"size"} == -1 or $info->{"mtime"} == -1) {
   1.300 +        my ($S, $T) = sizetime($tmpfile);
   1.301 +        $info->{"size"} = $S;
   1.302 +        $info->{"mtime"} = $T;
   1.303 +    }
   1.304 +    if ($info->{"md5"} eq "-") {
   1.305 +        $info->{"md5"} = digest($tmpfile);
   1.306 +    }
   1.307 +    push(@urls, $info);
   1.308 +    unlink($tmpfile);
   1.309 +
   1.310 +    #   parse references
   1.311 +    my @ref1 = ();
   1.312 +    my @ref2 = ();
   1.313 +    while ($xml =~ m/<Repository\s+([^>]*href=[^>]*)>/gs) {
   1.314 +        push(@ref1, $1);
   1.315 +    }
   1.316 +    while ($xml =~ m/<rdf:Description\s+([^>]*href=[^>]*)>/gs) {
   1.317 +        push(@ref2, $1);
   1.318 +    }
   1.319 +
   1.320 +    #   provide information about referenced leave URLs
   1.321 +    foreach my $ref2 (@ref2) {
   1.322 +        my $i = parse_attr($ref2);
   1.323 +        $i->{"href"} = relurl($url, $i->{"href"});
   1.324 +        push(@urls, $i);
   1.325 +    }
   1.326 +
   1.327 +    #   provide information about referenced index URLs
   1.328 +    foreach my $ref1 (@ref1) {
   1.329 +        my $i = parse_attr($ref1);
   1.330 +        $i->{"href"} = relurl($url, $i->{"href"});
   1.331 +        push(@urls, get_urls($i)); # RECURSION
   1.332 +    }
   1.333 +
   1.334 +    return @urls;
   1.335 +}
   1.336 +my @remote_urls = get_urls({ href => $url, size => -1, mtime => -1, md5 => "-" });
   1.337 +my $size = 0;
   1.338 +foreach my $url (@remote_urls) {
   1.339 +    $size += int($url->{"size"}) if ($url->{"size"} > 0);
   1.340 +}
   1.341 +print "-- remote: " . @remote_urls . " files (" . humansize($size) . ")\n";
   1.342 +
   1.343 +##
   1.344 +##   DOWNLOAD NEW OR UPDATED REMOTE FILES
   1.345 +##
   1.346 +
   1.347 +#   fetching remote files
   1.348 +print "++ fetching new/updated files\n";
   1.349 +my $local_files_keep = {};
   1.350 +sub keep ($) {
   1.351 +    my ($path) = @_;
   1.352 +    #   keep path and all its parent paths
   1.353 +    while ($path ne "") {
   1.354 +        $local_files_keep->{$path} = 1;
   1.355 +        $path =~ s/(?:^\/$|\/?[^\/]+$)//s;
   1.356 +    }
   1.357 +}
   1.358 +sub mkdir_for_file ($$) {
   1.359 +    my ($prefix, $path) = @_;
   1.360 +    $path =~ s/(?:^\/$|\/?[^\/]+$)//s;
   1.361 +    my @dirs = split("/", $path);
   1.362 +    $path = $prefix;
   1.363 +    $path =~ s/\/$//s;
   1.364 +    foreach my $dir (@dirs) {
   1.365 +        $path .= "/$dir";
   1.366 +        if (not -d $path) {
   1.367 +            print "-- creating: $path\n";
   1.368 +            system("mkdir \"$path\"");
   1.369 +        }
   1.370 +    }
   1.371 +}
   1.372 +foreach my $remote_url (@remote_urls) {
   1.373 +    my $href = $remote_url->{"href"};
   1.374 +    if ($href =~ m/^\Q$remoteprefixurl\E(.+)$/) {
   1.375 +        #   derive local path
   1.376 +        my $local_path = $1;
   1.377 +        my $local_file = $localprefixdir . $local_path;
   1.378 +        keep($local_file);
   1.379 +
   1.380 +        #   determine information about local path
   1.381 +        my $download = 1;
   1.382 +        my $cache_update = 0;
   1.383 +        my ($size, $mtime, $md5);
   1.384 +        if (-f $local_file) {
   1.385 +            my $c = $cache->{$local_file};
   1.386 +            $size  = $c->{"size"}  if (defined $c and defined $c->{"size"});
   1.387 +            $mtime = $c->{"mtime"} if (defined $c and defined $c->{"mtime"});
   1.388 +            if (not defined $size or not defined $mtime) {
   1.389 +                ($size, $mtime) = sizetime($local_file);
   1.390 +                $cache_update = 1;
   1.391 +            }
   1.392 +            if (    $size  == $remote_url->{"size"}
   1.393 +                and $mtime == $remote_url->{"mtime"}) {
   1.394 +                $md5 = $c->{"md5"} if (defined $c and defined $c->{"md5"});
   1.395 +                if (not defined $md5) {
   1.396 +                    $md5 = digest($local_file);
   1.397 +                    $cache_update = 1;
   1.398 +                }
   1.399 +                if ($md5 eq $remote_url->{"md5"}) {
   1.400 +                    $download = 0;
   1.401 +                }
   1.402 +            }
   1.403 +        }
   1.404 +
   1.405 +        #   optionally download remote file to local path
   1.406 +        if ($download) {
   1.407 +            print "-- fetching: $local_path\n";
   1.408 +            mkdir_for_file($localprefixdir, $local_path);
   1.409 +            my $cmd = "$my_prefix/bin/openpkg curl";
   1.410 +            $cmd .= " -s"     if (    $opts->{"q"});
   1.411 +            $cmd .= " \"-#\"" if (not $opts->{"q"});
   1.412 +            $cmd .= " -k -R -o \"$local_file\" \"$href\"";
   1.413 +            system($cmd);
   1.414 +            ($size, $mtime) = sizetime($local_file);
   1.415 +            $md5 = digest($local_file);
   1.416 +            $cache_update = 1;
   1.417 +        }
   1.418 +
   1.419 +        #   optionally update the size/mtime/md5 cache
   1.420 +        if ($opts->{"C"} and $cache_update) {
   1.421 +            $cache->{$local_file} = { size => $size, mtime => $mtime, md5 => $md5 };
   1.422 +            cache_save($cache);
   1.423 +        }
   1.424 +    }
   1.425 +    else {
   1.426 +        print "** WARNING: URL \"$href\" not under prefix \"$remoteprefixurl\": " .
   1.427 +            "unable to derive local path -- skipping URL\n";
   1.428 +    }
   1.429 +}
   1.430 +
   1.431 +##
   1.432 +##   REMOVE OBSOLETE LOCAL FILES
   1.433 +##
   1.434 +
   1.435 +#   remove obsolete files
   1.436 +print "++ removing obsolete files\n";
   1.437 +my $lf  = @local_files;
   1.438 +my $lfk = keys %{$local_files_keep};
   1.439 +if (not $opts->{"f"} and ($lf - $lfk) > ($lf * 0.3)) {
   1.440 +    print "** WARNING: more than 30% of your existing local files would be removed (require option -f)\n";
   1.441 +}
   1.442 +else {
   1.443 +    foreach my $local_file (@local_files) {
   1.444 +        if (not exists $local_files_keep->{$local_file}) {
   1.445 +            if (-f $local_file) {
   1.446 +                print "-- removing: $local_file (file)\n";
   1.447 +                system("rm \"$local_file\" >/dev/null 2>&1 || true");
   1.448 +            }
   1.449 +            elsif (-d $local_file) {
   1.450 +                print "-- removing: $local_file (dir)\n";
   1.451 +                system("rmdir \"$local_file\" >/dev/null 2>&1 || true");
   1.452 +            }
   1.453 +        }
   1.454 +    }
   1.455 +}
   1.456 +
   1.457 +##
   1.458 +##   CLEANUP
   1.459 +##
   1.460 +
   1.461 +#   optionally flush the cache
   1.462 +cache_save($cache);
   1.463 +
   1.464 +#   exit gracefully
   1.465 +exit(0);
   1.466 +

mercurial