diff -r 71503088f51b -r f880f219c566 openpkg/mirror.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/openpkg/mirror.pl Tue Jul 31 12:23:42 2012 +0200 @@ -0,0 +1,463 @@ +## +## mirror.pl -- OpenPKG Package Mirroring +## Copyright (c) 2011-2012 OpenPKG GmbH +## +## This software is property of the OpenPKG GmbH, DE MUC HRB 160208. +## All rights reserved. Licenses which grant limited permission to use, +## copy, modify and distribute this software are available from the +## OpenPKG GmbH. +## +## THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED +## WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +## MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +## IN NO EVENT SHALL THE AUTHORS AND COPYRIGHT HOLDERS AND THEIR +## CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +## USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +## ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +## OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +## OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +## SUCH DAMAGE. +## + +require 5.003; + +# OpenPKG instance prefix and tools +my $my_prefix = $ENV{'OPENPKG_PREFIX'}; +my $my_rpm = "$my_prefix/bin/openpkg rpm"; +my $my_openssl = "$my_prefix/lib/openpkg/openssl"; +delete $ENV{'OPENPKG_PREFIX'}; + +# program identification +my $progname = "search"; +my $progvers = "0.1.0"; + +# determine reasonable temporary directory +my $tmpdir = ($ENV{"TMPDIR"} || "/tmp"); + +# use unbuffered stdout +$|++; + +## +## COMMAND-LINE INTERFACE HANDLING +## + +# home-brewn getopt(3) style option parser +sub getopts ($$@) { + my ($opt_spec, $opts, @argv_orig) = @_; + my (%optf) = map { m/(\w)/; $1 => $_ } $opt_spec =~ m/(\w:|\w)/g; + my (@argv, $optarg); + + foreach (@argv_orig) { + if (@argv) { + push @argv, $_; + } elsif (defined $optarg) { + if (exists $opts->{$optarg}) { + $opts->{$optarg} .= " $_"; + } else { + $opts->{$optarg} = $_; + } + $optarg = undef; + } elsif (!/^[-]/) { + push @argv, $_; + } else { + while (/^\-(\w)(.*)/) { + if (exists $optf{$1}) { + if (length($optf{$1}) > 1) { + if ($2 ne '') { + if (exists $opts->{$1}) { + $opts->{$1} .= " $2"; + } else { + $opts->{$1} = $2; + } + } else { + $optarg = $1; + } + last; + } else { + $opts->{$1} = 1; + } + } else { + warn "openpkg:$prog_name:WARNING: unknown option $_\n"; + } + $_ = "-$2"; + } + } + } + if (defined $optarg) { + warn "openpkg:$prog_name:WARNING: option $optarg requires an argument\n"; + } + foreach my $opt (keys %optf) { + if (not exists $opts->{$opt}) { + $opts->{$opt} = (length($optf{$opt}) > 1 ? "" : 0); + } + } + return @argv; +} + +# parse command line options +my $opts = {}; +@ARGV = getopts("hfqr:p:C:", $opts, @ARGV); + +# usage sanity check and usage help +sub usage { + my ($rc) = @_; + my $usage = "openpkg:$prog_name:USAGE: openpkg mirror [-h] [-f] [-C ] [-r ] [-p ] \n"; + if ($rc == 0) { + print STDOUT $usage; + } + else { + print STDERR $usage; + } + exit($rc); +} +if ($opts->{"h"}) { + usage(0); +} +if (@ARGV != 1) { + usage(1); +} + +# determine arguments +my ($localprefixdir) = @ARGV; +if (not -d $localprefixdir) { + print STDERR "openpkg:$prog_name:ERROR: local directory already has to exist: $localprefixdir\n"; + exit(1); +} + +# determine start URL +my $url = $opts->{"r"} || ""; +if ($url eq "") { + $url = `$my_prefix/bin/openpkg release --fmt='%u' 2>/dev/null`; + $url =~ s/^\s+//s; + $url =~ s/\s+$//s; +} +if ($url eq "") { + print STDERR "openpkg:$prog_name:ERROR: no repository URL known\n"; + exit(1); +} + +# determine URL prefix +my $remoteprefixurl = $opts->{"p"}; +if (not defined $remoteprefixurl) { + $remoteprefixurl = $url; + $remoteprefixurl =~ s/\/([^\/]+)$//s; +} + +## +## HELPER FUNCTIONS +## + +# determine size and mtime of a file +sub sizetime ($) { + my ($file) = @_; + my @stat = stat($file); + return ($stat[7], $stat[9]); +} + +# determine MD5 of a file +sub digest ($) { + my ($file) = @_; + my $digest = ""; + my $output = `($my_openssl md5 -hex "$file") 2>/dev/null || true`; + if ($output =~ m/MD5\(.+?\)=\s*([0-9a-fA-F]+)/s) { + $digest = uc($1); + } + return $digest; +} + +# print a byte size in human-readable form +sub humansize ($) { + my ($bytes) = @_; + my $size = ""; + if ($bytes > 1024*1024*1024) { $size = sprintf("%.1fGB", $bytes / (1024*1024*1024)); } + elsif ($bytes > 1024*1024) { $size = sprintf("%.1fMB", $bytes / (1024*1024) ); } + elsif ($bytes > 1024) { $size = sprintf("%.1fKB", $bytes / (1024) ); } + else { $size = sprintf("%dB", $bytes ); } + return $size; +} + +# caching support: load cache +sub cache_load () { + my $cache = {}; + if ($opts->{"C"}) { + if (-f $opts->{"C"}) { + # load cache file + open(FP, "<" . $opts->{"C"}) + or die "unable to load cache file: " . $opts->{"C"}; + my $txt = ""; $txt .= $_ while (); + close(FP); + + # parse cache file + $txt =~ s/^\s*(\d+)\s+(\d+)\s+([\da-fA-F]+)\s+(.+)$/ + $cache->{$4} = { size => $1, mtime => $2, md5 => $3 }, '' + /mge; + } + } + return $cache; +} + +# caching support: save cache +sub cache_save ($) { + my ($cache) = @_; + if ($opts->{"C"}) { + # generate cache file + my $txt = ""; + foreach my $url (sort { $a cmp $b } keys %{$cache}) { + $txt .= sprintf("%10d %10d %s %s\n", + $cache->{$url}->{"size"}, + $cache->{$url}->{"mtime"}, + $cache->{$url}->{"md5"}, + $url + ); + } + + # write cache file + open(FP, ">" . $opts->{"C"}) + or die "unable to save cache file: " . $opts->{"C"}; + print FP $txt; + close(FP); + } + return; +} + +## +## DETERMINE LOCAL AND REMOTE FILES +## + +# optionally load the cache +my $cache = cache_load(); + +# determine local files +print "++ determining local files\n"; +my @local_files = map { + s/\r?\n$//s; + s/\/$//s; + $_; +} `find $localprefixdir -depth -print`; +print "-- local: " . @local_files . " files\n"; + +# determine remote files +print "++ determining remote files\n"; +sub relurl ($$) { + my ($url, $suburl) = @_; + if ($suburl =~ m/^\w+:\/\//) { + $url = $suburl; + } + elsif ($suburl =~ m/^\//) { + $url = "file://$suburl"; + } + else { + $url =~ s/(\/)?\/*[^\/]*$/$1$suburl/; + } + 1 while ($url =~ s/\/\.\//\//s); + 1 while ($url =~ s/\/[^\/]+\/\.\.\//\//s); + return $url; +} +sub parse_attr ($) { + my ($xml) = @_; + my $info = { href => "", size => -1, mtime => -1, md5 => "-" }; + $info->{"href"} = $1 if ($xml =~ m/\bhref="([^"]*)"/s); + $info->{"size"} = $1 if ($xml =~ m/\bsize="([^"]*)"/s); + $info->{"mtime"} = $1 if ($xml =~ m/\bmtime="([^"]*)"/s); + $info->{"md5"} = $1 if ($xml =~ m/\bmd5="([^"]*)"/s); + return $info; +} +sub get_urls ($) { + my ($info) = @_; + + # determine URL + my $url = $info->{"href"}; + if ($url =~ m/^\//) { + $url = "file://$url"; + } + if ($url =~ m/\/$/) { + $url .= "00INDEX.rdf"; + } + $info->{"href"} = $url; + + # fetch URL content + print "-- fetching XML/RDF index: $url\n"; + my $tmpfile = "$tmpdir/openpkg-mirror.tmp"; + unlink($tmpfile); + my $cmd = "$my_prefix/bin/openpkg curl -k -s -R -o \"$tmpfile\" \"$url\""; + system($cmd); + my $xml; + if ($url =~ m/\.bz2$/) { + $xml = `$my_prefix/lib/openpkg/bzip2 -d -c <"$tmpfile"`; + } + else { + $xml = `cat "$tmpfile"`; + } + + # provide information about this URL + my @urls = (); + if ($info->{"size"} == -1 or $info->{"mtime"} == -1) { + my ($S, $T) = sizetime($tmpfile); + $info->{"size"} = $S; + $info->{"mtime"} = $T; + } + if ($info->{"md5"} eq "-") { + $info->{"md5"} = digest($tmpfile); + } + push(@urls, $info); + unlink($tmpfile); + + # parse references + my @ref1 = (); + my @ref2 = (); + while ($xml =~ m/]*href=[^>]*)>/gs) { + push(@ref1, $1); + } + while ($xml =~ m/]*href=[^>]*)>/gs) { + push(@ref2, $1); + } + + # provide information about referenced leave URLs + foreach my $ref2 (@ref2) { + my $i = parse_attr($ref2); + $i->{"href"} = relurl($url, $i->{"href"}); + push(@urls, $i); + } + + # provide information about referenced index URLs + foreach my $ref1 (@ref1) { + my $i = parse_attr($ref1); + $i->{"href"} = relurl($url, $i->{"href"}); + push(@urls, get_urls($i)); # RECURSION + } + + return @urls; +} +my @remote_urls = get_urls({ href => $url, size => -1, mtime => -1, md5 => "-" }); +my $size = 0; +foreach my $url (@remote_urls) { + $size += int($url->{"size"}) if ($url->{"size"} > 0); +} +print "-- remote: " . @remote_urls . " files (" . humansize($size) . ")\n"; + +## +## DOWNLOAD NEW OR UPDATED REMOTE FILES +## + +# fetching remote files +print "++ fetching new/updated files\n"; +my $local_files_keep = {}; +sub keep ($) { + my ($path) = @_; + # keep path and all its parent paths + while ($path ne "") { + $local_files_keep->{$path} = 1; + $path =~ s/(?:^\/$|\/?[^\/]+$)//s; + } +} +sub mkdir_for_file ($$) { + my ($prefix, $path) = @_; + $path =~ s/(?:^\/$|\/?[^\/]+$)//s; + my @dirs = split("/", $path); + $path = $prefix; + $path =~ s/\/$//s; + foreach my $dir (@dirs) { + $path .= "/$dir"; + if (not -d $path) { + print "-- creating: $path\n"; + system("mkdir \"$path\""); + } + } +} +foreach my $remote_url (@remote_urls) { + my $href = $remote_url->{"href"}; + if ($href =~ m/^\Q$remoteprefixurl\E(.+)$/) { + # derive local path + my $local_path = $1; + my $local_file = $localprefixdir . $local_path; + keep($local_file); + + # determine information about local path + my $download = 1; + my $cache_update = 0; + my ($size, $mtime, $md5); + if (-f $local_file) { + my $c = $cache->{$local_file}; + $size = $c->{"size"} if (defined $c and defined $c->{"size"}); + $mtime = $c->{"mtime"} if (defined $c and defined $c->{"mtime"}); + if (not defined $size or not defined $mtime) { + ($size, $mtime) = sizetime($local_file); + $cache_update = 1; + } + if ( $size == $remote_url->{"size"} + and $mtime == $remote_url->{"mtime"}) { + $md5 = $c->{"md5"} if (defined $c and defined $c->{"md5"}); + if (not defined $md5) { + $md5 = digest($local_file); + $cache_update = 1; + } + if ($md5 eq $remote_url->{"md5"}) { + $download = 0; + } + } + } + + # optionally download remote file to local path + if ($download) { + print "-- fetching: $local_path\n"; + mkdir_for_file($localprefixdir, $local_path); + my $cmd = "$my_prefix/bin/openpkg curl"; + $cmd .= " -s" if ( $opts->{"q"}); + $cmd .= " \"-#\"" if (not $opts->{"q"}); + $cmd .= " -k -R -o \"$local_file\" \"$href\""; + system($cmd); + ($size, $mtime) = sizetime($local_file); + $md5 = digest($local_file); + $cache_update = 1; + } + + # optionally update the size/mtime/md5 cache + if ($opts->{"C"} and $cache_update) { + $cache->{$local_file} = { size => $size, mtime => $mtime, md5 => $md5 }; + cache_save($cache); + } + } + else { + print "** WARNING: URL \"$href\" not under prefix \"$remoteprefixurl\": " . + "unable to derive local path -- skipping URL\n"; + } +} + +## +## REMOVE OBSOLETE LOCAL FILES +## + +# remove obsolete files +print "++ removing obsolete files\n"; +my $lf = @local_files; +my $lfk = keys %{$local_files_keep}; +if (not $opts->{"f"} and ($lf - $lfk) > ($lf * 0.3)) { + print "** WARNING: more than 30% of your existing local files would be removed (require option -f)\n"; +} +else { + foreach my $local_file (@local_files) { + if (not exists $local_files_keep->{$local_file}) { + if (-f $local_file) { + print "-- removing: $local_file (file)\n"; + system("rm \"$local_file\" >/dev/null 2>&1 || true"); + } + elsif (-d $local_file) { + print "-- removing: $local_file (dir)\n"; + system("rmdir \"$local_file\" >/dev/null 2>&1 || true"); + } + } + } +} + +## +## CLEANUP +## + +# optionally flush the cache +cache_save($cache); + +# exit gracefully +exit(0); +