diff -r 000000000000 -r 6474c204b198 testing/tools/grabber/getpages.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/testing/tools/grabber/getpages.sh Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,54 @@ +#! /bin/bash +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#original author: Alice Nodelman +#contributor: Darin Fisher +# +#takes two inputs, $1 = file containing list of web pages of form http://pagename +# $2 = output file where list of index files is dumped - useful for places list of links into scripts +# +# web pages are dropped in directories named for their urls + +if [ $# != 2 ]; then + echo 'missing command line arguments' + echo + echo 'usage: getpages.sh inputfile outputfile' + echo ' inputfile: file containing one url per line of the form http://url' + echo ' outputfile: file to be created during execution, contains a list of index files one per url' + exit +fi + +# generates the list of files to be cleansed (exclude image files) +# disables any existing call-outs to the live web +# provided by Darin Fisher +cleanse_files() { + find "$1" -type f -a -print0 ! -iname \*.jpg -a ! -iname \*.gif -a ! -iname \*.png -a ! -name \*.bak | xargs -0 perl -pi -e 's/[a-zA-Z0-9_]*.writeln/void/g;' -e 's/[a-zA-Z0-9_]*.write/void/g;' -e 's/[a-zA-Z0-9_]*.open/void/g;' -e 's/"https/"httpsdisabled/gi;' -e 's/"http/"httpdisabled/gi;' -e 's/> $2 +done +cd .. +