1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/testing/tools/grabber/getpages.sh Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,54 @@ 1.4 +#! /bin/bash 1.5 +# This Source Code Form is subject to the terms of the Mozilla Public 1.6 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.8 + 1.9 + 1.10 +#original author: Alice Nodelman 1.11 +#contributor: Darin Fisher 1.12 +# 1.13 +#takes two inputs, $1 = file containing list of web pages of form http://pagename 1.14 +# $2 = output file where list of index files is dumped - useful for places list of links into scripts 1.15 +# 1.16 +# web pages are dropped in directories named for their urls 1.17 + 1.18 +if [ $# != 2 ]; then 1.19 + echo 'missing command line arguments' 1.20 + echo 1.21 + echo 'usage: getpages.sh inputfile outputfile' 1.22 + echo ' inputfile: file containing one url per line of the form http://url' 1.23 + echo ' outputfile: file to be created during execution, contains a list of index files one per url' 1.24 + exit 1.25 +fi 1.26 + 1.27 +# generates the list of files to be cleansed (exclude image files) 1.28 +# disables any existing call-outs to the live web 1.29 +# provided by Darin Fisher 1.30 +cleanse_files() { 1.31 + find "$1" -type f -a -print0 ! -iname \*.jpg -a ! -iname \*.gif -a ! -iname \*.png -a ! -name \*.bak | xargs -0 perl -pi -e 's/[a-zA-Z0-9_]*.writeln/void/g;' -e 's/[a-zA-Z0-9_]*.write/void/g;' -e 's/[a-zA-Z0-9_]*.open/void/g;' -e 's/"https/"httpsdisabled/gi;' -e 's/"http/"httpdisabled/gi;' -e 's/<object/<objectdisabled/gi;' -e 's/<embed/<embeddisabled/gi;' -e 's/load/loaddisabled/g;' 1.32 +} 1.33 + 1.34 +mkdir testpages 1.35 +cd testpages 1.36 +for URL in $(cat ../$1); do 1.37 + #strip the leading http:// from the url 1.38 + CLEANURL=$(echo $URL | sed -e 's/http:\/\/\(.*\)/\1/') 1.39 + #create a directory with the cleaned url as the name 1.40 + echo "grabbing "$CLEANURL 1.41 + mkdir $CLEANURL 1.42 + cd $CLEANURL 1.43 + ../../wget-1.10-css-parser/src/wget -p -k -H -E -erobots=off --no-check-certificate -U "Mozilla/5.0 (firefox)" --restrict-file-names=windows $URL -o outputlog.txt 1.44 + #figure out where/what the index file for this page is from the wget output log 1.45 + FILENAME=$(grep "saved" outputlog.txt | head -1 | sed -e "s/.*\`\(.*\)\'.*/\1/") 1.46 + rm outputlog.txt 1.47 + cd .. 1.48 + 1.49 + #do the final cleanup of any dangling urls 1.50 + #with thanks to Darin Fisher for the code 1.51 + cleanse_files $CLEANURL 1.52 + 1.53 + #add the index file link to the list of index files being generated 1.54 + echo $CLEANURL/$FILENAME >> $2 1.55 +done 1.56 +cd .. 1.57 +