testing/tools/grabber/getpages.sh

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/testing/tools/grabber/getpages.sh	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,54 @@
     1.4 +#! /bin/bash
     1.5 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.6 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.8 +
     1.9 +
    1.10 +#original author: Alice Nodelman
    1.11 +#contributor: Darin Fisher
    1.12 +#
    1.13 +#takes two inputs, $1 = file containing list of web pages of form http://pagename
    1.14 +#                  $2 = output file where list of index files is dumped - useful for places list of links into scripts
    1.15 +#
    1.16 +# web pages are dropped in directories named for their urls
    1.17 +
    1.18 +if [ $# != 2 ]; then
    1.19 +	echo 'missing command line arguments'
    1.20 +	echo
    1.21 +	echo 'usage: getpages.sh inputfile outputfile'
    1.22 +	echo '	inputfile: file containing one url per line of the form http://url'
    1.23 +	echo '	outputfile: file to be created during execution, contains a list of index files one per url'
    1.24 +	exit 
    1.25 +fi
    1.26 +
    1.27 +# generates the list of files to be cleansed (exclude image files)
    1.28 +# disables any existing call-outs to the live web
    1.29 +# provided by Darin Fisher
    1.30 +cleanse_files() {
    1.31 +	  find "$1" -type f -a -print0 ! -iname \*.jpg -a ! -iname \*.gif -a ! -iname \*.png -a ! -name \*.bak | xargs -0 perl -pi -e 's/[a-zA-Z0-9_]*.writeln/void/g;' -e 's/[a-zA-Z0-9_]*.write/void/g;' -e 's/[a-zA-Z0-9_]*.open/void/g;' -e 's/"https/"httpsdisabled/gi;' -e 's/"http/"httpdisabled/gi;' -e 's/<object/<objectdisabled/gi;' -e 's/<embed/<embeddisabled/gi;' -e 's/load/loaddisabled/g;'
    1.32 +}
    1.33 +
    1.34 +mkdir testpages
    1.35 +cd testpages
    1.36 +for URL in $(cat ../$1); do
    1.37 +	#strip the leading http:// from the url
    1.38 +	CLEANURL=$(echo $URL | sed -e 's/http:\/\/\(.*\)/\1/')
    1.39 +	#create a directory with the cleaned url as the name
    1.40 +	echo "grabbing "$CLEANURL
    1.41 +	mkdir $CLEANURL
    1.42 +	cd $CLEANURL
    1.43 +	../../wget-1.10-css-parser/src/wget -p -k -H -E -erobots=off --no-check-certificate -U "Mozilla/5.0 (firefox)" --restrict-file-names=windows $URL -o outputlog.txt
    1.44 +	#figure out where/what the index file for this page is from the wget output log
    1.45 +	FILENAME=$(grep "saved" outputlog.txt | head -1 | sed -e "s/.*\`\(.*\)\'.*/\1/")
    1.46 +	rm outputlog.txt
    1.47 +	cd ..
    1.48 +
    1.49 +	#do the final cleanup of any dangling urls
    1.50 +	#with thanks to Darin Fisher for the code
    1.51 +	cleanse_files $CLEANURL
    1.52 +
    1.53 +	#add the index file link to the list of index files being generated
    1.54 +	echo $CLEANURL/$FILENAME >> $2
    1.55 +done
    1.56 +cd ..
    1.57 +

mercurial