testing/tools/grabber/getpages.sh

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rwxr-xr-x

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 #! /bin/bash
michael@0 2 # This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 # License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 5
michael@0 6
michael@0 7 #original author: Alice Nodelman
michael@0 8 #contributor: Darin Fisher
michael@0 9 #
michael@0 10 #takes two inputs, $1 = file containing list of web pages of form http://pagename
michael@0 11 # $2 = output file where list of index files is dumped - useful for places list of links into scripts
michael@0 12 #
michael@0 13 # web pages are dropped in directories named for their urls
michael@0 14
michael@0 15 if [ $# != 2 ]; then
michael@0 16 echo 'missing command line arguments'
michael@0 17 echo
michael@0 18 echo 'usage: getpages.sh inputfile outputfile'
michael@0 19 echo ' inputfile: file containing one url per line of the form http://url'
michael@0 20 echo ' outputfile: file to be created during execution, contains a list of index files one per url'
michael@0 21 exit
michael@0 22 fi
michael@0 23
michael@0 24 # generates the list of files to be cleansed (exclude image files)
michael@0 25 # disables any existing call-outs to the live web
michael@0 26 # provided by Darin Fisher
michael@0 27 cleanse_files() {
michael@0 28 find "$1" -type f -a -print0 ! -iname \*.jpg -a ! -iname \*.gif -a ! -iname \*.png -a ! -name \*.bak | xargs -0 perl -pi -e 's/[a-zA-Z0-9_]*.writeln/void/g;' -e 's/[a-zA-Z0-9_]*.write/void/g;' -e 's/[a-zA-Z0-9_]*.open/void/g;' -e 's/"https/"httpsdisabled/gi;' -e 's/"http/"httpdisabled/gi;' -e 's/<object/<objectdisabled/gi;' -e 's/<embed/<embeddisabled/gi;' -e 's/load/loaddisabled/g;'
michael@0 29 }
michael@0 30
michael@0 31 mkdir testpages
michael@0 32 cd testpages
michael@0 33 for URL in $(cat ../$1); do
michael@0 34 #strip the leading http:// from the url
michael@0 35 CLEANURL=$(echo $URL | sed -e 's/http:\/\/\(.*\)/\1/')
michael@0 36 #create a directory with the cleaned url as the name
michael@0 37 echo "grabbing "$CLEANURL
michael@0 38 mkdir $CLEANURL
michael@0 39 cd $CLEANURL
michael@0 40 ../../wget-1.10-css-parser/src/wget -p -k -H -E -erobots=off --no-check-certificate -U "Mozilla/5.0 (firefox)" --restrict-file-names=windows $URL -o outputlog.txt
michael@0 41 #figure out where/what the index file for this page is from the wget output log
michael@0 42 FILENAME=$(grep "saved" outputlog.txt | head -1 | sed -e "s/.*\`\(.*\)\'.*/\1/")
michael@0 43 rm outputlog.txt
michael@0 44 cd ..
michael@0 45
michael@0 46 #do the final cleanup of any dangling urls
michael@0 47 #with thanks to Darin Fisher for the code
michael@0 48 cleanse_files $CLEANURL
michael@0 49
michael@0 50 #add the index file link to the list of index files being generated
michael@0 51 echo $CLEANURL/$FILENAME >> $2
michael@0 52 done
michael@0 53 cd ..
michael@0 54

mercurial