Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 #! /bin/bash
2 # This Source Code Form is subject to the terms of the Mozilla Public
3 # License, v. 2.0. If a copy of the MPL was not distributed with this
4 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 #original author: Alice Nodelman
8 #contributor: Darin Fisher
9 #
10 #takes two inputs, $1 = file containing list of web pages of form http://pagename
11 # $2 = output file where list of index files is dumped - useful for places list of links into scripts
12 #
13 # web pages are dropped in directories named for their urls
15 if [ $# != 2 ]; then
16 echo 'missing command line arguments'
17 echo
18 echo 'usage: getpages.sh inputfile outputfile'
19 echo ' inputfile: file containing one url per line of the form http://url'
20 echo ' outputfile: file to be created during execution, contains a list of index files one per url'
21 exit
22 fi
24 # generates the list of files to be cleansed (exclude image files)
25 # disables any existing call-outs to the live web
26 # provided by Darin Fisher
27 cleanse_files() {
28 find "$1" -type f -a -print0 ! -iname \*.jpg -a ! -iname \*.gif -a ! -iname \*.png -a ! -name \*.bak | xargs -0 perl -pi -e 's/[a-zA-Z0-9_]*.writeln/void/g;' -e 's/[a-zA-Z0-9_]*.write/void/g;' -e 's/[a-zA-Z0-9_]*.open/void/g;' -e 's/"https/"httpsdisabled/gi;' -e 's/"http/"httpdisabled/gi;' -e 's/<object/<objectdisabled/gi;' -e 's/<embed/<embeddisabled/gi;' -e 's/load/loaddisabled/g;'
29 }
31 mkdir testpages
32 cd testpages
33 for URL in $(cat ../$1); do
34 #strip the leading http:// from the url
35 CLEANURL=$(echo $URL | sed -e 's/http:\/\/\(.*\)/\1/')
36 #create a directory with the cleaned url as the name
37 echo "grabbing "$CLEANURL
38 mkdir $CLEANURL
39 cd $CLEANURL
40 ../../wget-1.10-css-parser/src/wget -p -k -H -E -erobots=off --no-check-certificate -U "Mozilla/5.0 (firefox)" --restrict-file-names=windows $URL -o outputlog.txt
41 #figure out where/what the index file for this page is from the wget output log
42 FILENAME=$(grep "saved" outputlog.txt | head -1 | sed -e "s/.*\`\(.*\)\'.*/\1/")
43 rm outputlog.txt
44 cd ..
46 #do the final cleanup of any dangling urls
47 #with thanks to Darin Fisher for the code
48 cleanse_files $CLEANURL
50 #add the index file link to the list of index files being generated
51 echo $CLEANURL/$FILENAME >> $2
52 done
53 cd ..