|
1 #! /bin/bash |
|
2 # This Source Code Form is subject to the terms of the Mozilla Public |
|
3 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
5 |
|
6 |
|
7 #original author: Alice Nodelman |
|
8 #contributor: Darin Fisher |
|
9 # |
|
10 #takes two inputs, $1 = file containing list of web pages of form http://pagename |
|
11 # $2 = output file where list of index files is dumped - useful for places list of links into scripts |
|
12 # |
|
13 # web pages are dropped in directories named for their urls |
|
14 |
|
15 if [ $# != 2 ]; then |
|
16 echo 'missing command line arguments' |
|
17 echo |
|
18 echo 'usage: getpages.sh inputfile outputfile' |
|
19 echo ' inputfile: file containing one url per line of the form http://url' |
|
20 echo ' outputfile: file to be created during execution, contains a list of index files one per url' |
|
21 exit |
|
22 fi |
|
23 |
|
24 # generates the list of files to be cleansed (exclude image files) |
|
25 # disables any existing call-outs to the live web |
|
26 # provided by Darin Fisher |
|
27 cleanse_files() { |
|
28 find "$1" -type f -a -print0 ! -iname \*.jpg -a ! -iname \*.gif -a ! -iname \*.png -a ! -name \*.bak | xargs -0 perl -pi -e 's/[a-zA-Z0-9_]*.writeln/void/g;' -e 's/[a-zA-Z0-9_]*.write/void/g;' -e 's/[a-zA-Z0-9_]*.open/void/g;' -e 's/"https/"httpsdisabled/gi;' -e 's/"http/"httpdisabled/gi;' -e 's/<object/<objectdisabled/gi;' -e 's/<embed/<embeddisabled/gi;' -e 's/load/loaddisabled/g;' |
|
29 } |
|
30 |
|
31 mkdir testpages |
|
32 cd testpages |
|
33 for URL in $(cat ../$1); do |
|
34 #strip the leading http:// from the url |
|
35 CLEANURL=$(echo $URL | sed -e 's/http:\/\/\(.*\)/\1/') |
|
36 #create a directory with the cleaned url as the name |
|
37 echo "grabbing "$CLEANURL |
|
38 mkdir $CLEANURL |
|
39 cd $CLEANURL |
|
40 ../../wget-1.10-css-parser/src/wget -p -k -H -E -erobots=off --no-check-certificate -U "Mozilla/5.0 (firefox)" --restrict-file-names=windows $URL -o outputlog.txt |
|
41 #figure out where/what the index file for this page is from the wget output log |
|
42 FILENAME=$(grep "saved" outputlog.txt | head -1 | sed -e "s/.*\`\(.*\)\'.*/\1/") |
|
43 rm outputlog.txt |
|
44 cd .. |
|
45 |
|
46 #do the final cleanup of any dangling urls |
|
47 #with thanks to Darin Fisher for the code |
|
48 cleanse_files $CLEANURL |
|
49 |
|
50 #add the index file link to the list of index files being generated |
|
51 echo $CLEANURL/$FILENAME >> $2 |
|
52 done |
|
53 cd .. |
|
54 |