#! /bin/sh

#================================================================
# crawler
# Utility to collect documents on WWW
#================================================================


# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="crawler"
#proxy="proxy.hogehoge.gov:8080"
useragent="Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"
extlist="html,HTML,htm,HTM,txt,TXT"
timeout="60"
waittime="1"


# check arguments
if [ $# != 1 ]
then
  printf '%s: usage: %s url_of_entry_point\n' "$progname" "$progname" 1>&2
  exit 1
fi


# set options
commands="-e robots=off"
if [ -n "$proxy" ]
then
  commands="$commands -e http_proxy=$proxy -e use_proxy=on"
fi


# download documents recursively
wget $commands -U "$useragent" -r -np -nc -A "$extlist" -T "$timeout" -w "$waittime" "$1"


# exit normally
exit 0



# END OF FILE
