#! /bin/sh

#================================================================
# estdochtml
# Strip a file of MS-Word and extract its text as HTML.
#================================================================


# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="estdochtml"
tmpdir="/tmp/$progname.$$"
tmpfile="/tmp/$progname.$$/tmp-$$"
nulldev="/dev/null"
infile="$1"
outfile="$2"


# show help message
if [ "$1" = "--help" ]
then
  printf 'Strip a file of MS-Word and extract its text as HTML.\n'
  printf '\n'
  printf 'Usage:\n'
  printf '  %s [infile] [outfile]\n' "$progname"
  printf '  estindex register -xsuf .doc application/msword %s casket\n' "$progname"
  printf '\n'
  exit 0
fi


# function to remove the temporary directory
tmpclean(){
  rm -rf "$tmpdir"
}


# function to create the temporary file for input
output(){
  if [ -z "$outfile" ]
  then
    cat
  else
    cat >> "$outfile"
  fi
}


# set the exit trap
trap tmpclean 1 2 3 13 15


# check the input file existence
if [ -n "$infile" ] && [ ! -f "$infile" ]
then
  printf '%s: %s: no such file\n' "$progname" "$infile"
  exit 1
fi


# create the temporary directory
mkdir -p "$tmpdir"


# create the temporary file
if [ -z "$infile" ]
then
  cat > "$tmpfile"
  infile="$tmpfile"
fi


# extract HTML from MS-Word
wvWare -d "$tmpdir" "$infile" 2> "$nulldev" |
  iconv -f UTF-8 -t UTF-8 -c |
  sed -e 's/\[Author ID.*$//' -e 's/^ *\]//' |
  output


# clean up the temporary directory
tmpclean


# exit normally
exit 0



# END OF FILE
