#! /bin/sh

#================================================================
# estpdfhtml
# Strip a file of PDF and extract its text as HTML.
#================================================================


# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="estpdfhtml"
tmpfile="/tmp/$progname.$$"
infile="$1"
outfile="$2"


# show help message
if [ "$1" = "--help" ]
then
  printf 'Strip a file of PDF and extract its text as HTML.\n'
  printf '\n'
  printf 'Usage:\n'
  printf '  %s [infile] [outfile]\n' "$progname"
  printf '  estindex register -xsuf .pdf application/pdf %s casket\n' "$progname"
  printf '\n'
  exit 0
fi


# function to remove the temporary file
tmpclean(){
  rm -rf "$tmpfile"
}


# function to create the temporary file for input
output(){
  if [ -z "$outfile" ]
  then
    cat
  else
    cat >> "$outfile"
  fi
}


# set the exit trap
trap tmpclean 1 2 3 13 15


# check the input file existence
if [ -n "$infile" ] && [ ! -f "$infile" ]
then
  printf '%s: %s: no such file\n' "$progname" "$infile"
  exit 1
fi


# create the temporary file
if [ -z "$infile" ]
then
  cat > "$tmpfile"
  infile="$tmpfile"
fi


# output the result
pdftotext -raw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
iconv -f UTF-8 -t UTF-8 -c |
awk '
BEGIN {
  esc = 0
  mul = 1
  emp = 0
}
{
  if(esc < 1 && $0 ~ /^<title>/ && $0 ~ /title>$/){
    printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n")
    gsub(/<[^>]*>/, "", $0)
    gsub(/&/, "\\&amp;", $0)
    gsub(/</, "\\&lt;", $0)
    gsub(/>/, "\\&gt;", $0)
    printf("<title>%s</title>\n", $0)
  } else if($0 == "<pre>"){
    esc++
    printf("<p>")
    mul = 1
  } else if($0 == "</pre>"){
    esc--
    printf("</p>\n")
  } else if($0 == "\f"){
    printf("</p>\n<hr>\n<p>")
  } else {
    if(esc > 0){
      gsub(/&/, "\\&amp;", $0)
      gsub(/</, "\\&lt;", $0)
      gsub(/>/, "\\&gt;", $0)
      gsub(/^ */, "", $0)
      gsub(/ *$/, "", $0)
    }
    if(length($0) < 1){
      emp++
    } else if(match($0, /^</)){
      printf("%s\n", $0)
      mul = 0
      emp = 0
    } else {
      if(emp >= 2 || (mul == 0 && substr($0, 1, 1) <= "~")){
        printf(" ")
      }
      i = 0
      while(i < length($0)){
        c = substr($0, i + 1, 1)
        if(c == " "){
          if(mul == 0){
            printf(" ")
          }
          mul = 0
        } else {
          printf("%c", c)
          mul = c > "~"
        }
        i++
      }
      emp = 0
    }
  }
}
' |
output


# clean up the temporary directory
tmpclean


# exit normally
exit 0



# END OF FILE
