#!/bin/sh -e

##########################################################################
#   Description:
#       BLAST a Danio gene against other transcriptomes/genomes
#       
#   History:
#   Date        Name        Modification
#   2022-04-15  Jason Bacon Begin
##########################################################################

usage()
{
    printf "Usage: $0 [gene-name BLAST-database-name]\n"
    exit 1
}


##########################################################################
#   Function description:
#       Pause until user presses return
##########################################################################

pause()
{
    local junk
    
    printf "Press return to continue..."
    read junk
}


##########################################################################
#   Main
##########################################################################

if [ 0$1 = 0--help ]; then
    usage
fi

export AUTO_ASK_TAG_PREFIX=microsynteny-tools-

if [ -z "$AWK_PATH" ]; then
    if [ $0 = Scripts/ms-blast ]; then
	# Development: Running from repo dir
	export PATH=Scripts:$PATH
	export AWK_PATH=$(pwd)/Libexec
	make
    elif [ $0 = ./ms-blast ]; then
	# Development: Running from repo dir
	cd ..
	export PATH=Scripts:$PATH
	export AWK_PATH=$(pwd)/Libexec
	make
    else
	# Installed
	export AWK_PATH=/usr/pkg/libexec
    fi
fi

if [ $# = 2 ]; then
    gene=$1
    db=$2
elif [ $# = 0 ]; then
    gene=$(auto-ask blast-gene-name "Gene name?" '')
    cd BLAST-DB
    printf "\nLocal databases:\n\n"
    ls *.ndb | cut -d . -f -1
    cd ..
    default_db="Mus_musculus-transcriptome Rattus_norvegicus-transcriptome Homo_sapiens-transcriptome"
    printf "\nCopy and paste one or more databases separated by space.\n"
    dbs=$(auto-ask blast-dbs "\nDatabase(s)?" "$default_db")
else
    usage
fi

# Debug
# export PATH=~/Prog/Src/local/bin:$PATH

mkdir -p Fasta
gene_fa=Fasta/$gene.fa
cds=Fasta/$gene-CDS.fa
if [ ! -e $gene_fa ]; then
    printf "Generating $gene_fa...\n"
    blt extract-seq GFF/Danio_rerio.GRCz11.105.chr.gff3 \
	Reference/Danio_rerio.GRCz11.dna.toplevel.fa.xz gene "Name=$gene;" \
	> $gene_fa
    rm -f $cds
fi
if [ ! -e $cds ]; then
    fgrep -w -A 1 CDS $gene_fa > $cds
fi
cat << EOM

Coding sequences saved to $cds:

EOM

grep '^>' $cds | awk '{ print $1, $2, $3, $4 }'

cat << EOM

Go to https://blast.ncbi.nlm.nih.gov/Blast.cgi for web-based queries and
upload $cds.
Using the web BLAST on NT/NR databases may be preferable to downloading
them, since they are many gigabytes.

EOM
pause

# -task megablast (default) = hightly similar
# -task dc-megablast = discontinguous megablast (pretty similar_
# -task blastn = somewhat similar
# evalue < 0.01 considered good enough for homology
# https://www.metagenomics.wiki/tools/blast/evalue
# https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
mkdir -p BLAST-output
for db in $dbs; do
    printf "\nBLASTing $db...\n\n"
    case ${db##*-} in
    transcriptome)
	blastn -task blastn -db BLAST-DB/$db \
	    -query $cds -evalue 0.01 \
	    -outfmt "6 qseqid pident slen length evalue stitle" \
	    > BLAST-output/$gene-transcriptome.out 2> BLAST-output/$gene-transcriptome.err
	printf "%-15s %8s %5s %4s %10s %-24s %s\n" \
	    "Query" "Identity" "Slen" "Alen" "E-value" "Location" "Gene"
	awk -f $AWK_PATH/blast-filter.awk BLAST-output/$gene-transcriptome.out \
	    | sort -gk 5 | more
	;;
    
    genome)
	if [ -e BLAST-output/$gene-genome.out ]; then
	    printf "$gene-genome.out already exists.  Rerun BLAST search? y/[n] "
	    read run
	else
	    run=y
	fi
	if [ 0$run = 0y ]; then
	    # More than 2 threads backfires due to I/O
	    time blastn -task dc-megablast -db BLAST-DB/$db \
		-query $cds -evalue 0.01 \
		-outfmt "6 qseqid pident length evalue stitle" \
		-num_threads 2 \
		> BLAST-output/$gene-genome.out
	fi
	# blast-filter.awk will attempt to print gene name from $10 as
	# well, but it will just be blank
	printf "%-15s %8s %5s %4s %10s %-24s\n" \
	    "Query" "Identity" "Slen" "Alen" "E-value" "Location"
	awk -f $AWK_PATH/blast-filter.awk BLAST-output/$gene-genome.out \
	    | sort -gk 5 | more
	;;
    
    *)
	usage
	;;
    
    esac
done
