/* ----------------------------------------------------------------------
 * Program for encoding and auto-crosscorrelating sequences
 * Copyright (C) 2000 January Weiner III
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 * USA.
 ---------------------------------------------------------------------- */

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <time.h>

#include "genpak.h"
#include "gp_getopt.h"

#define VERSION "0.2"
#define PROGNAME "gp_acc"

char *progname ;

typedef struct {
	FILE *in ;
	FILE *out ;
	long sl ;	/* sequence length for header display */
	int nd ;			/* number of descriptors */
	int pauto ; 
	int encode ;
	long pos ; } opt_s ;	/* lag */

double ComputeCterm(double *encoded,int desc1, int desc2, long lag, opt_s o) ;
double *EncodeSeq(sekw * inseq, opt_s options) ;
double *ACCSeq(sekw * inseq, double * encoded,opt_s options) ;
void FormatOutput(sekw * inseq, double * acced,opt_s options) ;
void FormatHeader(opt_s options) ;

/*
 *
 */

int main(int argc, char *argv[])
{
	extern int optind ;
	extern char *optarg ;
	int width = 70 ; /* width with which the sequence gets formatted */
	int seqnum = 0 ;
	opt_s options ;
	sekw *inseq, *outseq ;
	double * encoded ;
	double * acced ;

	int c;
	char message[100] ;
	int errflg = 0 ;

	options.pauto = TRUE ;
	options.nd = 3 ;
	options.pos = 0 ;
	options.sl = 0 ;
	options.encode = FALSE ;
	progname = argv[0] ;

	while ((c = gp_getopt(argc, argv, "el:p:qvdh")) != EOF)
		switch(c) {
		case 'e':
			if(debug) gp_warn("Only encoding the sequence") ;
			options.encode = TRUE ;
			break ;
		case 'l':
			if(sscanf(optarg,"%li",&options.sl) != 1)
				gp_error("main: Could not parameter >%s< for option -l ",optarg) ;
			if(debug) gp_warn("main: print a header for a %i bp long sequence",options.sl) ;
			break ;
		case 'p':
			if(sscanf(optarg,"%li",&options.pos) != 1)
				gp_error("main: Could not parameter >%s< for option -p ",optarg) ;
			if(debug) gp_warn("main: Crossterms to position %li",options.pos) ;
			break ;
		case 'q':
			quiet = TRUE ;
			break ;
		case 'v':
			fprintf(stderr,"%s version %s\n",progname,VERSION) ;
			exit(0) ;
			break ;
		case 'd':
			debug = TRUE ;
			gp_warn("Running in debug mode") ;
			break ;
		case 'h':
			Help() ;
			break ;
		default:
			errflg++ ;
			break;
		}


	if(errflg) gp_error("Type '%s -h' for help",progname) ;

/* open the file pointer to read the sequences 
 * from: standard input or a file provided? */
	if(optind >= argc) options.in = stdin ;
	else options.in = gp_file_open(argv[optind],"r") ;

/* opening the file pointer to write the output: 
 * standard output or file provided? */
	optind++ ;

	if(optind >= argc) options.out = stdout ;
	else options.out = gp_file_open(argv[optind],"wb") ;

	if(options.sl != 0) FormatHeader(options) ;

	while( (inseq = gp_seq_read(options.in)) != NULL) {
		seqnum++ ;
		if(debug) gp_warn("main: Processing sequence %i",seqnum) ;
		encoded = EncodeSeq(inseq,options) ;
		if(options.encode) FormatOutput(inseq,encoded,options) ;
		else {
			acced = ACCSeq(inseq,encoded,options) ;
			FormatOutput(inseq,acced,options) ;
			free(acced) ;
		}
		free(encoded) ;
		free(inseq) ;
	}
		
	if(html) gp_warn_print_all(options.out) ;
	fclose(options.out) ;
	fclose(options.in) ;
	return(0);
}


/* Encode the sequence */
/* This is temporary and works only for DNA/RNA sequences */

double *EncodeSeq(sekw * inseq, opt_s options) {

	long i,j ;
	double * enc ;
	double Conv[4][3] = {{ -1,-1,1},{1,-1,-1},{-1,1,-1},{1,1,1}} ;
	int C[128] ;
	
	C['A'] = 0 ; C['C'] = 1 ; C['G'] = 2 ; C['T'] = 3 ; C['U'] = 4 ;
	
	/* Let's get some space ... */
	enc = malloc(sizeof(*enc)*(inseq->leng+1)*options.nd) ;
	
	if(enc == NULL) 
		gp_error("EncodeSeq: Ooops, could not allocate enough memory. Sorry...") ;

	enc[0] = inseq->leng*options.nd ;

	if(debug)
		gp_warn("EncodeSeq: Sequence length %i, ndescr %i, total %f",
			inseq->leng,options.nd,enc[0]) ;

	for(i = 0;i<inseq->leng;i++) {
		j = C[toupper(inseq->sequ[i])] ;
		enc[(i*options.nd)+1] = Conv[j][0] ;
		enc[(i*options.nd)+2] = Conv[j][1] ;
		enc[(i*options.nd)+3] = Conv[j][2] ;	
	}

	return enc ;
		
}




double *ACCSeq(sekw * inseq, double * encoded,opt_s o) {


	double *acc ;
	long lag, total, cnum = 1,checknum,pos ;
	float cterm ; /* cross-correlation term */

	int desc1,desc2;

	if(o.pos == 0) {
		pos = inseq->leng / 3 ;
		gp_warn("ACCSeq: using automatic max lag") ;
	} else pos = o.pos ;

	total = o.nd * o.nd * (pos - 1) ;
	if(debug) gp_warn("ACCSeq: Total number of crossterms: %i, max lag %i",total,pos) ;

	/* Reserving space for output array */

	acc = malloc(sizeof(*acc)*(total+1)) ;

	if(acc == NULL) gp_error("ACCSeq: Could not allocate enough memory. Sorry.") ;

	acc[0] = total ;

	if(debug) gp_warn("ACCSeq: Number of ACC variables %f",total) ;

	/* each descriptor combination must be examined */
	for(desc1= 0;desc1<o.nd;desc1++) {
			/* for each description combination, each allowed lag is examined */
		for(lag = 1;lag<pos;lag++) {
			for(desc2= 0;desc2<o.nd;desc2++,cnum++) {
				cterm = ComputeCterm(encoded,desc1,desc2,lag,o) ;

				checknum = (pos-1)*desc1*o.nd + (pos-1)*desc2 + lag ;
				acc[cnum] = cterm ;
				if(debug) gp_warn("ACCSeq: z(%i)*z(%i) Check %i, cnum %i,cterm %f",
					desc1,desc2,checknum,cnum,cterm) ;

			}
		}
	}

	return acc ;

}

/* 
 * Given the lag and the number of descriptors, this function calculates the
 * auto - cross correlation term 
 */

double ComputeCterm(double *e,int desc1, int desc2, long lag, opt_s o) {

	double sum = 0 ;

	long i,j,n = 0 ;

	for(i = desc1 + 1,j = (1 + desc2 + lag*o.nd);j<= e[0];i+= o.nd,j+= o.nd) {
		
		sum += e[i]*e[j] ;
		n++ ;

	}

	sum /= n ;

	return sum ;

}


/* Prints out the calculated values */

void FormatOutput(sekw * inseq, double * acced,opt_s options) {


	long i = 0 ; 

	fprintf(options.out,"%s\t",inseq->name) ;

	for(i = 0;i<acced[0];i++) {
		fprintf(options.out,"%.4f\t",acced[i+1]) ;
	}

	fprintf(options.out,"\n") ;

}


/* Formats the header for the output file */

void FormatHeader(opt_s o) {

	long i,j,k,pos ;
	
	if(o.pos == 0) pos = o.sl/3 ;
	else pos = o.pos ;

	fprintf(o.out,"#Name\t") ;

	for(i = 1;i<= o.nd;i++)
		for(j = 1;j<pos;j++)
			for(k = 1;k<= o.nd;k++)
				fprintf(o.out,"z(%i)z(%i)L(%i)\t",i,k,j) ;
	
	fprintf(o.out,"\n") ;

}

/* Standard mesage */

void Help()
{
printf("\n");
printf("%s, v. %s - Encoding and autocrosscorrelating sequences",PROGNAME,VERSION);
printf("\n");
printf("  Usage:\n");
printf("     %s [options] [ input file ] [ output file ]\n",progname);
printf("\n");
printf("  Options:\n");
printf("     -e         : encode only, do not compute the ACC\n") ;
printf("     -l [value] : print a header with variable descriptions for sequence\n") ;
printf("                  length equal to [value]\n");
printf("     -p [value] : crossterm to position [value]\n");
printf("     -v         : print version information & exit\n");
printf("     -h         : print this help screen & exit\n");
printf("     -q         : quiet, suppress error messages\n");
printf("\n") ;
exit(0);
}


			
