#!/usr/bin/perl

####################################################################################
# run of single knetfold command. 
# usage: knetfoldsingle.pl -a fastafile 
# change log: 
# added option -f for running also NL-RNAfold (May 27, 2006)
# Eckart Bindewald
#####################################################################################

use Env;
use File::Basename;
use Getopt::Std;

if (scalar(@ARGV)  < 1) {
    die "Usage: knetfoldsingle.pl -a fastafile -s secondary -c constraintfile -v forbbiddenid\n";
}

getopt('icfhprsvw', \%opts); 

my $KNETFOLD_HOME = $opts{"h"};
if (length($KNETFOLD_HOME) == 0) {
    $KNETFOLD_HOME = $ENV{"KNETFOLD_HOME"};
}

if (length($KNETFOLD_HOME) == 0) {
    die "Environment variable KNETFOLD_HOME has to be specified!\n";
}

if (substr($KNETFOLD_HOME, 0,1) ne "/") {
    # workaround for relative file names: convert to absolute file names
    $KNETFOLD_HOME = `cd $KNETFOLD_HOME; pwd`;
    chomp($KNETFOLD_HOME);
}

$KNETFOLD_TMP = $ENV{"KNETFOLD_TMP"};
if (length($KNETFOLD_TMP) == 0) {
    $KNETFOLD_TMP = "/tmp"; # default tmp directory
}

print "KNETFOLD_HOME: $KNETFOLD_HOME KNETFOLD_TMP: $KNETFOLD_TMP\n";


$fastafile = $opts{"i"};

$reffile =  $opts{"s"};

$constraintfile =  $opts{"c"};

$foldModeOption = $opts{"f"};
$foldMode = "1"; # per default run RNAfold
if (length($foldModeOption) > 0) {
    $foldMode = $foldModeOption;
}

$probMatrixOption = $opts{"p"};

$forbidden = $opts{"v"};

$refali = $opts{$r};

$winnerOption = $opts{$w};

$weightfile = "weightfilesareswitchedoff"; # dummy file
$BIN = "$KNETFOLD_HOME/bin"; 
$goodies = $BIN;
$COMPASS = "$BIN/compass -p $BIN/compass.prm";
$PROBGEN = "$goodies/fasta2rnaprob.pl";
$SKIP = "$goodies/skiplines.pl";

$modelid = 0;
$maxmodel = 10;
$self = 0; # assuming no overlap between testing and training
$size = `$BIN/fasta2sizes.pl < $fastafile`;
chomp($size);

$modelbasedir = "$KNETFOLD_HOME/prm/bestmodel140";

if (length($forbidden) > 0) {
    $found = 0;
    # look for model id, in which "forbidden" is not part of training set:
    for ($i = 0; $i < $maxmodel; $i++) {
	$filename = "$modelbasedir/part_$i/forbidden.names";
	if (! -e $filename) {
	    print "Warning: Could not find forbidden name file $filename!\n";
	    next;
	}
	open (FILE, $filename) or die "Could not open forbidden name file!\n";
	@lines = <FILE>;
	chomp(@lines);
	$found = 0;
	for ($j = 0; $j < scalar(@lines); $j++) {
	    if ($lines[$j] eq $forbidden) {
		$found = 1;
		last;
	    }
	}
	if ($found == 0) {
	    $modelid = $i;
	    last; # no conflict found
	}
    }

    if ($found == 1) {
	$modelid = 0;
	print "Warning: Could not find suitable model!\n";
	$self = 1; # switch to avoiding overlapping test and training
    }
}

$train = "$modelbasedir/part_${modelid}";

print "Using model $modelid from directory $train\n";

$rescale = "$train/rescale.prm";

if (! -e $fastafile) {
    die "Could not find fasta alignment file $fastafile!\n";
}

$a = basename($fastafile, ".fasta");

if (length($readdir) < 1) {
    $readdir = "$a.dir";
}

$b = "$a.test";
$outfiletmp = "$a.test.tmp.stdout";
$outfile = "$a.test.stdout";
$listname = "$b.list";
$loopfile = "$b.loop.dat";

if (-e $outfile) {
    `rm $outfile`;
}

# coordinate this variable with patchsize variable in nn_common.R !
$patchsize = 2;
$alg = 26; # algorithm for mutual information, not using correction
# $alg = 14; # algorithm for mutual information, use correct correction
$neuralmode = 10; 

$header1 = "$b.nn.head.tmp";
$header2 = "$b.nn.head.tmp2";
$lengthmean = "120.0";
$lengthweight = "0.0"; # changed by EB on Oct 25, 2005 "0.1";
$matchthresh = "0.5"; # all positions with a matching fraction lower than this threshold are ignored by the classifier
$datfile = "$b.nn.dat";
$tmpfile = "$b.tmp.dat";
$tmpfile2 = "$b.tmp2.dat";
$tmpfile3 = "$b.nn.pred.matrix.tmp";
$matrixfile = "$b.nn.pred.matrix";
$nnresult = "$b.nn.result";
$alioutfile = "$b.tmp.fasta";
$refstemoutfile = "$b.tmp.reg";
$probmatrixdir = "$readdir";
if (length($probMatrixOption) > 0) {
    $probmatrix = $probMatrixOption;
}
else {
    $probmatrix = "$readdir/$a.prob.matrix";
}
$probmatrixoutfile = "$a.test.prob.matrix";
$stemoutfile2 = "$a.test.allstems.reg";
$entropymatrixoutfile = "$a.test.entropy.matrix";
$knncutoff = "0.000001";
$knnnet = "$train/compass.knn.optimized.prm";
$knnnet2 = "$train/post.prm"; # for postprocessing
$knndir = "$train";
$knndir2 = "$train";
$winner = "1"; # careful: switch off conflicting stems if "1"!
if (length($winnerOption) > 0) {
    $winner = $winnerOption;
}

if (!-e $knnnet) {
    die "Could not find $knnnet\n";
}

if (! -e $readdir) {
    my $mkcom = "mkdir -p $readdir";
    print "$mkcom\n";
    `$mkcom`;
}

$combase = "-i $fastafile --thresh 0.5 --norm 0 --algorithm $alg --energy-min 0.75 --neural-patch $patchsize --overwrite 1 --neural-mode $neuralmode --neural-energy-mode 0 --neural-all 1 --knn-data $knnnet --knn-dir $knndir --list $listname -o $b --knn-gauss 0.2 --neural-mode3 2 --filter 0 --entropy-matrix-out $entropymatrixoutfile --stem-outfile2 $stemoutfile2 --stem-outfile2-weight 1 --stem-outfile2-limit -10 --neural-conserve 1 --neural-rename-mode 1 --loop $loopfile --neural-self-mode $self --knn-cutoff2 $knncutoff --winner $winner --length-mean $lengthmean --length-weight $lengthweight --grow-match 0.5 --grow-limit 0.2 --thresh-individual 0.05 --stem-min 1 --stem-min-anyways 0.0 --char-min-filter 1 --filter-bad 4 --collapse 0 --match-thresh $matchthresh";
# --ali-out $alioutfile --stem-outfile $refstemoutfile 
# if RNAfold should be invoked:
if ($foldMode != 0) {    
############################################################
# generate probability matrix if not existent!
############################################################
    if (! -e $probmatrix) {
	my $probcom = "$PROBGEN $fastafile $probmatrix $probmatrixdir";
	if (-e $constraintfile) {
	    # added constraint to folding. Might come from mutual information
	    $probcom = $probcom . " $constraintfile";
	}
	print "$probcom\n";
	`$probcom`;
    }
    if (! -e $probmatrix) {
	die "Could not find probability matrix $probmatrix even after trying to generate it!\n";
    }
    # add options related to secondary structure prediction from RNAfold:
    $combase = "$combase " . "--prob-matrix $probmatrix --knn-data2 $knnnet2 --knn-dir2 $knndir2 ";
}

if (length($refali) > 0) {
    # collapse with respect to reference alignment
    $combase = $combase . " --ali-ref $refali";
}
#  --rescale $rescale  
if (-e $reffile) {
    if (-e $weightfile) {
	die "Sequence weight files are not supported anymore!\n";
	$comuse = "$combase --stem-file $reffile --stem-format 2";
    }
    else {
	$comuse = "$combase --stem-file $reffile --stem-format 2";
    }
}
else {
    if (-e $weightfile) {
	$comuse = "$combase --sequence-weight-mode 3 --sequence-weight-file $weightfile";
    }
    else {
	$comuse = "$combase";
    }
}

$resultmatrixname = "$b.matrix";

if ($foldMode != 2) {
    $com1 = "$COMPASS $comuse >& $outfile"; # full command for running compass
}
else {
    $com1 = "cp $probmatrix $b.matrix"; # do not run compass, equivalent to "NL-RNAfold" method
}

print "$com1\n";
`$com1; rm -f core*`;

sleep(10);

if (! -e $resultmatrixname) {
    print "Oops! Output matrix $resultmatrixname still does not exist.\n";
}
else {
    print "Run appears to be successful, result matrix exists: $resultmatrixname\n";
}

