#!/usr/bin/perl

use File::Basename;

# runs RNAfold program for all pairs of a set of sequences
# fasta2rnaprob2.pl cleaned up version of fasta2rnaprob.pl
# author: Eckart Bindewald

$KNETFOLD_HOME = $ENV{"KNETFOLD_HOME"};
if (length($KNETFOLD_HOME) == 0) {
    die "Environment variable KNETFOLD_HOME has to be specified!\n";
}

if (substr($KNETFOLD_HOME, 0,1) ne "/") {
    # workaround for relative file names: convert to absolute file names
    $KNETFOLD_HOME = `cd $KNETFOLD_HOME; pwd`;
    chomp($KNETFOLD_HOME);
}

$KNETFOLD_TMP = $ENV{"KNETFOLD_TMP"};
if (length($KNETFOLD_TMP) == 0) {
    $KNETFOLD_TMP = "/tmp"; # default tmp directory
}

if (scalar(@ARGV) > 0) {
    ($alifileOrig, $probfile, $writedir, $constraintfile) = @ARGV;
}

if (scalar(@ARGV) < 2) {
    die "Usage: fasta2rnaprob.pl alignmentfile outputfile [workdirectory]\n";
}

$outputfile = $probfile;
@allpsnames = ();

$RNAFOLD = "RNAfold";
$BIN = "$KNETFOLD_HOME/bin";
$ALIGNEDIT = "$BIN/alignedit2";
$goodies = "$BIN";
$FASTA2FILES = "$goodies/fasta2files.pl";
$FASTA2LENGTH = "$goodies/fasta2length.pl";
$STEMCONVERT = "$BIN/stemconvert";
$COMPASS = "$BIN/compass -p $BIN/compass.prm";
$dna2rna = "$goodies/fastadna2rna.pl";
$alifileBase = basename($alifileOrig);
$workdir =  $alifilebase . ".dir";
# $SECOMB = "$home/project/supermol3/bin/secomb --root $home/project/supermol3/resources/consensus";

if (&checkExistence($RNAFOLD) == 0) {
    die "Could not find binary of $RNAFOLD ! Please install RNAfold from Vienna package.\n";
}


if (length($writedir) > 0) {
    $workdir = $writedir;
}

print "mkdir -p $workdir\n";
`mkdir -p $workdir`;
$alifile = "$workdir/$alifileBase.tmp";


$outputfiletmp = "$outputfile.tmp";

open(OUTFILE, ">$outputfiletmp") 
    or die "Could not open temp output file: $outputfiletmp\n";

# chomp(@names= <STDIN>);

print "Writing to directory $workdir\n";
if (! -e $workdir) {
    `mkdir -p $workdir`;
}

# clean up sequence names:
$editcom = "$ALIGNEDIT -i $alifileOrig -o $alifile --names-edit 1";
print "$editcom\n";
`$editcom`;

my $listfiletmp = "$workdir/fastafiles.tmp.names";
my $listfile = "$workdir/fastafiles.names";

# generate separate fasta files:
my $fastacom = "$FASTA2FILES d $workdir $listfiletmp < $alifile";
print "$fastacom\n";
`$fastacom`;

chomp(@names = `cat $listfiletmp`);

$reflength = 0;

# ignore sequences, that are shorter than this fraction from the first sequence:
# $lenfrac = 0.66;
$lenfrac = 0.5;

open(LISTFILE, ">$listfile");

# only compare with first sequence:
# for ($i = 0; $i < 1; $i++) {
for ($i = 0; $i < scalar(@names); $i++) {
    if (length($names[$i]) < 1) {
	die "Bad filename: $names[$i]\n";
    }
    $filename1 = "$names[$i]" . ".fasta";
#    if (length($readdir) > 0) {
#	$filename1 = $readdir . "/" . $filename1;
#    }
#    $filename1 = "../" . $filename1;
    $comlength = "$FASTA2LENGTH < $workdir/$filename1";
    print "$comlength";
    $length = `$comlength`;
    if ($i == 0) {
	$reflength = $length;
	print "Reference length: $length\n";
	print LISTFILE "$names[$i]\n";
    }
    else {
	if ($length < ($lenfrac * $reflength)) {
	    print STDERR "Ignoring $names[$i], because it is too short: $length $reflength\n";
	    next; # ignore too small sequences
	}
	else {
	    print LISTFILE "$names[$i]\n";
	}
    }
    $resultfilebase = $names[$i] . ".rnafold.out";
    $resultps = substr($names[$i],0,12) . "_dp.ps";
    $resultps2 = substr($names[$i],0,12) . "_ss.ps";
    $resultfilename = "$resultfilebase";
    $comboutfile = "$names[$i]" . ".comb";
    $totout = "$workdir/$resultfilename";
    $totfilein = "$workdir/$filename1";
    $totfileps = "$workdir/$resultps";
    $totfileps2 = "$workdir/$resultps2";
    push(@allpsnames, $totfileps);
    push(@allpsnames, $totfileps2);
#    if (-e $totfileps) {
#	print "Skipping $names[$i] because $resultps already exists!\n";
#	print OUTFILE "$names[$i] $resultps\n";
#	next; # skip if result file already exists
#    }
#    if (! -e $totfilein) {
#	print "Skipping because $totfilein does not exist!\n";
#	next; # skip if result file already exists
#    }
    if (! -e $totfileps) {
	if (-e $constraintfile) {
	    # adjust constraint file to current sequence:
	     # plus one because external counting starts at 1:
	    $numSeqCurrent = $i + 1;
	    $currentconstraintfile = "$constraintfile.$i";
	    $constraintcom = "$STEMCONVERT -i $constraintfile --if 2 --of 7 -a $alifile --collapse $numSeqCurrent --wc 1 --verbose 0 > $workdir/$currentconstraintfile"; 
	    print "$constraintcom\n";
	    `$constraintcom`;
	    $command = "cd $workdir; cat $filename1 $currentconstraintfile | $dna2rna d | $RNAFOLD -p -C > $resultfilename";
	}
	else {
	    $command = "cd $workdir; cat $filename1 | $dna2rna d | $RNAFOLD -p > $resultfilename";
	}
#  -d3 -noLP
	print "$command\n";
	`$command`;
    }
#     $command2 = "cd $workdir; $SECOMB -i $resultfilename --format 2 > $comboutfile";
#     print "$command2\n";
#     `$command2`;
#    if (length($combali) > 0) {
#	$command2 = "cd $workdir; cat $comboutfile >> $combali";
#	print "$command2\n";
#	`$command2`;
#    }
    # only write to list, if postscript file was written successfully
    if (-e $totfileps) {
	print OUTFILE "$names[$i] $resultps\n";
    }
    else {
	print "Strange: Could not find output file: $totfileps\n";
    }
}


close(OUTFILE);
close(LISTFILE);

# convert generated postscript files to one probability matrix:
# now also use weighted sequences!
# exit code 10 : premature end of compass program, right after saving matrix
my $compasscom = "$COMPASS -i $alifile --prob-file $outputfiletmp --prob-dir $workdir --prob-matrix-out $outputfile --prob-consensus 3 --sequence-weight-mode 1 --exit 10 --ali-length 10000";
print "$compasscom\n";
`$compasscom`;

#clean up:
foreach (@allpsnames)
{
    print "rm $_\n";
    `rm $_`;
}


############################################################
# Subroutines
############################################################

# returns 1 if given program name exists in path, zero otherwise
sub checkExistence
{
    my $name = $_[0];
    if (length($name) < 1) {
	return 0;
    }
    my $rExist = `which $name`;
    chomp($rExist);
    $rExist =~ tr/ //s; # squeeze repeats of space character
    my @rWords = split/ /, $rExist;
    if (scalar(@rWords) == 1) {
	return 1;
    }
    return 0;
}
