#!/usr/bin/perl -w
#uage: plantTUCMemAlign.pl AS.alignment > TUC_EST.txt
#output is TUC_EST.txt for loading into MySQL

use strict;

if (!$ARGV[0]) {
    print "This program extract the alignment data from the alignment of a TUC with its member ESTs. The results are numbers representing the start and end positions of each EST compared to the consensus sequence.\n\nUsage: TUCMemAlign AlignmentFile\n";
    exit(0);
}

my $LINE_LENGTH = 60;
my @TUC;
$/ = ">";
open (FILE, $ARGV[0]) || die ("Can not open the file $ARGV[0].\n");
while (my $TUC = <FILE>) {
    push(@TUC, $TUC);
}
close (FILE);

my ($plant) = $ARGV[0] =~ /(\w+)\.alignment/;

#my $countGap = 0; #in consensus seq, there are - gap, need to deduct it from Start, and End position of member ESTs
foreach my $TUC (@TUC) {
	next unless ($TUC =~ /\w+/);
	$TUC =~ s/>$//; #get rid of the tailing >
        my ($name) = $TUC =~ /^(\w+tuc\S+)/;
	my %GAP; #record how many '-'s before that position
	my @members;
    	my $count = 0;
    	my %start;
    	my %end;
    	my @lines = split(/\n/, $TUC);
    	#first get the consensus seq with '-', and at each position, record how many '-'s before that position
    	my $Seq =  ' ';
    	foreach my $line (@lines){
        	if ($line =~ /consensus\s+(\S+)\s*/) {
                	$Seq .= $1;
    		}
    	}
    	$Seq =~ s/\s+//g; #note here only get rid of space but not -
    	my $SeqLen = length($Seq);
	my $realSeq = $Seq;
	$realSeq =~ s/\W+//g; #get rid of - sign
    	my $countGap = 0;
    	for (my $i=0; $i<$SeqLen; $i++){
		my $Char = substr($Seq, $i, 1); #get one character a time
		my $pos = $i + 1;
		$GAP{$pos} = $countGap;
		$countGap++ if($Char eq '-');
    	}
	my %sign;
    	foreach my $line (@lines) {
		my $l;
		if ($line =~ /(\S+)\s*([+-])\s+(\S+)/) {
		    $l = length($3);
	    	   #print "$l\n";
	    	   if (! exists ($start{$1})) {
			#print "Before $start{$1}\n";
			#print "count = $count\tEST=$1\n";
			$start{$1} = $LINE_LENGTH * ($count + 1) -  $l + 1;
	    	   }
	           $end{$1} = $LINE_LENGTH * $count + $l;
 		    #print "$start{$1}\n";
		    $sign{$1} = $2;
		}
		if ($line =~ /consensus\s+(\S+)\s*/) {
	    		$count++;
			my $seq = $1;
			my @gap = $seq =~ /-/g;
			my $gapSize = @gap;
			$countGap += $gapSize; 
	  		#  print "count = $count\n";
		}
    	}
    	if ($name){
		foreach my $member (keys (%start)) {
			my $readStart = $start{$member} - $GAP{$start{$member}}; #need to deduct the number of - before that position
			my $readEnd = $end{$member} - $GAP{$end{$member}};
			my $readSign = $sign{$member};
			if($member =~ /XYtu$/){
				$member .= 'c';
			}
	    		print  "$name\t$member\t$readSign\t$readStart\t$readEnd\n";
		}
    	}
}
	

