#!/usr/bin/perl -w
#extract RescueMu FASTA seq from PaCE cluster results
#useage: ./Cap3FromPaCE.pl RescueMuGSS.fsa SelectRescueMuForAssembly.fsa est.clusterXXXPaCE 

system("rm -r -f CapAlign") if (-e "CapAlign");
system("mkdir CapAlign");

my $cap3 = './cap3Dong';


my %stanfordID_GI;
open(FILE, "grep '>' $ARGV[0] |") || die("Cannot open");
while(<FILE>){
	my ($gi, $sid) = /gi\|(\d+)\S+\s+(\S+)/;
	$stanfordID_GI{$sid} = $gi;
}
close(FILE);

my %ID_Seq;
$/='>';
open(FILE, "$ARGV[1]") || die("Cannot open $ARGV[1]");
while(<FILE>){
        next unless /\w/;
        my $block = $_; chomp($block);
        my @lines = split(/\n/, $block);
        my ($ID) = $lines[0] =~ /(\S+)/;
        shift(@lines); #get rid of title
        my $Seq = join('', @lines);
        $Seq =~ s/\W+//g;
        my $SeqLen = length($Seq);
        $ID_Seq{$ID} = $Seq;
}
close(FILE);
$/ = "\n";

print STDERR "Extract PaCE clusters sequences ...\n";
$/ = 'Cluster';
open(FILE, "$ARGV[2]") || die ("Cannot open $ARGV[2]");
while(<FILE>){
        my $block = $_; chomp($block);
	next unless ($block =~ /\w/);
        my @lines = split(/\n/, $block);
        my ($clusterNum) = $lines[0] =~ /(\d+)/; #get the cluster number
        open(OUT, ">cluster$clusterNum.memberfasta") || die ("Cannot open Cluster$clusterNum");
        foreach my $line (@lines){
                next unless ($line =~ /Member\S+\s+>(\S+)/);
                my $memberID = $1;
		my $memberSeq = $ID_Seq{$memberID};
		if(exists $stanfordID_GI{$memberID}){
			$memberID = $stanfordID_GI{$memberID}; #repace StanfordID with gi
		}
if(!defined $memberSeq){
	print STDERR "ID = $memberID; \n";
	exit;
}
                print OUT ">$memberID\n$memberSeq\n";
        }
        close(OUT);
	#print STDERR "clusternum = $clusterNum\n";
        system("$cap3 cluster$clusterNum.memberfasta -p 90 -s 700 > cluster$clusterNum.memberfasta.Cap3Align");
        system("mv cluster$clusterNum.memberfasta* CapAlign");

}
close(FILE);
$/ = "\n";


