#!/usr/bin/perl -w
#Author: Qunfeng Dong

#usage: ./RescueMuPlasmidAssembly.pl AllGridparentalPlasmid.txt RescueMuGSSraw.fsa RescueMuGSS.trim
#AllGridparentalPlasmid.txt (output of ParentalVmatchCluster.pl) are the putative parental plasmids ID - exclude them from assembly
#RescueMuGSSraw.fsa is from Stanford after they marked TIR as X
#RescueMuGSS.trim (output of VectorTrim.pl) is the vector-trim sequences from RescueMuGSS.fsa (downloaded from GenBank)


#####################what this script does?#############################
# Perform a plasmid assembly: assemble sequences from the same plasmids
# step 1: put X, and Y together -> xxxXY
# step 2: if XY, put ELX and ELY onto it; otherwise try to put ELX and Y; ELY and X together
#    (1). PlasmidAssembly result
	open(PlasmidAssembly, '>RescueMuPlasmidAssembly.fsa') || die("Cannot open output");
#    (2). RescueMuXYtuc_memberGSS.txt and EtucMember
	open(TUCmember, ">RescueMuXYtuc_memberGSS.txt") || die("Cannot open output");
	open(Emember, ">RescueMuEtuc_memberGSS.txt");

#    (3). Coordinates9bpr_inXY.txt
	open(TUC9pbr, ">Coordinates9bpr_inXY.txt") || die("Cannot open output"); 

#it also throw away EL sequence with multiple EL sites
	open(EL, '>MultipleEL.txt') || die("Cannot open");

#(D). output for bad GenBank sequencs (GenBank seq is not a substring of it corresponding ScreenX seq)
	open(Screen_GenBank, '>Screen_GenBank.inconsistent'); #also include the derivatives of ELs

#(E)  some additional txt file for number game
	open(VALID, ">AllValidGSS.txt"); #all valid RescueMu GSS
	open(MAX, '>AllMax_member.txt');
########################################################################

use strict;

my $CAP = './cap3Dong'; #this cap3Dong was modified from cap3 by me at grinch4:/usr/local/src/cap3 to change 14 to 0 from one variable
###gobal data and structures
my $BamHI  = 'GGATCC';
my $BglII  = 'AGATCT';
my $Mix5   = 'GGATCT';
my $Mix3   = 'AGATCC';
my %StanfordID_ScreenSeq_Hash; #StanfordID vs. Screened Seq (no Tir but with 9 bp
my %StanfordID_GenBankSeq_Hash; #StanfordID vs. GenBank Se
my %maxXYEL;       #has to record (key) longest X, Y, ELX, ELY from each plasmid
		   #              (value) all the X, Y, ELX, ELY from that plasmid
		   #used for build RescueMuXYtuc_memberGSS.txt
		   #the difference between this hash and file XYmax is that it records ALL plasmids.
my %AllValidGSS;
my %XYtucSeq; #key XYtucID, value: its seq
my %AllGridparentalPlasmid; #all parental plasmids that should be excluded from the assembly


system("rm -r -f ELalnDIR") if(-e 'ELalnDIR');
system("mkdir ELalnDIR");


print STDERR "Build parental filter ...\n";
open(FILE, "$ARGV[0]") || die("Cannot open $ARGV[0]");
while(<FILE>){
	my ($plasmid) = /^\d+\s+(\S+)/;
	$AllGridparentalPlasmid{$plasmid} = 1;
}
close(FILE);

print STDERR "Building Hash for sequences...\n";
&GenbankSeq;#build %StanfordID_GenBankSeq_Hash  
&ScreenSeq; #build %StanfordID_ScreenSeq_Hash

print STDERR "Identify valid member GSSs from each plasmid ...\n";
open(XYELmax, '>Plasmid_maxXYELmemberGSS.txt') || die ("Cannot output"); #longest X, Y, ELX, ELY for each plasmid
&PlasmidMember; #PlasmidName -  X:LongestXmember (all Xs); Y:LongestYmember (all Ys); ELX:LongestELXmember( all ELXs); ELY:longestELYmember(all ELY)
close(XYELmax);

print STDERR "Assemble each plasmid ...\n";
&Plasmid_Assembly;


print STDERR "Assemble is finished!\n";

#functions -----------------------------------------------------------------------------------------
sub PlasmidMember{
	#this function sort out all member sequences generated from each plasmids
	#
	my %Plasmid_stanfordIDHash;
	open(FILE, "grep 'Rescue' $ARGV[2] |") || die ("Cannot grep");
	while(<FILE>){
   		my $line = $_; chomp($line);
    		if($line = /^>gi\|(\d+)\S+\s+(\S+\.\S+)\s+/){
			my $gi = $1;
		        my $stanfordID = $2;
			#this has to be a (1) consistent sequences and (2) NOT a multile EL seq
			next unless (exists $StanfordID_GenBankSeq_Hash{$stanfordID});
			$AllValidGSS{$stanfordID} = 1;
        		my ($plasmid) = $stanfordID =~ /(\w+)\.\S+/;
			next if(exists $AllGridparentalPlasmid{$plasmid}); #######filter parental plasmids######
		        if(!exists($Plasmid_stanfordIDHash{$plasmid})){
            			$Plasmid_stanfordIDHash{$plasmid} = $stanfordID;
        		}
        		else{
            			$Plasmid_stanfordIDHash{$plasmid} .= "\t$stanfordID";
        		}
    		}
	}
	close(FILE);
	
	#output all valid GSS
	foreach my $ValidGSS (sort keys %AllValidGSS){
		print VALID "$ValidGSS\n";
	}

	foreach my $Plasmid (keys %Plasmid_stanfordIDHash){
	    my $members = $Plasmid_stanfordIDHash{$Plasmid};
	    my ($maxX, $maxY, $maxELX, $maxELY);
	    my @X = $members =~ /(\S+\.x\S+)/g;
            if(@X){
                $maxX = &LongestSeq(@X);
                foreach my $mem (@X){
			if(exists $maxXYEL{$maxX}){
				$maxXYEL{$maxX} .= ":$mem";
			}else{
				$maxXYEL{$maxX} = $mem;
			}
                }
            }
	    $maxX = 0 if(!defined $maxX);
	    my @Y = $members =~ /(\S+\.y\S+)/g;
            if(@Y){
                $maxY = &LongestSeq(@Y);
                foreach my $mem (@Y){
                        if(exists $maxXYEL{$maxY}){
                                $maxXYEL{$maxY} .= ":$mem";
                        }else{
                                $maxXYEL{$maxY} = $mem;
                        }
                }
            }
	    $maxY = 0 if(!defined $maxY);
#            if(defined $maxX && defined $maxY){
#                        print XYmax "$maxX\t$maxY\n";
#            }
	    my @ELX = $members =~ /(\S+EL_x\S+)/g;
            if(@ELX){
                $maxELX = &LongestSeq(@ELX);
                foreach my $mem (@ELX){
                        if(exists $maxXYEL{$maxELX}){
                                $maxXYEL{$maxELX} .= ":$mem";
                        }else{
                                $maxXYEL{$maxELX} = $mem;
                        }
                }
            }
	    $maxELX = 0 if(!defined $maxELX);
            my @ELY = $members =~ /(\S+EL_y\S+)/g;
            if(@ELY){
                $maxELY = &LongestSeq(@ELY);
                foreach my $mem (@ELY){
                        if(exists $maxXYEL{$maxELY}){
                                $maxXYEL{$maxELY} .= ":$mem";
                        }else{
                                $maxXYEL{$maxELY} = $mem;
                        }
                }
            }
	    $maxELY = 0 if(!defined $maxELY);
	    print XYELmax "maxX:$maxX\tmaxY:$maxY\tmaxELX:$maxELX\tmaxELY:$maxELY\n";
	    ######################################################################################################
	    #only those LONG sequences recorded in %maxXYEL are going to the subsequent assembly!!!!!!!!!!!!!!!! #
	    ######################################################################################################
	}
	foreach my $max (keys %maxXYEL){
		my @mems = split(/:/, $maxXYEL{$max});
		foreach my $mem (@mems){
			print MAX "$max\t$mem\n";
		}
	}
}

sub LongestSeq{
        my (@SeqIDArray) = @_; #input is a list of StanfordIDs
        my $max = 0; my $maxSeqID;
        foreach my $SeqID (@SeqIDArray){
                my $Seq = $StanfordID_GenBankSeq_Hash{$SeqID};
                my $len = length($Seq);
                if($len > $max){
                        $max = $len;
                        $maxSeqID = $SeqID;
                }
        }
	return $maxSeqID;
}

sub Plasmid_Assembly{
	open(MEMBER, 'Plasmid_maxXYELmemberGSS.txt') || die ("Plasmid_maxXYELmemberGSS.txt");
	while(<MEMBER>){
		my ($maxX, $maxY, $maxELX, $maxELY) = /maxX:(\S+)\s+maxY:(\S+)\s+maxELX:(\S+)\s+maxELY:(\S+)/;
		my @ELassembly; #IDs for participting in EL_assembly
		my $XYtucID; 
		if($maxX && $maxY){
			#print STDERR "this plasmid has both the X and Y sequences\n";
			$XYtucID  = &Assemble_X_Y($maxX, $maxY);
			push(@ELassembly, $XYtucID) if($XYtucID);
		}
		unless(defined $XYtucID && $XYtucID){
			push(@ELassembly, $maxX) if($maxX);
			push(@ELassembly, $maxY) if($maxY);
		}
		if($maxELX || $maxELY){
                        push(@ELassembly, $maxELX) if($maxELX);
                        push(@ELassembly, $maxELY) if($maxELY);
			&Assemble_EL(@ELassembly);
		}else{
			#no EL seq
			if(defined $XYtucID && $XYtucID){
                                print PlasmidAssembly ">$XYtucID\n$XYtucSeq{$XYtucID}\n";
                        }else{
                                print PlasmidAssembly ">$maxX\n$StanfordID_GenBankSeq_Hash{$maxX}\n" if($maxX);
                                print PlasmidAssembly ">$maxY\n$StanfordID_GenBankSeq_Hash{$maxY}\n" if($maxY);
                        }
                }
        }#end while
	close(MEMBER);
}

sub Assemble_EL{
        my @SeqIDs = @_;
        my $plasmid;
        open(CAP, '>TempCap3Input') || die("Cannot open Cap3Input");
                foreach my $ID (@SeqIDs){
			print STDERR "$ID\n";
                        my $Seq;
                        if($ID =~ /(\S+)XY$/){
                                $Seq = $XYtucSeq{$ID};
                                $plasmid = $1;
                        }else{
                                $Seq = $StanfordID_GenBankSeq_Hash{$ID};
                                ($plasmid) = $ID =~ /(\S+)\./;
                        }
                        print CAP ">$ID\n$Seq\n";
                }
        close(CAP);
        #then we ran cap3, if any contig can be generated, I call it xxxxEtuc1, xxxxxEtuc2
        #for now I will just save the alignment
        my $CapInputFileName = $plasmid.'ELaln.fsa';
	system("mv TempCap3Input $CapInputFileName");
        system("$CAP $CapInputFileName -o 21 -p 95 -s 401 > $CapInputFileName.Cap3Align");
        my $CapContigFileName = $CapInputFileName.'.cap.contigs';
        my $CapSingletFileName = $CapInputFileName.'.cap.singlets';
        my $ELtucName = $plasmid.'E';
        $/ = '>';
        open(FILE, "$CapContigFileName") || die("Cannot open cap contig");
        while(<FILE>){
                next unless (/\w/);
                my $block = $_;
                $block =~ s/>$//;
                $block =~ s/Contig/$ELtucName/;
                print PlasmidAssembly ">$block";

        }
        close(FILE);
        $/ = "\n";
        $/ = '>';
        open(FILE, "$CapSingletFileName") || die("Cannot open cap singlet");
        while(<FILE>){
                next unless (/\w/);
                my $block = $_;
                $block =~ s/>$//;
                print PlasmidAssembly ">$block";

        }
        close(FILE);
        $/ = "\n";

        $/ = "Contig";
        my $NumContig = qx(grep '>' $CapContigFileName | wc -l);
        my $NumSinglet = qx(grep '>' $CapSingletFileName | wc -l);
        my $lastFlag = 0;
        open(CAPOUT, "$CapInputFileName.Cap3Align") || die("Cannot open");
        while(<CAPOUT>){
                next if /^Number/;
                my $members;
                my $CapOutBlock = $_; chomp($CapOutBlock);
                $lastFlag = 1 if($CapOutBlock =~ /DETAILED DISPLAY OF CONTIGS/);
                my @lineArray = split(/\n/, $CapOutBlock);
		my $EtucID = $ELtucName;
                foreach my $line (@lineArray){
                        $line =~ s/^\s+//g; #delete preceding space
                        if($line =~ /^(\d+)\s+\*\*\*/s){
				$EtucID = $ELtucName.$1;	
                        }
                        if($line =~ /^(\S+)([+|-])/){
				print Emember "$EtucID\t$1\n";
                        }
                }
                last if ($lastFlag);
        }#end of while #only need finish checing the CapOutput
        close(CAPOUT);
        $/ = "\n";







	system("mv $CapInputFileName* ELalnDIR");
}



sub Assemble_X_Y{
	my ($Xid, $Yid) = @_;
	my ($plasmid) = $Xid =~ /(\S+)\./;
	my $XscreenSeq = $StanfordID_ScreenSeq_Hash{$Xid};
 	my $YscreenSeq = $StanfordID_ScreenSeq_Hash{$Yid};
	if(defined $XscreenSeq && defined $YscreenSeq){
	         my ($CombineSeq, $start_9bpr, $contigName);
		 my $CombineID = $Xid.':'.$Yid;
		($CombineSeq, $start_9bpr) = &Combine_X_Y($Xid, $XscreenSeq, $Yid, $YscreenSeq);
		if($CombineSeq){
			#one of the few case print output here, since Combine_X_Y is called elsewhere
			#print STDERR "One GSS contig is assembled from (X, Y)\n";
        		$contigName = "$plasmid".'XY';
			#record the corridnates of the 9bpr in CombineSeq
			$start_9bpr = $start_9bpr + 1;
			print TUC9pbr "$CombineID\t$start_9bpr\n";
	                my @member = split(/:/,$CombineID);
           		foreach my $member (@member){
				my @allMember = split(/:/, $maxXYEL{$member}); #get also the shorter member
				foreach my $allmember (@allMember){
	                 		print TUCmember "$contigName\t$allmember\n";
					delete $AllValidGSS{$allmember};
				}
           		}
           		#print TUC ">$contigName\n$CombineSeq\n";
			$XYtucSeq{$contigName} = $CombineSeq;
			return $contigName;
		}
	}
	return 0;
}#end function

sub ScreenSeq{
        #Screened Seq: Raw Seq - TIR (still have the 9 bp
        #read screenx file and build a hash StanfordID vs. screened Seq
        #Be careful, cannot assume Stanford has every seq containingXXXXX as it should
	#need to trim off the extra edge sequneces from X, Y that are NOT submitted into GenBank
        $/ = '>';
        open(FILE, $ARGV[1]) || die ("Cannot open screenx");
        while(<FILE>){
                next unless /\w+/;
                my @lines = split(/\n/, $_);
                my $firstLine = shift(@lines);
                my ($stanfordID) = $firstLine =~ /^(\S+)\s+/;
                my $Seq = join('', @lines);
                $Seq =~ s/\W+//g; #delete blank and >
                $Seq = &Screen_TIR($stanfordID, $Seq);
                if($Seq){
                        $StanfordID_ScreenSeq_Hash{$stanfordID} = $Seq;
                }
        }
        close(FILE);
        $/ = "\n";
} #end function

sub GenbankSeq{
        #get Stanford vs. its genbank deposited Seq
        #change it to gi number from Acc#
        $/ = '>';
        open(FILE, $ARGV[2]) || die ("Cannot open AllGSS.fasta");
        while(<FILE>){
                next unless /RescueMu/;
                my @lines = split(/\n/, $_);
                my $firstLine = shift(@lines);
                my ($gi, $sid) = $firstLine =~ /^gi\|(\d+)\S+\|\w+\s+(\S+)/;
                my $Seq = join('', @lines);
                $Seq =~ s/\W+//g; #delete blank and >
                if($sid =~ /EL_/){
                	#an EL seq, check if it contains multipleEL site
                        my @count = $Seq =~ /$BamHI|$BglII|$Mix5|$Mix3/g;
                        my $size = @count;
                        if($size>1){
                                print EL "Multiple EL sites: $sid - excluded from subsequenct assembly\n";
                                next;
                        }
                }
                $StanfordID_GenBankSeq_Hash{$sid} = $Seq;
        }
        close(FILE);
        $/ = "\n";
} #end function

sub Screen_TIR{
        #This function returns realGSS (plus the 9 bpr) from the raw sequence whose TIR was masked as X by crossmatch program
        my ($stanfordID, $tirSeq) = @_;
	my $geneBankSeq = $StanfordID_GenBankSeq_Hash{$stanfordID};
	return 0 if (!defined $geneBankSeq);
        #my @cleanSeqArray = $tirSeq =~ /X+([^X]+)/g; #get seq after XXXXXXX, but sometime we have XXXseq1XXXseq2
        if($tirSeq !~ /$geneBankSeq/){
		#check if GenBank and Screen consistent or not
		#Screen should contain GenBank!!!
                #print STDERR "$stanfordID is NOT Consistent with ScreenX and GenBank!\n";
        	#exit;
		delete $StanfordID_GenBankSeq_Hash{$stanfordID};
		print Screen_GenBank "$stanfordID\n";
		#also delete any possible EL1 or EL2 seqs derived from this seq
		my ($plasmid, $xy) = $stanfordID =~ /(\S+)\.(\S+)/;
		my $EL1 = $plasmid.'.1EL_'.$xy;
		my $EL2 = $plasmid.'.2EL_'.$xy;
		if(exists $StanfordID_GenBankSeq_Hash{$EL1}){
			print Screen_GenBank "$EL1\n";
			delete $StanfordID_GenBankSeq_Hash{$EL1};
		}
                if(exists $StanfordID_GenBankSeq_Hash{$EL2}){
                        print Screen_GenBank "$EL2\n";
                        delete $StanfordID_GenBankSeq_Hash{$EL2};
                }
		return 0;
	}
	#This TIRseq must CONTAIN the seq submitted to the GenBank
	my ($TIRseq) = $tirSeq =~ /X+([^X]*$geneBankSeq)/g; #get seq after XXXXXXX,        
	if(defined $TIRseq){
		#if($TIRseq ne $geneBankSeq){
		#print ">found good TIR that are longer than GenBank Seq\n\n";
		#print "tir:$tirSeq\n\nTIR:$TIRseq\n\nGenBank:$geneBankSeq\n\n\n";
		#}
		return $TIRseq;
	}else{
	        return 0;
	}
} #end function



sub Combine_X_Y{ 
	#print STDERR "Combining (X, Y) ...\n";
	#this function combines X, Y
	my ($Xid, $X_seq, $Yid, $Y_seq) = @_; #get input
	my $CombineSeq;
	my $start_9bpr; #starting pos of the 9bpr #5' to the start of 9bpr
	#my $Xshift; #the number of bp to skip to find 9bpr on the X
	#($CombineSeq,$start_9bpr, $Xshift) = &find9bpr($X_seq, $Y_seq);
	($CombineSeq,$start_9bpr) = &find9bpr($Xid, $X_seq, $Yid, $Y_seq);
	return ($CombineSeq, $start_9bpr);
}#end function

sub RevCompl{
        #this function returns reverse_complement seq
        my ($Seq) = @_;
        $Seq =~ tr/gatcGATC/ctagCTAG/; # complement
        $Seq =~ tr/rykmRYKM/yrmkYRMK/; # complement for the ambiguity letter
                                     # R - purine (G, A)
                                     # Y - pyrimidine (T, C)
                                     # K - Keto (G, T)
                                     # M - Amino (A, C)
        $Seq = reverse $Seq;           # and reverse
        return $Seq;
}#end function

sub find9bpr{
	#input SeqX and SeqY, find 9bpr from SeqX and RevComp(SeqY) and return a combined seq
	#otherwise, return 0
	#note, the searching for 9 bpr has three shifts in case the crossmatch is wrong
	#x: 123456789abcde
	#y: 123456789xecef
	#y: 0123456789....
	#y: 00123456789...
	
	
       	my ($Xid, $SeqX, $Yid, $SeqY) = @_;
        my ($seq1, $seq2, $len);
	my $start_9bpr = '\N';
	my $minMatch = 8; #minimum matching in the 9bp searching window

        #print STDERR "shift SeqY\n";
        $seq1 = substr($SeqX, 0, 9);
        $seq2 = &RevCompl(substr($SeqY, 0, 9));
        #$seq2 = substr($SeqY, 0, 12);
        $len = length($seq1);
        my $same = 0;
        for(my $i=0; $i<$len; $i++){
            my $j = $i;
            my $Seq1_char = substr($seq1, $i, 1);
            my $Seq2_char = substr($seq2, $j, 1);
            $same++ if($Seq1_char eq $Seq2_char);
        }
        if($same >= $minMatch){
            #print STDERR "find 9bpr\n";
	    #now need to choose which 9bpr is the final 9bpr
	    my $rest_SeqX = substr($SeqX, 9);
	    my $rest_SeqX_len = length($rest_SeqX);
	    my $rest_SeqY = substr($SeqY, 9);
	    my $rest_SeqY_len = length($rest_SeqY);
	    $start_9bpr = $rest_SeqY_len; #Y is the left sequence
	    if($same == 9){
		    #print STDERR "they are identical, just choose any one of it!!!\n";
        	    my $combineSeq = &RevCompl($rest_SeqY).$SeqX;
            	    return ($combineSeq, $start_9bpr);
	    }else{
		#There is one base difference between 9bpr from rawX and rawY
		#need to see if genbank X and genbank Y can be used to decide which one to choose 
		my $X_genBank = $StanfordID_GenBankSeq_Hash{$Xid};
		my $X_genBank_len = length($X_genBank);
		my $Y_genBank = $StanfordID_GenBankSeq_Hash{$Yid};
		my $Y_genBank_len = length($Y_genBank);
		#let's find out the extra overhang of X or Y's GenBank sequence vs. $rest_SeqX or $rest_SeqY.
		#the one that has longer overhang (longer high-quality sequences) will be used		
		my $X_overhang = $X_genBank_len - $rest_SeqX_len;
		my $Y_overhang = $Y_genBank_len - $rest_SeqY_len;

if($X_overhang>0 && $Y_overhang>0){
	#print "$Xid\t$Yid\n";

}

		#if both the above overhang are negative, that means none of them has quality character in the 9 pbr region
		if($X_overhang >= $Y_overhang){
			#print STDERR "X might has more high score character in 9 bpr region\n";
			my $combineSeq = &RevCompl($rest_SeqY).$SeqX;
			return ($combineSeq, $start_9bpr);
		}else{
			#print STDERR "Y might has more high score character in 9 bpr region\n";
			my $combineSeq = &RevCompl($SeqY).$rest_SeqX;
			return ($combineSeq, $start_9bpr);
		}
	    }
        }
        return 0;
}

