#!/usr/bin/perl -w
#input 	
#	 	(1): RescueMuGSS.fsa #for gi, acc, plasmid info
#               (2). ZM_RM_GSS.FinalTUG.fasta (extract singlets and record non-parentals)
#		(3). ZM_RM_GSS.TUC_GSS #output of  RescueMuTUG.pl; note: it does not recursively include members from XY or Etuc; but it's still Ok to use it

#usage: ./UniqueRMLoci.pl ../TIGRrepeatTrim_RescueMuGSS ZM_RM_GSS.TUG.fasta ZM_RM_GSS.TUC_GSS  > UniqueRMLoci.txt
use strict;

open(MERGE, ">MergeProcess.txt") || die("Cannot open MergeProcess.txt");

print STDERR "\nRetrieving Plasmid Seq info  ...\n";
my %Plasmid_memberGSS;
my %memberGSS_Plasmid;
open(FILE, "grep '>' $ARGV[0] | ") || die ("Cannot grep $ARGV[0]");
while(<FILE>){
        my ($gi, $plasmid) = /gi\|(\d+)\S+\s+(\S+)\./;
        if(exists $Plasmid_memberGSS{$plasmid}){
		$Plasmid_memberGSS{$plasmid} .= ":$gi";
        }else{
		$Plasmid_memberGSS{$plasmid} = $gi;
        }
	$memberGSS_Plasmid{$gi} = $plasmid;
}
close(FILE);


my %Plasmid_Assembly;
my %FinalTUG;
#get singlet info
open(FILE, "grep '>' $ARGV[1] | ") || die ("Cannot open $ARGV[1]");
while(<FILE>){
	my ($tug) = />(\S+)/;
	if($tug !~ /tuc/){ 
		my $singlet = $tug;
		my $plasmid = $memberGSS_Plasmid{$singlet};
        	if(exists $Plasmid_Assembly{$plasmid}){
                	$Plasmid_Assembly{$plasmid} .= ":$singlet";
        	}else{
                	$Plasmid_Assembly{$plasmid} = $singlet;
        	}
	}
	$FinalTUG{$tug} = 1;
}
close(FILE);


#get contig info
open(FILE, "$ARGV[2]") || die ("Cannot open $ARGV[2]");
while(<FILE>){
        my ($contig, $GSS) = /^(\S+)\s+(\S+)/;
	next unless (exists $FinalTUG{$contig});
        my $plasmid;
        if($GSS =~ /(\S+)XY$/ || $GSS =~ /(\S+)E\d$/){
            #the member is a XYtuc or Etuc
                $plasmid = $1;
        }else{
                $plasmid = $memberGSS_Plasmid{$GSS};
        }
        if(exists $Plasmid_Assembly{$plasmid}){
                $Plasmid_Assembly{$plasmid} .= ":$contig";
        }else{
                $Plasmid_Assembly{$plasmid} = $contig;
        }
}
close(FILE);

foreach my $Plasmid (keys %Plasmid_Assembly){
        $Plasmid_Assembly{$Plasmid} = UniqName($Plasmid_Assembly{$Plasmid});
}



print STDERR "\nMerge assembly if they share the same plasmid\n";
my %Unique_Plasmid_Assembly;
if(&Merge()){
	my $UniqueRegionNum = 0;
	foreach my $key (keys (%Unique_Plasmid_Assembly)){
    		$UniqueRegionNum++;
		print "$UniqueRegionNum\t$key\t$Unique_Plasmid_Assembly{$key}\n";    
}

print STDERR "Done!\n";


}#end Merge())


sub UniqName{
        #for example, X:Y:X should be reduced to X:Y
        my ($SeqName) = shift;
        my %NameHash;
        my @NameArray = split(/:/, $SeqName);
        foreach my $name (@NameArray){
                if($name =~ /\w+/ && !exists $NameHash{$name}){
                        $NameHash{$name} = 1;
                }
        }
        my $NewName;
        foreach my $name (sort (keys %NameHash)){
                if(defined $NewName){
                        $NewName .= $name.':';
                }else{
                        $NewName = $name.':';
                }
        }
        $NewName =~ s/^\s+//;
        $NewName =~ s/:$//;
        return $NewName;
}

sub Overlap{
        #check if A and B intersect
        #A = a1:a2:a3...
        #B = b1:b2:b3...
        my ($A, $B) = @_;
        my $overlap = 0;
        my @Aarray = split(/:/, $A);
        my @Barray = split(/:/, $B);
        foreach my $a (@Aarray){
                foreach my $b (@Barray){
                        if($a eq $b){
                                $overlap = 1;
                                last;
                        }
                }
                last if ($overlap);
        }
        return $overlap;
}

sub Merge{
  while(%Plasmid_Assembly){
        my  @PlasmidsArray = keys (%Plasmid_Assembly);
        my $size = @PlasmidsArray;
        print STDERR "merging size is $size\n";
        my $mergeFlag = 0;
        my $MergePlasmid = $PlasmidsArray[0];
        my $MergeAssembly = $Plasmid_Assembly{$PlasmidsArray[0]};
        my @delete;
        for(my $i=1; $i<$size; $i++){
                my $first_assembly = $Plasmid_Assembly{$PlasmidsArray[0]};
                my $I_assembly = $Plasmid_Assembly{$PlasmidsArray[$i]};
                if(&Overlap($first_assembly, $I_assembly)){
                        $MergePlasmid .= ":$PlasmidsArray[$i]";
                        $MergeAssembly .= ":$I_assembly";
                        $mergeFlag = 1;
                        push(@delete, $PlasmidsArray[$i]);
			print MERGE "Plasmid: $PlasmidsArray[0]\t$PlasmidsArray[$i]\nAssembly: $first_assembly\t$I_assembly\n\n";
                }
        }
        $MergePlasmid =~ s/\s+//g;
        $MergePlasmid =~ s/^://;
        $MergeAssembly =~ s/\s+//g;
        $MergeAssembly =~ s/^://;
        $MergePlasmid = &UniqName($MergePlasmid);
        $MergeAssembly = &UniqName($MergeAssembly);
        if($mergeFlag){
                push(@delete, $PlasmidsArray[0]);
                foreach my $delete (@delete){
                	delete $Plasmid_Assembly{$delete};
                }
                $Plasmid_Assembly{$MergePlasmid} = $MergeAssembly;
        }else{
                $Unique_Plasmid_Assembly{$MergePlasmid} = $MergeAssembly;
                delete $Plasmid_Assembly{$MergePlasmid};
        }
  }#end while
  return 1
}


