#!/usr/bin/perl -w #input # (1): RescueMuGSS.fsa #for gi, acc, plasmid info # (2). ZM_RM_GSS.FinalTUG.fasta (extract singlets and record non-parentals) # (3). ZM_RM_GSS.TUC_GSS #output of RescueMuTUG.pl; note: it does not recursively include members from XY or Etuc; but it's still Ok to use it #usage: ./UniqueRMLoci.pl ../TIGRrepeatTrim_RescueMuGSS ZM_RM_GSS.TUG.fasta ZM_RM_GSS.TUC_GSS > UniqueRMLoci.txt use strict; open(MERGE, ">MergeProcess.txt") || die("Cannot open MergeProcess.txt"); print STDERR "\nRetrieving Plasmid Seq info ...\n"; my %Plasmid_memberGSS; my %memberGSS_Plasmid; open(FILE, "grep '>' $ARGV[0] | ") || die ("Cannot grep $ARGV[0]"); while(){ my ($gi, $plasmid) = /gi\|(\d+)\S+\s+(\S+)\./; if(exists $Plasmid_memberGSS{$plasmid}){ $Plasmid_memberGSS{$plasmid} .= ":$gi"; }else{ $Plasmid_memberGSS{$plasmid} = $gi; } $memberGSS_Plasmid{$gi} = $plasmid; } close(FILE); my %Plasmid_Assembly; my %FinalTUG; #get singlet info open(FILE, "grep '>' $ARGV[1] | ") || die ("Cannot open $ARGV[1]"); while(){ my ($tug) = />(\S+)/; if($tug !~ /tuc/){ my $singlet = $tug; my $plasmid = $memberGSS_Plasmid{$singlet}; if(exists $Plasmid_Assembly{$plasmid}){ $Plasmid_Assembly{$plasmid} .= ":$singlet"; }else{ $Plasmid_Assembly{$plasmid} = $singlet; } } $FinalTUG{$tug} = 1; } close(FILE); #get contig info open(FILE, "$ARGV[2]") || die ("Cannot open $ARGV[2]"); while(){ my ($contig, $GSS) = /^(\S+)\s+(\S+)/; next unless (exists $FinalTUG{$contig}); my $plasmid; if($GSS =~ /(\S+)XY$/ || $GSS =~ /(\S+)E\d$/){ #the member is a XYtuc or Etuc $plasmid = $1; }else{ $plasmid = $memberGSS_Plasmid{$GSS}; } if(exists $Plasmid_Assembly{$plasmid}){ $Plasmid_Assembly{$plasmid} .= ":$contig"; }else{ $Plasmid_Assembly{$plasmid} = $contig; } } close(FILE); foreach my $Plasmid (keys %Plasmid_Assembly){ $Plasmid_Assembly{$Plasmid} = UniqName($Plasmid_Assembly{$Plasmid}); } print STDERR "\nMerge assembly if they share the same plasmid\n"; my %Unique_Plasmid_Assembly; if(&Merge()){ my $UniqueRegionNum = 0; foreach my $key (keys (%Unique_Plasmid_Assembly)){ $UniqueRegionNum++; print "$UniqueRegionNum\t$key\t$Unique_Plasmid_Assembly{$key}\n"; } print STDERR "Done!\n"; }#end Merge()) sub UniqName{ #for example, X:Y:X should be reduced to X:Y my ($SeqName) = shift; my %NameHash; my @NameArray = split(/:/, $SeqName); foreach my $name (@NameArray){ if($name =~ /\w+/ && !exists $NameHash{$name}){ $NameHash{$name} = 1; } } my $NewName; foreach my $name (sort (keys %NameHash)){ if(defined $NewName){ $NewName .= $name.':'; }else{ $NewName = $name.':'; } } $NewName =~ s/^\s+//; $NewName =~ s/:$//; return $NewName; } sub Overlap{ #check if A and B intersect #A = a1:a2:a3... #B = b1:b2:b3... my ($A, $B) = @_; my $overlap = 0; my @Aarray = split(/:/, $A); my @Barray = split(/:/, $B); foreach my $a (@Aarray){ foreach my $b (@Barray){ if($a eq $b){ $overlap = 1; last; } } last if ($overlap); } return $overlap; } sub Merge{ while(%Plasmid_Assembly){ my @PlasmidsArray = keys (%Plasmid_Assembly); my $size = @PlasmidsArray; print STDERR "merging size is $size\n"; my $mergeFlag = 0; my $MergePlasmid = $PlasmidsArray[0]; my $MergeAssembly = $Plasmid_Assembly{$PlasmidsArray[0]}; my @delete; for(my $i=1; $i<$size; $i++){ my $first_assembly = $Plasmid_Assembly{$PlasmidsArray[0]}; my $I_assembly = $Plasmid_Assembly{$PlasmidsArray[$i]}; if(&Overlap($first_assembly, $I_assembly)){ $MergePlasmid .= ":$PlasmidsArray[$i]"; $MergeAssembly .= ":$I_assembly"; $mergeFlag = 1; push(@delete, $PlasmidsArray[$i]); print MERGE "Plasmid: $PlasmidsArray[0]\t$PlasmidsArray[$i]\nAssembly: $first_assembly\t$I_assembly\n\n"; } } $MergePlasmid =~ s/\s+//g; $MergePlasmid =~ s/^://; $MergeAssembly =~ s/\s+//g; $MergeAssembly =~ s/^://; $MergePlasmid = &UniqName($MergePlasmid); $MergeAssembly = &UniqName($MergeAssembly); if($mergeFlag){ push(@delete, $PlasmidsArray[0]); foreach my $delete (@delete){ delete $Plasmid_Assembly{$delete}; } $Plasmid_Assembly{$MergePlasmid} = $MergeAssembly; }else{ $Unique_Plasmid_Assembly{$MergePlasmid} = $MergeAssembly; delete $Plasmid_Assembly{$MergePlasmid}; } }#end while return 1 }