#!/usr/bin/perl -w #Author: Qunfeng Dong #usage: ./RescueMuPlasmidAssembly.pl AllGridparentalPlasmid.txt RescueMuGSSraw.fsa RescueMuGSS.trim #AllGridparentalPlasmid.txt (output of ParentalVmatchCluster.pl) are the putative parental plasmids ID - exclude them from assembly #RescueMuGSSraw.fsa is from Stanford after they marked TIR as X #RescueMuGSS.trim (output of VectorTrim.pl) is the vector-trim sequences from RescueMuGSS.fsa (downloaded from GenBank) #####################what this script does?############################# # Perform a plasmid assembly: assemble sequences from the same plasmids # step 1: put X, and Y together -> xxxXY # step 2: if XY, put ELX and ELY onto it; otherwise try to put ELX and Y; ELY and X together # (1). PlasmidAssembly result open(PlasmidAssembly, '>RescueMuPlasmidAssembly.fsa') || die("Cannot open output"); # (2). RescueMuXYtuc_memberGSS.txt and EtucMember open(TUCmember, ">RescueMuXYtuc_memberGSS.txt") || die("Cannot open output"); open(Emember, ">RescueMuEtuc_memberGSS.txt"); # (3). Coordinates9bpr_inXY.txt open(TUC9pbr, ">Coordinates9bpr_inXY.txt") || die("Cannot open output"); #it also throw away EL sequence with multiple EL sites open(EL, '>MultipleEL.txt') || die("Cannot open"); #(D). output for bad GenBank sequencs (GenBank seq is not a substring of it corresponding ScreenX seq) open(Screen_GenBank, '>Screen_GenBank.inconsistent'); #also include the derivatives of ELs #(E) some additional txt file for number game open(VALID, ">AllValidGSS.txt"); #all valid RescueMu GSS open(MAX, '>AllMax_member.txt'); ######################################################################## use strict; my $CAP = './cap3Dong'; #this cap3Dong was modified from cap3 by me at grinch4:/usr/local/src/cap3 to change 14 to 0 from one variable ###gobal data and structures my $BamHI = 'GGATCC'; my $BglII = 'AGATCT'; my $Mix5 = 'GGATCT'; my $Mix3 = 'AGATCC'; my %StanfordID_ScreenSeq_Hash; #StanfordID vs. Screened Seq (no Tir but with 9 bp my %StanfordID_GenBankSeq_Hash; #StanfordID vs. GenBank Se my %maxXYEL; #has to record (key) longest X, Y, ELX, ELY from each plasmid # (value) all the X, Y, ELX, ELY from that plasmid #used for build RescueMuXYtuc_memberGSS.txt #the difference between this hash and file XYmax is that it records ALL plasmids. my %AllValidGSS; my %XYtucSeq; #key XYtucID, value: its seq my %AllGridparentalPlasmid; #all parental plasmids that should be excluded from the assembly system("rm -r -f ELalnDIR") if(-e 'ELalnDIR'); system("mkdir ELalnDIR"); print STDERR "Build parental filter ...\n"; open(FILE, "$ARGV[0]") || die("Cannot open $ARGV[0]"); while(){ my ($plasmid) = /^\d+\s+(\S+)/; $AllGridparentalPlasmid{$plasmid} = 1; } close(FILE); print STDERR "Building Hash for sequences...\n"; &GenbankSeq;#build %StanfordID_GenBankSeq_Hash &ScreenSeq; #build %StanfordID_ScreenSeq_Hash print STDERR "Identify valid member GSSs from each plasmid ...\n"; open(XYELmax, '>Plasmid_maxXYELmemberGSS.txt') || die ("Cannot output"); #longest X, Y, ELX, ELY for each plasmid &PlasmidMember; #PlasmidName - X:LongestXmember (all Xs); Y:LongestYmember (all Ys); ELX:LongestELXmember( all ELXs); ELY:longestELYmember(all ELY) close(XYELmax); print STDERR "Assemble each plasmid ...\n"; &Plasmid_Assembly; print STDERR "Assemble is finished!\n"; #functions ----------------------------------------------------------------------------------------- sub PlasmidMember{ #this function sort out all member sequences generated from each plasmids # my %Plasmid_stanfordIDHash; open(FILE, "grep 'Rescue' $ARGV[2] |") || die ("Cannot grep"); while(){ my $line = $_; chomp($line); if($line = /^>gi\|(\d+)\S+\s+(\S+\.\S+)\s+/){ my $gi = $1; my $stanfordID = $2; #this has to be a (1) consistent sequences and (2) NOT a multile EL seq next unless (exists $StanfordID_GenBankSeq_Hash{$stanfordID}); $AllValidGSS{$stanfordID} = 1; my ($plasmid) = $stanfordID =~ /(\w+)\.\S+/; next if(exists $AllGridparentalPlasmid{$plasmid}); #######filter parental plasmids###### if(!exists($Plasmid_stanfordIDHash{$plasmid})){ $Plasmid_stanfordIDHash{$plasmid} = $stanfordID; } else{ $Plasmid_stanfordIDHash{$plasmid} .= "\t$stanfordID"; } } } close(FILE); #output all valid GSS foreach my $ValidGSS (sort keys %AllValidGSS){ print VALID "$ValidGSS\n"; } foreach my $Plasmid (keys %Plasmid_stanfordIDHash){ my $members = $Plasmid_stanfordIDHash{$Plasmid}; my ($maxX, $maxY, $maxELX, $maxELY); my @X = $members =~ /(\S+\.x\S+)/g; if(@X){ $maxX = &LongestSeq(@X); foreach my $mem (@X){ if(exists $maxXYEL{$maxX}){ $maxXYEL{$maxX} .= ":$mem"; }else{ $maxXYEL{$maxX} = $mem; } } } $maxX = 0 if(!defined $maxX); my @Y = $members =~ /(\S+\.y\S+)/g; if(@Y){ $maxY = &LongestSeq(@Y); foreach my $mem (@Y){ if(exists $maxXYEL{$maxY}){ $maxXYEL{$maxY} .= ":$mem"; }else{ $maxXYEL{$maxY} = $mem; } } } $maxY = 0 if(!defined $maxY); # if(defined $maxX && defined $maxY){ # print XYmax "$maxX\t$maxY\n"; # } my @ELX = $members =~ /(\S+EL_x\S+)/g; if(@ELX){ $maxELX = &LongestSeq(@ELX); foreach my $mem (@ELX){ if(exists $maxXYEL{$maxELX}){ $maxXYEL{$maxELX} .= ":$mem"; }else{ $maxXYEL{$maxELX} = $mem; } } } $maxELX = 0 if(!defined $maxELX); my @ELY = $members =~ /(\S+EL_y\S+)/g; if(@ELY){ $maxELY = &LongestSeq(@ELY); foreach my $mem (@ELY){ if(exists $maxXYEL{$maxELY}){ $maxXYEL{$maxELY} .= ":$mem"; }else{ $maxXYEL{$maxELY} = $mem; } } } $maxELY = 0 if(!defined $maxELY); print XYELmax "maxX:$maxX\tmaxY:$maxY\tmaxELX:$maxELX\tmaxELY:$maxELY\n"; ###################################################################################################### #only those LONG sequences recorded in %maxXYEL are going to the subsequent assembly!!!!!!!!!!!!!!!! # ###################################################################################################### } foreach my $max (keys %maxXYEL){ my @mems = split(/:/, $maxXYEL{$max}); foreach my $mem (@mems){ print MAX "$max\t$mem\n"; } } } sub LongestSeq{ my (@SeqIDArray) = @_; #input is a list of StanfordIDs my $max = 0; my $maxSeqID; foreach my $SeqID (@SeqIDArray){ my $Seq = $StanfordID_GenBankSeq_Hash{$SeqID}; my $len = length($Seq); if($len > $max){ $max = $len; $maxSeqID = $SeqID; } } return $maxSeqID; } sub Plasmid_Assembly{ open(MEMBER, 'Plasmid_maxXYELmemberGSS.txt') || die ("Plasmid_maxXYELmemberGSS.txt"); while(){ my ($maxX, $maxY, $maxELX, $maxELY) = /maxX:(\S+)\s+maxY:(\S+)\s+maxELX:(\S+)\s+maxELY:(\S+)/; my @ELassembly; #IDs for participting in EL_assembly my $XYtucID; if($maxX && $maxY){ #print STDERR "this plasmid has both the X and Y sequences\n"; $XYtucID = &Assemble_X_Y($maxX, $maxY); push(@ELassembly, $XYtucID) if($XYtucID); } unless(defined $XYtucID && $XYtucID){ push(@ELassembly, $maxX) if($maxX); push(@ELassembly, $maxY) if($maxY); } if($maxELX || $maxELY){ push(@ELassembly, $maxELX) if($maxELX); push(@ELassembly, $maxELY) if($maxELY); &Assemble_EL(@ELassembly); }else{ #no EL seq if(defined $XYtucID && $XYtucID){ print PlasmidAssembly ">$XYtucID\n$XYtucSeq{$XYtucID}\n"; }else{ print PlasmidAssembly ">$maxX\n$StanfordID_GenBankSeq_Hash{$maxX}\n" if($maxX); print PlasmidAssembly ">$maxY\n$StanfordID_GenBankSeq_Hash{$maxY}\n" if($maxY); } } }#end while close(MEMBER); } sub Assemble_EL{ my @SeqIDs = @_; my $plasmid; open(CAP, '>TempCap3Input') || die("Cannot open Cap3Input"); foreach my $ID (@SeqIDs){ print STDERR "$ID\n"; my $Seq; if($ID =~ /(\S+)XY$/){ $Seq = $XYtucSeq{$ID}; $plasmid = $1; }else{ $Seq = $StanfordID_GenBankSeq_Hash{$ID}; ($plasmid) = $ID =~ /(\S+)\./; } print CAP ">$ID\n$Seq\n"; } close(CAP); #then we ran cap3, if any contig can be generated, I call it xxxxEtuc1, xxxxxEtuc2 #for now I will just save the alignment my $CapInputFileName = $plasmid.'ELaln.fsa'; system("mv TempCap3Input $CapInputFileName"); system("$CAP $CapInputFileName -o 21 -p 95 -s 401 > $CapInputFileName.Cap3Align"); my $CapContigFileName = $CapInputFileName.'.cap.contigs'; my $CapSingletFileName = $CapInputFileName.'.cap.singlets'; my $ELtucName = $plasmid.'E'; $/ = '>'; open(FILE, "$CapContigFileName") || die("Cannot open cap contig"); while(){ next unless (/\w/); my $block = $_; $block =~ s/>$//; $block =~ s/Contig/$ELtucName/; print PlasmidAssembly ">$block"; } close(FILE); $/ = "\n"; $/ = '>'; open(FILE, "$CapSingletFileName") || die("Cannot open cap singlet"); while(){ next unless (/\w/); my $block = $_; $block =~ s/>$//; print PlasmidAssembly ">$block"; } close(FILE); $/ = "\n"; $/ = "Contig"; my $NumContig = qx(grep '>' $CapContigFileName | wc -l); my $NumSinglet = qx(grep '>' $CapSingletFileName | wc -l); my $lastFlag = 0; open(CAPOUT, "$CapInputFileName.Cap3Align") || die("Cannot open"); while(){ next if /^Number/; my $members; my $CapOutBlock = $_; chomp($CapOutBlock); $lastFlag = 1 if($CapOutBlock =~ /DETAILED DISPLAY OF CONTIGS/); my @lineArray = split(/\n/, $CapOutBlock); my $EtucID = $ELtucName; foreach my $line (@lineArray){ $line =~ s/^\s+//g; #delete preceding space if($line =~ /^(\d+)\s+\*\*\*/s){ $EtucID = $ELtucName.$1; } if($line =~ /^(\S+)([+|-])/){ print Emember "$EtucID\t$1\n"; } } last if ($lastFlag); }#end of while #only need finish checing the CapOutput close(CAPOUT); $/ = "\n"; system("mv $CapInputFileName* ELalnDIR"); } sub Assemble_X_Y{ my ($Xid, $Yid) = @_; my ($plasmid) = $Xid =~ /(\S+)\./; my $XscreenSeq = $StanfordID_ScreenSeq_Hash{$Xid}; my $YscreenSeq = $StanfordID_ScreenSeq_Hash{$Yid}; if(defined $XscreenSeq && defined $YscreenSeq){ my ($CombineSeq, $start_9bpr, $contigName); my $CombineID = $Xid.':'.$Yid; ($CombineSeq, $start_9bpr) = &Combine_X_Y($Xid, $XscreenSeq, $Yid, $YscreenSeq); if($CombineSeq){ #one of the few case print output here, since Combine_X_Y is called elsewhere #print STDERR "One GSS contig is assembled from (X, Y)\n"; $contigName = "$plasmid".'XY'; #record the corridnates of the 9bpr in CombineSeq $start_9bpr = $start_9bpr + 1; print TUC9pbr "$CombineID\t$start_9bpr\n"; my @member = split(/:/,$CombineID); foreach my $member (@member){ my @allMember = split(/:/, $maxXYEL{$member}); #get also the shorter member foreach my $allmember (@allMember){ print TUCmember "$contigName\t$allmember\n"; delete $AllValidGSS{$allmember}; } } #print TUC ">$contigName\n$CombineSeq\n"; $XYtucSeq{$contigName} = $CombineSeq; return $contigName; } } return 0; }#end function sub ScreenSeq{ #Screened Seq: Raw Seq - TIR (still have the 9 bp #read screenx file and build a hash StanfordID vs. screened Seq #Be careful, cannot assume Stanford has every seq containingXXXXX as it should #need to trim off the extra edge sequneces from X, Y that are NOT submitted into GenBank $/ = '>'; open(FILE, $ARGV[1]) || die ("Cannot open screenx"); while(){ next unless /\w+/; my @lines = split(/\n/, $_); my $firstLine = shift(@lines); my ($stanfordID) = $firstLine =~ /^(\S+)\s+/; my $Seq = join('', @lines); $Seq =~ s/\W+//g; #delete blank and > $Seq = &Screen_TIR($stanfordID, $Seq); if($Seq){ $StanfordID_ScreenSeq_Hash{$stanfordID} = $Seq; } } close(FILE); $/ = "\n"; } #end function sub GenbankSeq{ #get Stanford vs. its genbank deposited Seq #change it to gi number from Acc# $/ = '>'; open(FILE, $ARGV[2]) || die ("Cannot open AllGSS.fasta"); while(){ next unless /RescueMu/; my @lines = split(/\n/, $_); my $firstLine = shift(@lines); my ($gi, $sid) = $firstLine =~ /^gi\|(\d+)\S+\|\w+\s+(\S+)/; my $Seq = join('', @lines); $Seq =~ s/\W+//g; #delete blank and > if($sid =~ /EL_/){ #an EL seq, check if it contains multipleEL site my @count = $Seq =~ /$BamHI|$BglII|$Mix5|$Mix3/g; my $size = @count; if($size>1){ print EL "Multiple EL sites: $sid - excluded from subsequenct assembly\n"; next; } } $StanfordID_GenBankSeq_Hash{$sid} = $Seq; } close(FILE); $/ = "\n"; } #end function sub Screen_TIR{ #This function returns realGSS (plus the 9 bpr) from the raw sequence whose TIR was masked as X by crossmatch program my ($stanfordID, $tirSeq) = @_; my $geneBankSeq = $StanfordID_GenBankSeq_Hash{$stanfordID}; return 0 if (!defined $geneBankSeq); #my @cleanSeqArray = $tirSeq =~ /X+([^X]+)/g; #get seq after XXXXXXX, but sometime we have XXXseq1XXXseq2 if($tirSeq !~ /$geneBankSeq/){ #check if GenBank and Screen consistent or not #Screen should contain GenBank!!! #print STDERR "$stanfordID is NOT Consistent with ScreenX and GenBank!\n"; #exit; delete $StanfordID_GenBankSeq_Hash{$stanfordID}; print Screen_GenBank "$stanfordID\n"; #also delete any possible EL1 or EL2 seqs derived from this seq my ($plasmid, $xy) = $stanfordID =~ /(\S+)\.(\S+)/; my $EL1 = $plasmid.'.1EL_'.$xy; my $EL2 = $plasmid.'.2EL_'.$xy; if(exists $StanfordID_GenBankSeq_Hash{$EL1}){ print Screen_GenBank "$EL1\n"; delete $StanfordID_GenBankSeq_Hash{$EL1}; } if(exists $StanfordID_GenBankSeq_Hash{$EL2}){ print Screen_GenBank "$EL2\n"; delete $StanfordID_GenBankSeq_Hash{$EL2}; } return 0; } #This TIRseq must CONTAIN the seq submitted to the GenBank my ($TIRseq) = $tirSeq =~ /X+([^X]*$geneBankSeq)/g; #get seq after XXXXXXX, if(defined $TIRseq){ #if($TIRseq ne $geneBankSeq){ #print ">found good TIR that are longer than GenBank Seq\n\n"; #print "tir:$tirSeq\n\nTIR:$TIRseq\n\nGenBank:$geneBankSeq\n\n\n"; #} return $TIRseq; }else{ return 0; } } #end function sub Combine_X_Y{ #print STDERR "Combining (X, Y) ...\n"; #this function combines X, Y my ($Xid, $X_seq, $Yid, $Y_seq) = @_; #get input my $CombineSeq; my $start_9bpr; #starting pos of the 9bpr #5' to the start of 9bpr #my $Xshift; #the number of bp to skip to find 9bpr on the X #($CombineSeq,$start_9bpr, $Xshift) = &find9bpr($X_seq, $Y_seq); ($CombineSeq,$start_9bpr) = &find9bpr($Xid, $X_seq, $Yid, $Y_seq); return ($CombineSeq, $start_9bpr); }#end function sub RevCompl{ #this function returns reverse_complement seq my ($Seq) = @_; $Seq =~ tr/gatcGATC/ctagCTAG/; # complement $Seq =~ tr/rykmRYKM/yrmkYRMK/; # complement for the ambiguity letter # R - purine (G, A) # Y - pyrimidine (T, C) # K - Keto (G, T) # M - Amino (A, C) $Seq = reverse $Seq; # and reverse return $Seq; }#end function sub find9bpr{ #input SeqX and SeqY, find 9bpr from SeqX and RevComp(SeqY) and return a combined seq #otherwise, return 0 #note, the searching for 9 bpr has three shifts in case the crossmatch is wrong #x: 123456789abcde #y: 123456789xecef #y: 0123456789.... #y: 00123456789... my ($Xid, $SeqX, $Yid, $SeqY) = @_; my ($seq1, $seq2, $len); my $start_9bpr = '\N'; my $minMatch = 8; #minimum matching in the 9bp searching window #print STDERR "shift SeqY\n"; $seq1 = substr($SeqX, 0, 9); $seq2 = &RevCompl(substr($SeqY, 0, 9)); #$seq2 = substr($SeqY, 0, 12); $len = length($seq1); my $same = 0; for(my $i=0; $i<$len; $i++){ my $j = $i; my $Seq1_char = substr($seq1, $i, 1); my $Seq2_char = substr($seq2, $j, 1); $same++ if($Seq1_char eq $Seq2_char); } if($same >= $minMatch){ #print STDERR "find 9bpr\n"; #now need to choose which 9bpr is the final 9bpr my $rest_SeqX = substr($SeqX, 9); my $rest_SeqX_len = length($rest_SeqX); my $rest_SeqY = substr($SeqY, 9); my $rest_SeqY_len = length($rest_SeqY); $start_9bpr = $rest_SeqY_len; #Y is the left sequence if($same == 9){ #print STDERR "they are identical, just choose any one of it!!!\n"; my $combineSeq = &RevCompl($rest_SeqY).$SeqX; return ($combineSeq, $start_9bpr); }else{ #There is one base difference between 9bpr from rawX and rawY #need to see if genbank X and genbank Y can be used to decide which one to choose my $X_genBank = $StanfordID_GenBankSeq_Hash{$Xid}; my $X_genBank_len = length($X_genBank); my $Y_genBank = $StanfordID_GenBankSeq_Hash{$Yid}; my $Y_genBank_len = length($Y_genBank); #let's find out the extra overhang of X or Y's GenBank sequence vs. $rest_SeqX or $rest_SeqY. #the one that has longer overhang (longer high-quality sequences) will be used my $X_overhang = $X_genBank_len - $rest_SeqX_len; my $Y_overhang = $Y_genBank_len - $rest_SeqY_len; if($X_overhang>0 && $Y_overhang>0){ #print "$Xid\t$Yid\n"; } #if both the above overhang are negative, that means none of them has quality character in the 9 pbr region if($X_overhang >= $Y_overhang){ #print STDERR "X might has more high score character in 9 bpr region\n"; my $combineSeq = &RevCompl($rest_SeqY).$SeqX; return ($combineSeq, $start_9bpr); }else{ #print STDERR "Y might has more high score character in 9 bpr region\n"; my $combineSeq = &RevCompl($SeqY).$rest_SeqX; return ($combineSeq, $start_9bpr); } } } return 0; }