#!/usr/bin/perl -w #Author: Qunfeng Dong #after crossmath or vmatch to mask vector seq, trim them off #usage: ./VectorTrim.pl RescueMuGSS RescueMuGSS.mask > RescueMuGSS.trim use strict; #After trimming, if valid seq is less than 30 bp, consider the sequences as vector and throw them away open(VECTOR, ">VectorSeq.fsa") || die("cannot output"); my %SEQ; #original unmask seq $/ = '>'; open(FILE, "$ARGV[0]") || die("Cannot open"); while(){ next unless /\w/; my @lines = split(/\n/, $_); my $title = shift(@lines); my ($gi) = $title =~ /gi\|(\d+)/; my $Seq = join('', @lines); $Seq =~ s/\W+//g; $SEQ{$gi} = $Seq; } close(FILE); $/ = "\n"; $/ = '>'; open(FILE, "$ARGV[1]") || die("Cannot open $ARGV[1]"); while(){ next unless /\w/; my @lines = split(/\n/, $_); my $title = shift(@lines); $title =~ s/\s+$//g; my $Seq = join('', @lines); $Seq =~ s/\W+//g; my $seq = $Seq; $Seq =~ s/X+/X/g; #replace XXXXXXX with just one X my @segments = split(/X/, $Seq); my $LongSegCount = 0; #how many long seg after split; ideally just 1 my $LongSegSeq; foreach my $segment (@segments){ next unless ($segment =~ /\w/); my $len = length($segment); if($len >=15){ $LongSegCount++; #check if the segment length is longer than 15 $LongSegSeq = $segment; } } if($LongSegCount == 1){ #ideal case but still need to check if the trim seq is greater than 30 my $len = length($LongSegSeq); if($len < 30){ #not enough print VECTOR ">$title\n$seq\n"; }else{ print ">$title\n$LongSegSeq\n"; } }elsif($LongSegCount > 1){ #need to manual review such masking case #randomly checked on NCBI uniq-vect database, doesn't look like vector #print DOUBT ">$title\n$seq\n"; my ($gi) = $title =~ /gi\|(\d+)/; print ">$title\n$SEQ{$gi}\n"; }elsif($LongSegCount == 0){ #this seq is vector seq print VECTOR ">$title\n$seq\n"; }else{ print STDERR "Something is not right with LongSeqCount!\n"; exit; } } close(FILE);