#!/usr/bin/perl -w #Author: Qunfeng Dong #after crossmath or vmatch to mask vector seq, trim them off #usage: ./RepeatTrim.pl RescueMuPlasmidAssembly.fsa TIGRrepeatMask_RescueMuPlasmidAssembly.fsa > TIGRrepeatTrim_RescueMuPlasmidAssembly.fsa use strict; #After trimming, if valid seq is less than 30 bp, consider the sequences as vector and throw them away open(REPEAT, ">RepeatSeq.fsa") || die("cannot output"); open(DOUBT, '>doubt.txt'); my %SEQ; #original unmask seq $/ = '>'; open(FILE, "$ARGV[0]") || die("Cannot open"); while(){ next unless /\w/; my @lines = split(/\n/, $_); my $title = shift(@lines); my ($ID) = $title =~ /(\S+)/; my $Seq = join('', @lines); $Seq =~ s/\W+//g; $SEQ{$ID} = $Seq; } close(FILE); $/ = "\n"; $/ = '>'; open(FILE, "$ARGV[1]") || die("Cannot open $ARGV[1]"); while(){ next unless /\w/; my @lines = split(/\n/, $_); my $title = shift(@lines); my ($ID) = $title =~ /(\S+)/; my $Seq = join('', @lines); $Seq =~ s/\W+//g; my $seq = $Seq; $Seq =~ s/X+/X/g; #replace XXXXXXX with just one X my @segments = split(/X/, $Seq); my $LongSegCount = 0; #how many long seg after split; ideally just 1 my $LongSegSeq; foreach my $segment (@segments){ next unless ($segment =~ /\w/); my $len = length($segment); if($len >=30){ $LongSegCount++; #check if the segment length is longer than 100 $LongSegSeq = $segment; } } if($LongSegCount == 1){ #ideal case print ">$title\n$LongSegSeq\n"; }elsif($LongSegCount > 1){ #need to manual review such masking case!!!!!! #randomly checked on NCBI uniq-vect database, doesn't look like vector print DOUBT ">$title\n$seq\n"; #my ($gi) = $title =~ /gi\|(\d+)/; #print ">$title\n$SEQ{$gi}\n"; }elsif($LongSegCount == 0){ #this seq is vector seq print REPEAT ">$title\n$seq\n"; }else{ print STDERR "Something is not right with LongSeqCount!\n"; exit; } } close(FILE);