#!/usr/bin/perl -w
#Author: Qunfeng Dong
#after crossmath or vmatch to mask vector seq, trim them off
#usage: ./RepeatTrim.pl RescueMuPlasmidAssembly.fsa TIGRrepeatMask_RescueMuPlasmidAssembly.fsa  > TIGRrepeatTrim_RescueMuPlasmidAssembly.fsa

use strict;

#After trimming, if valid seq is less than 30 bp, consider the sequences as vector and throw them away
open(REPEAT, ">RepeatSeq.fsa") || die("cannot output");
open(DOUBT, '>doubt.txt');

my %SEQ; #original unmask seq
$/ = '>';
open(FILE, "$ARGV[0]") || die("Cannot open");
while(<FILE>){
        next unless /\w/;
        my @lines = split(/\n/, $_);
        my $title = shift(@lines);
	my ($ID) = $title =~ /(\S+)/;
        my $Seq = join('', @lines);
        $Seq =~ s/\W+//g;
        $SEQ{$ID} = $Seq;
}
close(FILE);
$/ = "\n";
$/ = '>';
open(FILE, "$ARGV[1]") || die("Cannot open $ARGV[1]"); 
while(<FILE>){
	next unless /\w/;
	my @lines = split(/\n/, $_);
	my $title = shift(@lines);
	my ($ID) = $title =~ /(\S+)/;
	my $Seq = join('', @lines);
	$Seq =~ s/\W+//g;
	my $seq = $Seq;
	$Seq =~ s/X+/X/g; #replace XXXXXXX with just one X
	my @segments = split(/X/, $Seq);
	my $LongSegCount = 0; #how many long seg after split; ideally just 1
	my $LongSegSeq;
	foreach my $segment (@segments){
		next unless ($segment =~ /\w/);
		my $len = length($segment);
		if($len >=30){
			$LongSegCount++; #check if the segment length is longer than 100
			$LongSegSeq = $segment;
		}
	}
	if($LongSegCount == 1){
		#ideal case
		print ">$title\n$LongSegSeq\n";
	}elsif($LongSegCount > 1){
		#need to manual review such masking case!!!!!!
		#randomly checked on NCBI uniq-vect database, doesn't look like vector
		print DOUBT ">$title\n$seq\n";
		#my ($gi) = $title =~ /gi\|(\d+)/;
		#print ">$title\n$SEQ{$gi}\n";
	}elsif($LongSegCount == 0){
		#this seq is vector seq
		print REPEAT ">$title\n$seq\n";

	}else{
		print STDERR "Something is not right with LongSeqCount!\n";
		exit;
	}

}
close(FILE);

