#!/usr/bin/perl -w
#./RandomSeqCluster.pl GridGparental.txt RescueMuXYPlasmidAssembly.fsa TIGRrepeatTrim_RescueMuGSS match.GridG_XYseq.fsa > GridGparental_randomSeq.fsa
#Author: Qunfeng Dong

use strict;

my %ParentalCluster; #vmatch cluster ID that are parentals
open(FILE, "$ARGV[0]") || die("Cannot open");
while(<FILE>){
	my ($clusterID) = /^(\d+)/;
	$ParentalCluster{$clusterID} = 1;
}
close(FILE);

my %XY_SEQ;
$/ = ">";
open(FILE, "$ARGV[1]") || die ("Cannot open");
while(<FILE>){
        next unless /\w/;
        my @lines = split(/\n/, $_);
        my ($ID) = $lines[0] =~ /(\S+)/;
        shift @lines;
        my $Seq = join('', @lines);
        $Seq =~ s/\W+//g;
	$XY_SEQ{$ID} = $Seq;
}
close(FILE);
$/ = "\n";

my %XorY_SEQ;
$/ = ">";
open(FILE, "$ARGV[2]") || die ("Cannot open");
while(<FILE>){
        next unless /\w/;
        my @lines = split(/\n/, $_);
        my ($ID) = $lines[0] =~ /gi\S+\s+(\S+)/;
        shift @lines;
        my $Seq = join('', @lines);
        $Seq =~ s/\W+//g;
        $XorY_SEQ{$ID} = $Seq;
}
close(FILE);
$/ = "\n";


my %UniquePlasmid; #don't include seq from the same plasmid more than once
$/ = ":\n";
my $clusterID;
open(FILE, "grep -v '^#' $ARGV[3] |") || die("Cannot open $ARGV[2]");
while(<FILE>){
        my $block = $_;
	my ($cluster) = $block =~ /(\d+):/;
	if(defined $cluster){
		$clusterID = $cluster;
		$clusterID--;
	}else{
		$clusterID++;
	}
	next unless (exists $ParentalCluster{$clusterID});	
	$block =~ s/\d+://;
        my @IDs = $block =~ /gi\S+\s+(\S+)/g;
	my (@XYs, @XorY);
	foreach my $ID (@IDs){
                my ($Plasmid) = $ID =~ /(\S+)\./;
                my $XY = $Plasmid.'XY';
                my ($SeqID, $Seq);
		#check if this plasmid has a Etuc that has member of XYtuc
		#then check if this plasmid has a XYtuc
		#otherwise next
		#print STDERR "Plasmid = $randomPlasmid\n";
		if(exists $XY_SEQ{$XY}){
			$SeqID = $XY;
			$Seq = $XY_SEQ{$SeqID}; 
			my $len = length($Seq);
			push(@XYs, $SeqID);
		}else{
			$SeqID = $ID;
			$Seq = $XorY_SEQ{$SeqID};
			my $len = length($Seq);
			push(@XorY, $SeqID);
		}
	}
	my $sizeOfXY = @XYs;
	print STDERR "cluster $clusterID has $sizeOfXY XYs\n";
        my $sizeOfXorY = @XorY;
        print STDERR "cluster $clusterID has $sizeOfXorY X or Y\n";
	my @seqArray;
	my $totalRetrieval = 0;
	my $totalRandom = 50; #want total 50 random number
	if($sizeOfXY<$totalRandom){
		#not enough good XYs; then we will take all the XYs plus some X or Y
		foreach my $XY (@XYs){
			print ">$XY\n$XY_SEQ{$XY}\n";
			$totalRetrieval++;
		}
		$totalRandom = $totalRandom - $sizeOfXY;
		@seqArray = @XorY;
		if($sizeOfXorY<$totalRandom){
			#not even enough good X or Ys; then we will take them all
	                foreach my $XorY (@XorY){
        	                print ">$XorY\n$XorY_SEQ{$XorY}\n";
                	        $totalRetrieval++;
                	}
		}
	}else{
		@seqArray = @XYs;
	}
	my $size = @seqArray;
	my $count = 0;
	my %UniqueRandom; #don't include the same random more than once
	while($count<$totalRandom){
		my $randomIndex = int(rand($size));
		my $randomID = $seqArray[$randomIndex];
		next if(exists $UniqueRandom{$randomIndex});	
		next if(exists $UniquePlasmid{$randomID}); #don't include seq from the SAME plasmid more than once
		print ">$randomID\n$XY_SEQ{$randomID}\n" if(exists $XY_SEQ{$randomID});
		print ">$randomID\n$XorY_SEQ{$randomID}\n" if(exists $XorY_SEQ{$randomID});
		$UniqueRandom{$randomIndex} = 1;
		$UniquePlasmid{$randomID} = 1;
		$count++;
		$totalRetrieval++;
	}
	print STDERR "cluster $clusterID retrieves $totalRetrieval sequences\n";
	
}
$/ = "\n";


