#!/usr/bin/perl -w
#Author: Qunfeng Dong
#check each vmatch cluster and identify parental and pre-meotic sequences based on grid location
#usage: ./ParentalVmatchCluster.pl RescueMuGSS_Location.txt match.GridG_XYseq.fsa > GridGparental.txt
#GridGparental.txt four column: clusterID, stanfordID, plasmidID, sizeofcluster, row/colum location



use strict;

my %Grid; #key StanfordID; value Grid
my (%RowLocation, %ColLocation); #$key StanfordID; value row or column number


my %GRID;
my %GRID_ROW = ('G'=>0,
		'H'=>0,
		'I'=>0,
		'K'=>0,
		'M'=>0,
		'P'=>0); #number of rows from this grid
my %GRID_COL = ('G'=>0,
                'H'=>0,
                'I'=>0,
                'K'=>0,
                'M'=>0,
                'P'=>0); #number of cols from this grid


open(FILE, "$ARGV[0]") || die("Cannot open $ARGV[0]");
while(<FILE>){
	my ($grid, $row, $col, $sid) = /\d+\s+(\w)\s+(\S+)\s+(\S+)\s+(\S+)/;
	my $grid_row = $grid.'row'.$row;
	my $grid_col = $grid.'col'.$col;
	my ($plasmid) = $sid =~ /(\S+)\./;
	if($row =~ /\d+/){
		$RowLocation{$sid} = $row;
		$RowLocation{$plasmid} = $row;
		if(!exists $GRID{$grid_row}){
			$GRID{$grid_row} = 1;
			$GRID_ROW{$grid}++;
		}
	}elsif($col =~ /\d+/){
		$ColLocation{$sid} = $col;
		$ColLocation{$plasmid} = $col;
                if(!exists $GRID{$grid_col}){
                        $GRID{$grid_col} = 1;
                        $GRID_COL{$grid}++;
                }
	}else{
		print STDERR "Somethin is not right with location!\n";
		exit;
	}
	$Grid{$sid} = $grid;
	
}
close(FILE);

#print "Grid\tRowNum\tColNum\n";
#foreach my $grid (keys %GRID_ROW){
#	print "$grid\t$GRID_ROW{$grid}\t$GRID_COL{$grid}\n";
#}
#exit;

my %BigCluster; #key:ClusterID; value:Plasmid
#############################################################################################
#my $parentalSize = 1000; #400; 
###########for Grid M (less seq), the cutoff is 400; for other grid the cutoff is 1000#######
$/ = ":\n";
my $clusterID;
open(FILE, "grep -v '^#' $ARGV[1] |") || die("Cannot open $ARGV[1]");
while(<FILE>){
        my $block = $_;
	my ($cluster) = $block =~ /(\d+):/;
	if(defined $cluster){
		$clusterID = $cluster;
		$clusterID--;
	}else{
		$clusterID++;
	}
	$block =~ s/\d+://;
        my @IDs = $block =~ /gi\S+\s+(\S+)\s+/g;
	my $size = @IDs;
	next unless ($size>=1);
	my $grid = $Grid{$IDs[0]}; #get to know which grid we are dealing with here
	my $rowNum = $GRID_ROW{$grid}; #number of row sequenced from this grid
	my $colNum = $GRID_COL{$grid}; #number of col sequenced from this grid
	#next unless ($size>=$parentalSize); 
	#now, where did the following plasmids coming from?
	my %LocationHash;
	my (%P_LowLocationHash, %P_HighLocationHash); 

	my $ParentalCutOff; #how many row/column if same sequences can be recoverd
        foreach my $ID (@IDs){
		my $location;
        	if($rowNum >= $colNum){
                	$location = $RowLocation{$ID} if (exists $RowLocation{$ID});
			#$ParentalCutOff = $rowNum*0.28;
			if($grid eq 'P'){
				$P_LowLocationHash{$RowLocation{$ID}} = 1 if(exists $RowLocation{$ID} && $RowLocation{$ID}<=37);
				$P_HighLocationHash{$RowLocation{$ID}} = 1 if(exists $RowLocation{$ID} && $RowLocation{$ID}>37);
			}

        	}else{
                	$location = $ColLocation{$ID} if (exists $ColLocation{$ID});
			#$ParentalCutOff = $colNum*0.28;
        	}
		$LocationHash{$location} = 1 if(defined $location);
        }
##########################Defined arbitrarily#########################
	$ParentalCutOff = 46 if($grid eq 'G');
	$ParentalCutOff = 36 if($grid eq 'H');
	$ParentalCutOff = 38 if($grid eq 'I');
	$ParentalCutOff = 30 if($grid eq 'K');
	$ParentalCutOff = 30 if($grid eq 'M');
	#$ParentalCutOff = 37 if($grid eq 'P');
#####################################################################
	if($grid eq 'P'){
		my $sizeofLow = keys %P_LowLocationHash;
		#my $sizeofHigh = keys %P_HighLocationHash;
		next unless($sizeofLow == 37); #####################special case of grid P######################################### 
	}else{
		my $sizeofLocationHash = keys %LocationHash;
		next if($sizeofLocationHash < $ParentalCutOff);
	}
	my %plasmidHash;
	foreach my $ID (@IDs){
		my ($plasmid) = $ID =~ /(\S+)\./;
		next if(exists $plasmidHash{$plasmid});
		$plasmidHash{$plasmid} = 1;
	        if(exists $BigCluster{$clusterID}){
                        $BigCluster{$clusterID} .= ":$plasmid";
        	}else{
                        $BigCluster{$clusterID} = $plasmid;
        	}
	}
}
$/ = "\n";

#sort numerically of the 
foreach my $cluster (sort {$a <=> $b} keys %BigCluster){
	my @IDs = split(/:/, $BigCluster{$cluster});
	my $size = @IDs;
	foreach my $ID (@IDs){
#		print "$cluster\t$ID\t$size\n";
		print "$cluster\t$ID\t$size\tROW: $RowLocation{$ID}\n" if(exists $RowLocation{$ID});
		print "$cluster\t$ID\t$size\tCOL: $ColLocation{$ID}\n" if(exists $ColLocation{$ID});
	}
}

