#!/usr/bin/perl -w #Author: Qunfeng Dong #check each vmatch cluster and identify parental and pre-meotic sequences based on grid location #usage: ./ParentalVmatchCluster.pl RescueMuGSS_Location.txt match.GridG_XYseq.fsa > GridGparental.txt #GridGparental.txt four column: clusterID, stanfordID, plasmidID, sizeofcluster, row/colum location use strict; my %Grid; #key StanfordID; value Grid my (%RowLocation, %ColLocation); #$key StanfordID; value row or column number my %GRID; my %GRID_ROW = ('G'=>0, 'H'=>0, 'I'=>0, 'K'=>0, 'M'=>0, 'P'=>0); #number of rows from this grid my %GRID_COL = ('G'=>0, 'H'=>0, 'I'=>0, 'K'=>0, 'M'=>0, 'P'=>0); #number of cols from this grid open(FILE, "$ARGV[0]") || die("Cannot open $ARGV[0]"); while(){ my ($grid, $row, $col, $sid) = /\d+\s+(\w)\s+(\S+)\s+(\S+)\s+(\S+)/; my $grid_row = $grid.'row'.$row; my $grid_col = $grid.'col'.$col; my ($plasmid) = $sid =~ /(\S+)\./; if($row =~ /\d+/){ $RowLocation{$sid} = $row; $RowLocation{$plasmid} = $row; if(!exists $GRID{$grid_row}){ $GRID{$grid_row} = 1; $GRID_ROW{$grid}++; } }elsif($col =~ /\d+/){ $ColLocation{$sid} = $col; $ColLocation{$plasmid} = $col; if(!exists $GRID{$grid_col}){ $GRID{$grid_col} = 1; $GRID_COL{$grid}++; } }else{ print STDERR "Somethin is not right with location!\n"; exit; } $Grid{$sid} = $grid; } close(FILE); #print "Grid\tRowNum\tColNum\n"; #foreach my $grid (keys %GRID_ROW){ # print "$grid\t$GRID_ROW{$grid}\t$GRID_COL{$grid}\n"; #} #exit; my %BigCluster; #key:ClusterID; value:Plasmid ############################################################################################# #my $parentalSize = 1000; #400; ###########for Grid M (less seq), the cutoff is 400; for other grid the cutoff is 1000####### $/ = ":\n"; my $clusterID; open(FILE, "grep -v '^#' $ARGV[1] |") || die("Cannot open $ARGV[1]"); while(){ my $block = $_; my ($cluster) = $block =~ /(\d+):/; if(defined $cluster){ $clusterID = $cluster; $clusterID--; }else{ $clusterID++; } $block =~ s/\d+://; my @IDs = $block =~ /gi\S+\s+(\S+)\s+/g; my $size = @IDs; next unless ($size>=1); my $grid = $Grid{$IDs[0]}; #get to know which grid we are dealing with here my $rowNum = $GRID_ROW{$grid}; #number of row sequenced from this grid my $colNum = $GRID_COL{$grid}; #number of col sequenced from this grid #next unless ($size>=$parentalSize); #now, where did the following plasmids coming from? my %LocationHash; my (%P_LowLocationHash, %P_HighLocationHash); my $ParentalCutOff; #how many row/column if same sequences can be recoverd foreach my $ID (@IDs){ my $location; if($rowNum >= $colNum){ $location = $RowLocation{$ID} if (exists $RowLocation{$ID}); #$ParentalCutOff = $rowNum*0.28; if($grid eq 'P'){ $P_LowLocationHash{$RowLocation{$ID}} = 1 if(exists $RowLocation{$ID} && $RowLocation{$ID}<=37); $P_HighLocationHash{$RowLocation{$ID}} = 1 if(exists $RowLocation{$ID} && $RowLocation{$ID}>37); } }else{ $location = $ColLocation{$ID} if (exists $ColLocation{$ID}); #$ParentalCutOff = $colNum*0.28; } $LocationHash{$location} = 1 if(defined $location); } ##########################Defined arbitrarily######################### $ParentalCutOff = 46 if($grid eq 'G'); $ParentalCutOff = 36 if($grid eq 'H'); $ParentalCutOff = 38 if($grid eq 'I'); $ParentalCutOff = 30 if($grid eq 'K'); $ParentalCutOff = 30 if($grid eq 'M'); #$ParentalCutOff = 37 if($grid eq 'P'); ##################################################################### if($grid eq 'P'){ my $sizeofLow = keys %P_LowLocationHash; #my $sizeofHigh = keys %P_HighLocationHash; next unless($sizeofLow == 37); #####################special case of grid P######################################### }else{ my $sizeofLocationHash = keys %LocationHash; next if($sizeofLocationHash < $ParentalCutOff); } my %plasmidHash; foreach my $ID (@IDs){ my ($plasmid) = $ID =~ /(\S+)\./; next if(exists $plasmidHash{$plasmid}); $plasmidHash{$plasmid} = 1; if(exists $BigCluster{$clusterID}){ $BigCluster{$clusterID} .= ":$plasmid"; }else{ $BigCluster{$clusterID} = $plasmid; } } } $/ = "\n"; #sort numerically of the foreach my $cluster (sort {$a <=> $b} keys %BigCluster){ my @IDs = split(/:/, $BigCluster{$cluster}); my $size = @IDs; foreach my $ID (@IDs){ # print "$cluster\t$ID\t$size\n"; print "$cluster\t$ID\t$size\tROW: $RowLocation{$ID}\n" if(exists $RowLocation{$ID}); print "$cluster\t$ID\t$size\tCOL: $ColLocation{$ID}\n" if(exists $ColLocation{$ID}); } }