#!/usr/bin/perl
## REMOVE SINGLETONS OR NON-SNPS FROM PHASE DATA
use strict;
use warnings;

sub help {
print("REMOVE SINGLETONS OR NON-SNPS FROM PHASE DATA\n");

print("usage:   perl phasesscreen.pl <phasefile> <outputphasefile>\n");
die "\n";
}

sub trim($){  # remove whitespace from beginning and end of the argument
        my $string = shift;
        $string =~ s/^\s+//;
        $string =~ s/\s+$//;
        return $string;
}

###############################
## ARGUMENT PROCESSING

if(@ARGV != 2) {help();}

my $infile=$ARGV[0];
my $outfile=$ARGV[1];


####################################
## Define global variables

my $line;
my @linearray; # temporary array of the current line
my @genomat=();

####################################
## File IO

## Check we can read the input files

open IN,"$infile" or die "Could not open input file $infile\n";
open OUT, ">", $outfile or die "Could not create output file $outfile\n";

## Read the header

my $initialline=-1; # -ve if we don't have an initial line
my $ninds=0;
my $nsnps=0;
my $line0=<IN>; # 0 line? or inds?
my $line1=<IN>; # inds? or snps?
my $line2=<IN>; # nsnps? or locations?
my @posvec=split(/\s+/,$line2);
if($posvec[0] eq "P"){ # there was no 0 line
    $ninds=trim($line0);
    $nsnps=trim($line1);
}else{
    $initialline=trim($line0);
    $ninds=trim($line1);
    $nsnps=trim($line2);
    $line=<IN>; # snps
    @posvec=split(/\s+/,$line);
}
shift @posvec;

$line=<IN>; # S line

################
## Read the data 
my $nhaps=0;
    while(<IN>)
    {
	$line=trim($_);
	@linearray=split(//,$line);
	for (my $i=0; $i < $nsnps; $i+=1)
	{
	    $genomat[$nhaps][$i]=$linearray[$i];
	}
	$nhaps++;
    }
close(IN);

################
## Remove invalid SNPs
my $snpon=0;
while($snpon<scalar(@{ $genomat[0] }) ){
    my $snpcount=0;
    for (my $i=0; $i < scalar(@genomat); $i+=1)
    {
	$snpcount+=$genomat[$i][$snpon];
    }
    if($snpcount<2 || scalar(@genomat) - $snpcount < 2){ ## Exclude
	$nsnps--;
	for (my $i=0; $i < scalar(@genomat); ++$i){
	    splice(@{ $genomat[$i] },$snpon,1);
	}
	splice(@posvec,$snpon,1);
    }else{
	$snpon++;
    }
}

#################
## Print PHASE format

if($initialline>=0) {print OUT "$initialline\n";}
print OUT "$ninds\n";
print OUT "$nsnps\n";
print OUT "P @posvec\n";
for (my $j=0; $j < $nsnps; $j+=1)
{
    print OUT "S";
}
print OUT "\n";

for (my $i=0; $i < $nhaps; $i+=1)
{
    for (my $j=0; $j < $nsnps; $j+=1)
    {
	print OUT "$genomat[$i][$j]";
    }
    print OUT "\n";
}
close(OUT);
