#!/usr/bin/perl
## CONVERTS PHASE (CHROMOPAINTER) FORMAT TO BEAGLE FORMAT
use strict;
use Switch;
use POSIX;

sub help {
print("CONVERTS PHASE (CHROMOPAINTER) FORMAT TO BEAGLE FORMAT\n");

print("usage:   perl phase2beagle.pl <options> <phasefile> <recmapfile> <output_filename_prefix>\n");


print("where:\n");
print("<phasefile>:		ChromoPainter/PHASE style (http://www.hapmap.org/downloads/phasing/2007-08_rel22/phased/00README.txt) SNP file\n");
print("<recmapfile>:	Recombination file in the format <SNPid> <recrateperbpinmorgans> as described in the ChromoPainter manual\n");
print("<output_filename_prefix>: Two files will be generated:\n		<x>.bgl which will be a (phased) beagle data file for use with phased= or unphased=.\n		<x>.markers for use with beagles markers=.\n\n");

print("<options>:\n");
print("-n <namefile>	Use the first column in namefile as names for the individuals\n");
print("-N <val>		Number of header lines in the name file (0 is the default)\n");
print("-i		DO NOT create beagle ibdpairs file\n");
print("-v               Verbose mode\n");
print("Beagle usage for fastIBD will be:\njava -jar beagle.jar missing=? unphased=<x>.bgl fastibd=true out=<output> markers=<x>.markers ibdpairs=<x>.ibdpairs\n");
die "\n";
}

###############################
## ARGUMENT PROCESSING

my $createibd=1; ## create pairwise IBD file
my $usenamefile=0; ## whether to read individual names from a file
my $namefileheaderlines=0; ## how many header lines to ignore of the name file
my $verbose=0;

my $phasefile="";
my $recfile="";
my $namefile="";

my $outfilePRE="";
my $beaglesnps="";
my $beaglemarkers="";
my $beagleibd="";
my $indnamefile="";


my $argon=0;
for (my $i = 0; $i < scalar(@ARGV); ++$i){
	if($ARGV[$i] eq "-i"){
		$createibd=0;
	}elsif($ARGV[$i] eq "-n"){
		$usenamefile=1;
		$namefile= $ARGV[++$i];
	}elsif($ARGV[$i] eq "-N"){
		$namefileheaderlines= $ARGV[++$i];
	}elsif($ARGV[$i] eq "-v"){
		$verbose=1;
	}else{
		switch($argon){
			case 0 {$phasefile="$ARGV[$i]";}
			case 1 {$recfile="$ARGV[$i]";}
			case 2 {$outfilePRE="$ARGV[$i]";}
			else {
				help();
			}
		}
		$argon++;
	}
}

if($outfilePRE eq "" || $argon != 3) {help();}

$beaglesnps="$outfilePRE.bgl";
$beaglemarkers="$outfilePRE.markers";
$beagleibd="$outfilePRE.ibdpairs";

####################################
## Define global variables
my @indnames; # names of the individuals
my @snplocs; # location of the SNPS
my @snpvals; # value of the SNPs (each value here is an array)
my @marker0; # value of the markers when the SNP is 0
my @marker1; # value of the markers when the SNP is 1

my $numsnps=0; # number of SNPS defined in the file
my $numinds=0; # number of individuals defined in the file
my $numhaps=0; # number of haplotypes observed
my $ploidy=-1; # number of haps per ind

####################################
## File IO

## Check we can read the input files
if($verbose){print "Checking files\n";}
open PHASEFILE, $phasefile or die $!;
open RECFILE, $recfile or die $!;
if( $usenamefile ){
	open NAMEFILE, $namefile or die $!;
}

## Create output files
open BEAGLESNP, ">", $beaglesnps or die $!;
open BEAGLEMARK, ">", $beaglemarkers or die $!;
if( $createibd ){
	open BEAGLEIBD, ">", $beagleibd or die $!;
}
if($verbose){print "Files are present and readable\n";}

####################################
## Functions we need
sub trim($){  # remove whitespace from beginning and end of the argument
	my $string = shift;
	$string =~ s/^\s+//;
	$string =~ s/\s+$//;
	return $string;
}

sub snp2marker($){ # convert (0,1) to (C,T) 
	my $tval=@_[0];
	if($tval==0){
		return("A");
	}elsif($tval==1){
		return("C");
	}elsif($tval==2){
		return("G");
	}elsif($tval==3){
		return("T");
	}
	return($tval);
}

####################################
## Read the phasefile 

## read the PHASEFILE header
if($verbose){print "Reading $phasefile header\n";}

my $skip=1;
my @tmarr;
while ($skip) {
	my $tmp=<PHASEFILE>;
	my @tmpvals = split(/ /, trim($tmp));
	if($tmpvals[0] eq "P"){ # found the line with all the SNP locations
		shift @tmpvals;
		@snplocs= @tmpvals;#split(/ /, $tmp);
		$tmp=<PHASEFILE>; # read the line of S's
		if (eof ){
		    die "Error reading $phasefile - did not find line beginning P listing the SNPs!\n";
		}
		$numsnps=trim(pop @tmarr);
		$numinds=trim(pop @tmarr);
		$skip=0;
	}else {
		push @tmarr, $tmpvals[0];
	}
}
print "Detected $numinds individuals\n";
# remaining lines are SNPs
while (my $tmp=<PHASEFILE>) {
	print "Reading haplotype $numhaps\n";
	my @tarr=split(//,trim($tmp));
	$snpvals[$numhaps++] = [ @tarr ]; # note: $snpvals is an array of arrays, but we actually store a reference to the *new* array [ @tarr ]
	if(scalar(@tarr)!=$numsnps){
		my $tmp=scalar(@tarr);
		die "Expected $numsnps SNPs on haplotype $numhaps, but received $tmp\n";
	}
}
$ploidy=$numhaps/$numinds;
print "Ploidy = $ploidy\n";
if($ploidy != ceil($numhaps/$numinds)) {
		die "Invalid number of haplotypes ($numhaps) per individual ($numinds)\n";
}
close PHASEFILE;

####################################
## make the individual names

if( $usenamefile ){
	for(my $i=0;$i < $namefileheaderlines;++$i){
		my $tmp=<NAMEFILE>;
	}
	while (my $tmp=<NAMEFILE>) {
		my @tarr=split(/ /,trim($tmp));
		push(@indnames, $tarr[0]);
	}
	close NAMEFILE;
	# check we have enough names
	my $tn=scalar(@indnames);
	if($tn < $numinds) {
		die "Error:Insufficient names in $namefile (found $tn, but have $numinds individuals)\n";
	}elsif($tn > $numinds) {
		print "WARNING: More names found in $namefile than expected (found $tn, but have $numinds individuals). These will be used but check they are correct.\n";
	}
}else{ # name them 1..N
	for (my $i = 1; $i <= $numinds; ++$i){
		push(@indnames, sprintf("ind%i",$i));
	}
}

####################################
## Figure out what the markers are (i.e. do we have 0's and 1's, 1's and 2's, or ACGTs)?

for (my $snp = 0; $snp < $numsnps; ++$snp){
	my @tmpfound; # which values we've found at this SNP
	for (my $hap = 0; $hap < $numhaps; ++$hap){
		my $tval=@snpvals->[$hap][$snp];
		my $found=0;
		for(my $tc = 0; $tc < scalar(@tmpfound) ; ++$tc){
			if($tmpfound[$tc] eq $tval) {$found=1;}
		}
		if(!$found){
			push(@tmpfound,$tval);
		}
	}
	$marker0[$snp]=snp2marker($tmpfound[0]);
	if(scalar(@tmpfound)>=2) {
		$marker1[$snp]=snp2marker($tmpfound[1]);
	}else{
		$marker1[$snp]="?";	
	}
}

####################################
## Write the bgl file
print "Writing Beagle file .bgl $beaglesnps\n";
print BEAGLESNP "I id";
for (my $i = 0; $i < $numinds; ++$i){
	for (my $j = 0; $j < $ploidy; ++$j){
		print BEAGLESNP " $indnames[$i]";
	}
}
print BEAGLESNP "\n";

for (my $snp = 0; $snp < $numsnps; ++$snp){
#	print "$snp $marker1[$snp]\n";
	if($marker1[$snp] ne "?") { # exclude non varying SNPS
		print BEAGLESNP "M $snplocs[$snp]";
		for (my $hap = 0; $hap < $numhaps; ++$hap){
			my $tval=snp2marker(@snpvals->[$hap][$snp]);
			print BEAGLESNP " $tval";
		}
		print BEAGLESNP "\n";
	}
}
close BEAGLESNP;# close snp file

####################################
## Read and process the recombination file

print "Writing Beagle file .markers $beaglemarkers\n";
my $snpon=0;
my $lastpos=-1;
my $lastdist=0;
while (my $tmp=<RECFILE>) {
	my @tarr=split(/ /,trim($tmp));
	my @tsnp=$tarr[0];
	if ($tsnp[0] =~ /^[+-]?\d+$/ ) {
	    if($marker1[$snpon] ne "?"){ # exclude non varying SNPS
		## Compute the culmulative recombination distance in centimorgans
		if($lastpos<0) { # first snp is at 0
			push(@tsnp, 0);
		}else{
			push(@tsnp, ($tarr[0]-$lastpos)*$tarr[1]*100+$lastdist);
		}
		$lastpos=$tsnp[0];
		$lastdist=$tsnp[1];
		push(@tsnp, $marker0[$snpon]);
		push(@tsnp, $marker1[$snpon]);
		print BEAGLEMARK "@tsnp\n";
	    }
	    ++$snpon;
	} else {
	    print "Warning: ignoring line: $tmp";
	}
}
# close rec and mark files
close RECFILE;
close BEAGLEMARK;

####################################
## Process the ibdpairs file
if( $createibd ){
	print "Writing Beagle file .ibdpairs $beagleibd\n";
	for (my $ind1 = 0; $ind1 < $numinds-1; ++$ind1){
		for (my $ind2 = $ind1+1; $ind2 < $numinds; ++$ind2){
			print BEAGLEIBD "$indnames[$ind1] $indnames[$ind2]\n";
		}
	}
	close BEAGLEIBD;
}
