## THIS IS FOR THE WHOLE WORLD DATA SEGREGATED INTO SUBPOPULATIONS
## IT MAKES *.force AND *.xml FILES SUITABLE FOR PCA
##
## Assumes that the individuals are named in the format <poplabel><number> e.g. BantuKenya1,BantuKenya2 will both match "BantuKenya"
## The names are taken from the "chunkcounts.out" file produced by chromopainter
##
## YOU ARE FREE TO USE AND MODIFY THIS FILE 
## THIS FILE MAY NOT BE FIT FOR ANY PURPOSE, USE AT YOUR OWN RISK (ETC)
##
## AUTHOR: DANIEL LAWSON (dan.lawson@bristol.ac.uk)
## DATE: 2/3/2012
#####################################################
## USAGE: Edit the inputfile and outputfileroot names. If you use a different naming system, also edit the "popIn" function to capture this...
## You also have to edit the contlist and collist to match the continents and populations within each continent respectively.
##
## If you want to use the "force" files for finestructure YOU MUST RERUN CHROMOCOMBINE FOR EACH CONTINENT.  The "-f" flag can be used and it
## will by default insert the correct value of "c" into the continent file. See the chromocombine help for details.
##
## You then need to create the finestructure tree from the .xml file either usiong the gui or the "-m T -T 1" option
## e.g. finestructure -m T -T 1 -t 20000 -F AllHGDPContPopEurope.force AllHGDPexample.chunkcounts.out AllHGDPContPopEurope.xml AllHGDPContPopEurope.tree.xml
## then you can load it into the GUI
## e.g. finegui -F AllHGDPContPopEurope.force -c AllHGDPexample.chunkcounts.out -m AllHGDPContPopEurope.xml -t AllHGDPContPopEurope.tree.xml
## Go to "File->Manage Files" and load the "Raw data file", the "MCMC output file", and the "Processed Tree file"
## (tip: change the view to "aggregated (alternative)" is the GUI is slow to respond...)
## then label the populations by going to "Organise->Edit Populations" and click "Guess" (WORKS ONLY IF YOUR NAMING SCHEME IS THE SAME AS OURS!)
## Finally, go to "Plot->Principal Components Analysis"


inputfile<-"AllHGDPexample.chunkcounts.out" # the chromopainter input file

## Outputfile names are constructed using <outputfileroot><Continentname><outputfiletail>
## where <outputfiletail> is ".force" for continent forcing files, and ".xml" for fake population files
outputfileroot<-"AllHGDPContPop"

popIn<-function(x){
## This function defines how a "population" is defined
## NOTE: if your individuals are named some other way, you can still use this approach.  You just have to construct 
## the "popnames" vector some other way, either by hand just by simply listing all individuals, or by writing your own "popIn" function
## to match your naming system
        tlab2<-gsub("[0-9]","",x)
        unique(tlab2)
}

# These are the "names" of the continents - you can make them whatever you like
contlist<-c("Africa","America","CentralSouthAsia","EastAsia","Europe","MiddleEast","Oceania")
  
# For each "continent", define a list of those individual labels
# The order should match that above (the list can be given continent names as below, but these are ignored)
collist<-list(Africa=c("BantuKenya","BantuSouthAfrica","BiakaPygmy","Mandenka","MbutiPygmy","Mozabite","San","Yoruba"),
    America=c("Colombian","Karitiana","Maya","Pima","Surui"),
    CentralSouthAsia=c("Balochi","Brahui","Burusho","Hazara","Kalash","Makrani","Pathan","Sindhi"),
    EastAsia=c("Cambodian","Dai","Daur","Han","Han.NChina","Hezhen","Japanese","Lahu","Miao","Mongola","Naxi","Oroqen","She","Tu","Tujia","Uygur","Xibo","Yakut","Yi"),
    Europe=c("Adygei","Basque","French","Italian","Orcadian","Russian","Sardinian","Tuscan"),
    MiddleEast=c("Bedouin","Palestinian","Druze"),
    Oceania=c("Melanesian","Papuan"))

## Read in the data
tdat<-as.matrix(read.csv(inputfile,row.names=1,skip=1)) # skip the first line, which is the value of "c"
tnames<-dimnames(tdat)[[1]] # names of the individuals

## Construct the labelled population that each individual is in
popnames<-sapply(tnames,popIn) # *population* of each individual


## Construct a list of the individuals found within each population
contpops<-list()
for(i in 1:length(contlist)){
  index<-as.vector(unlist(sapply(collist[[i]],function(x){which(popnames %in% x)}))) # index is the individuals that match any of the labels within the population
  contpops[[i]]<-paste(tnames[index],collapse=",") # extract them into comma separated list
}

#####################################
## Now we can construct the fake population xml file, and the continent force file, for each
for(i in 1:length(contlist)){
  sampleon<-contlist[i] # the name of the continent we are processing
  index<-sapply(collist[[i]],function(x){which(popnames %in% x)})
######################
## Make a valid population list, by removing the current continent and keeping all the individuals within only that continent
  interpop<-paste(as.vector(c(contlist[-i],sapply(index,function(x){paste(tnames[x],collapse=",")}))),collapse=")(")
  tpop<-paste("(",interpop,")",sep="")
## tpop now takes the form of a valid finestructure state. we just need to create a fake mcmc xml file wrapper
  outputfile<-paste(outputfileroot,sampleon,".xml",sep="")
  cat("<?xml version = '1.0' encoding = 'UTF-8'?>\n<outputFile>\n<header>\n<comment>Generated File</comment>\n",file=outputfile)
  header<-paste("<inflation>1</inflation>\n<burnin>0</burnin>\n<mcmclength>1</mcmclength>\n<skip>1</skip>\n<datafilename>",inputfile,"</datafilename>\n",sep="")
  cat(header,file=outputfile,append=TRUE)
  cat("</header>\n<Iteration>\n<Pop>",file=outputfile,append=TRUE)
  cat(tpop,file=outputfile,append=TRUE)
  cat("</Pop>\n</Iteration>\n</outputFile>\n",file=outputfile,append=TRUE)

#######################
## Now we construct the continent file. This lists all the individuals that should just be merged into a single continent by fineSTRUCTURE
  nameforce<-paste(outputfileroot,sampleon,".force",sep="")
  for(j in 1:length(contlist)){
## we use the strict type of continents, started with a "*", so that they are not used for tree construction
      contdesc<-paste("*",contlist[j],"(",contpops[[j]],")\n",sep="")
## we comment out the continent of interest
      if(i==j)contdesc<-paste("#",contdesc,sep="")
## and put in all the others
      if(j==1){cat(contdesc,file=nameforce)
      }else cat(contdesc,file=nameforce,append=TRUE)
  }
}


