hierfstat: Estimation and Tests of Hierarchical F-Statistics

Documented in ms2bed ms2dos read.ms

#################################
#' Read data generated by Hudson ms program
#'  
#'  Read data generated by 
#' \href{http://home.uchicago.edu/rhudson1/source/mksamples.html}{Hudson ms} 
#' program, either as Haplotypes or as SNPs.  
#' 
#' With argument what="SNP", each site is read as a SNP, with the ancestral allele encoded as 0 and the alternate 
#' allele encoded as 1.  If the ms output file contains several replicates, 
#' the different replicates will be collated together.  Hence, the number of loci is the sum of all
#' sites from all replicates.  
#' 
#' With argument what="Haplotype", each different sequence from a replicate 
#' is read as a haplotype, 
#' by converting it first to a factor, and then to an integer. There will be as many loci 
#' as there are replicates, and the number of alleles per locus will be the number of different
#' haplotypes in the corresponding replicate. 
#'  
#' @usage read.ms(fname,what=c("SNP","Haplotype"))
#' @param fname file name containing ms output
#' @param what whether to read ms output as SNPs or haplotypes
#' @return alldat  a data frame with nloc+1 columns, the first being the population
#'  to which the individual belongs and the next being the genotypes, one column per locus; 
#'  and one row per (haploid) individual.
#'  
#' 
#' @author Jerome Goudet \email{jerome.goudet@@unil.ch}
#' @references \href{https://academic.oup.com/bioinformatics/article-abstract/18/2/337/225783}{Hudson, R. R. (2002) Generating samples under a Wright-Fisher neutral model of genetic variation}. Bioinformatics 18 : 337-338.
#'
#' @examples 
#' \dontrun{
#'   datH<-read.ms(system.file("extdata","2pops_asspop.txt",package="hierfstat"),what="Haplotype")
#'   dim(datH)
#'   head(datH[,1:10]
#'   datS<-read.ms(system.file("extdata","2pops_asspop.txt",package="hierfstat"),what="SNP")
#'   dim(datS)
#'   head(datS[,1:10])
#'   }
#' @export
#####################################################################################
read.ms<-function(fname,what=c("SNP","Haplotype")){
  cl<-match.call()
  if(!is.na(pmatch(what,"SNP")))
    what<-"SNP"
  WHATS<-c("SNP","Haplotype")
  what<-pmatch(what,WHATS)
  if (is.na(what)) stop("Invalid data type")
  if (what==-1) stop("Ambiguous data type")
  dat<-readLines(fname)
  tmp<-strsplit(dat[1],split=" ")[[1]]
  pos.i<-which(tmp=="-I")
  if (length(pos.i)>0) {
    np<-as.integer(tmp[pos.i+1])
    ni<-as.integer(tmp[(pos.i+2):(pos.i+1+np)])
  }
  else ni<-as.integer(tmp[2])
  nrep<-as.integer(tmp[3])
  dat1<-grep("^[0,1]",dat[-c(1:4)],value=TRUE)
  nt<-sum(ni)
  dats<-split(dat1,rep(1:nrep,each=nt))
  if (what==2){
    datia<-lapply(dats,function(x)as.integer(as.factor(x)))
    datm<-matrix(unlist(datia),nrow=nt,byrow=FALSE)
    alldat<-data.frame(Pop=rep(1:np,ni),datm)
    names(alldat)<-c("Pop",paste("loc",1:dim(datm)[2],sep=""))
    return(alldat)
  }
  if (what==1){
    nsnps<-as.integer(unlist(lapply(strsplit(grep("segsites:",dat,value=TRUE),split=":"),function(x) x[[2]])))
    dats1<-lapply(dats,strsplit,"")
    dats2<-lapply(dats1,function(x) matrix(as.integer(unlist(x)),nrow=nt,byrow=TRUE))
    alldat<-matrix(numeric(nt*sum(nsnps)),nrow=nt)
    cnsnps<-cumsum(nsnps)
    alldat[,1:cnsnps[1]]<-dats2[[1]]
    if (nrep>1) 
      for (i in 2:nrep)
      alldat[,(cnsnps[i-1]+1):cnsnps[i]]<-dats2[[i]]
    popid<-rep(1:length(ni),ni)
    alldat.names<-c("Pop",paste("loc",1:dim(alldat)[2],sep=""))
    alldat<-data.frame(Pop=popid,alldat)
    names(alldat)<-alldat.names
    return(alldat)
  }
}

####
#' Import \code{ms} output
#' 
#' Import the output of the \code{ms} program into suitable format for further manipulations
#' 
#' @usage ms2dos(fname,chrom=NULL)
#' 
#' @param fname a text file containing the output of the \code{ms program}
#' @param chrom  the chromosomes (replicates) to be imported. default to all
#' 
#' @return alldat a matrix with as many row as (haploid) individuals and as many columns as SNPs
#' @pop a vector containing individuals population identifier
#' @return bim a data frame with two components
#'  chr contains the chromosome (replicate) id; 
#'  pos contains the SNPs positions on the chromosome
#'  
#'       
#' @export
####


ms2dos<-function(fname,chrom=NULL){
  #take an ms file as input
  #return a matrix of haploid gametes
  #for loop rather than lapply for speeding up if nrep too large
  
  char2int<-function(datas){
    tmp<-strsplit(datas,"")
    matrix(as.integer(unlist(tmp)),nrow=length(datas),byrow=TRUE)
  }
  
  dat<-readLines(fname)
  tmp<-strsplit(dat[1],split=" ")[[1]]
  pos.i<-which(tmp=="-I")
  if (length(pos.i)>0) {
    np<-as.integer(tmp[pos.i+1])
    ni<-as.integer(tmp[(pos.i+2):(pos.i+1+np)])
  }
  else {
    np<-1
    ni<-as.integer(tmp[2])
  }
  pos.i<-which(tmp=="-r")
  if (length(pos.i)>0){
    nbp<-as.integer(tmp[pos.i+2])
  }	
  else nbp<-NULL
  nrep<-as.integer(tmp[3])
  dat1<-grep("^[0,1]",dat[-c(1:4)],value=TRUE)
  nt<-sum(ni)
  dats<-split(dat1,rep(1:nrep,each=nt))
  
  if (!is.null(chrom)) dats<-dats[chrom] else chrom<-1:nrep
  
  nsnps<-as.integer(unlist(lapply(strsplit(grep("segsites:",dat,value=TRUE),split=":"),function(x) x[[2]])))[chrom]
  if (!is.null(nbp)){ 
    pos<-round(unlist(lapply(lapply(strsplit(grep("positions",dat,value=T),split=":"),
            function(x) strsplit(x[[2]],split=" ")),function(y) as.numeric(y[[1]][-1]))[chrom])*nbp,0)
    } else {
    pos<-unlist(lapply(nsnps,function(x) 1:x))
    }
  
   if((1.0*nsnps[1]*nt) <1.0e+8){
     dats2<-lapply(dats,char2int)
   } else{
    nblocks<-ceiling(1.0*max(nsnps)*nt /1.0e+8)
    indgroups<-split(1:nt,cut(1:nt,nblocks))
    dats2<-list(length=length(chrom))
    for (ic in 1:length(chrom)){
      dats1<-NULL
      dum<-lapply(indgroups,function(ii) char2int(dats[[ic]][ii])) 
      for (ib in 1:nblocks) dats1<-rbind(dats1,dum[[ib]])
      dats2[[ic]]<-dats1
      gc()
    }
    
  }
  alldat<-NULL
  for (i in 1:length(dats2))
    alldat<-cbind(alldat,dats2[[i]])
  gc()
  return(list(alldat=alldat,pop=rep(1:np,ni),bim=data.frame(chr=rep(chrom,nsnps),pos=pos)))
}

####
#' Import the output of the \code{ms} program in a \code{BED} object
#' 
#' Import the output of the \code{ms} program into a \code{BED} object, as defined in the 
#' \href{https://cran.r-project.org/package=gaston}{gaston} package
#' 
#' @usage ms2bed(fname,chrom=NULL)
#' 
#' @param  fname the name of the text file containing \code{ms} output
#' @param chrom  the number of chromosomes (replicates) to import (default to all)
#' 
#' 
#' @return a bed object
#' 
#' @details \code{ms2bed} relies on \code{\link{ms2dos}}. 
#' The population identifier is in bed@@ped$famid
#' 
#' @export
####
ms2bed<-function(fname,chrom=NULL){
  dos<-ms2dos(fname,chrom)
  x<-0:(dim(dos$alldat)[1]%/%2-1)
  dos2<-dos$alldat[x*2+1,]+dos$alldat[x*2+2,]
  bed<-gaston::as.bed.matrix(dos2)
  bed@ped$famid<-dos$pop[x*2+1]
  bed@snps$pos<-dos$bim$pos
  bed@snps$chr<-dos$bim$chr
  gc()
  return(bed)
}