R/readmsb.R

Defines functions ms2bed ms2dos read.ms

Documented in ms2bed ms2dos read.ms

#################################
#' Read data generated by Hudson ms program
#'  
#'  Read data generated by 
#' \href{http://home.uchicago.edu/rhudson1/source/mksamples.html}{Hudson ms} 
#' program, either as Haplotypes or as SNPs.  
#' 
#' With argument what="SNP", each site is read as a SNP, with the ancestral allele encoded as 0 and the alternate 
#' allele encoded as 1.  If the ms output file contains several replicates, 
#' the different replicates will be collated together.  Hence, the number of loci is the sum of all
#' sites from all replicates.  
#' 
#' With argument what="Haplotype", each different sequence from a replicate 
#' is read as a haplotype, 
#' by converting it first to a factor, and then to an integer. There will be as many loci 
#' as there are replicates, and the number of alleles per locus will be the number of different
#' haplotypes in the corresponding replicate. 
#'  
#' @usage read.ms(fname,what=c("SNP","Haplotype"))
#' @param fname file name containing ms output
#' @param what whether to read ms output as SNPs or haplotypes
#' @return alldat  a data frame with nloc+1 columns, the first being the population
#'  to which the individual belongs and the next being the genotypes, one column per locus; 
#'  and one row per (haploid) individual.
#'  
#' 
#' @author Jerome Goudet \email{jerome.goudet@@unil.ch}
#' @references \href{https://academic.oup.com/bioinformatics/article-abstract/18/2/337/225783}{Hudson, R. R. (2002) Generating samples under a Wright-Fisher neutral model of genetic variation}. Bioinformatics 18 : 337-338.
#'
#' @examples 
#' \dontrun{
#'   datH<-read.ms(system.file("extdata","2pops_asspop.txt",package="hierfstat"),what="Haplotype")
#'   dim(datH)
#'   head(datH[,1:10]
#'   datS<-read.ms(system.file("extdata","2pops_asspop.txt",package="hierfstat"),what="SNP")
#'   dim(datS)
#'   head(datS[,1:10])
#'   }
#' @export
#####################################################################################
read.ms<-function(fname,what=c("SNP","Haplotype")){
  cl<-match.call()
  if(!is.na(pmatch(what,"SNP")))
    what<-"SNP"
  WHATS<-c("SNP","Haplotype")
  what<-pmatch(what,WHATS)
  if (is.na(what)) stop("Invalid data type")
  if (what==-1) stop("Ambiguous data type")
  dat<-readLines(fname)
  tmp<-strsplit(dat[1],split=" ")[[1]]
  pos.i<-which(tmp=="-I")
  if (length(pos.i)>0) {
    np<-as.integer(tmp[pos.i+1])
    ni<-as.integer(tmp[(pos.i+2):(pos.i+1+np)])
  }
  else ni<-as.integer(tmp[2])
  nrep<-as.integer(tmp[3])
  dat1<-grep("^[0,1]",dat[-c(1:4)],value=TRUE)
  nt<-sum(ni)
  dats<-split(dat1,rep(1:nrep,each=nt))
  if (what==2){
    datia<-lapply(dats,function(x)as.integer(as.factor(x)))
    datm<-matrix(unlist(datia),nrow=nt,byrow=FALSE)
    alldat<-data.frame(Pop=rep(1:np,ni),datm)
    names(alldat)<-c("Pop",paste("loc",1:dim(datm)[2],sep=""))
    return(alldat)
  }
  if (what==1){
    nsnps<-as.integer(unlist(lapply(strsplit(grep("segsites:",dat,value=TRUE),split=":"),function(x) x[[2]])))
    dats1<-lapply(dats,strsplit,"")
    dats2<-lapply(dats1,function(x) matrix(as.integer(unlist(x)),nrow=nt,byrow=TRUE))
    alldat<-matrix(numeric(nt*sum(nsnps)),nrow=nt)
    cnsnps<-cumsum(nsnps)
    alldat[,1:cnsnps[1]]<-dats2[[1]]
    if (nrep>1) 
      for (i in 2:nrep)
      alldat[,(cnsnps[i-1]+1):cnsnps[i]]<-dats2[[i]]
    popid<-rep(1:length(ni),ni)
    alldat.names<-c("Pop",paste("loc",1:dim(alldat)[2],sep=""))
    alldat<-data.frame(Pop=popid,alldat)
    names(alldat)<-alldat.names
    return(alldat)
  }
}

####
#' Import \code{ms} output
#' 
#' Import the output of the \code{ms} program into suitable format for further manipulation
#' 
#' @usage ms2dos(fname)
#' 
#' @param fname a text file containing the output of the \code{ms program}
#' 
#' @return alldat a matrix with as many row as (haploid) individuals and as many columns as SNPs
#' @return bim a data frame with two components
#'  chr contains the chromosome (replicate) id; pos contains the SNPs posoition on the chromosome
#'   
#'       
#' @export
####


ms2dos<-function(fname){
  #take an ms file as input
  #return a matrix of haploid gametes
  #for loop rather than lapply for speeding up if nrep too large
  
  dat<-readLines(fname)
  tmp<-strsplit(dat[1],split=" ")[[1]]
  pos.i<-which(tmp=="-I")
  if (length(pos.i)>0) {
    np<-as.integer(tmp[pos.i+1])
    ni<-as.integer(tmp[(pos.i+2):(pos.i+1+np)])
  }
  else ni<-as.integer(tmp[2])
  pos.i<-which(tmp=="-r")
  if (length(pos.i)>0){
    nbp<-as.integer(tmp[pos.i+2])
  }	
  else nbp<-NULL
  nrep<-as.integer(tmp[3])
  dat1<-grep("^[0,1]",dat[-c(1:4)],value=TRUE)
  nt<-sum(ni)
  dats<-split(dat1,rep(1:nrep,each=nt))
  nsnps<-as.integer(unlist(lapply(strsplit(grep("segsites:",dat,value=TRUE),split=":"),function(x) x[[2]])))
  if (!is.null(nbp)) 
    pos<-round(unlist(lapply(lapply(strsplit(grep("positions",dat,value=T),split=":"),function(x) strsplit(x[[2]],split=" ")),function(y) as.numeric(y[[1]][-1])))*nbp,0)
  else pos<-unlist(lapply(nsnps,function(x) 1:x))
    dats1<-lapply(dats,strsplit,"")
  dats2<-lapply(dats1,function(x) matrix(as.integer(unlist(x)),nrow=nt,byrow=TRUE))
  alldat<-NULL
  for (i in 1:nrep)
    alldat<-cbind(alldat,dats2[[i]])

      return(list(alldat=alldat,bim=data.frame(chr=rep(1:nrep,nsnps),pos=pos)))
}

####
#' Import the output of the \code{ms} program in a \code{BED} object
#' 
#' Import the output of the \code{ms} program into a \code{BED} object, as defined in the 
#' \href{https://cran.r-project.org/package=gaston}{gaston} package
#' 
#' @param  fname the name of the text file containing \code{ms} output
#' 
#' @return a bed object
#' 
#' @export
####
ms2bed<-function(fname){
  dos<-ms2dos(fname)
  x<-0:(dim(dos$alldat)[1]%/%2-1)
  dos2<-dos$alldat[x*2+1,]+dos$alldat[x*2+2,]
  bed<-gaston::as.bed.matrix(dos2)
  bed@snps$pos<-dos$bim$pos
  bed@snps$chr<-dos$bim$chr
  return(bed)
}

Try the hierfstat package in your browser

Any scripts or data that you put into this service are public.

hierfstat documentation built on May 6, 2022, 1:05 a.m.