R/ethseq_Analysis.R
In EthSEQ: Ethnicity Annotation from Whole-Exome and Targeted Sequencing Data

Documented in ethseq.Analysis

#' Ancestry analysis from whole-exome and targeted sequencing data
#'
#' This function performs ancestry analysis of a set of samples ad reports the results.
#'
#' @param target.vcf Path to the sample's genotypes in VCF format
#' @param target.gds Path to the sample's genotypes in GDS format
#' @param bam.list Path to a file containing a list of BAM files paths
#' @param out.dir Path to the folder where the output of the analysis is saved
#' @param model.gds Path to a GDS file specifying the reference model
#' @param model.available String specifying the pre-computed reference model to use
#' @param model.assembly String value indicating the assembly version to download for the pre-build models
#' @param model.pop String value indicating the population to download for the pre-build models
#' @param model.folder Path to the folder where reference models are already present or downloaded when needed
#' @param run.genotype Logical values indicating whether the ASEQ genotype should be run
#' @param aseq.path Path to the folder where ASEQ binary is available or is downloaded when needed
#' @param mbq Minmum base quality used in the pileup by ASEQ
#' @param mrq Minimum read quality used in the piluep by ASEQ
#' @param mdc Minimum read count acceptable  for genotype inference by ASEQ
#' @param cores Number of parallel cores used for the analysis
#' @param verbose Print detailed information
#' @param composite.model.call.rate SNP call rate used to run Principal Component Analysis (PCA)
#' @param refinement.analysis Matrix specifying a tree of ancestry sets
#' @param space Dimensions of PCA space used to infer ancestry (2D or 3D)
#' @param bam.chr.encoding Logical value indicating whether input BAM files have chromosomes encoded with "chr" prefix
#' @return Logical value indicating the success of the analysis
#' @export

ethseq.Analysis <- function(
  target.vcf = NA,
  target.gds = NA,
  bam.list = NA,
  out.dir = tempdir(),
  model.gds = NA,
  model.available = NA,
  model.assembly = 'hg38',
  model.pop = 'All',
  model.folder = tempdir(),
  run.genotype = FALSE,
  aseq.path = tempdir(),
  mbq=20,
  mrq=20,
  mdc=10,
  cores=1,
  verbose=TRUE,
  composite.model.call.rate = 1,
  refinement.analysis = NA,
  space = "2D",
  bam.chr.encoding = FALSE)
{
  
  ## Version
  .message.Date("Running EthSEQ")
  .message.Date(paste("Working directory: ",out.dir,sep=""))
  
  if(!file.exists(out.dir))
  {
    .message.Date(paste("Create ",out.dir," folder",sep=""))
    dir.create(out.dir)
  }
  
  ## Select Model when model.gds is not specified
  if(is.na(model.gds))
  {
    if(is.na(model.available))
    {
      .message.Date("ERROR: model.gds or model.available variables should be specified.")
      return(FALSE)
    }
    model.path = .get.Model(model.available,model.folder, model.assembly, model.pop)
    if(is.na(model.path)){
      .message.Date(paste("Model ",model.available," (using assembly ",model.assembly," and/or poopulation ",model.pop,") not available.\nPlease run getModelsList() to obtain the list of available models, assembly and populations.",sep=""))
      return(FALSE)
    }
  } else
  {
    model.path = model.gds
  }
  
  # Run ASEQ when target VCF and GDS are not provided
  if(is.na(target.vcf)&is.na(target.gds))
  {
    if(is.na(bam.list))
    {
      .message.Date("ERROR: target.vcf or bam.list variables should be specified.")
      return(FALSE)
    }
    
    ### Check if all BAM files exists
    if(run.genotype&any(!file.exists(unique(readLines(bam.list)))))
    {
      .message.Date("ERROR: one or more BAM files do not exist.")
      return(FALSE)
    }
    bam.files = unique(readLines(bam.list))
    sample.names = gsub(".bam","",basename(bam.files))
    
    ## Execute ASEQ to generate genotypes
    table = data.frame(sample=gsub(".bam","",basename(bam.files)),bam=bam.files)
    genotype.dir = file.path(out.dir,"ASEQGenotypes","")
    if(!file.exists(genotype.dir))
    {
      .message.Date(paste("Create ",genotype.dir," folder",sep=""))
      dir.create(genotype.dir)
    }
    
    .message.Date(paste("Run ASEQ to generate genotypes for ",length(bam.files)," samples",sep=""))
    if(run.genotype)
    {
      res = .aseq.Run(bam.files,aseq.path,genotype.dir,out.dir,mbq,mrq,mdc,model.path,cores,bam.chr.encoding)
      if(!res)
      {
        .message.Date("ERROR: Error while executing ASEQ.")
        return(FALSE)
      }
    }
    
    ## Create Target Model GDS file
    .message.Date("Create target model")
    .create.Target.Model(sample.names,genotype.dir,out.dir,cores,bam.chr.encoding)
    
    target.model = file.path(out.dir,"Target.gds")
  } else if(is.na(target.gds))
  {
    .message.Date("Create target model from VCF")
    .create.Target.Model.From.VCF(target.vcf,out.dir,cores)

    target.model = file.path(out.dir,"Target.gds")
  } else
  {
    target.model = target.gds
  }
  
  ### Create Composite model - Changes with SNPRelate function
  .message.Date("Create aggregated model")
  res = .combine.Models(model.path,target.model,out.dir,verbose)
  # snpgdsCombineGeno(c(target.model,model.path),file.path(out.dir,"Aggregated.gds"),
  #                   method = 'position',snpfirstdim = TRUE,verbose=verbose)
  # res = file.exists(file.path(out.dir,"Aggregated.gds"))
  # genofile <- snpgdsOpen(file.path(out.dir,"Aggregated.gds"),readonly = F)
  # target.sample.annot = read.gdsn(index.gdsn(snpgdsOpen(target.model),'sample.annot'))
  # reference.sample.annot = read.gdsn(index.gdsn(snpgdsOpen(model.path),'sample.annot'))
  # sample.annot <- rbind(target.sample.annot,reference.sample.annot)
  # add.gdsn(genofile,"sample.annot",sample.annot)
  # snpgdsClose(genofile)
  
  if(!res)
  {
    .message.Date("ERROR: Target and reference models are not compatible.")
    return(FALSE)
  }
  
  ### Perform PCA
  .message.Date("Perform PCA on aggregated model")

  model <- snpgdsOpen(file.path(out.dir,"Aggregated.gds"),readonly = F)
  pca <- snpgdsPCA(model,num.thread = cores,eigen.method = "DSPEVX",verbose = verbose,missing.rate = 1-composite.model.call.rate,eigen.cnt=5)
  sample.id <- read.gdsn(index.gdsn(model, "sample.id"))
  pop_code <- read.gdsn(index.gdsn(model, "sample.annot/pop.group"))
  tab <- data.frame(sample.id = pca$sample.id,
                    pop = factor(pop_code)[match(pca$sample.id, sample.id)],
                    EV1 = pca$eigenvect[,1],
                    EV2 = pca$eigenvect[,2],
                    EV3 = pca$eigenvect[,3],
                    EV4 = pca$eigenvect[,4],
                    EV5 = pca$eigenvect[,5],
                    stringsAsFactors = FALSE)
  snpgdsClose(model)
  
  ### Infer ethnicity
  .message.Date("Infer ethnicities")
  annotations = .get.Ethnicity(tab,space)
  
  if(!all(is.na(refinement.analysis)))
  {
    res = .check.Matrix(refinement.analysis,names(annotations[[2]]))
    if(res)
    {
      .message.Date("ERROR: Refinement analysis matrix is wrongly formatted.")
      return(FALSE)
    }
    refinement.index = 1
    refinement.leafs = .get.Position.Leafs(refinement.analysis)
    refinement.position = .get.Position.Matrix(refinement.analysis)
    refinement.position = unlist(apply(refinement.position,1,function(x) x[which(x!="")]))
    refinement.position = as.numeric(refinement.position)
    refinement.analysis = unlist(apply(refinement.analysis,1,function(x) x[which(x!="")]))
    refinements = list()
    refinements[[1]] = annotations
    out = refinements[[1]][[1]][,c(1,2,8)]
    
    ### Plot visual PDF report
    .message.Date("Plot visual report")
    coord = annotations[[1]]
    n.dim = as.numeric(gsub("D","",space))-1
    coord = coord[,c(1,3:(3+n.dim))]
    coord[,1] = gsub("target.","",coord[,1])
    write.table(coord,file.path(out.dir,"Report_Ref0.PCAcoord"),sep="\t",row.names=F,quote=F)
    .plot.Report(tab,annotations[[1]],length(pca$snp.id),annotations[[2]],annotations[[3]],out.dir,label="_Ref0",space = space)
    
    for(s in 1:length(refinement.analysis))
    {
      .message.Date(paste("Refinement step ",s," on populations (",paste(refinement.analysis[[s]],collapse=","),")",sep=""))
      model <- snpgdsOpen(file.path(out.dir,"Aggregated.gds"),readonly = F)
      samples <- refinements[[refinement.position[s]]][[1]]
      samples <- samples$sample.id[which(sapply(samples$pop,function(x) length(intersect(strsplit(x,"\\|")[[1]],strsplit(refinement.analysis[[s]],"\\|")[[1]]))>0))]
      sample.id <- read.gdsn(index.gdsn(model, "sample.id"))
      pop_code <- read.gdsn(index.gdsn(model, "sample.annot/pop.group"))
      samples = c(samples,sample.id[which(as.character(pop_code)%in%strsplit(refinement.analysis[[s]],"\\|")[[1]])])
      pca <- snpgdsPCA(model,num.thread = cores,eigen.method = "DSPEVX",verbose = verbose,sample.id = samples, missing.rate=1-composite.model.call.rate,eigen.cnt =5)
      
      idx = which(as.character(pop_code)%in%c("ND",strsplit(refinement.analysis[[s]],"\\|")[[1]]))
      tab <- data.frame(sample.id = pca$sample.id,
                        pop = factor(pop_code)[match(pca$sample.id, sample.id)],
                        EV1 = pca$eigenvect[,1],
                        EV2 = pca$eigenvect[,2],
                        EV3 = pca$eigenvect[,3],
                        EV4 = pca$eigenvect[,4],
                        EV5 = pca$eigenvect[,5],
                        stringsAsFactors = FALSE)
      snpgdsClose(model)
      
      annotations = .get.Ethnicity(tab,space)
      if(annotations[[4]])
      {
        coord = annotations[[1]]
        n.dim = as.numeric(gsub("D","",space))-1
        coord = coord[,c(1,3:(3+n.dim))]
        coord[,1] = gsub("target.","",coord[,1])
        write.table(coord,file.path(out.dir,paste("Report_Ref",s,".PCAcoord",sep="")),sep="\t",row.names=F,quote=F)
        .plot.Report(tab,annotations[[1]],length(pca$snp.id),annotations[[2]],annotations[[3]],out.dir,label=paste("_Ref",s,sep=""),space = space)
      }
      if(!refinement.leafs[s])
      {
        refinements[[refinement.index+1]] = annotations
        refinement.index = refinement.index+1
      }
      
      if(annotations[[4]])
      {
        for(i in 1:nrow(out))
        {
          id = which(annotations[[1]]$sample.id==out$sample.id[i])
          if(length(id)>0)
          {
            out$pop[i] = annotations[[1]]$pop[id]
            out$type[i] = annotations[[1]]$type[id]
          }
        }
      }
    }
    
    .message.Date("Print annotations")
    out[,1] = gsub("target.","",out[,1])
    write.table(out,file.path(out.dir,"Report.txt"),sep="\t",row.names=F,quote=F)
    
  } else
  {
    ### Print ethnicity annotations on tex tab-delimited file
    .message.Date("Print annotations")
    out = annotations[[1]]
    out = out[,c(1,2,8,9)]
    out[,1] = gsub("target.","",out[,1])
    write.table(out,file.path(out.dir,"Report.txt"),sep="\t",row.names=F,quote=F)
    out = annotations[[1]]
    n.dim = as.numeric(gsub("D","",space))-1
    out = out[,c(1,3:(3+n.dim))]
    out[,1] = gsub("target.","",out[,1])
    write.table(out,file.path(out.dir,"Report.PCAcoord"),sep="\t",row.names=F,quote=F)
    
    ### Plot visual PDF report
    .message.Date("Plot visual report")
    .plot.Report(tab,annotations[[1]],length(pca$snp.id),annotations[[2]],annotations[[3]],out.dir,space=space)
  }
  
  .message.Date("Computation end")
  return(TRUE)
}