Archive/NIMBioS.code/analysis_funcs.R

analysis_funcs <- function(params){

  stopifnot(require(strataG))
  stopifnot(require(pegas))
  stopifnot(require(hierfstat))

  # saving global variables
  curr_scn<-params@current.scenario
  curr_rep<-params@current.replicate
  num_loci<-params@scenarios[[curr_scn]]@num.loci
  num_reps<-params@num.reps
  num_pops<-params@scenarios[[curr_scn]]@num.pops
  params@analyses.requested <- analyses.check(params@analyses.requested)

  #params@rep.sample is either a genind or a list of DNAbin objects
  if(inherits(params@rep.sample, "genind")){
    results_gtype<-genind2gtypes(params@rep.sample)
    } else if (inherits(params@rep.sample, "gtypes")){
      results_gtype <- params@rep.sample
      } else {
        # Convert the list of DNAbin objects to gtypes
        genes <- params@rep.sample
        results_gtype <- sequence2gtypes(genes$dna.seqs, strata = genes$strata)
        results_gtype <- labelHaplotypes(results_gtype)$gtype
        }

  loc_names <- locNames(results_gtype)
  strata_names <- strataNames(results_gtype)

  # If analysis results is empty, the first analysis done creates the list to hold the data
  if(is.null(params@analysis.results)){
    params@analysis.results <- sapply(c("Global","Locus","Pairwise"), function(x) NULL)
  }

        ######################### Global ##########################

  if(params@analyses.requested["Global"]){

    # multiDNA
    if(inherits(params@rep.sample,c("multidna","gtypes","list"))){

    #overall_stats() is from skeleSim.funcs.R
    #run by locus analysis across all populations
    r.m <- lapply(locNames(results_gtype), function (l){
      gtypes_this_loc<-results_gtype[,l,]
      overall_stats(gtypes_this_loc)
      })
    #find complete list of column names
    analysis.names <- unique(do.call(c, lapply(r.m, function(x) names(x))))
    r.m <- lapply(r.m, function(x){
      missing <- setdiff(analysis.names, names(x))
      x[missing] <- NA
      names(x) <- analysis.names
      x
    })
    results.matrix.l <- do.call(rbind, r.m)
    results.matrix <- rbind(overall_stats(results_gtype),results.matrix.l)
    analyses <- colnames(results.matrix)
    num_analyses <- length(analyses)
    #rownames(results.matrix) <- locNames(results_gtype)

    # We are printing by gene, not overall gene analysis. This differs from the genind code below.
    # The first row will hold summary statistics over all loci regardless of population structure.
    # The remaining rows will hold summary statistics per locus
    if(is.null(params@analysis.results[["Global"]][[curr_scn]])){
      params@analysis.results[["Global"]][[curr_scn]] <- array(0,dim=c(num_loci+1,
                                                                       num_analyses,
                                                                       num_reps),
                                                               dimnames = list(c(1:(num_loci+1)),
                                                                               analyses,
                                                                               1:num_reps))
    }

    params@analysis.results[["Global"]][[curr_scn]][,,curr_rep] <- results.matrix
    }

    if(inherits(params@rep.sample, "genind")){

      # Eric will improve overallTest {strataG} to deal with invariant loci
      # all statistics crash
      ovl.global <- overallTest(results_gtype, nrep=5, stat.list=statList("chi2"), quietly=TRUE)$result
      ovl.all.global <- as.data.frame(as.table(ovl.global))[,3]

      ovl <- lapply(locNames(results_gtype), function(l){
        overallTest(results_gtype[,l,], nrep = 5, stat.list = statList("chi2"), quietly = TRUE)$result
      })

      ovl.all <- lapply(ovl, function(x){
        as.data.frame(as.table(x))
        })

      ovl.all.values <- lapply(ovl.all, function(x){
        x[,-c(1:2)]
        })
      ovl.all.names <- paste(ovl.all[[1]]$Var1, ovl.all[[1]]$Var2,sep="_")
      ovl.all.out <- do.call(rbind, ovl.all.values)
      ovl.all.results <- rbind(ovl.all.global,ovl.all.out)

      if(is.null(params@analysis.results[["Global"]][[curr_scn]])){
        params@analysis.results[["Global"]][[curr_scn]] <- array(0,dim=c(num_loci+1,
                                                                         length(ovl.all.names),
                                                                         num_reps),
                                                                 dimnames = list(c(1:(num_loci+1)),
                                                                                 ovl.all.names,
                                                                                 1:num_reps))
      }

      params@analysis.results[["Global"]][[curr_scn]][,,curr_rep] <- ovl.all.results
    }

    }


        ######################### LOCUS ############################

  if(params@analyses.requested["Locus"]){

# genind
        if(inherits(params@rep.sample,"genind")){

          #Hardy Weinberg, per locus over populations, per locus per population
          locus <- hweTest(results_gtype)
          locus.pop <- lapply(strataSplit(results_gtype), function(s){
            hw <- hweTest(s)
            missingloc <- setdiff(loc_names, names(hw))
            hw[missingloc] <- NA
            hw
            })
          hwe.pval <- do.call(rbind,lapply(locus.pop, function(x) x[match(names(locus.pop[[1]]), names(x))]))
          hwe.pval.pop <- as.data.frame(as.table(hwe.pval))
          locus.t <- cbind(NA, names(locus), data.frame(locus))
          names(locus.t) <- c("Pop","Locus","HWE.pval")
          names(hwe.pval.pop) <- c("Pop","Locus","HWE.pval")
          hwe <- rbind(locus.t,hwe.pval.pop)

          #mratio on gtypes object, function needs genetic data as a gtype
          mrat_results_all<-calc.mratio(results_gtype)

          #by locus, all the other stats (num alleles etc) pulled from summarizeLoci
          smryLoci <- cbind(Locus = 1:num_loci,Pop = NA,summarizeLoci(results_gtype))
          #by population
          locus_pop <- as.matrix(expand.grid(1:num_loci,1:num_pops))  #cycles through the loci for each pop
          smryPop.1 <- cbind(locus_pop,do.call(rbind,summarizeLoci(results_gtype, by.strata = TRUE)))
          #sort by each population per locus
          smryPop <- smryPop.1[order(rownames(smryPop.1)),order(colnames(smryPop.1))]

          smry <- rbind(smryLoci[,-c(1:2)],smryPop[,-c(10,11)])

          #Number of private alleles by locus
          alleleFreqs <- alleleFreqs(results_gtype, by.strata = TRUE)
          by.loc <- sapply(alleleFreqs, function(loc) {
            mat <- loc[, "freq", ]
            rowSums(apply(mat, 1, function(r) {
              result <- rep(FALSE, length(r))
              if(sum(r > 0) == 1) result[r > 0] <- TRUE
              result
            }))
          })
          rownames(by.loc) <- strataNames(results_gtype)
          perLocus <- colSums(by.loc) #this has the number of alleles that are private per locus
          #the rows will be have the private alleles for each population by locus
          num.priv.allele <- c(perLocus, as.data.frame(as.table(by.loc))[,3])

          # Convert from genind to loci (package pegas))
          #Fis estimation:   Fis = 1-(Ho/He)
          results_loci<-genind2loci(params@rep.sample)
          #for loci
          FSTloci<-Fst(results_loci)

          #for pops
          # pop.1/locus.1:num_loci - pop.num_pops/locus.1:num_loci..
          FSTpop<-lapply(levels(results_loci$population), function(x){
            fst <- Fst(results_loci[results_loci$population == x,], pop=results_loci$population)
          })
          FSTpop2sort <- mapply(function(mat,pn){
            x <- data.frame(mat)
            x$Locus <- row.names(x)
            x$Pop <- pn
            x
          }, mat = FSTpop, pn = 1:length(FSTpop), SIMPLIFY = FALSE)
          FSTpop.1 <- do.call(rbind,FSTpop2sort)
          FSTpop.2 <- FSTpop.1[order(FSTpop.1$Locus,FSTpop.1$Pop),]
          FSTpop.all <- rbind(FSTloci,FSTpop.2[,-c(4:5)])


          #all the analyses get bound here
          # sorted by Loci across all populaitons,
          #   then Locus.1/Pop.1:Pop.num_pops ... Locus.num_loci/Pop.1:Pop.num_pops
          locus.final <- cbind(HWE.pval = hwe[,-c(1:2)],mrat_results_all,smry,num.priv.allele,FSTpop.all, row.names=NULL)
          analysis_names <- colnames(locus.final)
          locus.final <- as.matrix(locus.final)


          # Create the data array first time through
          if(is.null(params@analysis.results[["Locus"]][[curr_scn]])){
            params@analysis.results[["Locus"]][[curr_scn]] <- array(0, dim=c(num_loci*(num_pops+1),
                                                                             length(analysis_names),
                                                                             num_reps),
                                                                    dimnames = list(1:(num_loci*(num_pops+1)),
                                                                                    analysis_names,
                                                                                    1:num_reps))
          }

          params@analysis.results[["Locus"]][[curr_scn]][,,curr_rep] <-  locus.final

        }

    # multiDNA
    # Per gene (ignoring population structure, and per gene by population)

      if(inherits(params@rep.sample,c("multidna","gtypes","list"))){

        #Nucleotide diversity
        # by gene, across populations
          r.m.gene <- lapply(locNames(results_gtype), function(l){
            mean(nucleotideDiversity(results_gtype[,l,]@sequences),na.rm=TRUE)
            })
          nD <- do.call(rbind, r.m.gene)

          # by gene per popualation "strata" - pop1:gene1, pop1:gene2, pop2....
          r.m <- lapply(strataNames(results_gtype), function(s){
            lapply(locNames(results_gtype), function(l){
            mean(nucleotideDiversity(results_gtype[,l,s]@sequences),na.rm=TRUE)
            })
          })
          r.m.bind <- do.call(c, lapply(r.m, function(x){
            do.call(c,x)
          }))
          nD.all <- c(nD, r.m.bind)

        # Fu's Fs
          fu.fs.results <- fusFs(results_gtype)

          #by population for each strataNames(results_gtype) and results_gtype[,,pops]
          fu.fs.pop <- lapply(strataNames(results_gtype), function(s){
            lapply(locNames(results_gtype), function(l){
              fusFs(results_gtype[,l,s])
              })
            })
          fu.fs.results.pop <- do.call(rbind, lapply(fu.fs.pop, function(x){
            do.call(rbind,x)
          }))

          fu.fs.all <- c(fu.fs.results, fu.fs.results.pop)

        # Tajimas D
          # by gene
          t.d.results <-  tajimasD(params@rep.sample$dna.seqs)

        # Num samples, num missing, num alleles, percent unique alleles, heterozygosity
          # by gene per population
          t.d.pop <- lapply(strataNames(results_gtype), function(s){
            lapply(locNames(results_gtype), function(l){
              tajimasD(results_gtype[,l,s])
            })
          })
          t.d.pop.bind <- do.call(rbind, lapply(t.d.pop, function(x){
            do.call(rbind,x)
          }))
          t.d.all <- rbind(t.d.results, t.d.pop.bind)

        # Summary for loci and populations
          unstrat <- results_gtype
          strata(unstrat) <- "Default"
          smryLoci.gene <- lapply(locNames(unstrat), function(l){
            summary(unstrat[,l,])$strata.smry
          })
          smryLoci <- do.call(rbind,smryLoci.gene)

          smryPop <- lapply(locNames(results_gtype), function(l){
            summary(results_gtype[,l,], by.strata = TRUE)$strata.smry
          })
          smryPop.all <- do.call(rbind, smryPop)
          summary.analyses <- dimnames(smryPop.all)[[2]]

          #Loci over all populations, locus 1 per population, locus 2 per population...
          smryLP <- rbind(smryLoci,smryPop.all)

        # Nucleotide and percent within strata divergence, mean percent within
          # gene by population
          #mean.pct.within
#          dA <- nucleotideDivergence(results_gtype)
#          dA.names <- colnames(dA[[1]]$within)
#          dA.pop <- do.call(rbind, lapply(1:length(dA), function(i){
#            rbind(dA[[i]]$within)
#          }))

          # do we want dA in locus, doesn't make sense.
#          colnames(dA[[2]]$between)

          # nucleotide divergence is pairwise between and within strata. Cannot use unstratified.
#          dA.genes <- nucleotideDivergence(unstrat)[[1]]$within
#          geneNAs <- matrix(NA, num_loci, 6)  # no data over strata for each gene
#          dA.all <- rbind(geneNAs, dA.pop)

#          dA.all <- rbind(geneNAs, dA.pop)
#          dA.analyses <- dimnames(dA.all)[[2]]


        # num.private.alleles
          hapFreqs <- lapply(strataNames(results_gtype), function(s){
            lapply(locNames(results_gtype), function(l){
            hapFreqs <- alleleFreqs(results_gtype[,l,], by.strata = TRUE)
            by.loc <- sapply(hapFreqs, function(loc) {
              mat <- loc[, "freq", ]
              rowSums(apply(mat, 1, function(r) {
                result <- rep(FALSE, length(r))
                if(sum(r > 0) == 1) result[r > 0] <- TRUE
                result
                }))
              })
            colSums(by.loc)
            })
          })
          hapFreqs.pop <- do.call(c,do.call(c,hapFreqs))

          #by gene
          hapFreqs.gene <- lapply(locNames(results_gtype), function(l){
            hapFreqs <- alleleFreqs(results_gtype[,l,], by.strata = FALSE)
            by.loc <- sapply(hapFreqs, function(loc) {
              mat <- loc[,"freq"] #no strata
              mat[mat>0]<-1
              sum(mat)
            })
            sum(by.loc)
          })
          num.pri.haps <- c(do.call(c,hapFreqs.gene), hapFreqs.pop)

          #Ne placeholder

          #how to deal with nD?
          # make sure nucleotide Divergence is right and add or keep names below
          # to do start here
          locus.final <- cbind(nD.all,fu.fs.all,t.d.all,smryLP,#dA.all[,1],
                                    num.pri.haps)
          analysis_names <- c("nucloetide.diversity", "Fu.F",colnames(t.d.all),summary.analyses,
                              #"nucleotide.divergence",
                              "num.private.haps")

          # Create the data array first time through
          # gene.1...gene.num_loci, pop.1/gene.1:gene.num_loci...pop.num.pops/gene.1:gene.num_loci
          if(is.null(params@analysis.results[["Locus"]][[curr_scn]])){
            params@analysis.results[["Locus"]][[curr_scn]] <- array(0, dim=c(num_loci*(num_pops+1),
                                                                             length(analysis_names),
                                                                             num_reps),
                                                                    dimnames = list(1:(num_loci*(num_pops+1)),
                                                                                    analysis_names,
                                                                                    1:num_reps))
          }

          params@analysis.results[["Locus"]][[curr_scn]][,,curr_rep] <-  locus.final
      }
  }


        ######################### Pairwise ##########################

  if(params@analyses.requested["Pairwise"]){

    if(inherits(params@rep.sample, c("multidna","gtypes","list"))){

      #nucleotide Divergence and mean.pct.between
      dA <- nucleotideDivergence(results_gtype)
      dA.between <- lapply(dA, function(x){
        x$between
      })
      dA.all <- do.call(rbind,dA.between)[,grep("pct",names(dA.between[[1]]),invert = TRUE)]

      #Chi2, Fst, PHist
      psw <- lapply(locNames(results_gtype), function(l){
        pairwiseTest(results_gtype[,l,],nrep = 5,
                     stat.list = list(statGst),
                     quietly = TRUE)$result[-c(1:6,8:9,12:21)]
      })
      psw.all <- do.call(rbind,psw)

      # shared haps
      sA <- sharedAlleles(results_gtype)
      sA <- reshape(sA, direction = "long", varying=list(loc_names), v.names="sA",
                    timevar = "gene",idvar=c("strata.1","strata.2"),
                    new.row.names = 1:(num_loci*choose(num_pops,2)))[,"sA"]

      pairwise.final <- cbind(dA.all,psw.all,sA)
      analysis_names <- names(pairwise.final)[-c(1:2)]

      #Data.frame of summary data into simulation replicate
      # Create the data array first time through
      if(is.null(params@analysis.results[["Pairwise"]][[curr_scn]])){
        params@analysis.results[["Pairwise"]][[curr_scn]] <- array(0, dim=c(num_loci*choose(num_pops,2),
                                                                            length(analysis_names),
                                                                            num_reps),
                                                                   dimnames = list(apply(pairwise.final[,1:2],1,paste,collapse="."),
                                                                                   analysis_names,
                                                                                   1:num_reps))
      }

      params@analysis.results[["Pairwise"]][[curr_scn]][,,curr_rep] <-  data.matrix(pairwise.final[,-c(1:2)])

    }

    if(inherits(params@rep.sample, "genind")){

      #Pairwise Chi2, D, F..., G...
      pws <- pairwiseTest(results_gtype, nrep =5, keep.null=TRUE, quietly = TRUE)[1]$result
      #pairwise by locus
      pws.loc <- lapply(loc_names, function(l){
        pairwiseTest(results_gtype[,l,], nrep = 5, keep.null=TRUE, quietly=TRUE)[1]$result
        })
      pws.loc.all <- do.call(rbind,pws.loc)
      pws.all <- rbind(pws,pws.loc.all)

      sA <- sharedAlleles(results_gtype)
      sA.long <- reshape(sA, idvar = c("strata.1","strata.2"),
                         varying = names(sA[,-c(1:2)]),
                         timevar = "Locus",
                         v.names = "sA",
                         direction = "long")
      #shared alleles sumed over loci
      sA.sum <- rowSums(sA[,-c(1:2)])
      sA.all <- c(sA.sum,sA.long$sA)

      #chord.dist
      results_hierfstat <- genind2hierfstat(params@rep.sample)
      chord.dist <- genet.dist(results_hierfstat, diploid = TRUE, method = "Dch")
      ch.dist <- as.data.frame(as.table(chord.dist))
      # chord.dist by locus
      chord.dist.locus <- lapply(loc_names, function(l){
        as.data.frame(as.table(genet.dist(results_hierfstat[,c("pop",l)], diploid = TRUE, method = "Dch")))
      })
      chord.dist.l <- do.call(rbind,chord.dist.locus)
      chord.dist.all <- rbind(ch.dist,chord.dist.l)

      locus.final <- cbind(pws.all[,-c(1:5)],sA = sA.all,chord_distance = chord.dist.all[,2])
      #locus.final <- locus.final.names[,sapply(locus.final.names,is.numeric)]
      analysis_names <- names(locus.final)

      #Data.frame of summary data into simulation replicate
      # Create the data array first time through
      if(is.null(params@analysis.results[["Pairwise"]][[curr_scn]])){
        params@analysis.results[["Pairwise"]][[curr_scn]] <- array(0, dim=c((num_loci*choose(num_pops,2))+choose(num_pops,2),
                                                                            length(analysis_names),
                                                                            num_reps),
                                                                   dimnames = list(1:(num_loci*choose(num_pops,2)+choose(num_pops,2)),
                                                                                   analysis_names,
                                                                                   1:num_reps))
        }

        params@analysis.results[["Pairwise"]][[curr_scn]][,,curr_rep] <-  as.matrix(as.matrix(locus.final))

      }


  }
  params
}
christianparobek/skeleSim documentation built on Feb. 29, 2020, 6:58 p.m.