R/plot.pcoa.aggregate.R

Defines functions plot_pcoa_aggregate

Documented in plot_pcoa_aggregate

##### Plot the ordination of different aggregates

plot_pcoa_aggregate=function(f,ncode,level,label = TRUE) {

  diff1dis=function(f,ncode){
  #diveRsity::readGenepop
    arp2gen=function (infile)
    {
      flForm <- strsplit(infile, split = "\\.")[[1]]
      if (substr(infile, 1, 2) == "./") {
        flForm <- flForm[-1]
      }
      else if (substr(infile, 1, 3) == "../") {
        flForm <- flForm[-(1:2)]
      }
      if (length(flForm) > 3) {
        stop("There were multiple '.' characters in your file name!")
      }
      tstfile <- paste(flForm[1], ".gen", sep = "")
      if (!file.exists(tstfile)) {
        fastScan <- function(fname) {
          s <- file.info(fname)$size
          buf <- readChar(fname, s, useBytes = TRUE)
          if (length(grep("\r", buf)) != 0L) {
            buf <- gsub("\r", "\n", buf)
            buf <- gsub("\n\n", "\n", buf)
          }
          return(strsplit(buf, "\n", fixed = TRUE, useBytes = TRUE)[[1]])
        }
        dat <- fastScan(infile)
        dat <- gsub("^\\s+|\\s+$", "", dat)
        dataType <- grep("*datatype=*", tolower(dat))
        if (strsplit(dat[dataType], "=")[[1]][2] != "MICROSAT") {
          stop("Data are not in 'MICROSAT' format!")
        }
        missDataLine <- grep("*missingdata=*", tolower(dat))
        missData <- noquote(substr(dat[missDataLine], nchar(dat[missDataLine]) -
                                     1, nchar(dat[missDataLine]) - 1))
        sampSizeLine <- grep("*samplesize=*", tolower(dat))
        if (length(sampSizeLine) > 1) {
          sampNpos <- sapply(sampSizeLine, function(i) {
            return(regexpr("=", dat[i])[1])
          })
        }
        popSizes <- as.numeric(substr(dat[sampSizeLine], start = sampNpos +
                                        1, stop = nchar(dat[sampSizeLine])))
        npops <- length(popSizes)
        sampStrt <- grep("*sampledata=*", tolower(dat))
        strts <- sapply(sampStrt, function(x) {
          if (dat[(x + 1)] == "") {
            return(x + 2)
          }
          else {
            return(x + 1)
          }
        })
        ends <- strts + ((popSizes * 2) - 1)
        nloci <- length(strsplit(dat[strts[1]], split = "\\s+")[[1]]) -
          2
        popGeno <- lapply(seq_along(strts), function(i) {
          return(dat[strts[i]:ends[i]])
        })
        popSzcheck <- sapply(popGeno, function(x) length(x)/2)
        if (!all(identical(popSzcheck, popSizes))) {
          stop("Failed! Please make sure that your file is formatted correctly.")
        }
        popIdx <- lapply(popGeno, function(x) {
          return(seq(1, length(x), 2))
        })
        popList <- lapply(seq_along(popGeno), function(i) {
          al1 <- matrix(unlist(strsplit(popGeno[[i]][popIdx[[i]]],
                                        split = "\\s+")), nrow = popSizes[i], byrow = TRUE)[,
                                                                                            -(1:2)]
          al2 <- matrix(unlist(strsplit(popGeno[[i]][(popIdx[[i]] +
                                                        1)], split = "\\s+")), nrow = popSizes[i], byrow = TRUE)
          tst <- matrix(paste(al1, al2, sep = ""), nrow = popSizes[i])
          tst <- cbind(paste(rep("pop", nrow(tst)), i, " ,",
                             sep = ""), tst)
          rm(al1, al2)
          z <- gc()
          rm(z)
          if (nchar(tst[1, 2]) == 4) {
            tst[tst == paste(missData, missData, sep = "")] <- "0000"
          }
          else {
            tst[tst == paste(missData, missData, sep = "")] <- "000000"
          }
          out <- apply(tst, 1, function(x) {
            return(paste(x, collapse = "\t"))
          })
          out <- c("POP", out)
          rm(tst)
          z <- gc()
          rm(z)
          return(out)
        })
        outfile <- strsplit(infile, "\\.")[[1]]
        if (length(outfile) >= 2) {
          outfile <- paste(outfile[-length(outfile)], collapse = ".")
        }
        else {
          outfile <- outfile[1]
        }
        loci <- paste("locus", 1:nloci, sep = "")
        loci <- c(paste(outfile, "_gen_converted", sep = ""),
                  loci)
        of <- c(loci, unlist(popList))
        out <- file(paste(outfile, ".gen", sep = ""), "w")
        for (i in 1:length(of)) {
          cat(of[i], "\n", file = out, sep = "")
        }
        close(out)
        return(TRUE)
      }
      else {
        return(NULL)
      }
    }
    fileReader=function (infile)
    {
      if (typeof(infile) == "list") {
        return(infile)
      }
      else if (typeof(infile) == "character") {
        flForm <- strsplit(infile, split = "\\.")[[1]]
        ext <- flForm[[length(flForm)]]
        if (ext == "arp") {
          convRes <- arp2gen(infile)
          if (!is.null(convRes)) {
            cat("Arlequin file converted to genepop format! \n")
            infile <- paste(flForm[1], ".gen", sep = "")
          }
          else {
            infile <- paste(flForm[1], ".gen", sep = "")
          }
        }
        dat <- scan(infile, sep = "\n", what = "character", quiet = TRUE)
        if (length(strsplit(dat[4], split = "\\s+")[[1]][-1]) >
            1) {
          locs <- strsplit(dat[2], split = "\\s+")[[1]]
          if (length(locs != 1)) {
            locs <- strsplit(dat[2], split = ",")[[1]]
          }
          locs <- as.character(sapply(locs, function(x) {
            x <- strsplit(x, split = "")[[1]]
            if (is.element(",", x)) {
              x <- x[-(which(x == ","))]
            }
            return(paste(x, collapse = ""))
          }))
          dat <- c(dat[1], locs, dat[-(1:2)])
        }
        popLoc <- grep("^([[:space:]]*)pop([[:space:]]*)$", tolower(dat))
        no_col <- popLoc[1] - 1
        if (popLoc[1] == 3) {
          locs <- unlist(strsplit(dat[2], split = c("\\,",
                                                    "\\s+")))
          dat <- c(dat[1], locs, dat[3:length(dat)])
        }
        popLoc <- grep("^([[:space:]]*)pop([[:space:]]*)$", tolower(dat))
        no_col <- popLoc[1] - 1
        dat1 <- sapply(dat, function(x) {
          x <- unlist(strsplit(x, split = "\\s+"))
          if (is.element("", x)) {
            x <- x[-(which(x == ""))]
          }
          if (is.element(",", x)) {
            x <- x[-(which(x == ","))]
          }
          if (length(x) != 1 && length(x) != no_col) {
            x <- paste(x, collapse = "")
          }
          if (length(x) < no_col) {
            tabs <- paste(rep(NA, (no_col - length(x))),
                          sep = "\t", collapse = "\t")
            line <- paste(x, tabs, sep = "\t")
            line <- unlist(strsplit(line, split = "\t"))
            return(line)
          }
          else {
            return(x)
          }
        })
      }
      out <- as.data.frame(t(dat1))
      rownames(out) <- NULL
      return(out)
    }
    readGenepop=function (infile = NULL, gp = 3, bootstrap = FALSE)
    {
      gp = gp
      infile = infile
      bootstrap = bootstrap
      data1 <- fileReader(infile)
      if (gp == 3) {
        data1[data1 == 0] <- NA
        data1[data1 == "999999"] <- NA
        data1[data1 == "000000"] <- NA
        data1[data1 == "NANA"] <- NA
      }
      else if (gp == 2) {
        data1[data1 == 0] <- NA
        data1[data1 == "9999"] <- NA
        data1[data1 == "0000"] <- NA
        data1[data1 == "NA"] <- NA
      }
      raw_data <- data1
      npops <- length(which(toupper(data1[, 1]) == "POP"))
      pop_pos <- c(which(toupper(data1[, 1]) == "POP"), (nrow(data1) +
                                                           1))
      pop_sizes <- vector()
      for (i in 1:npops) {
        pop_sizes[i] <- pop_pos[(i + 1)] - pop_pos[i] - 1
      }
      pop_names <- as.character(data1[(pop_pos[1:npops] + 1), 1])
      pop_weights <- 1/pop_sizes
      n_harmonic <- npops/sum(pop_weights)
      N <- pop_sizes
      nloci <- (pop_pos[1] - 2)
      if (nloci != (ncol(raw_data) - 1)) {
        stop("Check your input file for formatting errors!")
      }
      loci_names <- as.vector(data1[2:(pop_pos[1] - 1), 1])
      pop_list <- list()
      for (i in 1:npops) {
        pop_list[[i]] <- as.matrix(data1[(pop_pos[i] + 1):(pop_pos[(i +
                                                                      1)] - 1), 2:(nloci + 1)])
      }
      extCheck <- sapply(1:length(pop_list), function(i) {
        sum(is.na(pop_list[[i]])) == nloci * pop_sizes[i]
      })
      if (sum(extCheck) > 0) {
        npops <- npops - sum(extCheck)
        pop_list <- pop_list[-(which(extCheck == TRUE))]
        pop_sizes <- pop_sizes[-(which(extCheck == TRUE))]
        pop_names <- pop_names[-(which(extCheck == TRUE))]
        pop_weights <- pop_weights[-(which(extCheck == TRUE))]
        N <- N[-(which(extCheck == TRUE))]
        noPop <- which(extCheck == TRUE)
        indexer <- lapply(noPop, function(i) {
          (pop_pos[i] + 1):(pop_pos[(i + 1)])
        })
        indexer <- unlist(indexer)
        raw_data <- raw_data[-(indexer), ]
      }
      if (gp == 3) {
        plMake <- function(x) {
          out <- matrix(sprintf("%06g", as.numeric(x)), nrow = nrow(x),
                        ncol = ncol(x))
          if (Sys.info()["sysname"] == "Darwin") {
            out[out == "0000NA"] <- "    NA"
          }
          return(out)
        }
      }
      else if (gp == 2) {
        plMake <- function(x) {
          out <- matrix(sprintf("%04g", as.numeric(x)), nrow = nrow(x),
                        ncol = ncol(x))
          if (Sys.info()["sysname"] == "Darwin") {
            out[out == "00NA"] <- "  NA"
          }
          return(out)
        }
      }
      suppressWarnings(pop_list <- lapply(pop_list, plMake))
      if (gp == 3) {
        for (i in 1:npops) {
          pop_list[[i]][pop_list[[i]] == "    NA"] <- NA
        }
      }
      else if (gp == 2) {
        for (i in 1:npops) {
          pop_list[[i]][pop_list[[i]] == "  NA"] <- NA
        }
      }
      if (bootstrap == T) {
        bs <- function(x) {
          return(matrix(x[sample(nrow(x), replace = TRUE),
                          ], ncol = ncol(x)))
        }
        pop_list <- lapply(pop_list, bs)
      }
      lps <- function(x) {
        lsp_count <- as.vector(colSums(!is.na(x)))
        return(lsp_count)
      }
      pre_loci_pop_sizes <- lapply(pop_list, lps)
      pls <- matrix(ncol = nloci, nrow = npops)
      for (i in 1:length(pre_loci_pop_sizes)) {
        pls[i, ] <- pre_loci_pop_sizes[[i]]
      }
      loci_pop_sizes <- split(pls, col(pls))
      pre_loc_weights <- 1/pls
      loci_pop_weights1 <- split(pre_loc_weights, col(pre_loc_weights))
      loci_harm_N <- npops/colSums(pre_loc_weights)
      if (gp == 3) {
        pl_ss <- function(x) {
          pl <- list()
          pl[[1]] <- matrix(substr(x, 1, 3), ncol = nloci)
          pl[[2]] <- matrix(substr(x, 4, 6), ncol = nloci)
          return(pl)
        }
      }
      else {
        pl_ss <- function(x) {
          pl <- list()
          pl[[1]] <- matrix(substr(x, 1, 2), ncol = nloci)
          pl[[2]] <- matrix(substr(x, 3, 4), ncol = nloci)
          return(pl)
        }
      }
      pop_alleles <- lapply(pop_list, pl_ss)
      alln <- function(x) {
        res <- list()
        for (i in 1:ncol(x[[1]])) {
          res[i] <- list(sort(unique(c(x[[1]][, i], x[[2]][,
                                                           i])), decreasing = F))
        }
        return(res)
      }
      allele_names <- lapply(pop_alleles, alln)
      loci_combi <- allele_names[[1]]
      for (j in 1:nloci) {
        for (i in 2:npops) {
          loci_combi[[j]] <- c(loci_combi[[j]], allele_names[[i]][[j]])
        }
      }
      aaList <- function(x) {
        return(sort(unique(x, decreasing = FALSE)))
      }
      all_alleles <- lapply(loci_combi, aaList)
      aa <- all_alleles
      aa <- lapply(aa, FUN = list, npops)
      afMatrix <- function(x) {
        np <- x[[2]]
        z <- matrix(rep(0, (np * length(x[[1]]))), ncol = np,
                    nrow = length(x[[1]]))
        rownames(z) <- x[[1]]
        return(z)
      }
      allele_freq <- lapply(aa, afMatrix)
      parbind <- function(x) {
        rbind(x[[1]], x[[2]])
      }
      pa1 <- lapply(pop_alleles, parbind)
      afTab <- function(x) {
        lapply(1:ncol(x), function(i) {
          return(table(x[, i]))
        })
      }
      actab <- lapply(pa1, afTab)
      afs <- function(x) {
        afsint <- function(y) {
          length(na.omit(y))/2
        }
        apply(x, 2, afsint)
      }
      indtyppop <- lapply(pa1, afs)
      afCalcpop <- lapply(1:length(actab), function(x) {
        lapply(1:length(actab[[x]]), function(y) {
          actab[[x]][[y]]/(indtyppop[[x]][y] * 2)
        })
      })
      obs_count <- allele_freq
      for (i in 1:npops) {
        for (j in 1:nloci) {
          allele_freq[[j]][names(afCalcpop[[i]][[j]]), i] <- afCalcpop[[i]][[j]]
          obs_count[[j]][names(actab[[i]][[j]]), i] <- actab[[i]][[j]]
        }
      }
      indtyp <- list()
      for (i in 1:nloci) {
        indtyp[[i]] <- vector()
      }
      for (i in 1:npops) {
        for (j in 1:nloci) {
          indtyp[[j]][i] <- indtyppop[[i]][j]
        }
      }
      if (bootstrap == T) {
        ind_vectors <- list()
        for (i in 1:npops) {
          ind_vectors[[i]] <- noquote(paste(rep(i, pop_sizes[i]),
                                            ",", sep = ""))
        }
        pre_data <- matrix(rep("\t", ((nloci + 1) * (nloci +
                                                       1))), ncol = (nloci + 1))
        pre_data[1, ] <- c("Title", rep("\t", nloci))
        for (i in 2:(nloci + 1)) {
          pre_data[i, 1] <- loci_names[(i - 1)]
        }
        pop_data <- list()
        for (i in 1:npops) {
          pop_data[[i]] <- matrix(rbind(c("POP", as.vector(rep("\t",
                                                               nloci))), cbind(ind_vectors[[i]], pop_list[[i]])),
                                  ncol = (nloci + 1))
        }
        bs_data_file <- matrix(rbind(pre_data, pop_data[[1]]),
                               ncol = (nloci + 1))
        for (i in 2:npops) {
          bs_data_file <- matrix(rbind(bs_data_file, pop_data[[i]]),
                                 ncol = (nloci + 1))
        }
        bs_data_file <- data.frame(bs_data_file)
      }
      nalleles <- vector()
      for (i in 1:nloci) {
        nalleles[i] <- nrow(allele_freq[[i]])
      }
      if (bootstrap == T) {
        list(npops = npops, nloci = nloci, pop_alleles = pop_alleles,
             pop_list = pop_list, loci_names = loci_names, pop_pos = pop_pos,
             pop_sizes = pop_sizes, allele_names = allele_names,
             all_alleles = all_alleles, allele_freq = allele_freq,
             raw_data = raw_data, loci_harm_N = loci_harm_N, n_harmonic = n_harmonic,
             pop_names = pop_names, indtyp = indtyp, nalleles = nalleles,
             bs_file = bs_data_file, obs_allele_num = obs_count)
      }
      else if (bootstrap == F) {
        list(npops = npops, nloci = nloci, pop_alleles = pop_alleles,
             pop_list = pop_list, loci_names = loci_names, pop_pos = pop_pos,
             pop_sizes = pop_sizes, allele_names = allele_names,
             all_alleles = all_alleles, allele_freq = allele_freq,
             raw_data = raw_data, loci_harm_N = loci_harm_N, n_harmonic = n_harmonic,
             pop_names = pop_names, indtyp = indtyp, nalleles = nalleles,
             obs_allele_num = obs_count)
      }
    }
  gp = ncode
  fr = readGenepop(f, gp, bootstrap = FALSE)
  af = fr$allele_freq

  DeltaD = function(abun, struc) {
    ## Chao et al, 2017
    n = sum(abun)
    N = ncol(abun)
    ga = rowSums(abun)
    gp = ga[ga > 0]/n
    G = sum(-gp * log(gp))
    H = nrow(struc)
    A = numeric(H - 1)
    W = numeric(H - 1)
    Diff = numeric(H - 1)
    wi = colSums(abun)/n
    W[H - 1] = -sum(wi[wi > 0] * log(wi[wi > 0]))
    pi = sapply(1:N, function(k) abun[, k]/sum(abun[, k]))
    Ai = sapply(1:N, function(k) -sum(pi[, k][pi[, k] > 0] * log(pi[, k][pi[, k] > 0])))
    A[H - 1] = sum(wi * Ai)
    if (H > 2) {
      for (i in 2:(H - 1)) {
        I = unique(struc[i, ])
        NN = length(I)
        ai = matrix(0, ncol = NN, nrow = nrow(abun))
        c
        for (j in 1:NN) {
          II = which(struc[i, ] == I[j])
          if (length(II) == 1) {
            ai[, j] = abun[, II]
          } else {
            ai[, j] = rowSums(abun[, II])
          }
        }
        pi = sapply(1:NN, function(k) ai[, k]/sum(ai[, k]))
        wi = colSums(ai)/sum(ai)
        W[i - 1] = -sum(wi * log(wi))
        Ai = sapply(1:NN, function(k) -sum(pi[, k][pi[, k] > 0] * log(pi[, k][pi[, k] > 0])))
        A[i - 1] = sum(wi * Ai)
      }
    }
    Diff[1] = (G - A[1])/W[1]
    if (H > 2) {
      for (i in 2:(H - 1)) {
        Diff[i] = (A[i - 1] - A[i])/(W[i] - W[i - 1])
      }
    }
    Diff = Diff
    out = matrix(c(Diff), ncol = 1)
    return(out)
  }

  v1 = c("ecosystem", "region1", "pop1")
  v2 = c("ecosystem", "region1", "pop2")
  str = data.frame(v1, v2)
  str = as.matrix(str)
  npops = fr$npops
  nloci = fr$nloci
  Dmat = list()
  for (l in 1:nloci) {
    Dmat[[l]] = matrix(data = 0, nrow = npops, ncol = npops)
    for (i in 1:npops) {
      for (j in 1:npops) {
        Dmat[[l]][i, j] = DeltaD((af[[l]][, c(i, j)]), str)[2]  ### select two pops from allelefrequency
      }
    }
  }
  pairwiseDav = Reduce("+", Dmat)/length(Dmat)
  colnames(pairwiseDav) = fr$pop_names
  rownames(pairwiseDav) = fr$pop_names
  # library(popbio)
  DeltaDmat = as.dist(pairwiseDav)
return(list(pairwiseDav=pairwiseDav,DeltaDmat=DeltaDmat))
}

diff1=diff1dis(f,ncode)
ape::pcoa
dd2=pcoa(diff1$pairwiseDav, correction="none", rn=rownames(diff1$pairwiseDav))

biplot(dd2,  plot.axes = c(1,2), col=1: length(levels(as.factor(level))), dir.axis1=1, dir.axis2=1, cex=NULL, main="Aggregation plot",xlabs = "PCoA1", ylabs = "PCoA2")
#ordihull(dd2$vectors, level, col=1:4, lwd=3)
ordiellipse(dd2$vectors, level, col=1: length(levels(as.factor(level))), kind = "ehull", lwd=3)
#ordiellipse(dd2$vectors, level, col=1:4, draw="polygon")
ordispider(dd2$vectors, groups =level, display = "sites", col=1: length(levels(as.factor(level))), label=label)
points(dd2$vectors, pch=21, col="purple", bg=c("white"), cex=1.3)

}
xinghuq/HierDpart documentation built on March 21, 2023, 6:43 p.m.