vaxpack: Provide Fast Population Genetic Analysis For A Single Gene

vaxpack_input <- function () {

  cat ("For this function to work you will need:\n")
  cat ("1 - A folder containing all files to be analysed\n")
  cat ("     These need to be aligned, the same length, and with no gaps! (e.g. with MEGA)\n")
  cat ("     Only accepts bases 'A/a', 'T/t', 'C/c', 'G/g' \n")
  cat ("     Gaps, or bases that are not AaTtCcGg, are called as reference, so accuracy is lost\n")
  cat ("     These replacements will be counted in the results table as \"invalid sites\"\n")
  cat ("\n")
  cat ("     Make a different file for different populations to compare them\n")
  cat ("     e.g. 'Asymptomatic.fasta, Symptomatic.fasta', or 'Brazil.fasta, Peru.fasta'\n")
  cat ("\n")
  cat ("2 - A reference file for the gene you are analysing\n")
  cat ("     (This cannot be in the same folder as the other files!)\n")
  cat ("\n")
  cat ("Accepted extensions are \".fasta\", \".fas\", \".fa\", and \".seq\"\n")
  cat ("\n")
  cat ("Enter the file path to your folder with a \"/\" at the end \n")
  cat ("e.g. \"/users/me/documents/r files/my fasta folder/\"\n")
  path <- readline (" My folder path <- ")
  path <- gsub("\"", "", path)

  if (substr(path, nchar(path), nchar(path)) != "/")
    stop("There needs to be a backslash at the end!")

  cat (" \n")
  cat (" Enter the file path to your reference file\n")
  cat (" e.g. \"/users/me/documents/r files/reference.fasta\"\n")
  ref <- readline (" My ref .fasta file path <- ")
  ref <- gsub("\"", "", ref)

  if (grepl(".fasta", ref, ignore.case = TRUE) == FALSE)
    if (grepl(".fas", ref, ignore.case = TRUE) == FALSE)
      if (grepl(".fa", ref, ignore.case = TRUE) == FALSE)
        if (grepl(".seq", ref, ignore.case = TRUE) == FALSE)
          stop("The reference file extension must be \".fasta\", \".fas\", \".fa\", or \".seq\"")

  cat (" \n")
  genename <- readline (" What is the name of the gene you're analysing? <- ")
  cat (" \n")

  cat (" \n")
  cat (" What is the ploidy of your gene?\n")
  cat (" 1 - Haploid\n")
  cat (" 2 - Diploid\n")
  cat (" 3 - Triploid\n")
  cat (" 4 - Tetraploid\n")
  cat (" 5 - Pentapoid\n")

  ploidycount <- readline (" My gene's ploidy <- ")
  ploidycount <- as.numeric(ploidycount)
  if (ploidycount == 1) ploidycount <- 1
  if (ploidycount == 2) ploidycount <- 4
  if (ploidycount == 3) ploidycount <- 8
  if (ploidycount == 4) ploidycount <- 16
  if (ploidycount == 5) ploidycount <- 32


  #vaxpack
  #Starting the timer---------------------------------------------------------------------------------
  st <- Sys.time()
  #Counting the number of files, setting up results table---------------------------------------------
  fasta.locs <- list.files(path, full.names = TRUE)
  fasta.names <- gsub("\\.[aA-zZ]*", "", list.files(path))
  fasta.num <- length(fasta.locs)

  cat (" Completing Step: 1 of", (fasta.num * 7) + 16, "\r")

  ndrt <- matrix(nrow = fasta.num + 1, ncol = 11)
  rownames(ndrt) <- c(fasta.names, "Total")
  colnames(ndrt) <- c("n", "Seq. Length", "Invalid Sites", "Seg. sites", "SNPS", "\u03a0 x 10^3", "TD",
                      "NS", "SP", "nuc. h", "nuc. Hd")

  #Setting up reference as a matrix-------------------------------------------------------------------
  cat (" Completing Step: 2 of", (fasta.num * 7) + 16, "\r")
  ref.string <- toupper(as.matrix(paste(readLines(ref)[-1], collapse = "")))
  seq.length <- sum(nchar(ref.string))
  ref.matrix <- matrix(nrow = 1, ncol = seq.length)
  totalpop <- matrix(ncol = seq.length)
  for (i in 1:seq.length){ref.matrix[1,i] <- substr(ref.string, i, i)}
  #Making amino acid converter------------------------------------------------------------------------
  cat (" Completing Step: 3 of", (fasta.num * 7) + 16, "\r")
  aalookup <-matrix(c("ATT", "I", "ATC", "I", "ATA", "I",
                      "CTT", "L", "CTC", "L", "CTA", "L", "CTG", "L", "TTA", "L", "TTG","L",
                      "GTT", "V", "GTC", "V", "GTA", "V", "GTG", "V",
                      "TTT", "F", "TTC", "F",
                      "ATG", "M",
                      "TGT", "C", "TGC", "C",
                      "GCT", "A", "GCC", "A", "GCA", "A", "GCG", "A",
                      "GGT", "G", "GGC", "G", "GGA", "G", "GGG", "G",
                      "CCT", "P", "CCC", "P", "CCA", "P", "CCG", "P",
                      "ACT", "T", "ACC", "T", "ACA", "T", "ACG", "T",
                      "TCT", "S", "TCC", "S", "TCA", "S", "TCG", "S", "AGT", "S", "AGC", "S",
                      "TAT", "Y", "TAC", "Y",
                      "TGG", "W",
                      "CAA", "Q", "CAG", "Q",
                      "AAT", "N", "AAC", "N",
                      "CAT", "H", "CAC", "H",
                      "GAA", "E", "GAG", "E",
                      "GAT", "D", "GAC", "D",
                      "AAA", "K", "AAG", "K",
                      "CGT", "R", "CGC", "R", "CGA", "R", "CGG", "R", "AGA", "R", "AGG", "R",
                      "TAA", "STOP", "TAG", "STOP", "TGA", "STOP"), nrow = 64, ncol = 2, byrow = TRUE)
  #Starting loop / checking length of sequences---------------------------------------------------------
  loop.count <- 1
  invalid.site.counter.total <- 0
  for (i in 1:fasta.num){
    cat (" Completing Step:", loop.count + 3, "of", (fasta.num * 7) + 16, "\r")
    onepop <- toupper(as.matrix(readLines(fasta.locs[i])[c(FALSE, TRUE)]))
    rownames(onepop) <- readLines(fasta.locs[i])[c(TRUE, FALSE)]
    seq.count <- nrow(onepop)
    if (length(unique(nchar(onepop))) != 1)
      stop("All sequences must be the same length!")
    if ((unique(nchar(onepop))) != seq.length)
      stop("Reference is not the same length as your sequences!")
    #Splitting sequences from each file into matrix format-----------------------------------------------
    cat (" Completing Step:", loop.count + 4, "of", (fasta.num * 7) + 16, "\r")
    invalid.site.counter <- 0
    if (seq.count > 1){
      wholegene <- matrix(nrow = seq.count, ncol = seq.length)
      n.change.matrix <- wholegene
      for (j in 1:seq.count){
        splitgene <- strsplit(onepop[j,1], split = "")[[1]]
        for (k in 1:seq.length){
          if (splitgene[k] %in% c("A", "T", "C", "G", "a", "t", "c", "g") == TRUE)
            wholegene[j,k] <- splitgene[k]
          if (splitgene[k] %in% c("A", "T", "C", "G", "a", "t", "c", "g") == FALSE)
            {wholegene[j,k] <- ref.matrix[1,k]
            invalid.site.counter <- invalid.site.counter + 1}
        }
      }
    #Comparing all rows for SNPs---------------------------------------------------------------------
      cat (" Completing Step:", loop.count + 5, "of", (fasta.num * 7) + 16, "\r")
      for (j in 1:(seq.count-1)){
        n.change.matrix[j,] <- colSums(sweep(wholegene[(j):(seq.count),], 2, wholegene[(j),], FUN = `!=`,
                                             check.margin = FALSE) + 0)}
      n.change.matrix[seq.count,] <- ( wholegene[seq.count,] != wholegene[seq.count-1,] ) + 0
      n.change.matrix <- n.change.matrix*2
      snps <- colSums(n.change.matrix)
      seg.sites <- matrix()
      for (j in 1:seq.length){
        if (snps[j] != 0) {seg.sites <- cbind(seg.sites, as.matrix(paste(j)))}
      }
      seg.sites <- as.numeric(seg.sites[,-1])
      seg.sites.num <- length(seg.sites)
  #Counting synonomous and non-synonomous snps-----------------------------------------------------
      cat (" Completing Step:", loop.count + 6, "of", (fasta.num * 7) + 16, "\r")
      single.snp.count <- 0
      single.syn.count <- 0
      single.nsyn.count <- 0
      for (j in 1:seg.sites.num){
        snp.site <- seg.sites[j]
        
        if (snp.site %% 3 == 1){
          refAA <- aalookup[match(paste(ref.matrix[ ,snp.site],
                                            ref.matrix[ ,snp.site+1],
                                            ref.matrix[ ,snp.site+2], sep = ""), aalookup),2]
          
          possible.AA.number <- length(
            unique(c(aalookup[match(unique(paste(
              wholegene[ ,snp.site],
              ref.matrix[ ,snp.site+1],
              ref.matrix[ ,snp.site+2], sep = "")), aalookup),2],
              refAA)))
          }

        if (snp.site %% 3 == 2){
          refAA <- aalookup[match(paste(ref.matrix[ ,snp.site-1],
                                            ref.matrix[ ,snp.site],
                                            ref.matrix[ ,snp.site+1], sep = ""), aalookup),2]
          
          possible.AA.number <- length(
            unique(c(aalookup[match(unique(paste(
              ref.matrix[ ,snp.site-1],
              wholegene[ ,snp.site],
              ref.matrix[ ,snp.site+1], sep = "")), aalookup),2],
              refAA)))
          }

        if (snp.site %% 3 == 0){
          refAA <- aalookup[match(paste(ref.matrix[ ,snp.site-2],
                                            ref.matrix[ ,snp.site-1],
                                            ref.matrix[ ,snp.site], sep = ""), aalookup),2]
          
          possible.AA.number <- length(
            unique(c(aalookup[match(unique(paste(
              ref.matrix[ ,snp.site-2],
              ref.matrix[ ,snp.site-1],
              wholegene[ ,snp.site], sep = "")), aalookup),2],
              refAA)))
        }
        
        possible.nuc.number <- length(unique(c(wholegene[,snp.site], ref.matrix[,snp.site])))
        
        single.snp.count  <- single.snp.count  + (possible.nuc.number - 1)
        single.nsyn.count <- single.nsyn.count + (possible.AA.number  - 1)
        single.syn.count  <- single.syn.count  + (possible.nuc.number - possible.AA.number)

      }
    #Counting haplotypes--------------------------------------------------------------------------------
      cat (" Completing Step:", loop.count + 7, "of", (fasta.num * 7) + 16, "\r")
      hap.nucs <- unique(onepop[ ,1])
      hap.count <- length(hap.nucs)
      hap.lookup <- cbind(hap.nucs, c(1:hap.count))
      hap.table <- match(onepop[ ,1], hap.lookup)
      hap.freq <- matrix(ncol = 2, nrow = hap.count)
      hap.freq[ ,1] <- c(1:hap.count)
      for (j in 1:hap.count){
        n.hap.counter <- 0
        for (k in 1:seq.count){
          if (hap.table[k] == j) {n.hap.counter <- n.hap.counter + 1}
        }
        hap.freq[j,2] <- n.hap.counter
      }
     Hd <- (seq.count/(seq.count-1)) * (1-(sum(((hap.freq[,2])/seq.count)^2)))
    #Calculating nucleotide diversity, Tajima's D--------------------------------------------------------
      cat (" Completing Step:", loop.count + 8, "of", (fasta.num * 7) + 16, "\r")
      pi.per.nuc <- ((colSums(n.change.matrix) /  (seq.count)^2) / seq.length) * (seq.count/(seq.count-1))
      tpi <- sum(colSums(n.change.matrix)) /  (seq.count) / seq.count  * (seq.count/(seq.count-1))
      pi <- sum(pi.per.nuc)
      aone <- 0
      for (j in 1:(seq.count-1)){aone <- aone + 1/j}
      atwo <- 0
      for (j in 1:(seq.count-1)){atwo <- atwo + 1/(j^2)}
      bone <- (seq.count + 1) / (3*(seq.count - 1))
      btwo <- (2*(seq.count^2 + seq.count + 3)) / (9*seq.count*(seq.count-1))
      cone <- bone - (1/aone)
      ctwo <- btwo - ((seq.count+2)/(aone*seq.count)) + (atwo/(aone^2))
      eone <- cone / aone
      etwo <- ctwo / ((aone^2) + atwo)
      td <- ((tpi) -  (seg.sites.num / aone)) /
        (sqrt(  (eone*seg.sites.num)  + ( (etwo*seg.sites.num) * (seg.sites.num-1) ) ))
    #Entering Table--------------------------------------------------------------------------------------
      cat (" Completing Step:", loop.count + 9, "of", (fasta.num * 7) + 16, "\r")
      ndrt[i,1]  <- seq.count
      ndrt[i,2]  <- seq.length
      ndrt[i,3]  <- invalid.site.counter
      ndrt[i,4]  <- seg.sites.num
      ndrt[i,5]  <- single.snp.count
      ndrt[i,6]  <- round(pi*1000, 3)
      ndrt[i,7]  <- round(td, 3)
      ndrt[i,8]  <- single.nsyn.count
      ndrt[i,9]  <- single.syn.count
      ndrt[i,10]  <- hap.count
      ndrt[i,11] <- round(Hd, 3)}
    #Table in case of only one sequence (no comparisons)------------------------------------------------
    if (seq.count == 1){
      ndrt[i,1]  <- seq.count
      ndrt[i,2]  <- seq.length
      ndrt[i,3]  <- invalid.site.counter
      ndrt[i,4]  <- 0
      ndrt[i,5]  <- 0
      ndrt[i,6]  <- 0
      ndrt[i,7]  <- 0
      ndrt[i,8]  <- 0
      ndrt[i,9]  <- 0
      ndrt[i,10] <- 1
      ndrt[i,11] <- 0
    }
    
    #Still inside loop, gathering sequences from each loop as totalpop---------------------------------
    totalpop <- rbind(totalpop, wholegene)
    loop.count <- loop.count + 7
    invalid.site.counter.total <- invalid.site.counter.total + invalid.site.counter
  }
  #Outside loop now, removing reference from top row--------------------------------------------------
  totalpop <- totalpop[-1,]
  seq.count <- dim(totalpop)[1]
  seq.length <- dim(totalpop)[2]
  #Recalculating SNPs from whole data set-----------------------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 4, "of", (fasta.num * 7) + 16, "\r")
  n.change.matrix <- matrix(nrow = seq.count, ncol = seq.length)
  for (j in 1:(seq.count-1)){
    n.change.matrix[j,] <- colSums(sweep(totalpop[(j):(seq.count),], 2, totalpop[(j),], FUN = `!=`,
                                         check.margin = FALSE) + 0)}
  n.change.matrix[seq.count,] <- ( totalpop[seq.count,] != totalpop[seq.count-1,] ) + 0
  n.change.matrix <- n.change.matrix*2
  snps <- colSums(n.change.matrix)
  seg.sites <- matrix()
  for (j in 1:seq.length){
    if (snps[j] != 0) {seg.sites <- cbind(seg.sites, as.matrix(paste(j)))}
  }
  seg.sites <- as.numeric(seg.sites[,-1])
  seg.sites.num <- length(seg.sites)
  seg.codons <- unique(ceiling(seg.sites/3))
  seg.codons.num <- length(seg.codons)
  globalsnps <- snps
  #SNPs for all sequences-------------------------------------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 5, "of", (fasta.num * 7) + 16, "\r")
  total.snp.count <- 0
  total.syn.count <- 0
  total.nsyn.count <- 0
  for (j in 1:seg.sites.num){
    snp.site <- seg.sites[j]
    
    if (snp.site %% 3 == 1){
      refAA <- aalookup[match(paste(ref.matrix[ ,snp.site],
                                    ref.matrix[ ,snp.site+1],
                                    ref.matrix[ ,snp.site+2], sep = ""), aalookup),2]
      
      possible.AA.number <- length(
        unique(c(aalookup[match(unique(paste(
          totalpop[ ,snp.site],
          ref.matrix[ ,snp.site+1],
          ref.matrix[ ,snp.site+2], sep = "")), aalookup),2],
          refAA)))
    }
    
    if (snp.site %% 3 == 2){
      refAA <- aalookup[match(paste(ref.matrix[ ,snp.site-1],
                                    ref.matrix[ ,snp.site],
                                    ref.matrix[ ,snp.site+1], sep = ""), aalookup),2]
      
      possible.AA.number <- length(
        unique(c(aalookup[match(unique(paste(
          ref.matrix[ ,snp.site-1],
          totalpop[ ,snp.site],
          ref.matrix[ ,snp.site+1], sep = "")), aalookup),2],
          refAA)))
    }
    
    if (snp.site %% 3 == 0){
      refAA <- aalookup[match(paste(ref.matrix[ ,snp.site-2],
                                    ref.matrix[ ,snp.site-1],
                                    ref.matrix[ ,snp.site], sep = ""), aalookup),2]
      
      possible.AA.number <- length(
        unique(c(aalookup[match(unique(paste(
          ref.matrix[ ,snp.site-2],
          ref.matrix[ ,snp.site-1],
          totalpop[ ,snp.site], sep = "")), aalookup),2],
          refAA)))
    }
    
    possible.nuc.number <- length(unique(c(totalpop[,snp.site], ref.matrix[,snp.site])))
    
    total.snp.count  <- total.snp.count  + (possible.nuc.number - 1)
    total.nsyn.count <- total.nsyn.count + (possible.AA.number  - 1)
    total.syn.count  <- total.syn.count  + (possible.nuc.number - possible.AA.number)
  }
  #Total haplotypes--------------------------------------------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 6, "of", (fasta.num * 7) + 16, "\r")
  tot.pop.string <- matrix(nrow = seq.count)
  for (i in 1:seq.count){tot.pop.string[i,1] <- paste(totalpop[i,], collapse = "")}
  hap.nucs <- unique(tot.pop.string[ ,1])
  hap.count <- length(hap.nucs)
  #Total Haplotype diversity-------------------------------------------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 7, "of", (fasta.num * 7) + 16, "\r")
  hap.lookup <- cbind(hap.nucs, c(1:hap.count))
  hap.table <- match(tot.pop.string[ ,1], hap.lookup)
  hap.freq <- matrix(ncol = 2, nrow = hap.count)
  hap.freq[ ,1] <- c(1:hap.count)
  for (j in 1:hap.count){
    n.hap.counter <- 0
    for (k in 1:seq.count){
      if (hap.table[k] == j) {n.hap.counter <- n.hap.counter + 1}}
    hap.freq[j,2] <- n.hap.counter}
  Hd <- (seq.count/(seq.count-1)) * (1-(sum(((hap.freq[,2])/seq.count)^2)))
  #Total nucleotide diversity and Tajima's D-----------------------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 8, "of", (fasta.num * 7) + 16, "\r")
  pi.per.nuc <- ((colSums(n.change.matrix) /  (seq.count)^2) / seq.length) * (seq.count/(seq.count-1))
  tpi <- sum(colSums(n.change.matrix)) /  (seq.count) / seq.count  * (seq.count/(seq.count-1))
  pi <- sum(pi.per.nuc)
  aone <- 0
  for (j in 1:(seq.count-1)){aone <- aone + 1/j}
  atwo <- 0
  for (j in 1:(seq.count-1)){atwo <- atwo + 1/(j^2)}
  bone <- (seq.count + 1) / (3*(seq.count - 1))
  btwo <- (2*(seq.count^2 + seq.count + 3)) / (9*seq.count*(seq.count-1))
  cone <- bone - (1/aone)
  ctwo <- btwo - ((seq.count+2)/(aone*seq.count)) + (atwo/(aone^2))
  eone <- cone / aone
  etwo <- ctwo / ((aone^2) + atwo)
  td <- ((tpi) -  (seg.sites.num / aone)) /
    (sqrt(  (eone*seg.sites.num)  + ( (etwo*seg.sites.num) * (seg.sites.num-1) ) ))
  #Adding to total row of results table-----------------------------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 9, "of", (fasta.num * 7) + 16, "\r")
  ndrt[fasta.num + 1, 1]  <- seq.count
  ndrt[fasta.num + 1, 2]  <- seq.length
  ndrt[fasta.num + 1, 3]  <- invalid.site.counter.total
  ndrt[fasta.num + 1, 4]  <- seg.sites.num
  ndrt[fasta.num + 1, 5]  <- total.snp.count
  ndrt[fasta.num + 1, 6]  <- round(pi*1000, 3)
  ndrt[fasta.num + 1, 7]  <- round(td, 3)
  ndrt[fasta.num + 1, 8]  <- total.nsyn.count
  ndrt[fasta.num + 1, 9]  <- total.syn.count
  ndrt[fasta.num + 1, 10] <- hap.count
  ndrt[fasta.num + 1, 11] <- round(Hd, 3)
  #Minor alleles freq for amino acids and then nucleotides----------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 10, "of", (fasta.num * 7) + 16, "\r")
  AAs.each.variable.codon <- matrix(nrow = seq.count, ncol = seg.codons.num)
  colnames(AAs.each.variable.codon) <- seg.codons
  ref.each.variable.codon <- matrix(nrow = 1, ncol = seg.codons.num)
  colnames(ref.each.variable.codon) <- seg.codons
  max.AA.variants <- 0
  for (i in 1:seg.codons.num){
    codon.num <- seg.codons[i]
    refAA <- aalookup[match(paste(ref.matrix[ ,(codon.num*3)-2],
                                  ref.matrix[ ,(codon.num*3)-1],
                                  ref.matrix[ ,(codon.num*3)], sep = ""), aalookup),2]
    ref.each.variable.codon[ ,i] <- refAA
    possible.AA <- aalookup[match(paste(totalpop[ ,(codon.num*3)-2],
                                        totalpop[ ,(codon.num*3)-1],
                                        totalpop[ ,(codon.num*3)], sep = ""), aalookup),2]
    AAs.each.variable.codon[ ,i] <- possible.AA
    possible.AA.number <- length(unique(c(refAA, possible.AA)))
    if (possible.AA.number > max.AA.variants) max.AA.variants <- possible.AA.number}
  #---------------------------------------------------------------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 11, "of", (fasta.num * 7) + 16, "\r")
  did.AA.change <- colSums(sweep(AAs.each.variable.codon, 2, ref.each.variable.codon, FUN = `!=`) + 0) / seq.count
  aa.variant.table <- matrix(nrow = max.AA.variants + 1, ncol = seg.codons.num)
  colnames(aa.variant.table) <- seg.codons
  aa.variant.table[1, ] <- did.AA.change
  #---------------------------------------------------------------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 12, "of", (fasta.num * 7) + 16, "\r")
  maf.AA.table <- matrix(nrow = 22, ncol = dim(aa.variant.table)[2])
  aalist <- c("I", "L", "V", "F", "M", "C", "A", "G", "P", "T", "S", "Y", "W", "Q", "N", "H", "E", "D", "K", "R", "STOP")
  rownames(maf.AA.table) <- c(aalist, "REF")
  colnames(maf.AA.table) <- seg.codons
  #---------------------------------------------------------------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 13, "of", (fasta.num * 7) + 16, "\r")
  seg.codons.ref.variants.matrix <- rbind(ref.each.variable.codon, AAs.each.variable.codon)
  for (i in 1:seg.codons.num){
    for (j in 1:max.AA.variants){
      possible.AA <- unique(seg.codons.ref.variants.matrix[ ,i])
      length(possible.AA) <- max.AA.variants
      aa.variant.table[j+1,i] <- sum(sweep(as.matrix(AAs.each.variable.codon[ ,i]), 2, possible.AA[j], FUN = `==`) + 0)
      maf.AA.table[match(possible.AA[j], aalist),i] <- round(((aa.variant.table[j+1,i]) * 100 / seq.count), 3)
    }
    maf.AA.table[22,i] <- possible.AA[1]
  }
  maf.AA.table[is.na(maf.AA.table)] <- 0
  #---------------------------------------------------------------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 14, "of", (fasta.num * 7) + 16, "\r")
  maf.nuc.table <- matrix(nrow = 5, ncol = seg.sites.num)
  possible.nucs <- c("A", "T", "G", "C")
  for (i in 1:seg.sites.num){
    for (j in 1:4){
      nuc.counter <- sum(sweep(
        as.matrix(totalpop[ ,seg.sites[i]]), 2, possible.nucs[j], FUN = `==`) + 0)
      maf.nuc.table[j,i] <- round(((nuc.counter * 100) / seq.count), 3)
    }
    maf.nuc.table[5,i] <- ref.matrix[seg.sites[i]]
  }
  maf.nuc.table[is.na(maf.nuc.table)] <- 0
  colnames(maf.nuc.table) <- seg.sites
  rownames(maf.nuc.table) <- c(possible.nucs, "REF")

#Tajima's D values for graphing (significance thresholds-----------------------------------------------
  cat (" Completing Step:", (fasta.num * 7) + 15, "of", (fasta.num * 7) + 16, "\r")
td.sim <- matrix(ncol = 9, nrow = 73)

td.sim[ ,1] <- c(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
                 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
                 49, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 110, 120, 130, 140, 150, 175, 200, 250,
                 300, 350, 400, 450, 500, 600, 800, 1000)
td.sim[ ,2] <- c(-0.876, -1.255, -1.405, -1.498, -1.522, -1.553, -1.559, -1.572, -1.573, -1.58, -1.58,
                 -1.584, -1.583, -1.585, -1.584, -1.585, -1.584, -1.585, -1.584, -1.584, -1.583, -1.583,
                 -1.582, -1.582, -1.581, -1.581, -1.58, -1.58, -1.579, -1.579, -1.578, -1.578, -1.577,
                 -1.577, -1.576, -1.576, -1.575, -1.575, -1.574, -1.574, -1.573, -1.573, -1.572, -1.572,
                 -1.571, -1.571, -1.57, -1.568, -1.566, -1.565, -1.563, -1.561, -1.56, -1.559, -1.557,
                 -1.556, -1.555, -1.552, -1.55, -1.549, -1.547, -1.545, -1.542, -1.539, -1.534, -1.53,
                 -1.526, -1.523, -1.521, -1.519, -1.515, -1.51, -1.505)
td.sim[ ,3] <- c(2.081, 1.737, 1.786, 1.728, 1.736, 1.715, 1.719, 1.71, 1.713, 1.708, 1.71, 1.708, 1.709,
                 1.708, 1.709, 1.708, 1.71, 1.709, 1.711, 1.71, 1.712, 1.712, 1.712, 1.712, 1.713, 1.714,
                 1.714, 1.714, 1.715, 1.716, 1.716, 1.717, 1.717, 1.717, 1.718, 1.718, 1.719, 1.719, 1.72,
                 1.72, 1.721, 1.721, 1.721, 1.722, 1.722, 1.722, 1.723, 1.724, 1.726, 1.727, 1.729, 1.73,
                 1.731, 1.732, 1.733, 1.734, 1.735, 1.737, 1.739, 1.74, 1.741, 1.743, 1.746, 1.748, 1.752,
                 1.755, 1.757, 1.759, 1.761, 1.763, 1.765, 1.769, 1.772)
td.sim[ ,4] <- c(-0.876, -1.269, -1.478, -1.608, -1.663, -1.713, -1.733, -1.757, -1.765, -1.779, -1.783,
                 -1.791, -1.793, -1.798, -1.799, -1.802, -1.803, -1.805, -1.804, -1.806, -1.806, -1.807,
                 -1.807, -1.807, -1.807, -1.807, -1.807, -1.807, -1.806, -1.806, -1.806, -1.806, -1.805,
                 -1.805, -1.804, -1.804, -1.804, -1.803, -1.803, -1.803, -1.802, -1.802, -1.801, -1.801,
                 -1.8, -1.8, -1.8, -1.797, -1.795, -1.793, -1.791, -1.79, -1.788, -1.786, -1.784, -1.783,
                 -1.781, -1.779, -1.776, -1.774, -1.771, -1.769, -1.765, -1.76,-1.754, -1.748, -1.744,
                 -1.74, -1.737, -1.734, -1.728, -1.721, -1.715)
td.sim[ ,5] <- c(2.232, 1.834, 1.999, 1.932, 1.975, 1.954, 1.975, 1.966, 1.979, 1.976, 1.985, 1.984, 1.99,
                 1.99, 1.996, 1.996, 2.001, 2.001, 2.005, 2.006, 2.009, 2.01, 2.013, 2.014, 2.017, 2.018,
                 2.02, 2.021, 2.023, 2.024, 2.026, 2.027, 2.029, 2.03, 2.031, 2.032, 2.033, 2.034, 2.036,
                 2.037, 2.038, 2.039, 2.04, 2.041, 2.042, 2.042, 2.044, 2.048, 2.052, 2.055, 2.058, 2.061,
                 2.064, 2.066, 2.069, 2.071, 2.073, 2.077, 2.08, 2.084, 2.086, 2.089, 2.095, 2.1, 2.107,
                 2.114, 2.119, 2.123, 2.127, 2.13, 2.135, 2.143, 2.15)
td.sim[ ,6] <- c(-0.876, -1.275, -1.54, -1.721, -1.83, -1.916, -1.967, -2.014, -2.041, -2.069, -2.085, -
                   2.103, -2.113, -2.126, -2.132, -2.141, -2.146, -2.152, -2.153, -2.16, -2.162, -2.165,
                 -2.167, -2.17, -2.171, -2.173, -2.173, -2.175, -2.175, -2.177, -2.177, -2.178, -2.178,
                 -2.179, -2.178, -2.179, -2.179, -2.179, -2.179, -2.179, -2.179, -2.179, -2.179, -2.179,
                 -2.178, -2.178, -2.178, -2.177, -2.175, -2.173, -2.171, -2.17, -2.168, -2.166, -2.164,
                 -2.162, -2.16, -2.157, -2.153, -2.15, -2.147, -2.144, -2.138, -2.132, -2.122, -2.114,
                 -2.107, -2.101, -2.096, -2.092, -2.084, -2.072, -2.062)
td.sim[ ,7] <- c(2.324, 1.901, 2.255, 2.185, 2.313, 2.296, 2.362, 2.359, 2.401, 2.403, 2.432, 2.436,
                 2.457, 2.461, 2.478, 2.483, 2.496, 2.501, 2.512, 2.516, 2.526, 2.53, 2.538, 2.542, 2.549,
                 2.553, 2.559, 2.563, 2.569, 2.572, 2.577, 2.58, 2.585, 2.588, 2.592, 2.595, 2.599, 2.601,
                 2.605, 2.608, 2.611, 2.613, 2.617, 2.619, 2.622, 2.624, 2.627, 2.638, 2.649, 2.658, 2.666,
                 2.673, 2.681, 2.687, 2.693, 2.699, 2.704, 2.713, 2.722, 2.73, 2.736, 2.743, 2.757, 2.768,
                 2.787, 2.802, 2.814, 2.824, 2.833, 2.84, 2.853, 2.873, 2.887)
td.sim[ ,8] <- c(-0.876, -1.276, -1.556, -1.761, -1.909, -2.023, -2.105, -2.174, -2.223, -2.267, -2.299,
                 -2.329, -2.35, -2.372, -2.387, -2.403, -2.414, -2.426, -2.434, -2.443, -2.449, -2.457,
                 -2.461, -2.467, -2.471, -2.475, -2.478, -2.482, -2.484, -2.487, -2.489, -2.492, -2.493,
                 -2.495, -2.496, -2.498, -2.499, -2.5, -2.501, -2.502, -2.502, -2.503, -2.504, -2.504,
                 -2.505, -2.505, -2.505, -2.506, -2.506, -2.506, -2.505, -2.504, -2.502, -2.5, -2.499,
                 -2.497, -2.495, -2.492, -2.488, -2.484, -2.481, -2.477, -2.47, -2.462, -2.449, -2.439,
                 -2.43, -2.422, -2.415, -2.409, -2.398, -2.382, -2.369)
td.sim[ ,9] <- c(2.336, 1.913, 2.373, 2.311, 2.524, 2.519, 2.64, 2.649, 2.729, 2.741, 2.798, 2.811,
                 2.854, 2.866, 2.9, 2.911, 2.939, 2.95, 2.973, 2.983, 3.002, 3.011, 3.029, 3.037, 3.052,
                 3.06, 3.073, 3.08, 3.092, 3.099, 3.11, 3.116, 3.126, 3.132, 3.141, 3.147, 3.155, 3.16,
                 3.168, 3.173, 3.18, 3.185, 3.191, 3.196, 3.202, 3.207, 3.212, 3.235, 3.256, 3.274, 3.291,
                 3.306, 3.32, 3.333, 3.345, 3.355, 3.366, 3.385, 3.401, 3.416, 3.43, 3.443, 3.47, 3.492,
                 3.529, 3.558, 3.581, 3.6, 3.617, 3.632, 3.657, 3.694, 3.722)
colnames(td.sim) <- c("n", 0.1, 0.1, 0.05, 0.05, 0.01, 0.01, 0.001, 0.001)

#Gathering data into tables---------------------------------------------------------------------
cat (" Completing Step:", (fasta.num * 7) + 16, "of", (fasta.num * 7) + 16, "\r")
  vp.AA.VARIANT.TABLE <- aa.variant.table
  vp.AAs.EACH.VARIABLE.CODON <- AAs.each.variable.codon
  vp.REF.EACH.VARIABLE.CODON <- ref.each.variable.codon
  vp.BASIC.RESULTS <- ndrt
  vp.MAF.AA.TABLE <- maf.AA.table
  vp.MAF.NUC.TABLE <- maf.nuc.table
  vp.GLOBAL.SNPS <- globalsnps
  vp.SEQ.NUM <- seq.count
  vp.SEQ.LENGTH <- seq.length
  vp.GENE.NAME <- genename
  vp.SEG.CODONS <- seg.codons
  vp.POP.NUM <- fasta.num
  vp.MAX.AA.VARIANTS <- max.AA.variants
  vp.A1 <- aone
  vp.E1 <- eone
  vp.E2 <- etwo
  vp.TD.SIM <- td.sim
  vp.ALL.SEQUENCES.MATRIX <- totalpop
  vp.REF.MATRIX <- ref.matrix
  vp.SEG.SITES <- seg.sites

  vp.data <- list()
  vp.data <<- list(vp.AA.VARIANT.TABLE, vp.AAs.EACH.VARIABLE.CODON, vp.REF.EACH.VARIABLE.CODON,
                   vp.BASIC.RESULTS, vp.MAF.NUC.TABLE, vp.MAF.AA.TABLE, vp.GLOBAL.SNPS, vp.SEQ.NUM, vp.SEQ.LENGTH,
                   vp.GENE.NAME, vp.SEG.CODONS, vp.POP.NUM, vp.MAX.AA.VARIANTS, vp.A1, vp.E1, vp.E2, vp.TD.SIM,
                   vp.ALL.SEQUENCES.MATRIX, vp.REF.MATRIX, vp.SEG.SITES)

  cat ("Completed! Step:", (fasta.num * 7) + 16, "of", (fasta.num * 7) + 16, "\r")
  cat ("\n")
  et <- Sys.time()
  cat("vaxpack_input() took", round(et - st, digits = 2), units.difftime(et - st), "\n")
  cat ("Now use vaxpack_output() to get your results! \n")
}
BarryLab01/vaxpack documentation built on Feb. 8, 2021, 3:19 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
BarryLab01/vaxpack
Provide Fast Population Genetic Analysis For A Single Gene

R/vaxpack_input.R
In BarryLab01/vaxpack: Provide Fast Population Genetic Analysis For A Single Gene

R Package Documentation

Browse R Packages

We want your feedback!

BarryLab01/vaxpack Provide Fast Population Genetic Analysis For A Single Gene

R/vaxpack_input.R In BarryLab01/vaxpack: Provide Fast Population Genetic Analysis For A Single Gene

R Package Documentation

Browse R Packages

We want your feedback!

BarryLab01/vaxpack
Provide Fast Population Genetic Analysis For A Single Gene

R/vaxpack_input.R
In BarryLab01/vaxpack: Provide Fast Population Genetic Analysis For A Single Gene