R/dataexport.R

Defines functions gendata.to.genind write.POPDIST write.ATetra write.Tetrasat write.GeneMapper write.SPAGeDi write.GenoDive write.Structure

Documented in gendata.to.genind write.ATetra write.GeneMapper write.GenoDive write.POPDIST write.SPAGeDi write.Structure write.Tetrasat

write.Structure <- function(object, ploidy, file = "",
                            samples = Samples(object), loci = Loci(object),
                            writepopinfo = TRUE, extracols = NULL,
                            missingout = -9){
  if(!all(!is.na(Ploidies(object, samples = samples, loci = loci))))
    stop("Ploidies needed.")
  
  if(missing(ploidy))
    stop("Ploidy of file must be specified seperately from ploidies of dataset.")
  
  structdata <- data.frame(rowlabel=c("missing",rep(samples, each = ploidy)))
  thiscol <- 1 # which column we are working with
  
  # Fill in PopInfo if that option is selected
  if(writepopinfo){
    if(!all(!is.na(PopInfo(object)[samples])))
      stop("PopInfo needed, or set writepopinfo to FALSE")
    thiscol <- thiscol+1
    structdata[[thiscol]] <- c(0,rep(PopInfo(object)[samples], each = ploidy))
    names(structdata)[thiscol] <- "PopInfo"
  }
  
  # Fill in extra columns.  Ignored if extracols=NULL.
  for(excol in dimnames(extracols)[[2]]){
    thiscol <- thiscol+1
    structdata[[thiscol]] <- c(0,rep(extracols[samples, excol], each = ploidy))
    names(structdata)[thiscol] <- excol
  }
  
  # Fill in the genotypes
  for(L in loci){
    thiscol <- thiscol + 1
    alleles <- missingout #set up the vector of alleles
    for(s in samples){
      # Is genotype missing?
      if(isMissing(object,s,L)){
        thesealleles <- rep(missingout, ploidy)
      } else {
        #If missing data must be inserted to reflect a lower ploidy level:
        if(Ploidies(object, s, L) < ploidy){
          if(length(Genotype(object,s,L)) == Ploidies(object, s, L)){
            #fully heterozygous genotype
            thesealleles <- c(Genotype(object, s, L),
                              rep(missingout,
                                  times = ploidy - Ploidies(object, s, L)))
          } else {
            if(length(Genotype(object, s, L)) < Ploidies(object, s, L)){
              #duplicate the first allele to get to the right ploidy
              thesealleles <- c(Genotype(object, s, L),
                                rep(Genotype(object, s, L)[1],
                                    times = Ploidies(object, s, L) -
                                      length(Genotype(object, s, L))),
                                rep(missingout,
                                    times = ploidy - Ploidies(object, s, L)))
            } else {
              #randomly choose alleles to use if there are too many
              thesealleles <- c(sample(Genotype(object, s, L),
                                       Ploidies(object, s, L),replace = FALSE),
                                rep(missingout,
                                    times = ploidy - Ploidies(object, s, L)))
              cat(paste("Randomly removed alleles:", s, L), sep = "\n")
            }
          }
          #If the individual has equal or greater ploidy to that
          #being used in the file:
        } else {
          if(length(Genotype(object, s, L)) == ploidy){
            #genotype fills available spaces
            thesealleles <- Genotype(object, s, L)
          } else {
            if(length(Genotype(object, s, L)) < ploidy){
              #duplicate the first allele to get to the right ploidy
              thesealleles <- c(Genotype(object, s, L),
                                rep(Genotype(object, s, L)[1],
                                    times = ploidy -
                                      length(Genotype(object, s, L))))
            } else {
              #randomly choose alleles to use if there are too many
              thesealleles <- sample(Genotype(object, s, L), ploidy,
                                     replace = FALSE)
              cat(paste("Randomly removed alleles:", s, L), sep = "\n")
            }
          }
        }
      }
      alleles <- c(alleles, thesealleles)
    }
    structdata[[thiscol]] <- alleles
    names(structdata)[thiscol] <- L
  }
  
  # write the two header lines
  if(writepopinfo){
    colstoskip <- 2
  } else {
    colstoskip <- 1
  }
  if(!is.null(extracols)){
    colstoskip <- colstoskip + dim(extracols)[2]
  }
  cat(c(paste(c(rep("", colstoskip), names(structdata)[-(1:colstoskip)]),
              collapse = "\t"),
        paste(c(rep("", colstoskip),
                rep(as.character(missingout), ncol(structdata) - colstoskip)),
              collapse = "\t")),
      sep = "\n", file = file)
  # write the bulk of the data
  write.table(structdata[-1,], file = file, sep = "\t", row.names = FALSE,
              quote = FALSE, col.names = FALSE, append = TRUE)
}

write.GenoDive<-function(object,
                         digits = 2, file = "", samples = Samples(object),
                         loci = Loci(object)){
  if(!all(!is.na(Ploidies(object, samples = samples, loci = loci))))
    stop("Ploidies needed.
              Use estimatePloidy to get max number of alleles.")
  if(!all(!is.na(PopInfo(object)[samples])))
    stop("PopInfo needed.")
  if(!all(!is.na(Usatnts(object)[loci])))
    stop("Usatnts needed.")
  # get some info for the second line of the file
  numsam <- length(samples)
  numloc <- length(loci)
  numpop <- length(unique(PopInfo(object)[samples]))
  maxploidy <- max(Ploidies(object, samples, loci))
  
  # start a character vector to contain all the lines of the file
  lines <- c(Description(object),
             paste(numsam, numpop, numloc, maxploidy, digits, sep = "\t"))
  lines[3:(numpop+2)] <-
    PopNames(object)[sort(unique(PopInfo(object)[samples]))]
  lines[numpop+3]<-paste("Population\tIndividual",
                         paste(loci, sep = "", collapse = "\t"), sep = "\t")
  
  # enter population and sample names
  for(s in 1:numsam){
    lines[numpop+3+s] <- paste(PopInfo(object)[samples[s]],
                               samples[s], sep = "\t")
  }
  
  # process alleles and write them to lines
  for(L in loci){
    # replace missing data with zeros
    repmiss<-function(value){
      if(value[1] == Missing(object)){ 0 } else {value}
    }
    convertedalleles <- mapply(repmiss, Genotypes(object, samples, L),
                               SIMPLIFY = FALSE)
    # convert alleles to repeat number
    divallele <- function(value) floor(value/Usatnts(object)[L])
    convertedalleles <- mapply(divallele, convertedalleles, SIMPLIFY = FALSE)
    # subtract if necessary to get them to the right number of digits
    suballele <- function(value){
      if(value[1] != 0){
        value-(10^(digits-1))
      } else{
        0
      }
    }
    while(max(mapply(max, convertedalleles)) >= 10 ^ digits){
      convertedalleles <- mapply(suballele, convertedalleles,
                                 SIMPLIFY = FALSE)
    }
    # For each sample, concatenate into strings and add to lines
    for(s in 1:numsam){
      # Convert alleles to character and set up string for concatenation
      charalleles <- as.character(convertedalleles[[s]])
      allelestring <- ""
      # For each allele:
      for(ca in charalleles){
        allele <- ca
        # Add zeros if necessary
        while(nchar(allele) < digits){
          allele <- paste("0", allele, sep = "")
        }
        # add this allele to the string
        allelestring <- paste(allelestring, allele, sep="")
      }
      # Add the allele string to the line for that sample.
      lines[numpop+3+s] <- paste(lines[numpop+3+s], allelestring, sep = "\t")
    }
  }
  
  # write the file
  cat(lines, file = file, sep = "\n")
}

write.SPAGeDi<-function(object, samples = Samples(object),
                        loci = Loci(object),
                        allelesep = "/", digits = 2, file = "",
                        spatcoord = data.frame(X = rep(1, length(samples)),
                                               Y = rep(1, length(samples)),
                                               row.names = samples)){
  if(!all(!is.na(Ploidies(object, samples, loci))))
    stop("Ploidies needed.")
  if(!all(!is.na(PopInfo(object)[samples])))
    stop("PopInfo needed.")
  if(!all(!is.na(Usatnts(object)[loci])))
    stop("Usatnts needed.")
  # name the rows of the spatial coordinates frame if not already done
  if(identical(row.names(spatcoord), as.character(1:dim(spatcoord)[1])))
    row.names(spatcoord) < -samples
  
  # set up data frame to contain genotypes
  gentable <- data.frame(Ind = samples,
                         Cat = PopNames(object)[PopInfo(object)[samples]],
                         spatcoord[samples,])
  
  # get genotype data by locus (column)
  for(L in loci){
    genotypesL <- Genotypes(object, samples, L)
    # replace missing data with zeros
    genotypesL[isMissing(object, samples, L)] <- 0
    # divide and subtract to convert to repeats
    divallele <- function(value) floor(value/Usatnts(object)[L])
    genotypesL <- mapply(divallele, genotypesL, SIMPLIFY = FALSE)
    suballele <- function(value){
      if(value[1] != 0){
        value - (10 ^ (digits - 1))
      } else {
        0
      }
    }
    while(max(mapply(max, genotypesL)) >= 10 ^ digits){
      genotypesL <- mapply(suballele, genotypesL, SIMPLIFY = FALSE)
    }
    names(genotypesL) <- samples
    # add zeros up to correct ploidy, delete alleles if necessary
    zerostoadd <- Ploidies(object, samples, L) - mapply(length, genotypesL)
    names(zerostoadd) <- samples
    for(s in samples){
      if(length(genotypesL[[s]]) == 1){
        # replicate allele if totally homozygous
        genotypesL[[s]] <- rep(genotypesL[[s]], Ploidies(object, s, L))
      } else {
        if(zerostoadd[s] < 0){
          # randomly remove alleles if there are too many
          genotypesL[[s]] <- sample(genotypesL[[s]],
                                    Ploidies(object, s, L),
                                    replace=FALSE)
          cat("Alleles randomly removed to get to ploidy:", L, s, "\n")
        } else {
          # add zeros for partial heterozygotes
          genotypesL[[s]] <- c(genotypesL[[s]], rep(0, zerostoadd[s]))
        }
      }
      # also make each allele the right number of digits if necessary
      if(allelesep == ""){
        genotypesL[[s]] <- as.character(genotypesL[[s]])
        for(a in 1:length(genotypesL[[s]])){
          while(nchar(genotypesL[[s]][a]) < digits){
            genotypesL[[s]][a] <- paste(0, genotypesL[[s]][a],
                                        sep = "")
          }
        }
      }
    }
    # concatenate into strings
    genvect <- mapply(paste, genotypesL, collapse = allelesep)
    # add the vector to the data frame
    gentable <- data.frame(gentable, genvect)
    names(gentable)[dim(gentable)[2]] <- L
  }
  # write file
  SpagTemp <- tempfile()
  write.table(gentable, file = SpagTemp, sep = "\t", row.names = FALSE,
              col.names = TRUE, quote = FALSE)
  cat(paste(length(samples), length(unique(PopInfo(object)[samples])),
            dim(spatcoord)[2],
            length(loci), digits, max(Ploidies(object,samples,loci)),
            sep="\t"),
      "0",
      readLines(SpagTemp), "END", sep = "\n", file = file)
}

write.GeneMapper <- function(object, file = "", samples = Samples(object),
                             loci = Loci(object)){
  if(!all(!is.na(Ploidies(object, samples, loci))))
    stop("Ploidies needed to determine number of columns.")
  # figure out how many allele columns are needed
  numallelecol <- max(Ploidies(object, samples, loci))
  # figure out how many rows are needed
  numrows <- length(samples) * length(loci)
  # set up data frame
  gentable <- data.frame(Sample.Name = rep(samples, times = length(loci)),
                         Marker = rep(loci, each = length(samples)))
  # put empty allele columns into data frame
  alcollabels <- paste("Allele.", 1:numallelecol, sep = "")
  for(ac in 1:numallelecol){
    gentable[[ac + 2]] <- rep("", times = numrows)
    names(gentable)[ac + 2] <- alcollabels[ac]
  }
  
  # put alleles into their respective cells
  currentrow <- 1
  for(L in loci){
    for(s in samples){
      thesealleles <- as.character(Genotype(object, s, L))
      while((length(thesealleles) + 2) > dim(gentable)[2])
        gentable <- data.frame(gentable, rep("", numrows),
                               stringsAsFactors = FALSE)
      gentable[currentrow, 3:(length(thesealleles) + 2)] <- thesealleles
      currentrow <- currentrow + 1
    }
  }
  
  # write the table to file
  write.table(gentable, file = file, sep = "\t", quote = FALSE,
              row.names = FALSE, col.names = TRUE)
}

write.Tetrasat <- function(object, samples = Samples(object), loci = Loci(object),
                           file = ""){
  if(!all(!is.na(PopInfo(object)[samples])))
    stop("PopInfo needed.")
  if(!all(!is.na(Usatnts(object)[loci])))
    stop("Usatnts needed.")
  if(!all(Ploidies(object, samples, loci) == 4))
    stop("Ploidy must be 4.")
  
  # a vector of populations to cycle through
  allpops <- unique(PopInfo(object)[samples])
  # set up vector of lines to write to file
  lines <- c(Description(object)[1], loci)
  # where does population/genotype data begin?
  datastart <- length(lines) + 1
  
  # make lines with "Pop" and sample names, and an index of where samples are
  sampleindex <- integer(0)
  currentline <- datastart
  for(pop in allpops){
    lines[currentline] <- "Pop"
    currentline <- currentline + 1
    # make a vector of all samples that belong to this population
    thesesamples <- samples[PopInfo(object)[samples] == pop]
    # make an index of the lines that they will go on
    indextoadd <- currentline:(currentline + length(thesesamples) - 1)
    names(indextoadd) <- thesesamples
    # add this to the sample index
    sampleindex <- c(sampleindex, indextoadd)
    # write each sample name (plus spaces up to 20 characters) to the
    # appropriate line
    for(s in thesesamples){
      samname <- s
      while(nchar(samname) < 20){
        samname <- paste(samname, " ", sep = "")
      }
      lines[currentline] <- samname
      currentline <- currentline + 1
    }
  }
  
  # now go through the loci, convert the genotypes, and fill them in
  for(L in loci){
    # convert alleles to repeat number
    divallele <- function(value){
      if(value[1] == Missing(object)){
        value
      } else {
        floor(value/Usatnts(object)[L])
      }
    }
    convertedalleles <- mapply(divallele, Genotypes(object, samples, L),
                               SIMPLIFY = FALSE)
    names(convertedalleles) <- samples
    # make sure alleles have two digits or fewer
    suballele <- function(value){
      if(value[1] == Missing(object)){
        value
      } else {
        value - 10
      }
    }
    while(max(mapply(max,convertedalleles)) >= 100){
      convertedalleles <- mapply(suballele, convertedalleles,
                                 SIMPLIFY = FALSE)
    }
    # go through the genotypes by sample
    for(s in samples){
      # convert missing data to blank spaces,
      # convert to alleles to characters
      if(convertedalleles[[s]][1] == Missing(object)){
        charalleles <- "         "
      } else {
        charalleles <- as.character(convertedalleles[[s]])
        # duplicate allele if fully homozygous
        if(length(charalleles) == 1){
          charalleles <- rep(charalleles, 4)
        }
        # randomly remove alleles if there are more than 4
        if(length(charalleles) > 4){
          charalleles <- sample(charalleles, 4, replace = FALSE)
          cat(c("Alleles randomly removed:", s, L, "\n"), sep = " ")
        }
      }
      # concatenate all alleles into one string
      allelestring <- ""
      for(ca in charalleles){
        allele <- ca
        # Add zeros if necessary
        while(nchar(allele) < 2){
          allele <- paste("0", allele, sep="")
        }
        # add this allele to the string
        allelestring <- paste(allelestring, allele, sep = "")
      }
      # add spaces up to nine characters
      while(nchar(allelestring) < 9){
        allelestring <- paste(allelestring, " ", sep = "")
      }
      # add the concatenated genotype to the appropriate line
      lines[sampleindex[s]] <- paste(lines[sampleindex[s]], allelestring,
                                     sep = "")
    }
  }
  
  cat(lines, sep = "\n", file = file)
}

write.ATetra<-function(object, samples = Samples(object), loci = Loci(object),
                       file = ""){
  if(!all(!is.na(PopInfo(object)[samples])))
    stop("PopInfo needed.")
  if(!all(Ploidies(object, samples, loci) == 4))
    stop("Ploidy must be 4.")
  
  # set up a character vector to hold the lines for the file
  lines <- paste("TIT", Description(object)[1], sep = ",")
  currentline <- 2 # a variable to say which line to write to next
  # make numbers to go with loci, pops, and samples
  locnums <- 1:length(loci)
  names(locnums) <- loci
  samnums <- 1:length(samples)
  names(samnums) <- samples
  popnames <- PopNames(object)[sort(unique(PopInfo(object)[samples]))]
  popnums <- sort(unique(PopInfo(object)[samples]))
  names(popnums) <- popnames
  
  # fill in data using a loop
  for(L in loci){
    # write a line describing the locus
    lines[currentline] <- paste("LOC", locnums[L], L, sep = ",")
    currentline <- currentline + 1
    for(p in popnames){
      # write a line describing the population
      lines[currentline] <- paste("POP", locnums[L], popnums[p], p, sep = ",")
      currentline <- currentline + 1
      # get a vector of individuals in this population
      thesesamples <- Samples(object, populations = p)
      thesesamples <- thesesamples[thesesamples %in% samples]
      # write lines for individuals
      for(s in thesesamples){
        # first put sample info into the line
        lines[currentline] <- paste("IND", locnums[L], popnums[p],
                                    samnums[s], s, sep = ",")
        # get the alleles and print a warning if there is missing data
        thesealleles<-Genotype(object, s, L)
        if(isMissing(object, s, L)){
          thesealleles <- ""
          cat("Missing data:", s, L, "\n", sep = " ")
        }
        # take a random sample if there are more than 4
        if(length(thesealleles) > 4){
          thesealleles <- sample(thesealleles, 4, replace = FALSE)
          cat("More than 4 alleles:", s, L, "\n", sep = " ")
        }
        # add alleles to the line
        for(a in 1:4){
          if(length(thesealleles) >= a){
            lines[currentline] <- paste(lines[currentline],
                                        thesealleles[a], sep = ",")
          } else {
            lines[currentline] <- paste(lines[currentline], "", sep = ",")
          }
        }
        currentline <- currentline + 1
      }
    }
  }
  
  # write the "END-record"
  lines[currentline] <- "END"
  
  # write the file
  cat(lines, sep = "\n", file = file)
}

write.POPDIST <- function(object, samples = Samples(object),
                          loci = Loci(object), file = ""){
  # error messages
  if(!all(!is.na(PopInfo(object)[samples])))
    stop("PopInfo needed")
  if(!all(!is.na(Ploidies(object,samples,loci))))
    stop("Ploidies needed")
  # start a vector of lines to write to the file
  Lines <- c(Description(object)[1], loci)
  # make a vector to index all the lines containing samples
  samindex <- integer(0)
  
  # add one population at a time
  for(p in unique(PopInfo(object)[samples])){
    Lines <- c(Lines, "Pop")
    # add each individual
    for(s in samples[samples %in% Samples(object, populations=p)]){
      Lines <- c(Lines, paste(PopNames(object)[p], "\t,", sep = ""))
      samindex <- c(samindex, length(Lines))
      names(samindex)[length(samindex)] <- s
    }
    # warning message for mixed ploidy
    if(length(unique(
      Ploidies(object,samples[samples %in% Samples(object,
                                                   populations = p)],
               loci))) > 1)
      warning(paste("Mixed ploidy in population", p,
                    "; POPDIST may reject file."))
  }
  
  # Convert alleles to two digits then add to lines
  for(L in loci){
    thesegen <- Genotypes(object, samples, L)
    if(max(mapply(max, thesegen)) > 99){
      if(!is.na(Usatnts(object)[L]) && Usatnts(object)[L] > 1){
        thesegen <- array(mapply(function(value){
          if(value[1] == Missing(object)){
            value
          } else {
            floor(value/Usatnts(object)[L])
          }
        },
        thesegen, SIMPLIFY = FALSE),
        dim = c(length(samples), 1),
        dimnames=list(samples, L))
      }
      
      while(max(mapply(max, thesegen)) > 99){
        thesegen <- array(mapply(function(value){
          if(value[1] == Missing(object)){
            value
          } else {
            value - 10
          }
        },
        thesegen, SIMPLIFY = FALSE),
        dim = c(length(samples), 1),
        dimnames = list(samples, L))
      }
    }
    
    for(s in samples){
      if(thesegen[[s,L]][1] == Missing(object)){
        alleles <- "00"
      } else {
        alleles <- as.character(thesegen[[s, L]])
      }
      if(length(alleles) > Ploidies(object, s, L)){
        alleles <- sample(alleles, Ploidies(object, s, L))
        warning(paste("Alleles randomly removed:", s, L))
      }
      Lines[samindex[s]] <- paste(Lines[samindex[s]], " ", sep="")
      for(a in 1:Ploidies(object, s, L)){
        if(a > length(alleles)){
          Lines[samindex[s]] <- paste(Lines[samindex[s]], "00", sep = "")
        } else {
          if(nchar(alleles[a]) == 1)
            alleles[a] <- paste("0", alleles[a], sep="")
          Lines[samindex[s]] <- paste(Lines[samindex[s]],
                                      alleles[a], sep = "")
        }
      }
    }
  }
  
  cat(Lines, file = file, sep = "\n")
}

gendata.to.genind <- function(object, samples = Samples(object),
                              loci = Loci(object)){
  # Errors and setup
  if(!is(object, "gendata")) stop("genambig or genbinary object needed.")
  object <- object[samples, loci]
  if(!"ploidysample" %in% class(object@Ploidies)){
    object <- reformatPloidies(object, output = "sample")
  }
  ploidy <- Ploidies(object)
  if(any(is.na(ploidy))) stop("Please specify ploidy.")
  
  # Get genbinary if necessary
  if(is(object, "genambig"))
    object <- genambig.to.genbinary(object)
  
  # export genotypes table
  tab <- Genotypes(object)
  # convert missing data to NA
  tab[tab == Missing(object)] <- NA
  # make column headings usable
  dimnames(tab)[[2]] <- gsub(".", "-", dimnames(tab)[[2]], fixed = TRUE)
  
  # create genind object
  x <- adegenet::genind(tab, pop = PopNames(object)[PopInfo(object)[samples]],
                        ploidy = ploidy, type = "PA")
  return(x)
}

Try the polysat package in your browser

Any scripts or data that you put into this service are public.

polysat documentation built on Aug. 23, 2022, 5:07 p.m.