R/data_export.R
In polyRAD: Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

Documented in Export_adegenet_genind ExportGAPIT Export_GWASpoly Export_MAPpoly Export_polymapR Export_polymapR_probs Export_rrBLUP_Amat Export_rrBLUP_GWAS Export_Structure Export_TASSEL_Numeric RADdata2VCF

# functions to export data into formats for use by other software

.chromosome_to_integer <- function(chrvect){
  ## Convert chromosomes as strings to chromosomes as integers ##
  
  if(is.integer(chrvect)) return (chrvect)
  
  if(!class(chrvect) %in% c("numeric", "character", "factor")){
    stop("Unexpected values in chromosome column.")
  }
  
  if(is.numeric(chrvect)){
    if(any(chrvect %% 1 != 0)){
      stop("Unexpected numeric non-integer values in chromosome column.")
    }
    chrnum <- as.integer(chrvect)
  }
  if(is.factor(chrvect)){
    chrvect <- as.character(chrvect)
  }
  if(is.character(chrvect)){
    # check if there are multiple sections with digits in chromosome strings
    weirdstrings <- grep("[[:digit:]+][^[:digit:]]+[[:digit:]+]",
                         chrvect)
    if(length(weirdstrings) > 0){
      warning("Strings naming chromosomes have multiple sections with numbers.")
      cat(chrvect[weirdstrings][1:max(c(10, length(weirdstrings)))],
          sep = "\n")
    }
    # remove everything that is not a number from the character strings
    chrnum <- as.integer(gsub("[^[:digit:]]", "", chrvect))
    # get indices for chromosome names starting with "chr"
    chromInd <- grep("^chr", chrvect, ignore.case = TRUE)
    # get highest chromosome number, and add this to non-"chr" sequences
    if(length(chromInd) > 0 && length(chromInd) < length(chrnum)){
      nChr <- max(chrnum[chromInd], na.rm = TRUE)
      chrnum[-chromInd] <- chrnum[-chromInd] + nChr
    }
    # check that strings with same num are the same
    for(cn in sort(unique(chrnum))){
      thisnumind <- which(chrnum == cn)
      if(length(unique(chrvect[thisnumind])) > 1){
        stop("Chromosome names not resolvable to chromosome numbers; number manually.")
      }
    }
  }
  
  return(chrnum)
}

.ordered_gen_and_loctable <- function(object, numgen, 
                                      omit1allelePerLocus = TRUE,
                                      omitCommonAllele = TRUE,
                                      chromAsInteger = TRUE){
  ## Internal function to make a data frame of loci with one row per     ##
  ## allele.  The data frame, with alignment data, and the matrix of     ##
  ## genotypes are output with alleles in the same order.                ##
  ## object = RADdata object                                             ##
  ## numgen = output of GetWeightedMeanGenotypes                         ##
  ## omit1allelePerLocus = TRUE if the same argument was used for GWMG   ##
  ## omitCommonAllele = TRUE if the same argument was used for GwMG      ##
  ## chromAsInteger = should chromosomes be forced to be integers?       ##
  
  # look up alleles in loc table
  if(omit1allelePerLocus){
    alindex <- object$alleles2loc[-OneAllelePerMarker(object,
                                                      commonAllele = omitCommonAllele)]
  } else {
    alindex <- object$alleles2loc
  }
  
  if(all(c("Chr", "Pos") %in% names(object$locTable))){
    loctable <- object$locTable[alindex,c("Chr", "Pos")]
    # sort the genotypes by chromosome and position
    alOrder <- order(loctable$Chr, loctable$Pos)
    numgen <- numgen[,alOrder]
    loctable <- loctable[alOrder,]
    
    if(chromAsInteger && !is.integer(loctable$Chr)){
      # conversion of chromosomes to integer format if needed
      loctable$Chr <- .chromosome_to_integer(loctable$Chr)
    }
  } else {
    # insert fake alignment data if it doesn't exist
    loctable <- data.frame(Chr = rep(1, length(alindex)),
                           Pos = alindex)
    cat("Alignment data not found in object.  Creating dummy data", sep = "\n")
  }
  
  return(list(numgen = numgen, loctable = loctable))
}
 
ExportGAPIT <- function(object, onePloidyPerAllele = FALSE){
  ## From a RADdata object, export weighted mean genotypes and alignment ##
  ## information formatted for GAPIT.                                    ##
  
  # get numeric genotypes
  numgen <- GetWeightedMeanGenotypes(object, minval = 0, maxval = 2,
                                     omit1allelePerLocus = TRUE,
                                     omitCommonAllele = TRUE,
                                     onePloidyPerAllele = onePloidyPerAllele)
  
  # look up alleles in loc table
  ord <- .ordered_gen_and_loctable(object, numgen, omit1allelePerLocus = TRUE,
                                   chromAsInteger = TRUE, omitCommonAllele = TRUE)
  rm(numgen)
  
  taxa <- GetTaxa(object)
  
  # finish converting to GAPIT format
  GD <- data.frame(taxa = taxa, ord$numgen, stringsAsFactors = FALSE)
  GM <- data.frame(Name = dimnames(ord$numgen)[[2]],
                   Chromosome = ord$loctable$Chr,
                   Position = ord$loctable$Pos,
                   stringsAsFactors = FALSE)
  
  return(list(GD = GD, GM = GM))
}

Export_rrBLUP_Amat <- function(object, naIfZeroReads = FALSE,
                               onePloidyPerAllele = FALSE){
  ## From a RADdata object, export weighted mean genotypes formatted ##
  ## for the A.mat function in rrBLUP.                               ##
  
  numgen <- GetWeightedMeanGenotypes(object, minval = -1, maxval = 1,
                                    omit1allelePerLocus = TRUE,
                                    omitCommonAllele = TRUE,
                                    naIfZeroReads = naIfZeroReads,
                                    onePloidyPerAllele = onePloidyPerAllele)
  return(numgen)
}

Export_rrBLUP_GWAS <- function(object, naIfZeroReads = FALSE,
                               onePloidyPerAllele = FALSE){
  ## From a RADdata object, export weighted mean gentoypes formatted ##
  ## for the GWAS function in rrBLUP.                                ##
  
  numgen <- Export_rrBLUP_Amat(object, naIfZeroReads = naIfZeroReads,
                               onePloidyPerAllele = onePloidyPerAllele)
  
  # look up alleles in loc table
  ord <- .ordered_gen_and_loctable(object, numgen, omit1allelePerLocus = TRUE,
                                   omitCommonAllele = TRUE,
                                   chromAsInteger = FALSE)
  rm(numgen)
  
  # set up output data frame
  geno <- data.frame(Marker = dimnames(ord$numgen)[[2]],
                     Chr = ord$loctable$Chr,
                     Pos = ord$loctable$Pos,
                     t(ord$numgen),
                     stringsAsFactors = FALSE)
  return(geno)
}

Export_TASSEL_Numeric <- function(object, file, naIfZeroReads = FALSE,
                                  onePloidyPerAllele = FALSE){
  ## From a RADdata object, export numeric genotypes for TASSEL ##
  
  numgen <- GetWeightedMeanGenotypes(object, minval = 0, maxval = 1,
                                     omit1allelePerLocus = TRUE,
                                     omitCommonAllele = TRUE,
                                     naIfZeroReads = naIfZeroReads,
                                     onePloidyPerAllele = onePloidyPerAllele)
  # file header
  cat(c("<Numeric>",
        paste(c("<Marker>", colnames(numgen)), collapse = "\t")),
      sep = "\n", file = file)
  # genotypes table
  write.table(round(numgen, 3), file = file, row.names = TRUE, col.names = FALSE,
              append = TRUE, sep = "\t", quote = FALSE)
}

Export_polymapR <- function(object, naIfZeroReads = TRUE,
                            progeny = GetTaxa(object)[!GetTaxa(object) %in% 
                                                        c(GetDonorParent(object),
                                                          GetRecurrentParent(object),
                                                          GetBlankTaxa(object))]){
  if(!is(object, "RADdata")){
    stop("RADdata object needed")
  }
  if(nrow(object$posteriorProb) > 1){
    stop("Only one ploidy allowed for Export_polymapR.")
  }
  # Should support parents of different ploidy; see polymapR::checkF1 documentation
  
  out <- t(GetProbableGenotypes(object, naIfZeroReads = naIfZeroReads,
                                correctParentalGenos = TRUE)[[1]])
  
  # make sure parents are first
  don <- GetDonorParent(object)
  rec <- GetRecurrentParent(object)
  neworder <- c(don, rec, progeny)
  
  return(out[, neworder])
}

Export_MAPpoly <- function(object, file, pheno = NULL, ploidyIndex = 1,
                          progeny = GetTaxa(object)[!GetTaxa(object) %in% c(GetDonorParent(object),
                                                                            GetRecurrentParent(object),
                                                                            GetBlankTaxa(object))],
                          digits = 3){
  # Confirm that genotype calling has been performed
  if(is.null(object$likelyGeno_donor) || is.null(object$posteriorProb)){
    stop("PipelineMapping2Parents needs to be run before using Export_MAPpoly.")
  }
  # Check phenotypes
  if(!is.null(pheno) && is.null(colnames(pheno))){
    stop("pheno should be a matrix or data frame with column names")
  }
  if(!is.null(pheno) && nrow(pheno) != length(progeny)){
    stop("Need one row of pheno for every progeny.")
  }
  if(!is.null(pheno) && !is.null(row.names(pheno)) && !identical(progeny, row.names(pheno))){
    warning("Please check that progeny vector and rows of pheno are in same order.")
  }
  if(!is.null(pheno) && any(grepl(" ", colnames(pheno)))){
    stop("Phenotype names should not have spaces.")
  }
  # Check progeny names
  if(!all(progeny %in% GetTaxa(object))){
    stop("Not all progeny names found in object.")
  }
  if(any(grepl(" ", progeny))){
    stop("Taxa names should not have spaces.")
  }
  # Determine the ploidy
  if(ploidyIndex > length(object$possiblePloidies)){
    stop("ploidyIndex should be the index of the desired ploidy within object$possiblePloidies (not the ploidy itself).")
  }
  ploidy <- object$possiblePloidies[[ploidyIndex]]
  if(length(ploidy) != 1){
    stop("Export is for autopolyploids only.")
  }
  pld.r <- object$taxaPloidy[GetRecurrentParent(object)] * ploidy / 2
  pld.d <- object$taxaPloidy[GetDonorParent(object)] * ploidy / 2
  pld.p <- unique(object$taxaPloidy[progeny]) * ploidy / 2
  if(length(pld.p) > 1){
    stop("All progeny must be same ploidy")
  }
  if(pld.r != pld.d){
    warning("MAPpoly may not support populations in which parents differ in ploidy.")
  }
  
  # Get parent genotypes
  donorGen <- object$likelyGeno_donor[as.character(pld.d),]
  recurGen <- object$likelyGeno_recurrent[as.character(pld.r),]
  
  # Identify markers to use
  keepal <- which(!is.na(donorGen) & !is.na(recurGen) & 
    !(donorGen == 0 & recurGen == ploidy) & !(donorGen == ploidy & recurGen == 0))
  keepal <- keepal[!keepal %in% OneAllelePerMarker(object, commonAllele = TRUE)]
  if(any(grepl(" ", GetAlleleNames(object)[keepal]))){
    stop("Allele and locus names should not have spaces.")
  }
  
  # Get chromosome and position
  if(is.null(object$locTable$Chr) || all(is.na(object$locTable$Chr))){
    chrnum <- NA_integer_
  } else {
    chrnum <- .chromosome_to_integer(object$locTable$Chr[object$alleles2loc[keepal]])
  }
  if(is.null(object$locTable$Pos) || all(is.na(object$locTable$Pos))){
    position <- NA_integer_
  } else {
    position <- object$locTable$Pos[object$alleles2loc[keepal]]
  }
  
  # Write file header
  cat(c(paste("ploidy", pld.p),
        paste("nind", length(progeny)),
        paste("nmrk", length(keepal)),
        paste("mrknames", paste(GetAlleleNames(object)[keepal], collapse = " ")),
        paste("indnames", paste(progeny, collapse = " ")),
        paste("dosageP", paste(donorGen[keepal], collapse = " ")),
        paste("dosageQ", paste(recurGen[keepal], collapse = " ")),
        paste("seq", paste(chrnum, collapse = " ")),
        paste("seqpos", paste(position, collapse = " ")),
        paste("nphen", ifelse(is.null(pheno), 0, ncol(pheno))),
        "pheno---------------------------------------"), file = file, sep = "\n")
  # Write phenotypes
  if(!is.null(pheno)){
    for(i in 1:ncol(pheno)){
      cat(paste(colnames(pheno)[i], paste(pheno[,i], collapse = " ")),
          file = file, sep = "\n", append = TRUE)
    }
  }
  cat("geno----------------------------------------", 
      file = file, sep = "\n", append = TRUE) # line 12 + nphen
  
  # Write genotype posterior probabilities
  genotab <- data.frame(rep(GetAlleleNames(object)[keepal], each = length(progeny)),
                        rep(progeny, times = length(keepal)),
                        matrix(round(object$posteriorProb[[ploidyIndex,as.character(pld.p)]][, progeny, keepal], digits),
                               byrow = TRUE, nrow = length(progeny) * length(keepal),
                               ncol = pld.p + 1))
  write.table(genotab, file = file, append = TRUE, quote = FALSE,
              col.names = FALSE, row.names = FALSE)
}

Export_GWASpoly <- function(object, file, naIfZeroReads = TRUE, postmean = TRUE,
                            digits = 3, splitByPloidy = TRUE){
  pldsums <- sapply(object$possiblePloidies, sum)
  if(length(unique(pldsums)) > 1){
    stop("Multiple ploidies possible for loci, but only one ploidy allowed by GWASpoly. Use SubsetByPloidy first.")
  }
  # only split files by ploidy if there are multiple ploidies
  txpld <- sort(unique(GetTaxaPloidy(object)))
  splitByPloidy <- splitByPloidy && length(txpld) > 1
  
  if(postmean){
    if(splitByPloidy){
      mygeno <- sapply(txpld,
                       function(x){
                         thesetaxa <- GetTaxaByPloidy(object, x)
                         mat <- GetWeightedMeanGenotypes(SubsetByTaxon(object, thesetaxa),
                                                  maxval = max(pldsums) * x / 2L,
                                                  omit1allelePerLocus = TRUE,
                                                  omitCommonAllele = TRUE,
                                                  naIfZeroReads = naIfZeroReads)
                         return(round(t(mat), digits = digits))
                       },
                       simplify = FALSE)
      names(mygeno) <- as.character(txpld)
    } else {
      mygeno <- t(GetWeightedMeanGenotypes(object,
                                           maxval = max(pldsums) *
                                             max(GetTaxaPloidy(object)) / 2L,
                                           omit1allelePerLocus = TRUE,
                                           omitCommonAllele = TRUE,
                                           naIfZeroReads = naIfZeroReads))
      mygeno <- round(mygeno, digits = digits)
    }
  } else {
    # matrix of discrete genotypes
    mygeno <- t(GetProbableGenotypes(object,
                                     omit1allelePerLocus = TRUE,
                                     omitCommonAllele = TRUE,
                                     naIfZeroReads = naIfZeroReads)$genotypes)
    if(splitByPloidy){
      mygeno <- sapply(txpld,
                       function(x){
                         thesetaxa <- GetTaxaByPloidy(object, x)
                         return(mygeno[,thesetaxa])
                       },
                       simplify = FALSE)
      names(mygeno) <- as.character(txpld)
    } else {
      if(length(unique(GetTaxaPloidy(object))) > 1){
        warning("Individuals in dataset vary in ploidy, and genotypes are expressed as allele copy numbers.  GWASpoly assumptions may be violated.")
      }
    }
  }
  
  # get loci to correspond to these alleles
  locindex <- object$alleles2loc[-OneAllelePerMarker(object,
                                                     commonAllele = TRUE)]
  loctable <- object$locTable
  if(is.null(loctable$Chr)){
    loctable$Chr <- rep(0L, nrow(loctable))
  }
  if(is.null(loctable$Pos)){
    loctable$Pos <- 1:nrow(loctable)
  }
  
  # data frame for export
  if(splitByPloidy){
    for(x in txpld){
      xC <- as.character(x)
      outdata <- data.frame(Marker = rownames(mygeno[[xC]]),
                            Chrom = .chromosome_to_integer(loctable$Chr[locindex]),
                            Position = loctable$Pos[locindex],
                            mygeno[[xC]],
                            check.names = FALSE)
      write.csv(outdata,
                file = sub("(\\.csv)?$", paste0("_", x, "x.csv"), file),
                row.names = FALSE, quote = FALSE)
    }
  } else {
    outdata <- data.frame(Marker = rownames(mygeno),
                          Chrom = .chromosome_to_integer(loctable$Chr[locindex]),
                          Position = loctable$Pos[locindex],
                          mygeno,
                          check.names = FALSE)
    
    # export
    write.csv(outdata, file = file, row.names = FALSE, quote = FALSE)
  }
}

RADdata2VCF <- function(object, file = NULL, asSNPs = TRUE, hindhe = TRUE,
                        sampleinfo = data.frame(row.names = GetTaxa(object)),
                        contigs = data.frame(row.names = unique(object$locTable$Chr))){
  # shortcuts to functions to use
  DataFrame <- S4Vectors::DataFrame
  
  varok <- attr(object$alleleNucleotides, "Variable_sites_only")
  if(is.null(varok) || varok){
    stop("Complete haplotype information not provided; unable to determine SNP positions.  Use refgenome argument in VCF2RADdata.")
  }
  if(is.null(object$locTable$Chr) || is.null(object$locTable$Pos)){
    stop("Need chromosome and position information (Chr and Pos in locTable).")
  }
  if(is.null(object$locTable$Ref)){
    warning("Reference allele not indicated.  Using the major allele for each locus.")
    object$locTable$Ref <- object$alleleNucleotides[OneAllelePerMarker(object, commonAllele = TRUE)]
  }
  if(nrow(sampleinfo) != nTaxa(object) || !all(rownames(sampleinfo) %in% GetTaxa(object))){
    "sampleinfo doesn't match taxa in RADdata object."
  }
  
  # Determine most probable genotypes, and their ploidies
  temp <- GetProbableGenotypes(object, omit1allelePerLocus = FALSE)
  geno <- temp$genotypes
  pld_index <- temp$ploidy_index
  pld_ind_per_loc <- 
    tapply(pld_index, object$alleles2loc,
           function(x){
             u <- unique(x)
             if(length(u) == 1){
               return(u)
             } else {
               return(as.integer(names(which.max(table(x)))))
             }
           })
  pld_per_loc <- sapply(object$possiblePloidies, sum)[pld_ind_per_loc]
  
  # Process data with internal RCpp function
  temp <- PrepVCFexport(geno, object$alleles2loc, object$alleleDepth,
                        object$alleleNucleotides, object$locTable, pld_per_loc,
                        GetTaxaPloidy(object), asSNPs)
  REF <- Biostrings::DNAStringSet(temp$REF)
  ALT <- Biostrings::DNAStringSetList(temp$ALT)
  CHROM <- as.character(object$locTable$Chr)[temp$Lookup]
  rr <- GenomicRanges::GRanges(CHROM,
                IRanges::IRanges(start = temp$POS, 
                                 width = BiocGenerics::width(REF)))
  fixed <- DataFrame(REF = REF, ALT = ALT)
  cd <- DataFrame(row.names = GetTaxa(object),
                  Ploidy = GetTaxaPloidy(object) * min(pld_per_loc) / 2L)
  if(ncol(sampleinfo) > 0){
    cd <- cbind(cd, sampleinfo[GetTaxa(object),])
    colnames(cd)[-1] <- colnames(sampleinfo)
  }
  DP <- t(object$locDepth[,as.character(temp$Lookup)])
  rownames(DP) <- NULL
  info <- DataFrame(NS = rowSums(DP > 0), DP = rowSums(DP), LU = temp$Lookup)
  infohdr <- DataFrame(row.names = c("NS", "DP", "LU"), Number = c("1", "1", "1"),
                       Type = c("Integer", "Integer", "Integer"),
                       Description = c("Number of samples with data", "Combined depth across samples",
                                       "Lookup index of marker in RADdata object"))
  metahdr <- DataFrame()
  if(ncol(contigs) > 0){
    ctg <- DataFrame(row.names = rownames(contigs), contigs)
    colnames(ctg) <- colnames(contigs)
  } else {
    ctg <- DataFrame(row.names = rownames(contigs))
  }
  
  # Add Hind/He if desired
  if(hindhe){
    infohdr <- rbind(infohdr,
                     DataFrame(row.names = "HH", Number = "1", Type = "Float",
                               Description = "Hind/He for the locus in the RADdata object"))
    metahdr <- DataFrame(row.names = "HH", Number = "1", Type = "Float",
                         Description = "Hind/He for the sample, averaged across loci in the RADdata object")
    hh <- HindHe(object)
    info$HH <- colMeans(hh, na.rm = TRUE)[temp$Lookup]
    cd$HH <- rowMeans(hh, na.rm = TRUE)
  }
  
  # Build VCF object
  hdr <- VariantAnnotation::VCFHeader(reference = rownames(ctg),
    samples = GetTaxa(object),
    IRanges::DataFrameList(fileformat = DataFrame(row.names = "fileformat", Value = "VCFv4.3"),
                           fileDate = DataFrame(row.names = "fileDate", Value = gsub("-", "", Sys.Date())),
                           source = DataFrame(row.names = "source", Value = paste0("polyRADv", packageVersion("polyRAD"))),
                           FORMAT = DataFrame(row.names = c("GT", "AD", "DP"),
                                              Number = c("1", "R", "1"), Type = c("String", "Integer", "Integer"),
                                              Description = c("Genotype", "Read depth for each allele", "Read depth")),
                           INFO = infohdr, META = metahdr, contig = ctg))
  if(ncol(cd) > 0){
    VariantAnnotation::meta(hdr)$SAMPLE <- cd
  }
  vcf <- VariantAnnotation::VCF(rowRanges = rr, fixed = fixed, info = info, colData = cd,
                                geno = S4Vectors::SimpleList(GT = temp$GT, AD = temp$AD, DP = DP),
                                exptData = list(header = hdr), collapsed = TRUE)
  
  # output
  if(!is.null(file)){
    VariantAnnotation::writeVcf(vcf, file)
  }
  return(vcf)
}

Export_Structure <- function(object, file, includeDistances = FALSE,
                             extraCols = NULL, missingIfZeroReads = TRUE){
  # sort data if necessary
  if(includeDistances){
    if(is.null(object$locTable$Chr) || is.null(object$locTable$Pos)){
      stop("Chromosome and position needed in locTable if distances are to be output.")
    }
    ord <- order(object$locTable$Chr, object$locTable$Pos)
    if(!identical(ord, seq_len(nLoci(object)))){
      object <- SubsetByLocus(object, ord)
    }
    # get inter marker distances
    distrow <- rep(-1, nLoci(object))
    for(chr in unique(object$locTable$Chr)){
      theserow <- which(object$locTable$Chr == chr)
      distrow[theserow[-1]] <-
        object$locTable$Pos[theserow[-1]] - object$locTable$Pos[theserow[-length(theserow)]]
    }
  }
  # get genotype matrix
  geno <- GetProbableGenotypes(object, omit1allelePerLocus = FALSE,
                               naIfZeroReads = missingIfZeroReads)
  # get ploidy
  pind <- unique(geno$ploidy_index)
  ploidies <- sapply(object$possiblePloidies[pind], sum)
  ploidy <- max(ploidies) * max(GetTaxaPloidy(object)) / 2L
  stopifnot(all(geno$genotypes <= ploidy, na.rm = TRUE))
  # put data into Structure format (Rcpp function)
  strdata <- FormatStructure(geno$genotypes, object$alleles2loc, ploidy)
  #colnames(strdata) <- GetLoci(object)
  
  # write file
  if(is.null(extraCols)){
    ncolx <- 1
    outdf <- data.frame(rep(GetTaxa(object), each = ploidy),
                        strdata)
  } else {
    ncolx <- 1 + ncol(extraCols)
    if(!is.null(rownames(extraCols)) &&
       !identical(rownames(extraCols), as.character(seq_len(nTaxa(object)))) &&
       !identical(rownames(extraCols), GetTaxa(object))){
      if(!all(GetTaxa(object) %in% rownames(extraCols))){
        stop("Row names of extraCols don't match sample names.")
      }
      extraCols <- extraCols[GetTaxa(object),]
    }
    if(nrow(extraCols) != nTaxa(object)){
      stop("Number of rows in extraCols is different from number of taxa in object.")
    }
    outdf <- data.frame(rep(GetTaxa(object), each = ploidy),
                        extraCols[rep(seq_len(nTaxa(object)), each = ploidy),],
                        strdata)
  }
  cat(paste(c(rep("", ncolx), GetLoci(object)), collapse = "\t"),
      file = file, sep = "\n")
  if(includeDistances){
    cat(paste(c(rep("", ncolx), distrow), collapse = "\t"), file = file,
        sep = "\n", append = TRUE)
  }
  write.table(outdf, file = file, append = TRUE, sep = "\t", col.names = FALSE,
              row.names = FALSE, quote = FALSE)
  cat(c(paste("Number of individuals:", nTaxa(object)),
        paste("Number of loci:", nLoci(object)),
        paste("Ploidy of data:", ploidy),
        "Missing data value: -9", "", "File contains:",
        "Row of marker names"),
      sep = "\n")
  if(includeDistances){
    cat("Map distances between loci", sep = "\n")
  }
  cat("Individual ID for each individual", sep = "\n")
  if(ncolx > 1){
    cat(paste(ncolx - 1, "extra columns that you should define for Structure"))
  }
}

Export_adegenet_genind <- function(object, ploidyIndex = 1){
  object <- SubsetByPloidy(object, ploidies = object$possiblePloidies[ploidyIndex])
  
  tab <- GetProbableGenotypes(object, omit1allelePerLocus = FALSE,
                              multiallelic = "correct")$genotypes
  colnames(tab) <- paste(sub("\\.", "_", GetLoci(object)[object$alleles2loc]),
                         object$alleleNucleotides, sep = ".")
  pldT2 <- sum(object$possiblePloidies[[1]]) * GetTaxaPloidy(object)
  stopifnot(all(pldT2 %% 2L == 0L))
  
  out <- methods::new("genind", tab = tab,
             ploidy = pldT2 %/% 2L,
             type = "codom")

  return(out)
}

Export_polymapR_probs <- function(object, maxPcutoff = 0.9,
                                  correctParentalGenos = TRUE,
                                  multiallelic = "correct"){
  if(!is(object, "RADdata")){
    stop("RADdata object needed")
  }
  if(nrow(object$posteriorProb) > 1){
    stop("Only one ploidy allowed for Export_polymapR_probs.")
  }
  
  object <- RemoveUngenotypedLoci(object, removeNonvariant = TRUE)
  p1 <- max(sapply(object$posteriorProb[1,], function(x) dim(x)[1])) # ploidy plus one
  omitals <- OneAllelePerMarker(object, commonAllele = TRUE)
  keepals <- GetAlleleNames(object)[-omitals]
  
  probmat <- matrix(NA_real_,
                    nrow = nTaxa(object) * length(keepals),
                    ncol = p1,
                    dimnames = list(NULL, paste0("P", seq_len(p1) - 1L)))
  out <- data.frame(SampleName = rep(GetTaxa(object), times = length(keepals)),
                    MarkerName = rep(keepals, each = nTaxa(object)))
  for(pld in colnames(object$posteriorProb)){
    p1a <- dim(object$posteriorProb[[1,pld]])[1] # ploidy plus one for this set of taxa
    thesetaxa <- dimnames(object$posteriorProb[[1,pld]])[[2]]
    theserows <- out$SampleName %in% thesetaxa
    stopifnot(identical(thesetaxa, unique(out$SampleName[theserows])))
    probmat[theserows, 1:p1a] <-
      matrix(object$posteriorProb[[1,pld]][,,keepals],
             nrow = length(thesetaxa) * length(keepals),
             ncol = p1a, byrow = TRUE)
    # Note that .priorTimesLikelihood already uses even priors for the parents
  }
  out <- cbind(out, probmat)
  
  genomat <- GetProbableGenotypes(object, omit1allelePerLocus = TRUE,
                                  omitCommonAllele = TRUE,
                                  correctParentalGenos = correctParentalGenos,
                                  multiallelic = multiallelic)$genotypes
  genovect <- as.vector(genomat)
  out$maxP <- numeric(nTaxa(object) * length(keepals))
  for(i in seq_len(p1) - 1L){
    theserows <- which(genovect == i)
    out$maxP[theserows] <- out[[paste0("P", i)]][theserows]
  }
  out$maxgeno <- genovect
  out$geno <- genovect
  out$geno[out$maxP < maxPcutoff] <- NA_real_
  
  return(out)
}
Any scripts or data that you put into this service are public.
polyRAD documentation built on Nov. 10, 2022, 5:14 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
polyRAD
Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

R/data_export.R
In polyRAD: Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

Defines functions Export_polymapR_probs Export_adegenet_genind Export_Structure RADdata2VCF Export_GWASpoly Export_MAPpoly Export_polymapR Export_TASSEL_Numeric Export_rrBLUP_GWAS Export_rrBLUP_Amat ExportGAPIT .ordered_gen_and_loctable .chromosome_to_integer

Documented in Export_adegenet_genind ExportGAPIT Export_GWASpoly Export_MAPpoly Export_polymapR Export_polymapR_probs Export_rrBLUP_Amat Export_rrBLUP_GWAS Export_Structure Export_TASSEL_Numeric RADdata2VCF

Try the polyRAD package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

polyRAD Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

R/data_export.R In polyRAD: Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

Defines functions Export_polymapR_probs Export_adegenet_genind Export_Structure RADdata2VCF Export_GWASpoly Export_MAPpoly Export_polymapR Export_TASSEL_Numeric Export_rrBLUP_GWAS Export_rrBLUP_Amat ExportGAPIT .ordered_gen_and_loctable .chromosome_to_integer

Documented in Export_adegenet_genind ExportGAPIT Export_GWASpoly Export_MAPpoly Export_polymapR Export_polymapR_probs Export_rrBLUP_Amat Export_rrBLUP_GWAS Export_Structure Export_TASSEL_Numeric RADdata2VCF

Try the polyRAD package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

polyRAD
Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

R/data_export.R
In polyRAD: Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids