R/create_crimap_input.R

Defines functions create_crimap_input

Documented in create_crimap_input

#' create_crimap_input: Create a CriMAP input file.
#' @param gwaa.data GenABEL gwaa.data object containing genomic data
#' @param familyPedigree data.frame containing columns ANIMAL, FATHER, MOTHER
#'   and FAMILY. FAMILY defines family groups for crimap. IDs in ANIMAL can be
#'   repeated within and between families if necessary.
#' @param analysisID string. Used as an identifier for the chromosome. Must
#'   begin with a numeric character as per crimap convention; for example,
#'   analysisID = "1a" will output the file "chr1a.gen".
#' @param snplist vector, optional. A list of ordered SNPs. UNless build is run,
#'   SNPs will be assumed to be in this order.
#' @param chr string. Chromosome ID if whole chromosome is to be run.
#' @param is.X logical. If true, then all heterozygotes are scored as missing in
#'   males, except for pseudoautosomal SNPs when defined (see below)
#' @param is.Z logical. If true, then all heterozygotes are scored as missing in
#'   females, except for pseudoautosomal SNPs when defined (see below)
#' @param pseudoautoSNPs Character vector of pseudoautosomal SNP IDs. Only used
#'   if is.X or is.Z == TRUE.
#' @param use.mnd logical, default = FALSE. Masks parent offspring genotype
#'   mismatches based on output from parse_mend_err
#' @param use.specific.mnd string, path to specific .mnd file to use.
#' @param outdir String, optional. Specify the path of the directory in which
#'   the output file should be written.
#' @param verbose Logical. FALSE will suppress messages.
#' @param clear.existing.analysisID Logical, default is TRUE. Overwrites .gen
#'   file and deletes crimap files that have the analysisID in the target
#'   directory.
#' @param dummy.parent.id numeric. This is an identifier for IDs that should
#'   be filled in with blank genotypes, for example, because CRIMAP cannot deal
#'   with a single parent.
#' @import GenABEL
#' @import data.table
#' @import plyr
#' @export
#
#

# snplist = NULL
# chr = NULL
# is.X = FALSE
# is.Z = FALSE
# pseudoautoSNPs = NULL
# use.mnd = FALSE
# use.specific.mnd = NULL
# outdir = NULL
# verbose = TRUE
# clear.existing.analysisID = TRUE
# dummy.mother.id = NULL
# dummy.father.id = NULL
#
# gwaa.data <- deer.abel
# familyPedigree <- deer.famped
# analysisID <- "1a"
# dummy.mother.id = c(8888, 9999)
# dummy.father.id = NULL

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Create Crimap Files
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


create_crimap_input <- function(gwaa.data,
                                familyPedigree,
                                analysisID,
                                snplist = NULL,
                                chr = NULL,
                                is.X = FALSE,
                                is.Z = FALSE,
                                pseudoautoSNPs = NULL,
                                use.mnd = FALSE,
                                use.specific.mnd = NULL,
                                outdir = NULL,
                                verbose = TRUE,
                                clear.existing.analysisID = TRUE,
                                dummy.mother.id = NULL,
                                dummy.father.id = NULL) {

  #~~ Check that analysisID starts with a number


  if(!substr(analysisID, 1, 1) %in% 0:9) stop("analysisID must start with a numeric character. Please rename it and try again.")

  if(is.X == TRUE & is.Z == TRUE) stop("Cannot specify as both X and Z chromosome")

  if(!is.null(chr) & !is.null(snplist)) stop("Please only specify only SNP list or chromosome")

  if(is.X == TRUE) message("Running as X chromosome: specify pseudoautosomalSNPs to retain some male PAR heterozygotes")

  if(is.Z == TRUE) message("Running as Z chromosome: specify pseudoautosomalSNPs to retain some female PAR heterozygotes")

  if(any(which(!c(familyPedigree[,1], familyPedigree[,2], familyPedigree[,3]) %in% c(idnames(gwaa.data), 0, NA, dummy.father.id, dummy.mother.id)))){

    stop("familyPedigree must not contain any IDs that are not in the gwaa.data object.")

  }

  if(is.null(outdir)){
    if(use.mnd == TRUE & !paste0("chr", analysisID, ".mnd") %in% dir()){
      stop(paste0("use.mnd == TRUE, but there is no file named chr",
                  analysisID, ".mnd. Change to FALSE and/or run parse_mend_err()."))
    }
  } else if(use.mnd == TRUE & !paste0("chr", analysisID, ".mnd") %in% dir(outdir)){
    stop(paste0("use.mnd == TRUE, but there is no file named chr",
                analysisID, ".mnd. Change to FALSE and/or run parse_mend_err()."))
  }

  if(!is.null(outdir)) dir.create(outdir, showWarnings = FALSE)



  #~~ Delete existing files if specified

  if(!is.null(outdir)) out.path.stem <- paste0(outdir, "/chr", analysisID)
  if( is.null(outdir)) out.path.stem <- paste0("chr", analysisID)


  if(clear.existing.analysisID == TRUE){

    if(!is.null(outdir)) del.vec <- grep(paste0("chr", analysisID, "."), dir(outdir), value = T)
    if( is.null(outdir)) del.vec <- grep(paste0("chr", analysisID, "."), dir(), value = T)


    if(use.mnd == TRUE & paste0("chr", analysisID, ".mnd") %in% del.vec){

      del.vec <- del.vec[-which(del.vec == paste0("chr", analysisID, ".mnd"))]

      if(paste0("chr", analysisID, ".mndverbose") %in% del.vec){

        del.vec <- del.vec[-which(del.vec == paste0("chr", analysisID, ".mndverbose"))]

      }

    }


    if(Sys.info()["sysname"] == "Windows") {

      if(length(del.vec) > 0){

        if(!is.null(outdir)) del.vec <- paste0(outdir, "\\", del.vec)

        for(i in del.vec){

          system("cmd", input = paste0("del ", gsub("/", "\\\\", i)), show.output.on.console = F)
        }

      }

    } else {

      if(!is.null(outdir)) del.vec <- paste0(outdir, "/", del.vec)


      for(i in del.vec){

        system(paste0("rm ", i), show.output.on.console = F)
      }

    }

  }


  #~~ Check family pedigree format

  familyPedigree <- format_family_pedigree(familyPedigree)

  #~~ subset based on snplist

  if(!is.null(snplist)) gwaa.data <- gwaa.data[,snplist]
  if(!is.null(chr))     gwaa.data <- gwaa.data[,which(chromosome(gwaa.data) == chr)]


  nfamilies <- length(unique(familyPedigree$Family))

  nloci <- nsnps(gwaa.data)

  locus.names <- snp.names(gwaa.data)

  #~~ define outfile and write initial information to file


  outfile <- paste0(out.path.stem, ".gen")

  write.table(nfamilies, outfile, row.names = F, quote = F, col.names = F)
  write.table(nloci, outfile, row.names = F, quote = F, col.names = F, append=T)
  write.table("", outfile, row.names = F, quote = F, col.names = F, append=T)
  write.table(locus.names, outfile, row.names = F, quote = F, col.names = F, append=T)
  write.table("", outfile, row.names = F, quote = F, col.names = F, append=T)

  #~~ Subset gwaa.data to include only individuals which are within the pedigree provided

  ids.keep <- idnames(gwaa.data)[which(idnames(gwaa.data) %in% c(familyPedigree[,1], familyPedigree[,2], familyPedigree[,3]))]

  gwaa.data <- gwaa.data[ids.keep,]

  #~~ Recode ACGT as 1234 with space delimited between alleles

  temp.geno <- as.character.snp.data(gtdata(gwaa.data))

  temp.geno <- as.matrix(temp.geno)

  if(verbose == TRUE) message("Recoding alleles to numeric values...")

  temp.geno[which(is.na(temp.geno))] <- "0 0"
  temp.geno[which(temp.geno == "A/A")] <- "1 1"
  temp.geno[which(temp.geno == "A/C")] <- "1 2"
  temp.geno[which(temp.geno == "A/G")] <- "1 3"
  temp.geno[which(temp.geno == "A/T")] <- "1 4"
  temp.geno[which(temp.geno == "C/A")] <- "2 1"
  temp.geno[which(temp.geno == "C/C")] <- "2 2"
  temp.geno[which(temp.geno == "C/G")] <- "2 3"
  temp.geno[which(temp.geno == "C/T")] <- "2 4"
  temp.geno[which(temp.geno == "G/A")] <- "3 1"
  temp.geno[which(temp.geno == "G/C")] <- "3 2"
  temp.geno[which(temp.geno == "G/G")] <- "3 3"
  temp.geno[which(temp.geno == "G/T")] <- "3 4"
  temp.geno[which(temp.geno == "T/A")] <- "4 1"
  temp.geno[which(temp.geno == "T/C")] <- "4 2"
  temp.geno[which(temp.geno == "T/G")] <- "4 3"
  temp.geno[which(temp.geno == "T/T")] <- "4 4"
  temp.geno[which(temp.geno == "1/1")] <- "1 1"
  temp.geno[which(temp.geno == "1/2")] <- "1 2"
  temp.geno[which(temp.geno == "2/1")] <- "2 1"
  temp.geno[which(temp.geno == "2/2")] <- "2 2"


  if(verbose == TRUE) message("...done")


  #~~ Create a data frame with ID, FATHER, MOTHER, SEX, and Genotypes.

  temp.geno <- data.frame(temp.geno, stringsAsFactors=F)

  temp.geno$ANIMAL <- idnames(gwaa.data)

  temp.geno$SEX <- phdata(gwaa.data)$sex

  if(!is.null(dummy.father.id)){
    for(i in 1:length(dummy.father.id)){
      temp.geno <- rbind(temp.geno, c(rep("0 0", times = length(locus.names)), dummy.father.id[i], 1))
    }
  }

  if(!is.null(dummy.mother.id)){
    for(i in 1:length(dummy.mother.id)){
      temp.geno <- rbind(temp.geno, c(rep("0 0", times = length(locus.names)), dummy.mother.id[i], 0))
    }
  }


  familyPedigree$ANIMAL <- as.character(familyPedigree$ANIMAL)

  temp.geno <- data.table(temp.geno, key = "ANIMAL")

  ped2 <- data.table(unique(familyPedigree[,1:3]), key = "ANIMAL")


  if(verbose == TRUE) message("Merging pedigree and genotype information...")

  suppressMessages(temp.geno <- join(ped2, temp.geno))

  if(verbose == TRUE) message("...done.")

  temp.geno <- data.frame(temp.geno)

  temp.geno <- cbind(temp.geno[,c("ANIMAL", "MOTHER", "FATHER", "SEX")], temp.geno[,which(!names(temp.geno) %in% c("ANIMAL", "MOTHER", "FATHER", "SEX"))])

  #~~ Deal with NA's

  temp.geno$MOTHER[which(is.na(temp.geno$MOTHER))] <- 0

  temp.geno$FATHER[which(is.na(temp.geno$FATHER))] <- 0

  for(i in 1:ncol(temp.geno)) temp.geno[which(is.na(temp.geno[,i])),i] <- "0 0"

  #~~ Format temp.geno

  temp.geno <- unique(temp.geno)

  suppressMessages(temp.geno <- join(familyPedigree, temp.geno))

  temp.geno <- temp.geno[,c("Family", "ANIMAL", "MOTHER", "FATHER", "SEX", names(temp.geno)[6:ncol(temp.geno)])]

  #~~ Deal with sex chromosomes

  if(is.X == TRUE){

    xmarkers <- which(!names(temp.geno) %in% pseudoautoSNPs)

    xmarkers <- xmarkers[6:length(xmarkers)]

    heterozygotes <- c("1 2", "1 3", "1 4", "2 1", "2 3", "2 4", "3 1", "3 2", "3 4", "4 1", "4 2", "4 3")

    for(j in xmarkers){

      #~~ remove heterozygotes

      temp.geno[which(temp.geno$SEX == 1 & temp.geno[,j] %in% heterozygotes),j] <- "0 0"

      temp.geno[which(temp.geno$SEX == 1 & temp.geno[,j] == "1 1"),j] <- "1 0"
      temp.geno[which(temp.geno$SEX == 1 & temp.geno[,j] == "2 2"),j] <- "2 0"
      temp.geno[which(temp.geno$SEX == 1 & temp.geno[,j] == "3 3"),j] <- "3 0"
      temp.geno[which(temp.geno$SEX == 1 & temp.geno[,j] == "4 4"),j] <- "4 0"

    }

    rm(xmarkers, heterozygotes)

  }

  if(is.Z == TRUE){

    xmarkers <- which(!names(temp.geno) %in% pseudoautoSNPs)

    xmarkers <- xmarkers[6:length(xmarkers)]

    heterozygotes <- c("1 2", "1 3", "1 4", "2 1", "2 3", "2 4", "3 1", "3 2", "3 4", "4 1", "4 2", "4 3")

    for(j in xmarkers){

      #~~ remove heterozygotes

      temp.geno[which(temp.geno$SEX == 0 & temp.geno[,j] %in% heterozygotes),j] <- "0 0"

      temp.geno[which(temp.geno$SEX == 0 & temp.geno[,j] == "1 1"),j] <- "1 0"
      temp.geno[which(temp.geno$SEX == 0 & temp.geno[,j] == "2 2"),j] <- "2 0"
      temp.geno[which(temp.geno$SEX == 0 & temp.geno[,j] == "3 3"),j] <- "3 0"
      temp.geno[which(temp.geno$SEX == 0 & temp.geno[,j] == "4 4"),j] <- "4 0"

    }

    rm(xmarkers, heterozygotes)

  }



  #~~ deal with mendelian errors if specified

  if(use.mnd == TRUE || !is.null(use.specific.mnd)){

    if(verbose == TRUE) message("Masking Mendelian errors...")

    if(!is.null(use.specific.mnd)){

      menderrtab <- read.table(use.specific.mnd, header = T, stringsAsFactors = F)

    } else {

      menderrtab <- read.table(paste0(out.path.stem, ".mnd"), header = T, stringsAsFactors = F)

    }

    if(nrow(menderrtab) > 0){

      menderrtab <- subset(menderrtab, SNP.Name %in% snpnames(gwaa.data))


      for(i in 1:nrow(menderrtab)){



        temp.geno[which(temp.geno$ANIMAL == menderrtab$ANIMAL[i]),
                  which(names(temp.geno) == menderrtab$SNP.Name[i])] <- "0 0"

      }

    }

    if(verbose == TRUE) message("...done.")


  }

  if(verbose == TRUE) message(paste0("Parsing and writing to ", outfile, "..."))


  temp.geno <- as.matrix(temp.geno)

  #~~ Create the family and genotype part of the file.

  test <- data.frame(FamSize = tapply(familyPedigree$Family, familyPedigree$Family, length))

  test$Family <- row.names(test)

  familyPedigree <- arrange(familyPedigree, Family)

  all(unique(test$Family) == unique(familyPedigree$Family))

  final.frame <- matrix("", ncol = (ncol(temp.geno)-1), nrow = (sum(test$FamSize) + (nrow(test)*4)))

  counter <- 1

  for(i in unique(familyPedigree$Family)){

    famsize <- test$FamSize[which(test$Family == i)]

    final.frame[counter, 1] <- i
    final.frame[counter+1, 1] <- famsize
    final.frame[(counter+3):(counter+2+famsize),] <- temp.geno[which(temp.geno[,"Family"] == i),-1]

    counter <- counter + 4 + famsize

  }

  #~~ remove white space

  final.frame2 <- apply(final.frame, 1, function (x) paste(x, collapse = " "))
  final.frame2 <- gsub("\\s+$", "", final.frame2)

  #~~ Write to file

  write.table(final.frame2, outfile, row.names = F, quote = F, col.names = F, append=T)

  if(verbose == TRUE) message("...done")

}
susjoh/crimaptools documentation built on Oct. 13, 2020, 3:24 p.m.