R/readMWril.R

######################################################################
#
# readMWril.R
#
# copyright (c) 2009-2011, Karl W Broman
# last modified Dec, 2011
# first written Apr, 2009
#
#     This program is free software; you can redistribute it and/or
#     modify it under the terms of the GNU General Public License,
#     version 3, as published by the Free Software Foundation.
# 
#     This program is distributed in the hope that it will be useful,
#     but without any warranty; without even the implied warranty of
#     merchantability or fitness for a particular purpose.  See the GNU
#     General Public License, version 3, for more details.
# 
#     A copy of the GNU General Public License, version 3, is available
#     at http://www.r-project.org/Licenses/GPL-3
#
# Part of the R/qtl package
# Contains: readMWril
#
######################################################################

######################################################################
#
# readMWril
#
# read multi-way RIL data in comma-delimited format,
# with a separate file for the founder genotypes, and possible a
# separate file for the phenotype data.
#
######################################################################

readMWril <-
function(dir, rilfile, founderfile, 
         type=c("ri4self", "ri4sib", "ri8self", "ri8sib", "bgmagic16"),
         na.strings=c("-","NA"), rotate=FALSE, 
         ...)
{
  # create file names
  if(missing(rilfile) || missing(founderfile))
    stop("Need to specify rilfile and founderfile.")

  if(!missing(dir) && dir != "") {
    rilfile <- file.path(dir, rilfile)
    founderfile <- file.path(dir, founderfile)
  }

  type <- match.arg(type)

  args <- list(...)

  # if user wants to use comma for decimal point, we need
  if(length(args) > 0 && "dec" %in% names(args)) {
    dec <- args[["dec"]]
  }
  else dec <- "."

  # read the data file
  if(length(args) < 1 || !("sep" %in% names(args))) {
    # "sep" not in the "..." argument and so take sep=","
    if(length(args) < 1 || !("comment.char" %in% names(args))) {
      gen <- read.table(rilfile, sep=",", na.strings=na.strings,
                        colClasses="character", fill=TRUE,
                        blank.lines.skip=TRUE, comment.char="", ...)
      founder <- read.table(founderfile, sep=",", na.strings=na.strings,
                        colClasses="character", fill=TRUE,
                        blank.lines.skip=TRUE, comment.char="", ...)
    }
    else {
      gen <- read.table(rilfile, sep=",", na.strings=na.strings,
                        colClasses="character", fill=TRUE,
                        blank.lines.skip=TRUE, ...)
      founder <- read.table(founderfile, sep=",", na.strings=na.strings,
                            colClasses="character", fill=TRUE,
                            blank.lines.skip=TRUE, ...)
    }
  }
  else {
    if(length(args) < 1 || !("comment.char" %in% names(args))) {
      gen <- read.table(rilfile, na.strings=na.strings,
                        colClasses="character", fill=TRUE,
                        blank.lines.skip=TRUE, comment.char="", ...)
      founder <- read.table(founderfile, na.strings=na.strings,
                            colClasses="character", fill=TRUE,
                            blank.lines.skip=TRUE, comment.char="", ...)
    }
    else {
      gen <- read.table(rilfile, na.strings=na.strings,
                        colClasses="character", fill=TRUE,
                        blank.lines.skip=TRUE, ...)
      founder <- read.table(founderfile, na.strings=na.strings,
                            colClasses="character", fill=TRUE,
                            blank.lines.skip=TRUE, ...)
    }
  }

  if(rotate) {
    gen <- as.data.frame(t(gen), stringsAsFactors=FALSE)
    founder <- as.data.frame(t(founder), stringsAsFactors=FALSE)
  }
  rn <- founder[-1,1]
  cn <- founder[1,-1]
  founder <- founder[-1,-1, drop=FALSE]
  founder <- matrix(as.numeric(unlist(founder)), ncol=length(cn))
  dimnames(founder) <- list(rn, cn)

  # determine number of phenotypes based on initial blanks in row 2
  n <- ncol(gen)
  temp <- rep(FALSE,n)
  for(i in 1:n) {
    temp[i] <- all(gen[2,1:i]=="")
    if(!temp[i]) break
  }

  if(!any(temp)) # no phenotypes!
    stop("You must include at least one phenotype (e.g., an index).")
  n.phe <- max((1:n)[temp])

  # Is map included?  yes if first n.phe columns in row 3 are all blank
  if(all(!is.na(gen[3,1:n.phe]) & gen[3,1:n.phe]=="")) {
    map.included <- TRUE
    map <- asnumericwithdec(unlist(gen[3,-(1:n.phe)]), dec=dec)
    if(any(is.na(map))) 
      stop("There are missing marker positions.")
    nondatrow <- 3
  }
  else {
    map.included <- FALSE
    map <- rep(0,ncol(gen)-n.phe)
    nondatrow <- 2 # last non-data row
  }
  pheno <- as.data.frame(gen[-(1:nondatrow),1:n.phe,drop=FALSE], stringsAsFactors=FALSE)
  colnames(pheno) <- as.character(gen[1,1:n.phe])

  # replace empty cells with NA
  gen <- sapply(gen,function(a) { a[!is.na(a) & a==""] <- NA; a })

  # pull apart phenotypes, genotypes and map
  mnames <- gen[1,-(1:n.phe)]
  if(any(is.na(mnames)))  stop("There are missing marker names.")
  chr <- gen[2,-(1:n.phe)]
  if(any(is.na(chr))) stop("There are missing chromosome IDs.")

  gen <- matrix(as.numeric(gen[-(1:nondatrow),-(1:n.phe)]),
                ncol=ncol(gen)-n.phe)

  # Fix up phenotypes
  sw2numeric <-
    function(x, dec) {
      wh1 <- is.na(x)
      n <- sum(!is.na(x))
      y <- suppressWarnings(asnumericwithdec(as.character(x), dec=dec))
      wh2 <- is.na(y)
      m <- sum(!is.na(y))
      if(n==m || (n-m) < 2 || (n-m) < n*0.05) {
        if(sum(!wh1 & wh2) > 0) {
          u <- unique(as.character(x[!wh1 & wh2]))
          if(length(u) > 1) {
            themessage <- paste("The phenotype values", paste("\"", u, "\"", sep="", collapse=" "))
                themessage <- paste(themessage, " were", sep="")
              }
              else {
                themessage <- paste("The phenotype value \"", u, "\" ", sep="")
                themessage <- paste(themessage, " was", sep="")
              }
              themessage <- paste(themessage, "interpreted as missing.")
              warning(themessage)

        }
        return(y)
      }
      else return(x)
    }
  pheno <- data.frame(lapply(pheno, sw2numeric, dec=dec), stringsAsFactors=TRUE)

  n.str <- nrow(founder)

  if(!("cross" %in% names(pheno))) {
    warning("Need a phenotype named \"cross\"; assuming all come from the cross ",
            paste(LETTERS[1:n.str], collapse="x"))
    crosses <- matrix(1:n.str, ncol=n.str, nrow=nrow(gen), byrow=TRUE)
  }
  else {
    pheno$cross <- as.character(pheno$cross)
    if(any(nchar(pheno$cross) != n.str))
      stop("Mismatches in length of \"cross\" phenotype.")
    thecross <- matrix(unlist(strsplit(pheno$cross, "")), byrow=TRUE, ncol=n.str)
    crosses <- matrix(NA, ncol=n.str, nrow=nrow(gen))
    for(i in 1:nrow(thecross))
      crosses[i,] <- match(thecross[i,], LETTERS[1:n.str])
    if(any(is.na(crosses)))
      stop("Problems in the \"cross \" phenotype.")
  }

  # check founder data matches in dimension
  if(ncol(founder) != ncol(gen))
    stop("Different numbers of markers in RIL and founder files.")
  if(any(colnames(founder) != mnames)) {
    cnf <- colnames(founder)
    if(any(is.na(match(cnf, mnames))) || any(is.na(match(mnames, cnf))))
      stop("Mismatch in markers in RIL and founder files.")
    founder <- founder[,match(mnames, cnf),drop=FALSE]
  }

  wh <- which(is.na(gen))
  missingval <- min(as.numeric(c(gen, founder)), na.rm=TRUE)-1
  gen[wh] <- missingval
  founder[is.na(founder)] <- missingval
  d <- dim(gen)
  gen <-
    .C("R_reviseMWril",
       as.integer(d[1]),
       as.integer(d[2]),
       as.integer(n.str),
       as.integer(founder),
       gen=as.integer(gen),
       as.integer(crosses),
       as.integer(missingval),
       PACKAGE="qtl")$gen
  gen[wh] <- NA
  gen <- matrix(gen, nrow=d[1])
  gen[gen==0 | gen==(2^n.str-1)] <- NA
  

  # re-order the markers by chr and position
  # try to figure out the chr labels
  if(all(chr %in% c(1:999,"X","x"))) { # 1...19 + X
    tempchr <- chr
    tempchr[chr=="X" | chr=="x"] <- 1000
    tempchr <- as.numeric(tempchr)
    if(map.included) neworder <- order(tempchr, map)
    else neworder <- order(tempchr)

    chr <- chr[neworder]
    map <- map[neworder]
    gen <- gen[,neworder,drop=FALSE]
    mnames <- mnames[neworder]
  }
  
  # fix up dummy map
  if(!map.included) {
    map <- split(rep(0,length(chr)),chr)[unique(chr)]
    map <- unlist(lapply(map,function(a) seq(0,length=length(a),by=5)))
    names(map) <- NULL
  }

  # fix up map information
  # number of chromosomes
  uchr <- unique(chr)
  n.chr <- length(uchr)
  geno <- vector("list",n.chr)
  names(geno) <- uchr
  min.mar <- 1
  for(i in 1:n.chr) { # loop over chromosomes
    # create map
    temp.map <- map[chr==uchr[i]]
    names(temp.map) <- mnames[chr==uchr[i]]

    # pull out appropriate portion of genotype data
    data <- gen[,min.mar:(length(temp.map)+min.mar-1),drop=FALSE]
    min.mar <- min.mar + length(temp.map)
    colnames(data) <- names(temp.map)

    geno[[i]] <- list(data=data,map=temp.map)
    if(uchr[i] == "X" || uchr[i] == "x")
      class(geno[[i]]) <- "X"
    else 
      class(geno[[i]]) <- "A"
  }

  cross <- list(geno=geno,pheno=pheno)

  # check that data dimensions match
  n.mar1 <- sapply(geno,function(a) ncol(a$data))
  n.mar2 <- sapply(geno,function(a) length(a$map))
  n.phe <- ncol(pheno)
  n.ind1 <- nrow(pheno)
  n.ind2 <- sapply(geno,function(a) nrow(a$data))
  if(any(n.ind1 != n.ind2)) {
    cat(n.ind1,n.ind2,"\n")
    stop("Number of individuals in genotypes and phenotypes do not match.");
  }
  if(any(n.mar1 != n.mar2)) {
    cat(n.mar1,n.mar2,"\n")
    stop("Numbers of markers in genotypes and marker names files do not match.");
  }

  # print some information about the amount of data read
  cat(" --Read the following data:\n");
  cat("\t", n.ind1, " individuals\n");
  cat("\t", sum(n.mar1), " markers\n");
  cat("\t", n.phe, " phenotypes\n");

  if(all(is.na(gen)))
    warning("There is no genotype data!\n")

  cross$cross <- crosses

  # save founder genotypes in data
  founder[founder==missingval] <- NA
  ua <- apply(founder, 2, function(a) unique(a[!is.na(a)]))
  nua <- sapply(ua, length)
  if(all(nua <= 2)) { # re-code as 0/1
    for(i in 1:ncol(founder)) {
      if(nua[i]==1)
        founder[!is.na(founder[,i]),i] <- 0
      else if(nua[i]==2) {
        founder[!is.na(founder[,i]) & founder[,i]==ua[[i]][1],i] <- 0
        founder[!is.na(founder[,i]) & founder[,i]==ua[[i]][2],i] <- 1
      }
    }
  }
  cross$founderGeno <- founder

  class(cross) <- c(type, "cross")

  # check data
  summary(cross)

  cross
}


# end of readMWril.R
byandell/qtl documentation built on May 13, 2019, 9:28 a.m.