R/read_cross2.R

# read_cross2
#' Read QTL data from files
#'
#' Read QTL data from a set of files
#'
#' @param file Character string with path to the
#' \href{http://www.yaml.org}{YAML} file containing all of the control
#' information. This could instead be a zip file containing all of the
#' data files, in which case the contents are unzipped to a temporary
#' directory and then read.
#' @param quiet If \code{FALSE}, print progress messages.
#'
#' @return Object of class \code{"cross2"}. For details, see the
#' \href{http://kbroman.org/qtl2/assets/vignettes/developer_guide.html}{R/qtl2 developer guide}.
#'
#' @details A control file in \href{http://www.yaml.org}{YAML} format contains information about
#' basic parameters as well as the names of the series of data files
#' to be read. See the
#' \href{http://kbroman.org/qtl2/pages/sampledata.html}{sample data files}
#' and the
#' \href{http://kbroman.org/qtl2/assets/vignettes/input_files.html}{vignette describing the input file format}.
#'
#' @export
#' @importFrom magrittr "%>%"
#' @keywords IO
#' @seealso \code{\link{write_control_file}}, sample data files at \url{http://kbroman.org/qtl2/pages/sampledata.html}
#' @examples
#' \dontrun{
#' yaml_file <- "http://kbroman.org/qtl2/assets/sampledata/grav2/grav2.yaml"
#' grav2 <- read_cross2(yaml_file)
#' }
#' zip_file <- system.file("extdata", "grav2.zip", package="qtl2")
#' grav2 <- read_cross2(zip_file)
read_cross2 <-
function(file, quiet=TRUE)
{
    if(length(grep("\\.zip$", file)) > 0) { # zip file
        dir <- tempdir()
        if(is_web_file(file)) {
            tmpfile <- tempfile()
            if(!quiet) message(" - downloading ", file, "\n       to ", tmpfile)
            download.file(file, tmpfile, quiet=TRUE)
            file <- tmpfile
            on.exit(unlink(tmpfile))
        }

        if(!quiet) message(" - unzipping ", file, "\n       to ", dir)
        file <- path.expand(file)
        stop_if_no_file(file)
        unzipped_files <- utils::unzip(file, exdir=dir)
        file <- unzipped_files[grep("\\.yaml$", unzipped_files)]

        on.exit({ # clean up when done
            if(!quiet) message(" - cleaning up")
            unlink(unzipped_files)
        }, add=TRUE)
    }

    # directory containing the data
    file <- path.expand(file)
    dir <- dirname(file)

    # load the control file
    stop_if_no_file(file)
    control <-  yaml::yaml.load_file(file)

    # grab cross type
    if("crosstype" %in% names(control))
        output <- list(crosstype=control$crosstype)
    else
        stop("crosstype not found")

    # grab genotype encodings or use defaults
    if("genotypes" %in% names(control))
        genotypes <- control$genotypes
    else
        genotypes <- list(A=1, H=2, B=3, D=4, C=5)

    # grab the major bits
    sections <- c("geno", "gmap", "pmap", "pheno", "covar", "phenocovar", "founder_geno")
    for(section in sections) {
        if(section %in% names(control)) {
            if(!quiet) message(" - reading ", section)
            file <- file.path(dir, control[[section]])
            stop_if_no_file(file)
            sheet <- read_csv(file, na.strings=control$na.strings, sep=control$sep)

            # change genotype codes and convert phenotypes to numeric matrix
            if(section=="geno" || section=="founder_geno") {
                if(!quiet) message(" - encoding ", section)
                sheet <- encode_geno(sheet, genotypes)
            }
            else if(section=="pheno")
                sheet <- pheno2matrix(sheet)

            output[[section]] <- sheet
        }
    }

    # pull out a map
    if("gmap" %in% names(output))
        map <- output$gmap
    else if("pmap" %in% output)
        map <- output$pmap
    else stop("Need a genetic or physical marker map")

    if(!("geno" %in% names(output)))
       stop("No genotype data found.")

    # split genotypes by chromosome
    geno <- c("geno", "founder_geno")
    for(section in geno) {
        if(section %in% names(output)) {
            if(!quiet) message(" - splitting up ", section)
            output[[section]] <- split_geno(output[[section]], map)
        }
    }

    # split maps by chr
    maps <- c("gmap", "pmap")
    for(section in maps) {
        if(section %in% names(output)) {
            if(!quiet) message(" - splitting up ", section)
            output[[section]] <- split_map(output[[section]])
        }
    }

    # X chr?
    chr <- names(output$geno)
    output$is_x_chr <- rep(FALSE, length(chr))
    names(output$is_x_chr) <- chr
    if("x_chr" %in% names(control)) {
        x_chr <- control$x_chr # name of X chromosome
        output$is_x_chr[x_chr] <- TRUE
    }

    # sex
    output$is_female <- convert_sex(control$sex, output$covar, control$sep, dir, quiet=quiet)
    if(is.null(output$is_female)) { # missing; assume all FALSE
        output$is_female <- rep(FALSE, nrow(output$geno[[1]]))
        names(output$is_female) <- rownames(output$geno[[1]])
    }

    # cross_info
    output$cross_info <- convert_cross_info(control$cross_info, output$covar, control$sep, dir, quiet=quiet)
    if(is.null(output$cross_info)) { # missing; make a 0-column matrix
        output$cross_info <- matrix(0L, ncol=0, nrow=nrow(output$geno[[1]]))
        rownames(output$cross_info) <- rownames(output$geno[[1]])
    }

    # line map (mapping of individuals to lines)
    output$linemap <- convert_linemap(control$linemap, output$covar, control$sep, dir, quiet=quiet)

    # alleles?
    if("alleles" %in% names(control))
        output$alleles <- control$alleles

    class(output) <- "cross2"
    output
}

# convert genotype data, using genotype encodings
# genotypes is a list with names = code in data
#                     and values = new numeric code
encode_geno <-
function(geno, genotypes)
{
    newgeno <- geno <- as.matrix(geno)

    if(any(unlist(genotypes)==0))
        stop("Can't encode genotypes as 0, that's used for missing values.")

    # genotype codes
    gnames <- names(genotypes)
    codes <- genotypes

    # any mismatches?
    gch <- as.character(geno)
    mismatch <- !is.na(gch) & is.na(match(gch,gnames))
    if(any(mismatch)) {
        warning(sum(mismatch), " genotypes treated as missing: ",
                paste0('"', unique(gch[mismatch]), '"', collapse=", "))
        newgeno[mismatch] <- NA
    }

    # re-code
    for(g in gnames)
        newgeno[!is.na(geno) & geno==g] <- genotypes[[g]]

    # turn missing values to 0s
    newgeno[is.na(newgeno)] <- "0"

    #
    storage.mode(newgeno) <- "integer"

    newgeno
}

# convert a phenotype data.frame to a matrix of doubles
pheno2matrix <-
function(pheno)
{
    pheno <- as.matrix(pheno)
    storage.mode(pheno) <- "double"

    pheno
}

# make the first column of a data frame the row names
firstcol2rownames <-
function(mat, name="")
{
    # assume the first column is the ID
    idcolumn <- mat[,1]

    # look for duplicates
    check4duplicates(idcolumn, name)

    # drop that column and make it the row names
    mat <- mat[,-1,drop=FALSE]
    rownames(mat) <- as.character(idcolumn)

    mat
}

# check a list of IDs for duplicates
check4duplicates <-
function(ids, name="")
{
    if(name != "") name <- paste("in", name)
    dup <- duplicated(ids)
    if(any(dup)) {
        stop("Not all ids in ", name, " are unique: ",
             paste0('"', unique(ids[dup]), '"', collapse=", "))
    }
    TRUE
}

# split genotype data into list of chromosomes
split_geno <-
function(geno, map)
{
    gmark <- colnames(geno)
    mmark <- rownames(map)

    m <- match(gmark, mmark)
    if(any(is.na(m)))
        stop("Some markers in genotype data not found in map: ",
             paste0('"', gmark[is.na(m)], '"', collapse=", "))

    chr <- map[match(colnames(geno), rownames(map)), 1]
    pos <- map[match(colnames(geno), rownames(map)), 2]

    uchr <- unique(chr)
    newgeno <- vector("list", length(uchr))
    names(newgeno) <- uchr
    for(i in uchr) {
        wh <- which(chr==i)
        o <- order(pos[chr==i])
        newgeno[[i]] <- geno[,wh[o], drop=FALSE]
    }

    newgeno
}

# split a data frame with chr, pos (and marker names as rows)
#   into a list of chromosomes
split_map <-
function(map)
{
    pos <- map[,2]
    chr <- map[,1]
    uchr <- unique(chr)

    names(pos) <- rownames(map)

    lapply(split(pos, factor(chr, uchr)), sort)
}

# grab sex information
convert_sex <-
function(sex_control, covar, sep, dir, quiet=TRUE)
{
    if(is.null(sex_control)) return(NULL)

    if("covar" %in% names(sex_control)) { # sex within the covariates
        sex <- covar[,sex_control$covar, drop=FALSE]
    }
    else if("file" %in% names(sex_control)) { # look for file
        if(!quiet) message(" - reading sex")
        file <- file.path(dir, sex_control$file)
        stop_if_no_file(file)
        sex <- read_csv(file, sep=sep, na.strings=NULL)
    }
    else return(NULL)

    # convert to vector
    id <- rownames(sex)
    sex <- sex[,1]
    names(sex) <- id

    # missing values?
    if(any(is.na(sex))) {
        stop(sum(is.na(sex)), " missing sexes (sex can't be missing).")
    }

    # grab the rest of sex_control as conversion codes
    codes <- sex_control[is.na(match(names(sex_control), c("file", "covar")))]
    codes <- convert_sexcodes(codes) # convert to 0 and 1
    sexcode <- names(codes)

    if(length(codes)==0) { # no codes, pass over as is
        storage.mode(sex) <- "integer"
        return(sex==0)
    }

    # any mismatches?
    sexch <- as.character(sex)
    mismatch <- !is.na(sexch) & is.na(match(sexch,sexcode))
    if(any(mismatch)) {
        stop(sum(mismatch), " sexes don't match the codes (sex can't be missing): ",
                paste0('"', unique(sexch[mismatch]), '"', collapse=", "))
    }

    # re-code
    newsex <- sex
    for(code in sexcode)
        newsex[sex==code] <- codes[[code]]

    storage.mode(newsex) <- "integer"

    (newsex==0)
}

convert_sexcodes <-
function(codes)
{
    result <- codes %>%
        tolower() %>%
        substr(1,1) %>%
        match(c("f", "m")) - 1

    names(result) <- names(codes)
    result
}


# grab cross_info
convert_cross_info <-
function(cross_info_control, covar, sep, dir, quiet=TRUE)
{
    if(is.null(cross_info_control)) return(NULL)

    if(!is.list(cross_info_control)) { # provided file name directly?
        if(file.exists(file.path(dir, cross_info_control)))
            cross_info_control <- list(file=cross_info_control)
    }

    if("file" %in% names(cross_info_control)) { # look for file
        if(!quiet) message(" - reading cross_info")

        file <- file.path(dir, cross_info_control$file)
        stop_if_no_file(file)
        cross_info <- read_csv(file, sep=sep)

        if(any(is.na(cross_info)))
            stop(sum(is.na(cross_info)), " missing values in cross_info (cross_info can't be missing.")
    }
    else if("covar" %in% names(cross_info_control)) { # cross_info within the covariates
        cross_info <- covar[,cross_info_control$covar, drop=FALSE]
    }
    else return(NULL)

    # make it a matrix
    cross_info <- as.matrix(cross_info)

    if(any(is.na(cross_info))) {
        stop(sum(is.na(cross_info)), " missing values in cross_info (cross_info can't be missing).")
    }

    # grab the rest of cross_info_control as conversion codes
    codes <- cross_info_control[is.na(match(names(cross_info_control), c("file", "covar")))]
    cicode <- names(codes)

    if(length(codes)==0) { # no codes, pass over as is
        storage.mode(cross_info) <- "integer"
        return(cross_info)
    }

    # any mismatches?
    cich <- as.character(cross_info)
    mismatch <- !is.na(cich) & is.na(match(cich,cicode))
    if(any(mismatch)) {
        stop(sum(mismatch), " cross_info vals don't match the codes (cross_info can't be missing): ",
                paste0('"', unique(cich[mismatch]), '"', collapse=", "))
    }
    # re-code
    newci <- cross_info
    for(code in cicode)
        newci[cross_info==code] <- codes[[code]]

    storage.mode(newci) <- "integer"

    newci
}

# grab linemap information
convert_linemap <-
function(linemap_control, covar, sep, dir, quiet=TRUE)
{
    if(is.null(linemap_control)) return(NULL)

    if(!is.list(linemap_control)) { # provided directly
        if(file.exists(file.path(dir, linemap_control))) # is it a file name?
            linemap_control <- list(file=linemap_control)
        else # assumed to be a covariate column
            linemap_control <- list(covar=linemap_control)
    }

    # see if it's a file
    if("file" %in% names(linemap_control)) {
        if(!quiet) message(" - reading linemap")
        filename <- file.path(dir, linemap_control$file)
        linemap <- read_csv(filename, sep=sep, na.strings=NULL)
    }
    else if("covar" %in% names(linemap_control)) { # column name in the covariate data
        linemap <- covar[,linemap_control$covar, drop=FALSE]
    }
    else { # can't figure it out; just ignore it
        return(NULL)
    }

    # convert to vector
    id <- rownames(linemap)
    linemap <- linemap[,1]
    names(linemap) <- id

    # missing values?
    if(any(is.na(linemap)))
        stop(sum(is.na(linemap)), " missing linemapes (linemap can't be missing).")

    linemap
}

# is file on web (starts with http://, https://, or file://)
is_web_file <-
function(file)
{
    patterns <- c("^http://", "^https://", "^file://")
    vapply(patterns, function(a,b) length(grep(a, b)), 0, file) %>% sum %>% ">"(0)
}

stop_if_no_file <-
function(filename)
{
    if(is_web_file(filename)) return(TRUE)
    if(!file.exists(filename))
        stop('file "', filename, '" does not exist.')
}

# read a csv file
read_csv <-
function(filename, sep=",", na.strings=c("NA", "-"))
{
    x <- data.table::fread(filename, na.strings=na.strings, sep=sep, header=TRUE,
                           verbose=FALSE, showProgress=FALSE, data.table=FALSE)

    firstcol2rownames(x)
}
simecek/qtl2 documentation built on May 29, 2019, 10:01 p.m.