R/csv.R

Defines functions writePhenoCSV writeMapframeCSV writeMapCSV writeGenoCSV writeCrossCSV sniffCSV recodeCSV readPhenoCSV readMapframeCSV readMapCSV readGenoCSV readCrossCSV readCovarCSV hasMapCSV getMetadataCSV

Documented in getMetadataCSV hasMapCSV readCovarCSV readCrossCSV readGenoCSV readMapCSV readMapframeCSV readPhenoCSV recodeCSV sniffCSV writeCrossCSV writeGenoCSV writeMapCSV writeMapframeCSV writePhenoCSV

# Start of csv.R ###############################################################

# getMetadataCSV ---------------------------------------------------------------
#' Get parameters of \pkg{R/qtl} input data.
#' 
#' Given the path to an \pkg{R/qtl} input CSV file, or a \code{data.frame}
#' loaded from such a file, this function determines the parameters of that
#' data (e.g. phenotype columns).
#' 
#' @details The returned list includes information on the following parameters:
#' 
#' \itemize{
#'  \item{\code{pheno.cols}}{Indices of phenotype columns,
#'  or \code{NULL} if none found.}
#'  \item{\code{id.col}}{Index of ID column, or \code{NA} if not found.}
#'  \item{\code{geno.cols}}{Indices of genotype columns,
#'  or \code{NULL} if none found.}
#'  \item{\code{dat.rows}}{Row indices of data.}
#'  \item{\code{map.present}}{\code{TRUE} if the data contains a map;
#'  \code{FALSE} otherwise.}
#'  \item{\code{class}}{Expected class of the input data.
#'  This is \code{'unknown'} by default.}
#' }
#' 
#' This information can then guide further processing of the input data.
#' 
#' @param x An \pkg{R/qtl} input CSV file or
#' an \pkg{R/qtl} input \code{data.frame}.
#'
#' @return A list containing parameters of \pkg{R/qtl} input data.
#' 
#' @keywords internal
#' @rdname getMetadataCSV
getMetadataCSV <- function(x) {
    
    if ( is.data.frame(x) ) {
        
        data.class <- 'unknown'
        
        if ( nrow(x) < 2 ) {
            stop("data not found")
        }
        
        if ( anyNA(x[1, ]) ) {
            stop("invalid headings found")
        }
        
        # Get logical vector indicating which columns are blank
        # in the first row after the initial heading row.
        seq.blanks <- ! is.na(x[2, ]) & x[2, ] == ''
        
        # If third row present, get indices of columns with blanks
        # (where the optional genetic map data is placed)..
        if ( nrow(x) >= 3 ) {
            map.blanks <- ! is.na(x[3, ]) & x[3, ] == ''
        } else { # ..otherwise assume no map data.
            map.blanks <- rep.int(FALSE, ncol(x))
        }
        
        # Verify genetic map correctly formatted, if present.
        if ( any(map.blanks) && ( length(map.blanks) != length(seq.blanks) ||
            any(map.blanks != seq.blanks) ) ) {
            stop("map data row is invalid")
        }
        
        # Set phenotype columns from those with blank sequence row.
        # This includes the ID column for now.
        pheno.cols <- which(seq.blanks)
        
        # Get index of 'id' columns.
        id.col <- which(tolower(x[1, ]) == 'id')
        
        if ( length(id.col) > 1 ) {
            stop("multiple ID columns found")
        }
        
        # If there are phenotype columns, this is either cross or geno data..
        if ( length(pheno.cols) > 0 ) {
            
            if ( pheno.cols[1] != 1 || any(diff(pheno.cols) != 1) ) {
                stop("sequence data row is invalid")
            }
            
            if ( length(id.col) > 0 ) {
                
                if ( ! id.col %in% pheno.cols ) {
                    stop("ID column not in phenotype columns")
                }
                
                pheno.cols <- pheno.cols[-id.col]
            }
            
            geno.cols <- which( ! seq.blanks )
            
            if ( length(geno.cols) == 0 ) {
                stop("genotype data not found")
            }
            
            map.present <- any(map.blanks)
            head.rows <- if (map.present) {1:3} else {1:2}
            
            if ( length(pheno.cols) == 0 ) {
                
                if ( length(id.col) == 0 ) {
                    stop("ID column not found")
                }
                
                data.class <- 'geno'
                
            } else {
                
                data.class <- 'cross'
            }
            
        } else { # ..otherwise it's either map or pheno data.
            
            if ( length(id.col) == 0 ) {
                stop("ID column not found")
            }
            
            geno.cols <- NULL
            head.rows <- 1
            
            seq.col <- which( x[1, ] == 'chr' )
            pos.col <- which(grepl(const$pattern$poscol, x[1, ], ignore.case=TRUE))
            
            if ( length(seq.col) == 1 && length(pos.col) == 1 &&
                 id.col == 1 && seq.col == 2 && pos.col == 3 ) {
                
                data.class <- 'map'
                map.present <- TRUE
                
            } else {
                
                data.class <- 'pheno'
                map.present <- FALSE
                pheno.cols <- getColIndices(x)[-id.col] # every column (except ID column) is a pheno column
            }
        }
        
        first.data.row <- length(head.rows) + 1
        last.data.row <- nrow(x)
        
        if ( last.data.row < first.data.row ) {
            stop("data not found")
        }
        
        dat.rows <- first.data.row : last.data.row
        
        if ( length(pheno.cols) == 0 ) {
            pheno.cols <- NULL
        }
        
        if ( length(id.col) == 0 ) {
            id.col <- NA
        }
        
        params <- list(
            pheno.cols  = pheno.cols,
            id.col      = id.col,
            geno.cols   = geno.cols,
            dat.rows    = dat.rows,
            map.present = map.present,
            class       = data.class
        )
        
    } else if ( isSingleString(x) ) {
        
        stopifnot( file.exists(x) )
        
        x <- utils::read.csv(x, header=FALSE, nrows=4, check.names=FALSE,
            quote='', na.strings=const$missing.value, colClasses='character',
            strip.white=TRUE, stringsAsFactors=FALSE)
        
        x <- bstripBlankRows( rstripBlankCols(x) )
        
        params <- getMetadataCSV(x)
        
    } else {
        
        stop("cannot get CSV metadata from object of class - '", class(x), "'")
    }
    
    return(params)
}

# hasMapCSV --------------------------------------------------------------------
#' Test if CSV file contains map data.
#' 
#' @param infile Input CSV file path.
#'
#' @return \code{TRUE} if CSV file contains map data in a known format;
#' \code{FALSE} otherwise.
#'
#' @export
#' @family CSV functions
#' @family map utility functions
#' @rdname hasMapCSV
hasMapCSV <- function(infile) {
    
    stopifnot( isSingleString(infile) )
    stopifnot( file.exists(infile) )
    
    status <- tryCatch({
        params <- getMetadataCSV(infile)
        result <- params$map.present
    }, error=function(e) {
        result <- FALSE
    })
    
    return(status)
}

# readCovarCSV -----------------------------------------------------------------
#' Read covariate matrix from a CSV file.
#' 
#' This function reads a table of covariates (with optional \code{'ID'} column)
#' from a CSV file and returns a numeric \code{matrix} of covariate data. Sample
#' IDs are not included in the returned matrix, as \pkg{R/qtl} matches the
#' covariate matrix row-by-row to the sample rows of the \code{cross} object.
#' The contents of the input table do not need to be numeric, but they will be
#' converted to numeric values when being read from file. For example, columns
#' containing character values are read as factors, which are converted to
#' their numeric representation.
#' 
#' If a \code{cross} object is specified, the covariate matrix is matched
#' sample-by-sample to the given \code{cross}; matching per-sample requires
#' that sample IDs are included in both the covariate file and the given
#' \code{cross} object.
#' 
#' @param infile Input CSV file path.
#' @param cross An \pkg{R/qtl} \code{cross} object.
#' 
#' @return A numeric \code{matrix} of covariate data.
#' 
#' @export
#' @family CSV functions
#' @importFrom utils read.csv
#' @importFrom utils type.convert
#' @rdname readCovarCSV
readCovarCSV <- function(infile, cross=NULL) {
    
    stopifnot( isSingleString(infile) )
    stopifnot( file.exists(infile) )
    
    # Read covariate data as character data-frame from CSV file.
    covar.table <- utils::read.csv(infile, header=TRUE, check.names=FALSE,
        quote='', na.strings=const$missing.value, colClasses='character',
        strip.white=TRUE, stringsAsFactors=TRUE)
    stopif( anyDuplicated( colnames(covar.table) ) )
    
    # Trim any blank rows/columns from the bottom/right, respectively.
    covar.table <- bstripBlankRows( rstripBlankCols(covar.table) )
    
    # Get 'id' column of covariate table, if present.
    covar.id.col <- getIdColIndex(covar.table)
    
    # Get copy of covariate table with any duplicate rows removed.
    # NB: If each sample has consistent covariate data, this
    # should also deduplicate the ID column, if present.
    dedup.covar <- covar.table[ ! duplicated(covar.table), ]
    
    # If covariate table has an ID column, get vector of sample IDs, then
    # check for duplicates that would indicate inconsistent covariate data.
    if ( ! is.null(covar.id.col) ) {
        
        covar.ids <- as.character(dedup.covar[, covar.id.col])
        
        err.ids <- covar.ids[ duplicated(covar.ids) ]
        if ( length(err.ids) > 0 ) {
            stop("inconsistent covariate data for samples - '", toString(err.ids), "'")
        }
    }
    
    # If a cross object specified, try to match covariate data to cross.
    if ( ! is.null(cross) ) {
        
        stopifnot( 'cross' %in% class(cross) )
        
        matchable = FALSE
        
        cross.id.col <- getIdColIndex(cross)
        
        # Check: cross object has sample IDs.
        if ( ! is.null(cross.id.col) ) {
            
            # Get cross IDs.
            cross.ids <- as.character(cross$pheno[, cross.id.col])
            
            # Define new covariate table.
            covar.table <- data.frame( matrix( NA_character_,
                nrow = length(cross.ids), ncol = ncol(covar.table) ),
                stringsAsFactors = FALSE )
            colnames(covar.table) <- colnames(dedup.covar)
            
            # Check: covariate table has a sample ID column.
            if ( ! is.null(covar.id.col) ) {
                
                # Check: covariate sample IDs are a superset of the cross sample IDs.
                if ( all( cross.ids %in% covar.ids ) ) {
                    
                    matchable = TRUE
                    
                    for ( i in seq_along(cross.ids) ) {
                        j = which(covar.ids == cross.ids[i])
                        covar.table[i, ] <- unlist(dedup.covar[j, ])
                    }
                }
            }
        }
  
        # If 'matchable' flag equals to false, then check whether
        # have the same number between cross ID and covariate ID.
        if ( ! matchable && qtl::nind(cross) != nrow(covar.table) ) {
            stop("number of rows in covariate table (", nrow(covar.table),
                ") does not match number of cross samples (", qtl::nind(cross), ")")
        }
    }
    
    # Remove 'id' column from covariate table, if present.
    if ( ! is.null(covar.id.col) ) {
        covar.table <- deleteColumn(covar.table, col.index=covar.id.col)
    }
    
    # Convert columns of covariate data-frame
    # from character to appropriate datatypes.
    covar.table <- do.call(cbind.data.frame,
        lapply(covar.table, utils::type.convert))
    stopifnot( all( sapply(covar.table, class) %in%
        c('double', 'factor', 'integer', 'logical', 'numeric') ) )
    
    # Convert covariate table to numeric matrix.
    # NB: converts factors to their numeric encoding.
    covar.matrix <- data.matrix(covar.table, rownames.force=FALSE)
    
    return(covar.matrix)
}

# readCrossCSV -----------------------------------------------------------------
#' Read yeast \code{cross} from a CSV file.
#' 
#' This function reads yeast cross data from an \pkg{R/qtl} CSV file and
#' returns an \pkg{R/qtl} \code{cross} object. The returned \code{cross}
#' has an attribute \code{'info'} of type \code{\linkS4class{CrossInfo}},
#' which contains information about the input \code{cross}.
#' 
#' If a genetic map is not present in the input file, this is estimated when
#' loading the cross. In such cases, parameters \code{error.prob} and
#' \code{map.function} are passed to \pkg{R/qtl} for map estimation.
#' 
#' By default, any genetic map positions must include map units (i.e.
#' \code{'cM'}). To read a cross without requiring map units, set
#' parameter \code{require.mapunit} to \code{FALSE}. If map units
#' are not required and not found, map positions are assumed to be
#' in centiMorgans.
#' 
#' @param infile Input CSV file path.
#' @param error.prob Genotyping error rate.
#' @param map.function Genetic map function.
#' @param require.mapunit Require map unit information in map positions.
#'  
#' @return An \pkg{R/qtl} \code{cross} object with an attribute
#' \code{'info'} of type \code{\linkS4class{CrossInfo}}.
#' 
#' @export
#' @family CSV functions
#' @family cross object functions
#' @importFrom methods new
#' @importFrom stats na.omit
#' @importFrom utils read.csv
#' @importFrom utils write.table
#' @rdname readCrossCSV
readCrossCSV <- function(infile, error.prob=0.0001,
    map.function=c('haldane', 'kosambi', 'c-f', 'morgan'),
    require.mapunit=TRUE) {
    
    crosstype <- 'haploid' # TODO: support other cross types
    
    stopifnot( isSingleString(infile) )
    stopifnot( file.exists(infile) )
    error.prob <- as.numeric(error.prob)
    stopifnot( isSingleProbability(error.prob) )
    stopifnot( isBOOL(require.mapunit) )
    
    map.function <- match.arg(map.function)
    
    # Read cross input data as CSV file.
    cross.table <- utils::read.csv(infile, header=FALSE, check.names=FALSE,
        quote='', na.strings=const$missing.value, colClasses='character',
        strip.white=TRUE, stringsAsFactors=FALSE)
    
    # Trim any blank rows/columns from the bottom/right, respectively.
    cross.table <- bstripBlankRows( rstripBlankCols(cross.table) )
    
    # Get logical vector indicating which columns are blank 
    # in the first row after the initial heading row.
    seq.is.blank <- cross.table[2, ] == ''
    
    # Set phenotype columns from those with blank sequence row.   
    pheno.cols <- which( seq.is.blank )
    stopifnot( length(pheno.cols) > 0 )
    
    # Set marker columns from those with nonempty sequence row.   
    geno.cols <- which( ! seq.is.blank )
    stopifnot( length(geno.cols) > 0 )
    
    # Verify that phenotype columns form a contiguous block at left of table.
    if ( pheno.cols[1] != 1 || any(diff(pheno.cols) != 1) ) {
        stop("sequence data row is invalid")
    } 
    
    # Get indices of columns with blanks in second row after initial  
    # headings (where the optional genetic map data is placed).
    map.blanks <- which( cross.table[3, ] == '' )
    
    # Check whether genetic map data appears to be present.
    map.present <- length(map.blanks) > 0
    
    # Verify genetic map correctly formatted, if present.
    if ( map.present && ! all(map.blanks == pheno.cols) ) {
        stop("map data row is invalid")
    }
    
    # Get indices of initial rows.
    head.rows <- if (map.present) {1:3} else {1:2}
    
    # Get offset of main body of cross data.
    dat.offset = length(head.rows)
    
    # Get index of first and last data rows.
    first.data.row <- dat.offset + 1
    last.data.row <- nrow(cross.table)
    
    stopifnot( last.data.row >= first.data.row )
    
    # Get vector of data row indices.
    dat.rows <- first.data.row : last.data.row
    
    # Get vector of non-missing values in phenotype data.
    pheno.values <- stats::na.omit( unlist(cross.table[dat.rows, pheno.cols]) )
    
    # Verify that there are no blank phenotype values.
    if ( any( pheno.values == '' ) ) {
        stop("blank phenotype values found")
    }
    
    # Get set of unique symbols used in marker genotype data.
    genotypes <- sort( unique( as.character( # NB: sort removes NA values
        unlist(cross.table[dat.rows, geno.cols]) ) ) )
    
    # Get allele symbols from characters in genotype symbols.
    alleles <- makeAlleleSet(genotypes) # NB: validates genotypes
    
    # Get phenotype names.
    phenotypes <- as.character(cross.table[1, pheno.cols])
    
    # Get index of phenotype columns with R/qtl special heading 'id'. If present,
    # this contains identifiers of sampled individuals. Search is case-insensitive.
    id.col <- which( tolower(phenotypes) == 'id' )
    
    if ( length(id.col) > 0 ) {
        # If 'id' column present, set vector of sample IDs 
        # and remove column from phenotype column indices..
        stopifnot( length(id.col) == 1 )
        samples <- cross.table[dat.rows, id.col]
        pheno.cols <- pheno.cols[-id.col]
        phenotypes <- as.character(cross.table[1, pheno.cols])
    
    } else {
        # ..otherwise set vector of sample indices.
        samples <- seq_along(dat.rows)
    }
    
    # Get locus info.
    locus.ids <- as.character(cross.table[1, geno.cols])
    locus.seqs <- as.character(cross.table[2, geno.cols])
    
    # Create CrossInfo object.
    cross.info <- methods::new('CrossInfo')
    cross.info <- setMarkers(cross.info, markers=locus.ids)
    cross.info <- setMarkerSeqs(cross.info, sequences=locus.seqs)
    cross.info <- setPhenotypes(cross.info, phenotypes)
    cross.info <- setAlleles(cross.info, alleles)
    cross.info <- setGenotypes(cross.info, genotypes)
    cross.info <- setCrosstype(cross.info, crosstype)
    cross.info <- setSamples(cross.info, samples)
    
    # Get normalised marker sequences, replace these in original table.
    locus.seqs <- getMarkerSeqs(cross.info)
    cross.table[2, geno.cols] <- locus.seqs
    
    # Set sorted sequences in CrossInfo object.
    sequences <- sortSeq( unique(locus.seqs) )
    cross.info <- setSequences(cross.info, sequences)
    
    # If map is present, check for map unit info.
    if (map.present) {
        
        # Get cross map positions.
        map.pos <- as.character(cross.table[3, geno.cols])
        
        # Get map unit from map positions.
        map.unit <- getMapUnitSuffix(map.pos)
        
        if ( ! is.na(map.unit) ) {
            if ( map.unit != 'cM' ) {
                stop("cross map positions must be in centiMorgans (e.g. '47 cM')")
            }
        } else {
            if (require.mapunit) {
                stop("cross map positions must include map units (e.g. '47 cM')")
            }
        }
        
        # Strip map unit information.
        map.pos <- setPosColDataMapUnit(map.pos, NULL)
        
        # Replace original map positions.
        cross.table[3, geno.cols] <- map.pos
    }
    
    # Create temp file for adjusted cross data.
    temp.file <- tempfile(fileext='csv')
    utils::write.table(cross.table, file=temp.file, na='', sep=',',
        quote=FALSE, row.names=FALSE, col.names=FALSE)
    
    # Read adjusted cross data.
    cross <- qtl::read.cross('csv', '', temp.file, genotypes=genotypes, 
        alleles=alleles, na.strings=const$missing.value, error.prob=error.prob,
        map.function=map.function, crosstype=crosstype)
    
    # Infer strain indices..keep if there are replicate samples.
    strain.indices <- inferStrainIndices(cross)
    if ( max( table(strain.indices) ) > 1 ) {
        cross.info <- setStrainIndices(cross.info, strains=strain.indices)
    }
    
    # Infer tetrad indices..keep if samples are tetradic.
    tetrad.indices <- inferTetradIndices(cross)
    if ( ! is.null(tetrad.indices) ) {
        cross.info <- setTetradIndices(cross.info, tetrads=tetrad.indices)
    }
    
    attr(cross, 'info') <- cross.info
    
    return(cross)
}

# readGenoCSV ------------------------------------------------------------------
#' Read yeast genotype data from a CSV file.
#' 
#' This function reads yeast cross genotype data from an \pkg{R/qtl} CSV file
#' and returns a \code{geno} object. The returned \code{geno} object has an
#' attribute \code{'info'} of type \code{\linkS4class{CrossInfo}}. This
#' \code{\linkS4class{CrossInfo}} object contains information about the
#' genotype data, including sample IDs, which must be present in the
#' input genotype data file.
#' 
#' By default, any genetic map positions must include map units (i.e.
#' \code{'cM'}). To read a genotype file without requiring map units,
#' set parameter \code{require.mapunit} to \code{FALSE}. If map units
#' are not required and not found, map positions are assumed to be in
#' centiMorgans.
#' 
#' @param infile Input CSV file path.
#' @param require.mapunit Require map unit information in map positions.
#' 
#' @export
#' @family CSV functions
#' @importFrom utils read.csv
#' @rdname readGenoCSV
readGenoCSV <- function(infile, require.mapunit=TRUE) {
    
    stopifnot( isSingleString(infile) )
    
    # Read genotype input data as CSV file.
    geno.table <- utils::read.csv(infile, header=FALSE, check.names=FALSE,
        quote='', na.strings=const$missing.value, colClasses='character',
        strip.white=TRUE, stringsAsFactors=FALSE)
    
    # Trim any blank rows/columns from the bottom/right, respectively.
    geno.table <- bstripBlankRows( rstripBlankCols(geno.table) )
    
    if ( nrow(geno.table) < 3 || ncol(geno.table) < 2 ) {
        stop("invalid genotype data file")
    }
    
    # Make geno table column names from first row of input table.
    colnames(geno.table) <- make.names(geno.table[1, ])
    
    # Check for ID heading in first column.
    id.col <- getIdColIndex(geno.table)
    if ( is.null(id.col) || id.col != 1 ) {
        stop("ID column not found in genotype data file - '", infile, "'")
    }
    
    # Check for sample IDs - required in genotype file.
    head.rows <- if (geno.table[3, id.col] == '') {1:3} else {1:2}
    dat.rows <- getRowIndices(geno.table)[-head.rows]
    if ( any(geno.table[dat.rows, id.col] == '') ) {
        stop("ID column is incomplete in genotype data file - '", infile, "'")
    }
    
    # Normalise sequence IDs.
    geno.table[2, -id.col] <- normSeq( as.character(geno.table[2, -id.col]) )
    
    return( as.geno(geno.table, require.mapunit=require.mapunit) )
}

# readMapCSV -------------------------------------------------------------------
#' Read \code{map} from a CSV file.
#' 
#' This function reads a yeast cross genetic or physical map from any
#' \pkg{R/qtl} CSV file that contains map data, whether that file
#' contains a map table, genotype data, or a full cross.
#' 
#' By default, the input map must include map unit information (e.g.
#' \code{'cM'}, \code{'bp'}). In the case of a map table file, this information
#' can be indicated in the map headings (e.g. \code{'pos (cM)'}) or in the map
#' positions (e.g. \code{'47 cM'}). For a cross or genotype file, map units
#' should be included with map positions.
#' 
#' To read an input file without requiring map units, set parameter
#' \code{require.mapunit} to \code{FALSE}. If map units are not
#' required and not found, map positions are assumed to be in
#' centiMorgans.
#' 
#' @param infile Input CSV file path.
#' @param require.mapunit Require map unit information.
#'     
#' @return An \pkg{R/qtl} \code{map} object.
#' 
#' @export
#' @family CSV functions
#' @family map utility functions
#' @rdname readMapCSV
readMapCSV <- function(infile, require.mapunit=TRUE) {
    
    stopifnot( isBOOL(require.mapunit) )
    
    params <- getMetadataCSV(infile)
    
    if ( ! params$map.present ) {
        stop("no map data found in file - '", infile, "'")
    }
    
    if ( params$class == 'cross' ) {
        
        cross <- readCrossCSV(infile, require.mapunit=require.mapunit)
        cross.map <- qtl::pull.map(cross)
        
    } else if ( params$class == 'geno' ) {
        
        geno <- readGenoCSV(infile, require.mapunit=require.mapunit)
        cross.map <- pullMap(geno)
        
    } else if ( params$class == 'map' ) {
        
        cross.map <- as.map( readMapframeCSV(infile,
            require.mapunit=require.mapunit) )
        
    } else {
        
        stop("cannot read map from ", params$class, " data file - '", infile, "'")
    }
    
    return(cross.map)
}

# readMapframeCSV --------------------------------------------------------------
#' Read \code{mapframe} from a CSV file.
#' 
#' This function reads a yeast cross genetic or physical mapframe from any
#' \pkg{R/qtl} CSV file that contains map data, whether that file contains
#' a map table, genotype data, or a full cross.
#' 
#' By default, the input mapframe must include map unit information (e.g.
#' \code{'cM'}, \code{'bp'}). In the case of a map table file, this information
#' can be indicated in the map headings (e.g. \code{'pos (cM)'}) or in the map
#' positions (e.g. \code{'47 cM'}). For a cross or genotype file, map units
#' should be included with map positions.
#' 
#' To read an input file without requiring map units, set parameter
#' \code{require.mapunit} to \code{FALSE}. If map units are not
#' required and not found, map positions are assumed to be in
#' centiMorgans.
#' 
#' @param infile Input CSV file path.
#' @param require.mapunit Require map unit information.
#'     
#' @return A \code{mapframe} object.
#' 
#' @export
#' @family CSV functions
#' @family map utility functions
#' @importFrom utils read.csv
#' @rdname readMapframeCSV
readMapframeCSV <- function(infile, require.mapunit=TRUE) {
    
    stopifnot( isBOOL(require.mapunit) )
    
    params <- getMetadataCSV(infile)
    
    if ( ! params$map.present ) {
        stop("no map data found in file - '", infile, "'")
    }
    
    if ( params$class %in% c('cross', 'geno') ) {
        
        cross.map <- as.mapframe( readMapCSV(infile,
            require.mapunit=require.mapunit) )
        
    } else if ( params$class == 'map' ) {
        
        # Read mapframe from CSV file.
        x <- utils::read.csv(infile, check.names=FALSE, quote='', na.strings='',
            colClasses='character', strip.white=TRUE,
            comment.char='', stringsAsFactors=FALSE)
        
        # Trim any blank rows/columns from the bottom/right, respectively.
        x <- bstripBlankRows( rstripBlankCols(x) )
        
        # Validate map unit information.
        map.unit <- getMapUnit(x)
        
        if ( is.na(map.unit) ) {
            
            if (require.mapunit) {
                stop("input map must include map unit information (e.g. 'pos (cM)', '30 kb')")
            }
            
            map.unit <- 'cM'
        }
        
        # Set locus IDs from an input 'id' column, if present.
        id.col <- getIdColIndex(x)
        if ( ! is.null(id.col) ) {
            x <- setRownamesFromColumn(x, col.name=colnames(x)[id.col])
        }
        
        cross.map <- as.mapframe(x, map.unit=map.unit)
        
    } else {
        
        stop("cannot read mapframe from ", params$class, " data file - '", infile, "'")
    }
    
    return(cross.map)
}

# readPhenoCSV -----------------------------------------------------------------
#' Read yeast phenotype data from a CSV file.
#'
#' This function reads yeast cross phenotype data from an \pkg{R/qtl} CSV file
#' and returns a \code{pheno} object. The returned \code{pheno} object has an
#' attribute \code{'info'} of type \code{\linkS4class{CrossInfo}}. This
#' \code{\linkS4class{CrossInfo}} object contains information about the
#' phenotype data, including sample IDs, which must be present in the
#' input phenotype data file.
#' 
#' @param infile Input CSV file path.
#'
#' @export
#' @family CSV functions
#' @importFrom utils read.csv
#' @rdname readPhenoCSV
readPhenoCSV <- function(infile) {
    
    stopifnot( isSingleString(infile) )
    
    # Read phenotype input data as CSV file.
    pheno.table <- utils::read.csv(infile, header=FALSE, check.names=FALSE,
        quote='', na.strings=const$missing.value, colClasses='character',
        strip.white=TRUE, stringsAsFactors=FALSE)
    
    # Trim any blank rows/columns from the bottom/right, respectively.
    pheno.table <- bstripBlankRows( rstripBlankCols(pheno.table) )
    
    # Make pheno table column names from first row of input table.
    colnames(pheno.table) <- make.names(pheno.table[1, ])
    
    # Check for ID heading.
    id.col <- getIdColIndex(pheno.table)
    if ( is.null(id.col) ) {
        stop("ID column not found in phenotype data file - '", infile, "'")
    }
    
    return( as.pheno(pheno.table) )
}

# recodeCSV --------------------------------------------------------------------
#' Recode \pkg{R/qtl} data in CSV file.
#' 
#' This function recodes the data in an \pkg{R/qtl} cross or genotype CSV file.
#' Genotypes can be recoded by passing to the \code{geno} parameter a simple
#' \code{\link{mapping}} of old to new genotypes. Alternatively, genotype data
#' can be converted to enumerated genotypes by setting parameter
#' \code{enum.geno} to \code{TRUE}.
#' 
#' Any \code{\link{mapping}} of old to new symbols must be complete and
#' unambiguous. All existing values must be mapped to a new value, and
#' data with different input values cannot have the same output value.
#' 
#' @param infile Input CSV file path.
#' @param outfile Output CSV file path.
#' @param geno A simple \code{\link{mapping}} of old to new genotype symbols,
#' with names containing existing genotype symbols, and elements containing
#' their replacement values (incompatible with \code{enum.geno}).
#' @param enum.geno Option indicating if genotype data should be recoded as
#' enumerated genotypes (incompatible with \code{geno}).
#'
#' @export
#' @family CSV functions
#' @importFrom utils read.csv
#' @importFrom utils write.table
#' @rdname recodeCSV
recodeCSV <- function(infile, outfile, geno=NULL, enum.geno=FALSE) {
    
    # TODO: implement phenotype recoding
    # TODO: implement sample recoding ?
    # TODO: implement marker recoding ?
    
    stopifnot( isSingleString(infile) )
    stopifnot( file.exists(infile) )
    stopifnot( isSingleString(outfile) )
    stopifnot( isBOOL(enum.geno) )
    
    # Check if genotype recoding specified.
    recode.geno <- ! is.null(geno) && length(geno) > 0
    
    # At least one recoding parameter must be specified.
    if ( ! ( recode.geno || enum.geno ) ) {
        stop("cannot recode - no recoding option specified")
    }
    
    # Check no more than one genotype recoding option specified.
    if ( recode.geno && enum.geno ) {
        stop("multiple genotype recoding options specified")
    }
    
    # Read input CSV file.
    x <- utils::read.csv(infile, header=FALSE, check.names=FALSE, quote='',
        na.strings=const$missing.value, colClasses='character',
        strip.white=TRUE, stringsAsFactors=FALSE)
    
    # Trim any blank rows/columns from the bottom/right, respectively.
    x <- bstripBlankRows( rstripBlankCols(x) )
    
    params <- getMetadataCSV(x)
    
    if (recode.geno) {
        
        stopifnot( is.mapping(geno) )
        
        src.geno <- mappingKeys(geno)
        dest.geno <- as.character( mappingValues(geno) )
        
        if ( is.null(params$geno.cols) ) {
            stop("cannot recode - no genotype data")
        }
        
        if ( anyNA(src.geno) || anyNA(dest.geno) ) {
            stop("cannot recode - incomplete genotype recoding")
        }
        
        if ( anyDuplicated(src.geno) || anyDuplicated(dest.geno) ) {
            stop("cannot recode - ambiguous genotype recoding")
        }
        
        # Existing genotypes can be any string,
        # but replacements must be valid genotypes.
        validateGenotypeSet(dest.geno)
        
        # Get genotype data.
        recoded.geno <- original.geno <- x[params$dat.rows, params$geno.cols]
        
        # Get set of symbols in genotype data.
        curr.geno <- unique( as.character( unlist(original.geno) ) )
        curr.geno <- curr.geno[ ! is.na(curr.geno) ]
        
        err.geno <- curr.geno[ ! curr.geno %in% src.geno ]
        if ( length(err.geno) > 0 ) {
            stop("no recoding defined for genotype symbols - '", toString(err.geno), "'")
        }
        
        # Recode genotypes.
        for ( i in seq_along(geno) ) {
            recoded.geno[ original.geno == src.geno[i] ] <- dest.geno[i]
        }
        
        # Replace genotype data.
        x[params$dat.rows, params$geno.cols] <- recoded.geno
        
    } else if (enum.geno) {
        
        # Get original genotype data.
        original.geno <- x[params$dat.rows, params$geno.cols]
        
        # Init recoded genotype data.
        recoded.geno <- matrix(NA_integer_, nrow=nrow(original.geno),
            ncol=ncol(original.geno), dimnames=dimnames(original.geno) )
        
        # Recode each column independently.
        for ( i in getColIndices(original.geno) ) {
            
            # Get sample symbols and genotypes for this locus.
            locus.symbols <- original.geno[, i]
            locus.levels <- unique(locus.symbols)
            locus.genotypes <- locus.levels[ locus.levels != const$missing.value ]
            
            # Skip loci with more genotypes than can be represented.
            if ( length(locus.genotypes) > length(const$enum.geno.charset) ) {
                next
            }
            
            # Set genotype numbers for this locus.
            recoded.geno[, i] <- match(locus.symbols, locus.genotypes)
        }
        
        # Get number of null loci in recoded genotype data.
        null.count <- sum( apply(recoded.geno, 2, function(column)
            allNA( as.vector(column) ) ) )
        
        if ( null.count > 0 ) {
            warning("enumerated genotype data contains ", null.count, " null loci")
        }
        
        # Replace genotype data.
        x[params$dat.rows, params$geno.cols] <- recoded.geno
    }
    
    # Write recoded data to file.
    utils::write.table(x, file=outfile, na=const$missing.value, sep=',',
        quote=FALSE, row.names=FALSE, col.names=FALSE)
    
    return( invisible() )
}

# sniffCSV ---------------------------------------------------------------------
#' Identify type of \pkg{R/qtl} data in CSV file.
#' 
#' This function identifies the type of data in the input CSV file. This can
#' be \code{'cross'}, \code{'geno'}, \code{'pheno'}, or \code{'map'}.
#' Alternatively, it can be \code{'unknown'} if the data cannot be identified.
#' 
#' Note that this function should be used with some discretion, as some
#' \pkg{R/qtl} input file types cannot be distinguished. For example, a
#' covariate data file containing a sample ID column will be misidentified
#' as containing phenotype data.
#' 
#' @param infile Input CSV file path.
#'
#' @return A string describing the type of data in the input CSV file.
#'
#' @export
#' @family CSV functions
#' @rdname sniffCSV
sniffCSV <- function(infile) {
    
    stopifnot( isSingleString(infile) )
    stopifnot( file.exists(infile) )
    
    data.class <- tryCatch({
        params <- getMetadataCSV(infile)
        result <- params$class
    }, error=function(e) {
        result <- 'unknown'
    })
    
    return(data.class)
}

# writeCrossCSV ----------------------------------------------------------------
#' Write yeast \code{cross} to a CSV file.
#' 
#' This function writes a yeast \code{cross} to an \pkg{R/qtl} CSV file.
#' Phenotype, genotype, and sample IDs are taken from the \code{'info'}
#' attribute of the \code{cross}, if present.
#'  
#' @param cross An \pkg{R/qtl} \code{cross} object.
#' @param outfile Output CSV file path.
#' @param chr Vector of sequences for which genotype data should be included in 
#' the output file. If none are specified, genotype data are output for all 
#' sequences.
#' @param digits If specified, round genetic map positions and
#' numeric phenotype values to the specified number of digits.
#' @param include.mapunit Include map unit information in map positions.
#'  
#' @export
#' @family CSV functions
#' @family cross object functions
#' @importFrom utils write.table
#' @rdname writeCrossCSV
writeCrossCSV <- function(cross, outfile, chr=NULL, digits=NULL, 
    include.mapunit=TRUE) {
    
    stopifnot( isSingleString(outfile) )
    stopifnot( isBOOL(include.mapunit) )
    
    map.unit <- 'cM'
    
    # Get indices of phenotype columns.
    pheno.col <- getPhenoColIndices(cross)
    
    # Get CrossInfo object.
    cross.info <- attr(cross, 'info')
    
    # Get specified sequences.
    chr <- subsetBySeq(pull.chr(cross), chr)
    stopifnot( length(chr) > 0 )

    # Take cross info from CrossInfo, if available..
    if ( ! is.null(cross.info) ) {
        compareCrossInfo(cross, cross.info)
        crosstype <- getCrosstype(cross.info)
        phenotypes <- getPhenotypes(cross.info)
        alleles <- getAlleles(cross.info)
        markers <- getSeqMarkers(cross.info, normSeq(chr), simplify=TRUE)
        sample.ids <- getSamples(cross.info)
    } else { # ..otherwise take directly from cross.
        crosstype <- pull.crosstype(cross)
        phenotypes <- qtl::phenames(cross)[pheno.col]
        alleles <- pull.alleles(cross)
        markers <- qtl::markernames(cross, chr)
        sample.ids <- pull.ind(cross)
    }
    
    stopifnot( crosstype == 'haploid' ) # TODO: support other cross types
    stopifnot( length(phenotypes) > 0 )
    stopifnot( length(alleles) > 0 )
    stopifnot( length(markers) > 0 )
    stopifnot( length(sample.ids) > 0 )
    
    # Get phenotypes, map, genotypes.
    pheno.table <- cross$pheno[, pheno.col, drop=FALSE]
    map.table <- as.data.frame(qtl::pull.map(cross, chr), map.unit=map.unit)
    geno.table <- qtl::pull.geno(cross)[, markers]
    
    genotypes <- makeGenotypeSet(alleles, crosstype)
    
    # If digits specified, round numeric phenotype values and map positions.
    if ( ! is.null(digits) ) {
        stopifnot( isSinglePositiveWholeNumber(digits) )
        mask <- sapply(pheno.table, is.numeric)
        pheno.table[, mask] <- round(pheno.table[, mask], digits=digits)
        map.table$pos <- round(map.table$pos, digits=digits)
    }
    
    # Replace encoded genotypes with actual genotype values.
    geno.table <- decodeGenotypes(geno.table, genotypes)
    
    # If writing map unit, add map unit info to map table.
    if (include.mapunit) {
        map.table <- setPosColDataMapUnit(map.table, map.unit)
    }
    
    # Bind map and genotype tables into one output table.
    output.table <- rbind(t(map.table), geno.table)
    
    # Pad phenotypes with two blank rows at top, corresponding 
    # to the location of the map above the genotype data.
    pheno.padding <- data.frame(matrix('', nrow=2, ncol=ncol(pheno.table), 
        dimnames=list(NULL, colnames(pheno.table))), stringsAsFactors=FALSE)
    pheno.table <- rbind(pheno.padding, pheno.table)
    
    # Bind phenotype table to output table.
    output.table <- cbind(pheno.table, output.table)
    output.table <- data.frame( lapply(output.table, as.character) )
    
    # Set output table headings as first row of output table.
    # NB: we want table headings to be written unmodified.
    output.headings <- data.frame( t( c(phenotypes, markers) ), 
        stringsAsFactors=FALSE)
    colnames(output.headings) <- colnames(output.table)
    output.table <- rbind(output.headings, output.table)
    
    # If sample IDs defined, insert sample ID  
    # column between phenotypes and genotypes.
    if ( ! is.null(sample.ids) ) {
        
        # Get ID column name.
        id.col.name <- colnames(cross$pheno)[ getIdColIndex(cross) ]
        
        # Pad sample IDs with column heading and with two blank values, 
        # corresponding to the location of the map above the genotype data.
        padded.ids <- c(id.col.name, rep_len('', 2), sample.ids)
        
        # Set ID column index to first column after phenotypes.
        id.col <- ncol(pheno.table) + 1
        
        # Insert sample ID column into output table.
        output.table <- insertColumn(output.table, col.index=id.col, 
            col.name=id.col.name, data=padded.ids)
    } 
    
    # Write cross data to file.
    utils::write.table(output.table, file=outfile, na=const$missing.value, sep=',',
        quote=FALSE, row.names=FALSE, col.names=FALSE)
    
    return( invisible() )
}

# writeGenoCSV -----------------------------------------------------------------
#' Write yeast genotype data to a CSV file.
#'   
#' @description This function writes a yeast \code{geno} object to an
#' \pkg{R/qtl} CSV file. The \code{geno} object must have an attribute
#' \code{'info'} of type \code{\linkS4class{CrossInfo}}, from which
#' genotype and sample IDs are taken.
#' 
#' @param geno An \pkg{R/qtl} \code{cross} \code{geno} object.
#' @param outfile Output CSV file path.
#' @param chr Vector of sequences for which genotype data should be included in 
#' the output file. If none are specified, a genotype file is output for all 
#' available sequences.
#' @param digits If specified, round genetic map positions to the specified 
#' number of digits.
#' @param include.mapunit Include map unit information in map positions.
#'  
#' @export
#' @family CSV functions
#' @importFrom utils write.table
#' @rdname writeGenoCSV
writeGenoCSV <- function(geno, outfile, chr=NULL, digits=NULL, 
    include.mapunit=TRUE) {
        
    stopifnot( isSingleString(outfile) )
    
    # Convert geno data to a data frame for output.
    geno.table <- as.data.frame(geno, chr=chr, digits=digits, 
        include.mapunit=include.mapunit)
    
    # Check for sample IDs - required in genotype file.
    if ( any(geno.table[4:nrow(geno.table), 'id'] == '') ) {
        stop("cannot output genotype data without sample IDs")
    }
    
    # Write cross geno data to CSV file.
    utils::write.table(geno.table, file=outfile, na=const$missing.value, sep=',',
        quote=FALSE, row.names=FALSE, col.names=FALSE)
    
    return( invisible() )
}

# writeMapCSV ------------------------------------------------------------------
#' Write \code{map} to a CSV file.
#' 
#' This function writes a yeast \code{mapframe} object to any \pkg{R/qtl} CSV
#' file that can contain map data, whether it is a map table file, genotype
#' file, or a full cross file. If an output genotype or cross file already
#' exists, the mapframe is written to the map portion of the output file.
#' 
#' @param map An \pkg{R/qtl} \code{map} object.
#' @param outfile Output CSV file path.
#' @param include.mapunit Include map unit information in output file.
#'  
#' @export
#' @family CSV functions
#' @family map utility functions
#' @rdname writeMapCSV
writeMapCSV <- function(map, outfile, include.mapunit=TRUE) {
    
    stopifnot( 'map' %in% class(map) )
    stopifnot( isSingleString(outfile) )
    stopifnot( isBOOL(include.mapunit) )
    
    # Assume that we are not pushing map into output file.
    pushing <- FALSE
    
    # If output file exists and is a cross or geno CSV
    # file, we are pushing map into output file.
    if ( file.exists(outfile) ) {
        
        guess <- sniffCSV(outfile)
        
        if ( guess %in% c('cross', 'geno') ) {
            pushing <- TRUE
        } else if ( guess != 'map' ) {
            stop("cannot write map to existing file with ", guess, " data - '", outfile, "'")
        }
    }
    
    # If pushing map, push into cross or geno object and write result to file..
    if (pushing) {
        
        # Create output temp file.
        tmp <- tempfile()
        on.exit( file.remove(tmp) )
        
        if ( guess == 'cross' ) {
            
            cross <- readCrossCSV(outfile, require.mapunit=FALSE)
            cross <- qtl::replace.map(cross, map)
            writeCrossCSV(cross, tmp, include.mapunit=include.mapunit)
            
        } else { # guess == 'geno'
            
            geno <- readGenoCSV(outfile, require.mapunit=FALSE)
            geno <- pushMap(geno, map)
            writeGenoCSV(geno, tmp, include.mapunit=include.mapunit)
        }
        
        # Move temp file to final output file.
        # NB: file.copy is used here instead of file.rename because the latter
        # can sometimes fail when moving files between different file systems.
        file.copy(tmp, outfile, overwrite=TRUE)
        
    } else { # ..otherwise convert to mapframe and write to file.
        
        writeMapframeCSV(as.mapframe(map), outfile,
            include.mapunit=include.mapunit)
    }
    
    return( invisible() )
}

# writeMapframeCSV -------------------------------------------------------------
#' Write \code{mapframe} to a CSV file.
#' 
#' This function writes a yeast \code{mapframe} object to any \pkg{R/qtl} CSV
#' file that can contain map data, whether it is a map table file, genotype
#' file, or a full cross file. If an output genotype or cross file already
#' exists, the mapframe is written to the map portion of the output file.
#' 
#' @param x A \code{mapframe} object.
#' @param outfile Output CSV file path.
#' @param include.mapunit Include map unit information in output file.
#'  
#' @export
#' @family CSV functions
#' @family map utility functions
#' @importFrom utils write.csv
#' @rdname writeMapframeCSV
writeMapframeCSV <- function(x, outfile, include.mapunit=TRUE) {
    
    stopifnot( 'mapframe' %in% class(x) )
    stopifnot( isSingleString(outfile) )
    stopifnot( isBOOL(include.mapunit) )
    
    # Assume that we are not pushing mapframe into output file.
    pushing <- FALSE
    
    # If output file exists and is a cross or geno CSV
    # file, we are pushing mapframe into output file.
    if ( file.exists(outfile) ) {
        
        guess <- sniffCSV(outfile)
        
        if ( guess %in% c('cross', 'geno') ) {
            pushing <- TRUE
        } else if ( guess != 'map' ) {
            stop("cannot write map to existing file with ", guess, " data - '", outfile, "'")
        }
    }
    
    # If pushing mapframe, convert to map and push into output file..
    if (pushing) {
        
        writeMapCSV(as.map(x), outfile, include.mapunit=include.mapunit)
        
    } else { # ..otherwise write mapframe to file.
        
        x <- as.data.frame(x)
        
        # If including map unit, append a map
        # unit suffix to each map position.
        if (include.mapunit) {
            x <- setPosColNameMapUnit(x, getMapUnit(x))
        }
        
        # Set 'id' column from locus IDs, if present.
        if ( hasRownames(x) ) {
            x <- setColumnFromRownames(x, col.name='id')
        }
        
        # Write mapframe to CSV file.
        utils::write.csv(x, file=outfile, quote=FALSE, row.names=FALSE)
    }
    
    return( invisible() )
}

# writePhenoCSV ----------------------------------------------------------------
#' Write yeast pheno data to a CSV file.
#' 
#' This function writes a yeast \code{pheno} object to an \pkg{R/qtl} CSV file.
#' The \code{pheno} object must have an attribute \code{'info'} of type
#' \code{\linkS4class{CrossInfo}}, from which phenotype and sample IDs are taken.
#' 
#' @param pheno An \pkg{R/qtl} \code{cross} \code{pheno} object.
#' @param outfile Output CSV file path.
#' @param digits If specified, round numeric phenotype
#' values to the specified number of digits.
#' 
#' @export
#' @family CSV functions
#' @importFrom utils write.table
#' @rdname writePhenoCSV
writePhenoCSV <- function(pheno, outfile, digits=NULL) {
    
    stopifnot( isSingleString(outfile) )
    
    # Convert pheno data to a data frame for output.
    pheno.table <- as.data.frame(pheno, digits=digits)
    
    # Write cross pheno data to CSV file.
    utils::write.table(pheno.table, file=outfile, na=const$missing.value, sep=',',
        quote=FALSE, row.names=FALSE, col.names=FALSE)
    
    return( invisible() )
}

# End of csv.R #################################################################
gact/shmootl documentation built on Nov. 11, 2021, 6:23 p.m.