shmootl: QTL Analysis Utilities for Yeast

Documented in crossesEqual getFlankingPhenoColIndices hasTimeSeriesPhenotypes inferLodColNames inferStrainIndices.character inferStrainIndices.cross inferTetradIndices inferTetradIndices.character inferTetradIndices.cross inferTimeStep interpTimeSeries makeCross padTimeSeries permCross permIndices refreshCross

# Start of cross.R #############################################################

# crossesEqual -----------------------------------------------------------------
#' Test if two \pkg{R/qtl} \code{cross} objects are essentially equal.
#' 
#' This uses the \pkg{R/qtl} function \code{comparecrosses} to check if two
#' input \code{cross} objects are essentially equal, with key elements being
#' identical, and numeric values being equal (within the given tolerance).
#' 
#' @param cross1 An \pkg{R/qtl} \code{cross} object.
#' @param cross2 An \pkg{R/qtl} \code{cross} object.
#' @param tol Tolerance for comparing numeric values.
#' 
#' @return Returns \code{TRUE} if the two input \code{cross} objects are
#' essentially equal. Otherwise, this function returns a character vector
#' describing an observed difference between the two \code{cross} objects.
#' 
#' @template seealso-rqtl-manual
#' 
#' @export
#' @family cross object functions
#' @rdname crossesEqual
crossesEqual <- function(cross1, cross2, tol=1e-5) {
    
    diffs <- tryCatch({
        suppressMessages( qtl::comparecrosses(cross1, cross2, tol=tol) )
        result <- character()
    }, error=function(e) {
        result <- e[['message']]
    })
    
    return( if ( length(diffs) == 0 ) {TRUE} else {diffs} )
}

# getFlankingPhenoColIndices ---------------------------------------------------
#' Get flanking phenotype column indices.
#' 
#' Get the indices of the phenotype columns in the \code{cross} phenotype data 
#' frame that are flanking the specified index, excluding special columns such 
#' as the sample ID column.
#' 
#' @param cross An \pkg{R/qtl} \code{cross} object.
#' @param i Column index for which the flanking phenotype column indices 
#' should be returned.
#' 
#' @return Flanking phenotype column indices. If the specified index refers to
#' the first or last column, an \code{NA} value is returned for the lower or
#' upper flanking index, respectively.
#' 
#' @keywords internal
#' @rdname getFlankingPhenoColIndices
getFlankingPhenoColIndices <- function(cross, i) {
    
    stopifnot( isSinglePositiveNumber(i) )
    
    # Get indices of phenotype columns in cross.
    pheno.col <- getPhenoColIndices(cross)
    
    if ( i < 1 || i > ncol(cross$pheno) ) {
        stop("cross phenotype column index out of bounds - '", i, "'")
    }
    
    # Get lower flanking phenotype column index, or NA value if none available.
    lower.diff <- i - pheno.col
    if ( any(lower.diff > 0) ) {
        lower.index <- which( lower.diff == min(lower.diff[ lower.diff > 0 ]) )
    } else {
        lower.index <- NA
    }
    
    # Get upper flanking phenotype column index, or NA value if none available.
    upper.diff <- pheno.col - i
    if ( any(upper.diff > 0) ) {
        upper.index <- which( upper.diff == min(upper.diff[ upper.diff > 0 ]) )
    } else {
        upper.index <- NA
    }
    
    return( c(lower.index, upper.index) )
}

# hasTimeSeriesPhenotypes ------------------------------------------------------
#' Test if \code{cross} contains time-series phenotypes.
#' 
#' @param cross An \pkg{R/qtl} \code{cross} object.
#' @param allow.gaps Allow gaps in time series, provided
#' that any gaps are a multiple of the inferred time step.
#' @param tol Tolerance for time step equality.
#' 
#' @return \code{TRUE} if \code{cross} seems to have time-series phenotypes,
#' given the specified constraints; \code{FALSE} otherwise.
#' 
#' @template section-time-series
#' 
#' @export
#' @family cross object functions
#' @family time-series functions
#' @rdname hasTimeSeriesPhenotypes
hasTimeSeriesPhenotypes <- function(cross, allow.gaps=TRUE, tol=1e-5) {
    return( ! is.null( inferTimeStep(cross, allow.gaps=allow.gaps, tol=tol) ) )
}

# inferLodColNames -------------------------------------------------------------
#' Infer \code{scanone} result LOD column names.
#' 
#' Given a \code{cross} and set of phenotypes, get the expected the LOD column
#' names of the corresponding scanone result.
#'
#' @param cross An \pkg{R/qtl} \code{cross} object.
#' @param pheno.col Vector indicating the phenotypes for which LOD column names
#' should be inferred. This can be an integer vector of phenotype column indices
#' with respect to the columns of the \code{cross} phenotype \code{data.frame},
#' or a character vector that contains phenotype IDs or their syntactically
#' valid names. If none are specified, LOD names are inferred for all
#' \code{cross} phenotypes.
#'  
#' @return Vector of the LOD column headings that would be expected in a
#' \code{scanone} result for the given \code{cross} and phenotypes.
#' 
#' @keywords internal
#' @rdname inferLodColNames
inferLodColNames <- function(cross, pheno.col=NULL) {
    
    pheno.col <- getPhenoColIndices(cross, pheno.col)
    
    if ( length(pheno.col) == 1 ) {
        lodcol.names <- 'lod'
    } else if ( length(pheno.col) > 1 ) {
        lodcol.names <- colnames(cross$pheno)[pheno.col]
    } else {
        stop("cannot infer LOD column names - no phenotypes specified")
    }
    
    return(lodcol.names)
}

# inferStrainIndices -----------------------------------------------------------
#' Infer sample strain indices.
#'
#' Infers strain indices of a set of samples from the given object. A character
#' vector of sample IDs can be passed as an argument, in which case the strain
#' indices will be inferred from each set of identical consecutive sample IDs.
#' If a \code{cross} object is passed as an argument, strain indices will be
#' inferred from sample IDs, if present. Otherwise, strain indices are inferred
#' from each set of identical consecutive sample genotypes.
#'
#' @param x An \pkg{R/qtl} \code{cross} object,
#' or a character vector of sample IDs.
#'  
#' @return Vector of sample strain indices inferred from the input object.
#' 
#' @template section-sample-ids
#' 
#' @export
#' @family cross object functions
#' @rdname inferStrainIndices
inferStrainIndices <- function (x) {
    UseMethod('inferStrainIndices', x)
}

# inferStrainIndices.character -------------------------------------------------
#' @export
#' @rdname inferStrainIndices
inferStrainIndices.character <- function(x) {
    
    runs <- rle(x)
    
    if ( anyDuplicated(runs$values) ) {
        stop("non-consecutive replicate sample IDs")
    }
    
    runs$values <- 1:length(runs$values)
    
    return( inverse.rle(runs) )
}

# inferStrainIndices.cross -----------------------------------------------------
#' @export
#' @rdname inferStrainIndices
inferStrainIndices.cross <- function(x) {
    
    # For each sample, combine genotypes into a single string.
    geno <- apply(qtl::pull.geno(x), 1, paste0, collapse='')
    
    # Get cross sample IDs.
    cross.info <- attr(x, 'info')
    if ( ! is.null(cross.info) ) {
        compareCrossInfo(x, cross.info)
        sample.ids <- getSamples(cross.info)
    } else {
        sample.ids <- pull.ind(x)
    }
    
    # If cross has sample IDs, infer strain indices from these, then 
    # check genotypes are identical in the inferred strain replicates..
    if ( ! is.null(sample.ids) ) {
        
        strain.indices <- inferStrainIndices(sample.ids)
        
        for ( s in unique(strain.indices) ) {
            strain.geno <- geno[ which( strain.indices == s ) ]
            if ( any(strain.geno != strain.geno[1]) ) {
                stop("genotype mismatch in replicate samples")
            }
        } 
        
    } else { # ..otherwise infer strain indices directly from genotypes.
        runs <- rle(geno)
        runs$values <- 1:length(runs$values)
        strain.indices <- inverse.rle(runs)
    }
    
    return(strain.indices)
}

# inferTetradIndices -----------------------------------------------------------
#' Infer sample tetrad indices.
#'
#' @description Infers tetrad indices of a set of samples from the given object.
#' A character vector of sample IDs can be passed as the first argument, in
#' which case the tetrad indices will be inferred, where possible, from the
#' sample IDs themselves.
#'   
#' If a \code{cross} is the first argument, tetrad indices will be inferred
#' from sample IDs, if present, or otherwise by comparison of sample genotypes.
#' 
#' @template section-sample-ids
#' @template section-tetradic-samples
#'   
#' @param x An \pkg{R/qtl} \code{cross} object,
#' or a character vector of sample IDs.
#'  
#' @return Vector of sample tetrad indices inferred from the input object. 
#' Returns \code{NULL} if tetrad pattern could not be found.
#' 
#' @export
#' @family cross object functions
#' @rdname inferTetradIndices
inferTetradIndices <- function(x) {
    # TODO: improve identification of tetrads.
    UseMethod('inferTetradIndices', x)
}

# inferTetradIndices.character -------------------------------------------------
#' @export
#' @rdname inferTetradIndices
inferTetradIndices.character <- function(x) {

    # Tetrad indices corresponding to the input sample IDs.
    sample.tindices <- NULL
    
    # Get 'exemplar' sample IDs.
    exemplar.ids <- unique(x)
    
    # Get indices of exemplars with respect to the vector of sample IDs.
    exemplar.sindices <- sapply(exemplar.ids, match, x)
    
    # Get indices of replicates with respect to the vector of sample IDs.
    terminal.sindices <- c(sapply(unname(exemplar.sindices[2:length(exemplar.sindices)]),
        function(e) e - 1), length(x) )
    replicate.sindices <- mapply(function(i, j) i:j, exemplar.sindices, 
        terminal.sindices, SIMPLIFY=FALSE)
    
    # Indices of exemplars with respect to the 'notional' exemplar sample IDs, 
    # in the ideal situation where every sample is present in every tetrad. 
    # Notional exemplar indices are taken from the numeric/alphanumeric suffix 
    # of each sample ID. This is used to place exemplar samples in tetrads,
    # and can work even if there are missing samples.
    exemplar.nindices <- NULL
    
    # If exemplars have numeric suffixes, try to assign notional indices from these..
    if ( all( grepl(const$pattern$tetrad['numeric'], exemplar.ids) ) ) {
        
        # Get sample numbers from numeric sample ID suffixes.
        sample.numbers <- as.integer( sub(const$pattern$tetrad['numeric'], '\\1', exemplar.ids) )
        
        # Test if sample numbers are ordered.
        samples.ordered <- ! is.unsorted(sample.numbers)
        
        # If sample numbers are sorted, get notional indices from the sample 
        # numbers, as offset by the value of the first sample number.
        if (samples.ordered) {
            exemplar.nindices <- sample.numbers - sample.numbers[1] + 1
        }
    
    # ..otherwise if exemplars have alphanumeric suffixes, try to assign notional indices from those.    
    } else if ( all( grepl(const$pattern$tetrad['alphanumeric'], exemplar.ids) ) ) {
        
        # Get tetrad numbers from numeric part of sample ID suffixes.
        tetrad.numbers <- as.integer( sub(const$pattern$tetrad['alphanumeric'], '\\1', exemplar.ids) )
        
        # Test if tetrad numbers are ordered.
        tetrads.ordered <- ! is.unsorted(tetrad.numbers)
        
        # Get tetrad sample labels, assign a label index to each (e.g. 1 for 'A', 2 for 'B').
        sample.labels <- toupper( sub(const$pattern$tetrad['alphanumeric'], '\\2', exemplar.ids) )
        exemplar.lindices <- sapply(sample.labels, function(sample.label) 
            which( const$tetrad.sample.labels == sample.label ), USE.NAMES=FALSE)
        
        # Test if sample labels are ordered within tetrads.
        labels.ordered <- all( sapply(unique(tetrad.numbers), function(i) 
            ! is.unsorted(exemplar.lindices[ which( tetrad.numbers == i ) ]) ) )
        
        # If tetrad and sample numbers are sorted, get notional indices from the tetrad 
        # and sample numbers, as offset by the value of the first tetrad number.
        if ( tetrads.ordered && labels.ordered ) {
            exemplar.tindices <- tetrad.numbers - tetrad.numbers[1] + 1
            exemplar.nindices <- 4 * (exemplar.tindices - 1) + exemplar.lindices
        }
    }
    
    # If notional exemplar indices were identified, use these to assign tetrad indices.
    if ( ! is.null(exemplar.nindices) ) {
        
        # Get notional number of tetrads.
        ntetrads <- ceiling( max(exemplar.nindices) / 4 )
        
        # Get expected notional indices for the given number of tetrads.
        expected.nindices <- lapply(1:ntetrads, function(i)
            { o <- 4*(i-1); seq(o + 1, o + 4) } )
        
        # Get list containing notional indices for each tetrad.
        tetrad.nindices <- lapply(1:ntetrads, function(i)
            exemplar.nindices[ exemplar.nindices %in% expected.nindices[[i]] ])
        
        # Collapse tetrad notional indices to tetrad exemplar indices.
        tetrad.xindices <- tetrad.nindices[ lengths(tetrad.nindices) > 0 ]
        exemplar.count <- 0
        for ( i in 1:length(tetrad.xindices) ) {
            tetrad.xindices[[i]] <- exemplar.count + seq_along(tetrad.xindices[[i]])
            exemplar.count <- exemplar.count + length(tetrad.xindices[[i]])
        }
        
        # Get sample tetrad indices.
        sample.tindices <- integer( length=length(x) )
        for ( i in seq_along(tetrad.xindices) ) {
            tetrad.sindices <- unlist( replicate.sindices[ tetrad.xindices[[i]] ] )
            sample.tindices[tetrad.sindices] <- i
        }
    }
    
    return(sample.tindices)
}

# inferTetradIndices.cross -----------------------------------------------------
#' @export
#' @rdname inferTetradIndices
inferTetradIndices.cross <- function(x) {
    
    # Tetrad indices corresponding to the input sample IDs.
    sample.tindices <- NULL
    
    # Get cross sample IDs.
    cross.info <- attr(x, 'info')
    if ( ! is.null(cross.info) ) {
        compareCrossInfo(x, cross.info)
        sample.ids <- getSamples(cross.info)
    } else {
        sample.ids <- pull.ind(x)
    }
    
    stopifnot( length(sample.ids) > 0 )
    
    # Set minimum fraction of tetradic genotypes required to infer tetrad indices.
    threshold <- 0.95
    
    # Infer strain indices, as replicates will be included in each tetrad.
    strain.indices <- inferStrainIndices(x)
    
    # Get indices of exemplars with respect to tthe set of samples.
    exemplar.sindices <- sapply(unique(strain.indices), match, strain.indices)
    
    # Get indices of replicates with respect to the set of samples.
    terminal.sindices <- c(sapply(unname(exemplar.sindices[2:length(exemplar.sindices)]),
        function(e) e - 1), length(strain.indices) )
    replicate.sindices <- mapply(function(i, j) i:j, exemplar.sindices, 
        terminal.sindices, SIMPLIFY=FALSE)
    
    # Tetrad indices with respect to the set of exemplar 
    # sample indices. NB: no need to look at full sample 
    # data, as replicate samples have identical genotypes.
    tetrad.xindices <- NULL
    
    # If cross has sample IDs, try to infer tetrad indices from these.
    if ( ! is.null(sample.ids) ) {
        id.tindices <- inferTetradIndices(sample.ids)
        num.tetrads <- max(id.tindices)
        exemplar.tindices <- id.tindices[exemplar.sindices]
        tetrad.xindices <- lapply(1:num.tetrads, function(i) 
            which( exemplar.tindices == i ) )
    } 
    
    # If we failed to infer tetrad exemplar indices from sample IDs, 
    # fall back to assuming that samples form complete tetrads.
    if ( is.null(tetrad.xindices) ) {
        num.tetrads <- length(exemplar.sindices) / 4
        if ( isWholeNumber(num.tetrads) ) {
            tetrad.xindices <- lapply(1:num.tetrads, function(i) 
            { o <- 4*(i-1); seq(o + 1, o + 4) } )
        }
    }
    
    # If tetrad indices identified from sample IDs or assumed from number of 
    # exemplars, check this against genotype data and get tetrad indices 
    # corresponding to samples. 
    if ( ! is.null(tetrad.xindices) ) {
        
        # Calculate total number of possible tetrad genotypes.
        total.tetrad.genotypes <- sum(lengths(tetrad.xindices) > 0) * qtl::totmar(x)
        
        total.counted <- total.tetradic <- 0
        
        geno <- qtl::pull.geno(x)
        
        markers <- colnames(geno)
        
        # For each candidate tetrad, count tetradic genotypes.
        for ( i in 1:num.tetrads) {
            
            tetrad.exemplar.sindices <- exemplar.sindices[ tetrad.xindices[[i]] ]
            
            # If we have genotypes for a complete tetrad, check if appears to be tetradic.
            if ( length(tetrad.exemplar.sindices) == 4 ) {
                
                tetrad.geno <- geno[tetrad.exemplar.sindices, ]
                
                tetrad.tables <- lapply(markers, function(m) table(tetrad.geno[, m]))
                
                geno.counted <- sapply(tetrad.tables, sum) == 4
                
                geno.homozygous <- ! geno.counted | sapply(tetrad.tables, function(x) all(x == 4))
                
                # Create run-length encoding of homozygous genotype mask.., 
                runs <- rle(geno.homozygous)
                # ..set internal runs of homozygosity as FALSE..
                runs$values[ -c(1, length(runs$values)) ] <- FALSE
                # ..and then recreate the original homozygous genotype mask.
                geno.homozygous <- inverse.rle(runs)
                
                geno.counted <- geno.counted & ! geno.homozygous
                
                geno.tetradic <- geno.counted & sapply(tetrad.tables, function(x) all(x == 2))
                
                total.tetradic <- total.tetradic + sum(geno.tetradic)
                total.counted <- total.counted + sum(geno.counted)
            }
        }
        
        # If enough genotypes checked and enough of those 
        # were tetradic, get tetrad indices of samples.
        if ( total.counted > 0.5 * total.tetrad.genotypes ) {
            
            if ( total.tetradic / total.counted >= threshold ) {
                
                sample.tindices <- integer( length=qtl::nind(x) )
                
                for ( i in 1:num.tetrads ) {
                    tetrad.sindices <- unlist( replicate.sindices[ tetrad.xindices[[i]] ] )
                    sample.tindices[tetrad.sindices] <- i
                }
            } 
        }  
    }
    
    return(sample.tindices)
}  

# inferTimeStep ----------------------------------------------------------------
#' Infer time step of time-series phenotypes.
#' 
#' @param cross An \pkg{R/qtl} \code{cross} object.
#' @param allow.gaps Allow gaps in time series, provided
#' that any gaps are a multiple of the inferred time step.
#' @param tol Tolerance for time step equality.
#'     
#' @return Inferred time step. Returns \code{NULL}
#' if time step could not be inferred.
#' 
#' @template section-time-series
#' 
#' @export
#' @family cross object functions
#' @family time-series functions
#' @rdname inferTimeStep
inferTimeStep <- function(cross, allow.gaps=TRUE, tol=1e-5) {
    
    stopifnot( isBOOL(allow.gaps) )
    
    # Get time-series phenotypes.
    cross.info <- attr(cross, 'info')
    if ( ! is.null(cross.info) ) {
        compareCrossInfo(cross, cross.info)
        phenotypes <- getPhenotypes(cross.info)
        times <- as.numeric(phenotypes)
    } else {
        pheno.col <- getPhenoColIndices(cross)
        phenotypes <- colnames(cross$pheno)[pheno.col]
        times <- makeNumbers(phenotypes)
    }
    
    stopifnot( length(phenotypes) > 0 )
    
    # One time point cannot form a series.
    if ( length(times) == 1 ) {
        return(NULL)
    }
    
    # Time points must be valid non-negative numbers.
    if ( ! all ( isNonNegativeNumber(times) ) ) {
        return(NULL)
    }
    
    # Times must be strictly monotonically increasing.
    if ( is.unsorted(times, strictly=TRUE) ) {
        return(NULL)
    }
    
    # Get vector of time steps.
    time.steps <- diff(times)
    
    # Infer size of time step.
    time.step <- inferStepSize(time.steps, tol=tol)
    
    if ( is.null(time.step) ) {
        return(NULL)
    }
    
    # Time steps must be consistent.
    if ( any(time.step - time.steps > tol) ) {
        return(NULL)
    }
    
    # Identify gaps where difference between two times deviates from time step.
    gap.indices <- which( time.steps - time.step > tol )
    
    # If there are any gaps, verify that gaps are allowed and 
    # that all gaps are a multiple of the apparent time step.
    if ( length(gap.indices) > 0 ) {  
        
        if ( ! allow.gaps ) {
            return(NULL)
        }
        
        step.counts <- time.steps[gap.indices] / time.step - 1
        
        if ( any( ! isWholeNumber(step.counts, tol=tol) ) ) {
            return(NULL)
        }
    }
    
    return(time.step)
}

# interpTimeSeries -------------------------------------------------------------
#' Interpolate gaps in a time-series.
#' 
#' Interpolate values for gaps in time-series
#' phenotypes of a \code{cross} object.
#' 
#' @param cross An \pkg{R/qtl} \code{cross} object.
#' @param tol Tolerance for time step equality.
#'     
#' @return The input \code{cross} object is returned with gaps in time-series
#' phenotypes filled with values interpolated from the gap endpoints.
#' 
#' @template section-time-series
#' 
#' @export
#' @family cross object functions
#' @family time-series functions
#' @importFrom stats approx
#' @rdname interpTimeSeries
interpTimeSeries <- function(cross, tol=1e-5) {
    
    cross <- padTimeSeries(cross, tol=tol)
    
    # Get indices of phenotype columns in cross.
    pheno.col <- getPhenoColIndices(cross)
    
    # Get time-series phenotypes.
    cross.info <- attr(cross, 'info')
    if ( ! is.null(cross.info) ) {
        compareCrossInfo(cross, cross.info)
        phenotypes <- getPhenotypes(cross.info)
        times <- as.numeric(phenotypes)
    } else {
        phenotypes <- colnames(cross$pheno)[pheno.col]
        times <- makeNumbers(phenotypes)
    }
    
    numeric.phenotypes <- sapply(pheno.col, 
        function (i) is.numeric(cross$pheno[[i]]) )
    
    if ( ! all(numeric.phenotypes) ) {
        stop("cannot interpolate non-numeric phenotypes")
    }
    
    if ( anyNA(cross$pheno[, pheno.col[1] ]) ) {
        stop("cannot interpolate data at start of time series")
    }
    
    if ( anyNA(cross$pheno[, pheno.col[ length(pheno.col) ] ]) ) {
        stop("cannot interpolate data at end of time series")
    }
    
    for ( i in 1:nrow(cross$pheno) ) {
        
        # Get time-series values for sample (including blanks).
        time.series <- as.numeric( cross$pheno[i, pheno.col] )
        
        # Get runs of blank values, then get list of vectors, such that each  
        # vector contains the indices of a consecutive set of blank columns.
        blank.runs <- rle( is.na(time.series) )
        J <- cumsum(blank.runs$lengths)
        I <- c( 1, sapply(J[1:(length(J)-1)], function(j) j + 1) )
        b <- blank.runs$values == TRUE
        blank.index.list <- mapply(function(i, j) i:j, I[b], J[b], SIMPLIFY=FALSE)
        
        # We know from the code immediately above that consecutive runs of blank 
        # values are grouped together, and we have checked above that none of 
        # these run off either end of the time series. Therefore, we can safely 
        # take the flanking columns as our interpolation endpoints (below).
        
        for ( blank.indices in blank.index.list ) {
            
            # Take values from flanking columns as interpolation endpoints.
            x <- c(blank.indices[1] - 1, blank.indices[ length(blank.indices) ] + 1)
            
            # Get time series values for this blank region.
            y <- time.series[x]
            
            # Interpolate phenotype values for this blank region.
            interpolation <- stats::approx(x, y, blank.indices, 'linear')
            
            # Set blank values from interpolation results.
            time.series[blank.indices] <- interpolation$y
        }
        
        # Set time-series values for sample (with blanks interpolated).
        cross$pheno[i, pheno.col] <- time.series
    }
    
    return(cross)
}

# makeCross --------------------------------------------------------------------
#' Make an \pkg{R/qtl} \code{cross} object.
#' 
#' This function makes an \pkg{R/qtl} \code{cross} - essentially a list of two
#' elements named \code{'geno'} and \code{'pheno'} - from the input \code{geno}
#' and \code{pheno} objects. Both input objects must have an \code{'info'}
#' attribute of class \code{\linkS4class{CrossInfo}} containing sample IDs. The
#' output \code{cross} will be created from the intersection set of the samples
#' in the input \code{geno} and \code{pheno} objects. Sample replicates in the
#' input \code{pheno} object are preserved. Sample replicates in the input
#' \code{geno} object must have identical genotypes, which will be replicated
#' as needed to match the input phenotypes.
#' 
#' @param geno A \code{geno} object.
#' @param pheno A \code{pheno} object.
#' 
#' @return An \pkg{R/qtl} \code{cross} object.
#' 
#' @keywords internal
#' @rdname makeCross
makeCross <- function(geno, pheno) {
    
    stopifnot( 'geno' %in% class(geno) )
    stopifnot( 'pheno' %in% class(pheno) )
    
    # Get sequences from geno object.
    geno.seqs <- names(geno)
    
    # Get alleles from geno object.
    alleles <- attr(geno, 'alleles')
    
    # Get CrossInfo from geno object.
    geno.info <- attr(geno, 'info')
    
    # Validate CrossInfo of geno object.
    compareCrossInfo(geno, geno.info)
    
    # Clear unneeded attributes of geno object.
    attr(geno, 'alleles') <- NULL
    attr(geno, 'info') <- NULL

    # Get CrossInfo for pheno data.
    pheno.info <- attr(pheno, 'info')
    compareCrossInfo(pheno, pheno.info)
    attr(pheno, 'info') <- NULL
    
    if ( ! hasSampleIDs(geno.info) ) {
        stop("cannot make cross - no genotype sample IDs")
    }
    
    if ( ! hasSampleIDs(pheno.info) ) {
        stop("cannot make cross - no phenotype sample IDs")
    }
    
    # Get sample IDs from genotype info.
    geno.samples <- getSamples(geno.info)
    
    # Get sample IDs from phenotype info.
    pheno.samples <- getSamples(pheno.info)
    
    if ( length( intersect(geno.samples, pheno.samples) ) == 0 ) {
        stop("cannot make cross - no matching samples")
    }
    
    if ( is.na(geno.info@crosstype) ) {
        stop("cannot make cross - unknown cross type")
    }
    
    # Check that genotype replicates have identical genotype data.
    run.index.list <- getRunIndexList(geno.samples)
    for ( run.indices in run.index.list ) {
        if ( length(run.indices) > 1 ) {
            for ( geno.seq in names(geno) ) {
                run.data <- geno[[geno.seq]]$data[run.indices, ]
                if ( ! all( duplicated(run.data)[-1] ) ) {
                    stop("genotype replicates have conflicting data")
                }
            }
        }
    }
    
    # Get indices of phenotype samples in genotype data.
    gindices <- match(pheno.samples, geno.samples)
    
    # Expand genotype data to match phenotype samples.
    for ( geno.seq in names(geno) ) {
        geno[[geno.seq]]$data <- geno[[geno.seq]]$data[gindices, ]
    }
    
    # Expand sample table from geno object to match phenotype samples.
    sample.table <- geno.info@samples[gindices, ]
    rownames(sample.table) <- sample.table$sample.index <- getRowIndices(sample.table)
    
    # Create CrossInfo object from genotype and phenotype info.
    cross.info <- methods::new('CrossInfo',
        seq       = geno.info@seq,
        pheno     = pheno.info@pheno,
        markers   = geno.info@markers,
        samples   = sample.table,
        alleles   = alleles,
        genotypes = geno.info@genotypes,
        crosstype = geno.info@crosstype
    )
    
    # Prepare genotype and phenotype objects.
    class(pheno) <- 'data.frame'
    class(geno) <- 'list'
    
    # Create cross list.
    cross <- list(geno=geno, pheno=pheno)
    
    # Set cross alleles.
    attr(cross, 'alleles') <- alleles
    
    # Set cross info.
    attr(cross, 'info') <- cross.info
    
    # Set cross class.
    class(cross) <- c(cross.info@crosstype, 'cross')
    
    return(cross)
}

# padTimeSeries ----------------------------------------------------------------
#' Pad gaps in a time-series.
#' 
#' @param cross An \pkg{R/qtl} \code{cross} object.
#' @param tol Tolerance for time step equality.
#'     
#' @return The input \code{cross} object is returned with all
#' gaps in time-series phenotypes padded with \code{NA} values.
#' 
#' @template section-time-series
#' 
#' @export
#' @family cross object functions
#' @family time-series functions
#' @rdname padTimeSeries
padTimeSeries <- function(cross, tol=1e-5) {
    
    # Get indices of phenotype columns in cross.
    pheno.col <- getPhenoColIndices(cross)
    
    # Get index of ID column in cross.
    id.col <- getIdColIndex(cross)
    
    # Get ID column name.
    id.col.name <- colnames(cross$pheno)[id.col]
    
    # If ID column found, get flanking phenotypes. The ID column  
    # is reinserted between these in the padded phenotype data.
    if ( ! is.null(id.col) ) {
        flanking.indices <- getFlankingPhenoColIndices(cross, id.col)
        flanking.phenames <- colnames(cross$pheno)[flanking.indices]
    }
    
    # Get phenotype data.
    pheno <- qtl::pull.pheno(cross, pheno.col)
    
    # Get phenotype data class.
    pheno.class <- unique( sapply(pheno, class) )
    
    if ( length(pheno.class) != 1 ) {
        stop("inconsistent phenotype data types")
    }
    
    # Get missing value for this phenotype data.
    missing.value <- getMissingValueFromClassS3(pheno.class)
    
    # Get phenotypes and sample IDs.
    cross.info <- attr(cross, 'info')
    if ( ! is.null(cross.info) ) {
        compareCrossInfo(cross, cross.info)
        phenotypes <- getPhenotypes(cross.info)
        times <- as.numeric(phenotypes)
        sample.ids <- getSamples(cross.info)
    } else {
        phenotypes <- colnames(cross$pheno)[pheno.col]
        times <- makeNumbers(phenotypes)
        sample.ids <- pull.ind(cross)
    }
    
    # Time step is the mode of time differences.
    time.step <- inferTimeStep(cross, tol=tol)
    
    # Get differences between times.
    time.diff <- diff(times)
    
    # Identify gaps where difference between two times deviates from time step.
    gap.indices <- which( time.diff - time.step > tol )
    
    # Get number of such gaps.
    num.gaps <- length(gap.indices)
    
    # If there are gaps, pad them with NA values.
    if ( num.gaps > 0 ) {
        
        # Calculate step counts for all gaps.
        step.counts <- as.integer( round(time.diff[gap.indices] / time.step - 1) )
        
        # Set missing time points for each gap.
        gap.steps <- vector('list', num.gaps)
        for ( i in 1:num.gaps ) {
            g <- gap.indices[i]
            first.step <- times[g] + time.step
            last.step <- times[g + 1] - time.step
            gap.steps[[i]] <- seq(first.step, last.step, by=time.step)
        }
        
        # Get expected number of time points after gaps are padded.
        num.times <- length(times) + sum(step.counts)
        
        # Pad phenotype data and time points.
        padded.pheno <- data.frame( matrix(nrow=nrow(pheno), ncol=num.times), 
            stringsAsFactors=FALSE)
        padded.times <- rep(NaN, length=num.times)
        
        padded.pheno[, 1] <- pheno[, 1]
        padded.times[1] <- times[1]
        
        t <- 2
        
        for ( d in 1:length(time.diff) ) {
            
            if ( d %in% gap.indices ) {
                
                g <- which( gap.indices == d )
                gap.times <- gap.steps[[g]]
                
                for ( gap.time in gap.times ) {
                    
                    padded.pheno[, t] <- rep(missing.value, nrow(pheno))
                    padded.times[t] <- gap.time
                    t <- t + 1L
                }
            } 
            
            i <- d + 1L
            padded.pheno[, t] <- pheno[, i]
            padded.times[t] <- times[i]
            t <- t + 1L
        }
        
        # Set syntactically valid column names for padded phenotypes.
        colnames(padded.pheno) <- make.names(padded.times)
        
        # If phenotype has ID column, reinsert ID column 
        # as close as possible to its original position.
        if ( ! is.null(id.col) ) {
            
            stopifnot( length(sample.ids) > 0 )
            
            if ( ! is.na(flanking.phenames[1]) ) {
                i <- which( colnames(padded.pheno) == flanking.phenames[1] ) + 1
            } else {
                i <- which( colnames(padded.pheno) == flanking.phenames[2] ) - 1
            }
            
            padded.pheno <- insertColumn(padded.pheno, i, col.name=id.col.name, data=sample.ids)
        }
        
        # Replace padded phenotype data.
        cross$pheno <- padded.pheno
        
        # If cross has CrossInfo, update phenotypes to reflect padded data.
        if ( ! is.null(cross.info) ) {
            cross.info <- setPhenotypes(cross.info, padded.times)
            attr(cross, 'info') <- cross.info
        }
    } 
    
    return(cross)
}

# permIndices ------------------------------------------------------------------
#' Generate permutation indices for a \code{cross} object.
#' 
#' @description Given an input \code{cross} object with \code{N} samples, this
#' function generates a non-redundant set of \code{N} permutation indices in
#' the range \code{1-N}, which can be used to permute the phenotype or genotype
#' data of a \code{cross} object, as well as any associated data.
#' 
#' @param cross An \pkg{R/qtl} \code{cross} object.
#'     
#' @return Vector of sample indices that can be used to permute a
#' \code{cross} object or associated data.
#' 
#' @template section-permutations
#' 
#' @export
#' @family cross object functions
#' @family permutation functions
#' @rdname permIndices
permIndices <- function(cross) {
  
    stopifnot( 'cross' %in% class(cross) )
    
    num.samples <- nrow(cross$pheno)
    sample.indices <- 1:num.samples
    
    cross.info <- attr(cross, 'info')
    
    if ( ! is.null(cross.info) ) {
        compareCrossInfo(cross, cross.info)
        replicate.indices <- getStrainIndices(cross.info)
        tetrad.indices <- getTetradIndices(cross.info)
    } else {
        replicate.indices <- inferStrainIndices(cross)
        tetrad.indices <- inferTetradIndices(cross)
    }
    
    tetradic <- ! is.null(tetrad.indices)
    
    if (tetradic) {
        sample.bindices <- tetrad.indices
    } else {
        sample.bindices <- rep.int(1, num.samples)
    }
    
    blocks <- lapply( 1:max(sample.bindices),
        function(s) which( sample.bindices == s ) )
    
    imbalanced.tetrads <- imbalanced.replicates <- FALSE
    
    perm.indices <- rep(0, num.samples)
    
    for ( block in blocks ) {
        
        rep.freq <- table(replicate.indices[block])
        
        rep.freq <- sort(rep.freq, decreasing=TRUE)
        
        if (tetradic) {
            
            if ( length(rep.freq) == 1 ) {
                stop("cannot generate permutation indices - tetrads too imbalanced")
            } else if ( length(rep.freq) != 4 ) {
                imbalanced.tetrads <- TRUE
            }
        }
        
        if( length(rep.freq) == 1 || rep.freq[1] > sum(rep.freq[2:length(rep.freq)]) ) {
            stop("cannot generate permutation indices - replicates too imbalanced")
        } else if ( length( unique(rep.freq) ) > 1 ) {
            imbalanced.replicates <- TRUE
        }
        
        perm.block <- sample(sample.indices[block])

        while ( any(replicate.indices[perm.block] == replicate.indices[block]) ) {
            perm.block <- sample(sample.indices[block])
        }
        
        perm.indices[block] <- perm.block
    }
    
    if (imbalanced.replicates) {
        warning("sample replicates are imbalanced")
    }
    
    if (imbalanced.tetrads) {
        warning("tetrads are imbalanced")
    }
    
    return(perm.indices)
}
    
# permCross --------------------------------------------------------------------
#' Permute \code{cross} phenotype or genotype data.
#' 
#' @description Given an input \code{cross} object, this function permutes
#' either the phenotype or genotype data (but not both), and returns the
#' permuted \code{cross} object.
#' 
#' Note that when permuting phenotypes, any covariates must also be permuted
#' in the same way. In such cases, it is recommended to generate permutation
#' indices with \code{\link{permIndices}}, pass the result to the
#' \code{perm.indices} parameter of this function, and use the same
#' permutation indices to permute the covariate data.
#' 
#' When permuting genotypes, any derived data (e.g. genotype probabilities)
#' are recalculated by default. To prevent recalculation of derived data,
#' set the \code{refresh} parameter to \code{FALSE}.
#' 
#' @param cross An \pkg{R/qtl} \code{cross} object.
#' @param perm.indices Permutation indices.
#' @param perm.pheno Permute phenotype data (incompatible with \code{perm.geno}).
#' @param perm.geno Permute genotype data (incompatible with \code{perm.pheno}).
#' @param refresh Refresh any derived data.
#'     
#' @return Permuted \code{cross} object.
#' 
#' @template section-permutations
#' 
#' @export
#' @family cross object functions
#' @family permutation functions
#' @rdname permCross
permCross <- function(cross, perm.indices=NULL, perm.pheno=TRUE,
    perm.geno=FALSE, refresh=TRUE) {

    stopifnot( 'cross' %in% class(cross) )
    stopifnot( isBOOL(perm.pheno) )
    stopifnot( isBOOL(perm.geno) )
    stopifnot( isBOOL(refresh) )

    if ( ! xor(perm.pheno, perm.geno) ) {
        stop("permCross must permute either cross phenotypes or cross genotypes")
    }
    
    num.samples <- nrow(cross$pheno)
    
    if ( ! is.null(perm.indices) ) {
        stopifnot( length(perm.indices) == num.samples )
        stopifnot( all( perm.indices %in% 1:num.samples ) )
        stopif( anyDuplicated(perm.indices) )
    } else {
        perm.indices <- permIndices(cross)
    }
    
    if (perm.pheno) {
        
        pheno.col <- getPhenoColIndices(cross)
        
        cross$pheno[, pheno.col] <- cross$pheno[perm.indices, pheno.col]
    }
    
    if (perm.geno) {
        
        geno.seqs <- names(cross$geno)
        
        for ( geno.seq in geno.seqs ) {
            cross$geno[[geno.seq]]$data <- cross$geno[[geno.seq]]$data[perm.indices, ]
        }
        
        if (refresh) {
            cross <- refreshCross(cross)
        }
    }
    
    return(cross)
}

# refreshCross -----------------------------------------------------------------
#' Refresh derived data attributes of \pkg{R/qtl} \code{cross}.
#' 
#' @param cross An \pkg{R/qtl} \code{cross} object.
#' @param derived.attributes Names of derived data attributes to refresh. If 
#' none are specified, all derived data attributes in the \code{cross} are 
#' refreshed. 
#'  
#' @return Input \code{cross} object with derived data attributes refreshed.
#' 
#' @keywords internal
#' @rdname refreshCross
refreshCross <- function(cross, derived.attributes=NULL) {
    
    # This function is modelled on R/qtl function clean.cross; if that function
    # changes, this should be changed to reflect it. Also, this function relies 
    # on internal flag 'onlylod' for determining which function to use.
    
    stopifnot( 'cross' %in% class(cross) )
    
    known.derived <- c('argmax', 'draws', 'errorlod', 'prob', 'rf')
    
    if ( ! is.null(derived.attributes) ) {
        
        unknown <- derived.attributes[ ! derived.attributes %in% known.derived ]
        if ( length(unknown) > 0 ) {
            stop("cross has unknown derived data elements - '", 
                toString(unknown), "'")
        }
    
    } else {
        derived.attributes <- known.derived
    }
    
    geno.attr <- unique( as.vector( sapply( cross$geno, function (g) names(g) ) ) )
    
    if ( 'argmax' %in% derived.attributes && 'argmax' %in% geno.attr ) {
        
        arg.names <- c('step', 'off.end', 'error.prob', 'map.function', 'stepwidth')
        args <- lapply(arg.names, function (a) { unique( sapply( cross$geno, 
            function (g) attr(g$argmax, a) ) ) } )
        names(args) <- arg.names
        
        cross <- qtl::argmax.geno(cross, step=args$step, off.end=args$off.end, 
            error.prob=args$error.prob, map.function=args$map.function, 
            stepwidth=args$stepwidth)
    }
    
    if ( 'draws' %in% derived.attributes && 'draws' %in% geno.attr ) {
        
        n.draws <- unique( sapply( cross$geno, function (g) dim(g$draws)[3] ) )
        
        arg.names <- c('step', 'off.end', 'error.prob', 'map.function', 'stepwidth')
        args <- lapply(arg.names, function (a) { unique( sapply( cross$geno, 
            function (g) attr(g$draws, a) ) ) } )
        names(args) <- arg.names
        
        cross <- qtl::sim.geno(cross, n.draws=n.draws, step=args$step, 
            off.end=args$off.end, error.prob=args$error.prob, 
            map.function=args$map.function, stepwidth=args$stepwidth)
    }
    
    if ( 'errorlod' %in% derived.attributes && 'errorlod' %in% geno.attr ) {
        
        arg.names <- c('error.prob', 'map.function')
        args <- lapply(arg.names, function (a) { unique( sapply( cross$geno, 
            function (g) attr(g$errorlod, a) ) ) } )
        names(args) <- arg.names
        
        cross <- qtl::calc.errorlod(cross, error.prob=args$error.prob, 
            map.function=args$map.function)
    }
    
    if ( 'prob' %in% derived.attributes && 'prob' %in% geno.attr ) {
        
        arg.names <- c('step', 'off.end', 'error.prob', 'map.function', 'stepwidth')
        args <- lapply(arg.names, function (a) { unique( sapply( cross$geno, 
            function (g) attr(g$prob, a) ) ) } )
        names(args) <- arg.names
        
        cross <- qtl::calc.genoprob(cross, step=args$step, off.end=args$off.end, 
            error.prob=args$error.prob, map.function=args$map.function, 
            stepwidth=args$stepwidth)
    }
    
    if ( 'rf' %in% derived.attributes && 'rf' %in% names(cross) ) {
        
        lod.flag <- attr(cross$rf, 'onlylod')
        
        if ( ! is.null(lod.flag) && lod.flag ) {
            cross <- qtl::markerlrt(cross)
        } else {
            cross <- qtl::est.rf(cross)
        }
    }
    
    return(cross)
}

# End of cross.R ###############################################################