Nothing
# read genomic prediction data
#' Create genomic prediction data object
#'
#' This function combines all raw data sources in a single, unified data object
#' of class \code{gpData}. This is a \code{list} with elements for phenotypic,
#' genotypic, marker map, pedigree and further covariate data. All elements are
#' optional.
#'
#' The class \code{gpData} is designed to provide a unified framework for data
#' related to genomic prediction analysis. Every data source can be omitted. In
#' this case, the corresponding argument must be \code{NULL}. By default
#' (argument \code{reorderMap}), markers in \code{geno} are ordered by their
#' position in \code{map}. Individuals are ordered in alphabetical order.
#'
#' An object of class \code{gpData} can contain different subsets of
#' individuals or markers in the elements \code{pheno}, \code{geno} and
#' \code{pedigree}. In this case the \code{id} in \code{covar} comprises all
#' individuals that either appear in \code{pheno}, \code{geno} and
#' \code{pedigree}. Two additional columns in \code{covar} named
#' \code{phenotyped} and \code{genotyped} are automatically generated to
#' identify individuals that appear in the corresponding \code{gpData} object.
#'
#' @param pheno \code{data.frame} with individuals organized in rows and traits
#' organized in columns. For unrepeated measures unique \code{rownames} should
#' identify individuals. For repeated measures, the first column identifies
#' individuals and a second column indicates repetitions (see also argument
#' \code{repeated}).
#' @param geno \code{matrix} with individuals organized in rows and markers
#' organized in columns. Genotypes could be coded arbitrarily. Missing values
#' should be coded as \code{NA}. Colums or rows with only missing values not
#' allowed. Unique \code{rownames} identify individuals and unique
#' \code{colnames} markers. If no \code{rownames} are available, they are taken
#' from element \code{pheno} (if available and if dimension matches). If no
#' \code{colnames} are used, the \code{rownames} of \code{map} are used if
#' dimension matches.
#' @param map \code{data.frame} with one row for each marker and two columns
#' (named \code{chr} and \code{pos}). First columns gives the chromosome
#' (\code{numeric} or \code{character} but not \code{factor}) and second column
#' the position on the chromosome in centimorgan or the physical distance
#' relative to the reference sequence in basepairs. Unique \code{rownames}
#' indicate the marker names which should match with marker names in
#' \code{geno}. Note that order and number of markers must not be identical
#' with the order in \code{geno}. If this is the case, gaps in the map are
#' filled with \code{NA} to ensure the same number and order as in element
#' \code{geno} of the resulting \code{gpData} object.
#' @param pedigree Object of class \code{pedigree}.
#' @param family \code{data.frame} assigning individuals to families with names
#' of individuals in \code{rownames} This information could be used for
#' replacing of missing values with function \code{codeGeno}.
#' @param covar \code{data.frame} with further covariates for all individuals
#' that either appear in \code{pheno}, \code{geno} or \code{pedigree$ID}, e.g.
#' sex or age. \code{rownames} must be specified to identify individuals.
#' Typically this element is not specified by the user.
#' @param reorderMap \code{logical}. Should markers in \code{geno} and
#' \code{map} be reordered by chromosome number and position within chromosome
#' according to \code{map} (default = \code{TRUE})?
#' @param map.unit \code{character}. Unit of position in \code{map}, i.e. 'cM'
#' for genetic distance or 'bp' for physical distance (default = 'cM').
#' @param repeated This column is used to identify the replications of the
#' phenotypic values. The unique values become the names of the third dimension
#' of the pheno object in the \code{gpData}. This argument is only required for
#' repeated measurements.
#' @param modCovar \code{vector} with \code{colnames} which identify columns
#' with covariables in \code{pheno}. This argument is only required for
#' repeated measurements.
#' @param na.string \code{character} or vector of \code{characters}. You can
#' specify values with which \code{NA} is coded in your geno object. In case
#' you read missing values from a file not as missing, but as character
#' strings. It can be specified more than one value for missings in a vector.
#' Default is \code{"NA"}.
#' @param cores \code{numeric}. Here you can specify the number of cores you
#' like to use.
#' @return Object of class \code{gpData} which is a \code{list} with the
#' following elements \item{covar}{\code{data.frame} with information on
#' individuals} \item{pheno}{\code{array} (individuals x traits x replications)
#' with phenotypic data} \item{geno}{\code{matrix} marker matrix containing
#' genotypic data. Columns (marker) are in the same order as in \code{map} (if
#' \code{reorderMap=TRUE}.) } \item{pedigree}{object of class \code{pedigree}}
#' \item{map}{\code{data.frame} with columns 'chr' and 'pos' and markers sorted
#' by 'pos' within 'chr'} \item{phenoCovars}{\code{array} with phenotypic
#' covariates} \item{info}{\code{list} with additional information on data
#' (coding of data, unit in \code{map}) From synbreed version 0.11-11 on the
#' function \code{\link{codeGeno}} adds here the package version which was used
#' to do the coding. There are differences in codings between version 0.10-11
#' and 0.11-0! }
#' @note In case of missing row names or column names in one item, information
#' is substituted from other elements (assuming the same order of
#' individuals/markers) and a warning specifying the assumptions is returned.
#' Please check them carefully.
#' @author Valentin Wimmer and Hans-Juergen Auinger with contributions be Peter
#' VandeHaar
#' @seealso \code{\link{codeGeno}}, \code{\link{summary.gpData}},
#' \code{\link{gpData2data.frame}}
#' @keywords manip
#' @examples
#'
#' set.seed(123)
#' # 9 plants with 2 traits
#' n <- 9 # only for n > 6
#' pheno <- data.frame(Yield = rnorm(n, 200, 5), Height = rnorm(n, 100, 1))
#' rownames(pheno) <- letters[1:n]
#'
#' # marker matrix
#' geno <- matrix(sample(c("AA", "AB", "BB", NA),
#' size = n * 12, replace = TRUE,
#' prob = c(0.6, 0.2, 0.1, 0.1)
#' ), nrow = n)
#' rownames(geno) <- letters[n:1]
#' colnames(geno) <- paste("M", 1:12, sep = "")
#'
#' # genetic map
#' # one SNP is not mapped (M5) and will therefore be removed
#' map <- data.frame(chr = rep(1:3, each = 4), pos = rep(1:12))
#' map <- map[-5, ]
#' rownames(map) <- paste("M", c(1:4, 6:12), sep = "")
#'
#' # simulate pedigree
#' ped <- simul.pedigree(3, c(3, 3, n - 6))
#'
#' # combine in one object
#' gp <- create.gpData(pheno, geno, map, ped)
#' summary(gp)
#'
#'
#' # 9 plants with 2 traits , 3 replications
#' n <- 9 #
#' pheno <- data.frame(
#' ID = rep(letters[1:n], 3), rep = rep(1:3, each = n),
#' Yield = rnorm(3 * n, 200, 5), Height = rnorm(3 * n, 100, 1)
#' )
#'
#' # combine in one object
#' gp2 <- create.gpData(pheno, geno, map, repeated = "rep")
#' summary(gp2)
#' @export create.gpData
#' @importFrom doBy orderBy
#' @importFrom methods is
#' @importFrom stats as.formula rnorm
#' @importFrom utils sessionInfo
#'
create.gpData <- function(pheno = NULL, geno = NULL, map = NULL, pedigree = NULL, family = NULL, covar = NULL,
reorderMap = TRUE, map.unit = "cM", repeated = NULL, modCovar = NULL, na.string = "NA", cores = 1) {
infoCall <- match.call()
# start with some checks on data
# geno as matrix but not data.frame (storage)
if (!map.unit %in% c("cM", "bp", "kb", "Mb")) warning("The measurement unit for the positions in the map should be either 'cM', 'bp', 'kb' or 'Mb'")
if (!is.null(geno)) {
if (is.data.frame(geno)) {
geno <- as.matrix(geno)
# if(any(duplicated(geno,MARGIN=1))) warning("individuals with duplicated genotypes")
}
geno[geno %in% na.string] <- NA
if (!is.matrix(geno)) stop("geno must be a matrix or data.frame, not a ", class(geno))
if (anyDuplicated(rownames(geno))) {
stop(paste("In", substitute(geno), " are duplicated names of genotypes in rownames!"))
}
if (is.null(rownames(geno)) && is.null(pheno)) stop("rownames(geno) cannot be NULL unless a pheno of the same length is supplied")
if (is.null(colnames(geno)) && is.null(map)) warning("colnames(geno) should not be NULL unless a map of the same length is supplied")
}
if (!is.null(map)) {
if (!is.data.frame(map)) stop("map must be a data.frame, not a", class(map), "object")
# as a data.frame, map already has rownames
if (!all(c("chr", "pos") %in% colnames(map))) stop("colnames(map) must include 'chr' and 'pos'")
# test if positions in map are numeric
if (!is.numeric(map$pos)) stop("Position informations have to be numeric values!")
# test if chr in map is numeric or character
if (!(is.numeric(map$chr) | is.character(map$chr))) {
warning(paste("Chromosome information (in map$chr) should be numeric or character, not", class(map$chr)))
}
} else {
map.unit <- NULL
}
# match geno and map
if (!is.null(geno) & !is.null(map)) {
if (!all(colnames(geno) %in% rownames(map))) {
warning("not all markers in 'geno' mapped in 'map'. gaps filled with 'NA' \n")
}
if (ncol(geno) == nrow(map) && is.null(colnames(geno))) {
# assuming same markers in geno and map
warning("missing colnames in 'geno': assuming to be identical as rownames in 'map' because of identical length \n")
colnames(geno) <- rownames(map)
} else {
if (!identical(colnames(geno), rownames(map))) {
# markers must be reordered
map <- data.frame(
chr = map$chr[match(colnames(geno), rownames(map))],
pos = map$pos[match(colnames(geno), rownames(map))],
row.names = colnames(geno)
)
}
}
}
phenoCovars <- NULL
attrModCovars <- NULL
if (!is.null(pheno)) {
if (2 != length(dim(pheno))) stop("pheno must be 2-dimensional, not ", length(dim(pheno)), "-dimensional")
classList <- unlist(lapply(pheno, class))
if (!all((classList[!names(classList) %in% repeated & !names(classList) %in% modCovar])[-1] %in% c("numeric", "integer"))) stop("Trait values have to be numeric!")
# repeated measures? Use rownames of pheno as identifier for genotypes
if (is.null(repeated)) {
if (is.null(rownames(pheno))) stop("rownames(pheno) cannot be null when not using repeated measures!")
if (anyDuplicated(rownames(pheno))) warning("rownames in pheno should be unique when not using repeated measures!")
add <- 10^ceiling(log10(nrow(pheno)))
# if the rownames are numbers, then temporarily add a big number to allow alphabetical sorting.
if (all(rownames(pheno) %in% 1:nrow(pheno))) rownames(pheno) <- add + as.numeric(rownames(pheno)) else add <- NULL
if (dim(pheno)[2] == 1) { # only a vector of traits
phenoNames <- dimnames(pheno)
arrPheno <- array(pheno[order(phenoNames[[1]]), ], dim = c(length(phenoNames[[1]]), 1, 1))
dimnames(arrPheno) <- list(phenoNames[[1]][order(phenoNames[[1]])], phenoNames[[2]], "1")
} else { # more than one trait, still unreplicated
pheno <- pheno[order(rownames(pheno)), ]
pheno.not.modCovar <- pheno[, !(colnames(pheno) %in% modCovar)]
arrPheno <- array(as.matrix(pheno.not.modCovar), dim = c(dim(pheno.not.modCovar), 1))
dimnames(arrPheno) <- list(rownames(pheno), colnames(pheno.not.modCovar), "1")
}
if (!is.null(add)) dimnames(arrPheno)[[1]] <- as.numeric(dimnames(arrPheno)[[1]]) - add
if (!is.null(modCovar)) {
arrModCovars <- array(1, dim = c(dim(pheno[, colnames(pheno) %in% modCovar]), 1))
dimnames(arrModCovars) <- list(dimnames(arrPheno)[[1]], colnames(pheno)[colnames(pheno) %in% modCovar], "1")
for (i in colnames(pheno)[colnames(pheno) %in% modCovar]) {
arrModCovars[, i, 1] <- pheno[, i]
}
}
} else { # a vector with replication identifier is applied. The first column is the identifier for genotypes
dim3 <- data.frame(unique(pheno[, repeated]))
colnames(dim3) <- repeated
dim3 <- orderBy(as.formula(paste("~", paste(repeated, collapse = " + "))), data = dim3)
for (i in 1:ncol(dim3)) dim3[, i] <- as.character(dim3[, i])
rownam <- sort(unique(pheno[, 1]))
if (!is.null(modCovar)) repeated <- unique(c(repeated, modCovar))
arrPheno <- array(NA, dim = c(length(rownam), ncol(pheno) - (1 + length(repeated)), nrow(dim3)))
dimnames(arrPheno) <- list(rownam, (colnames(pheno)[!colnames(pheno) %in% repeated])[-1], as.character(apply(dim3, 1, paste, collapse = "_")))
for (i in 1:nrow(dim3)) {
vec.bool <- apply(as.matrix(pheno[, colnames(dim3)]) == as.matrix(dim3[rep(i, nrow(pheno)), ]), 1, all)
arrPheno[as.character(pheno[vec.bool, 1]), , i] <- as.matrix(pheno[vec.bool, (colnames(pheno)[!colnames(pheno) %in% repeated])[-1]])
}
if (!is.null(modCovar)) { # strip out covariates
arrModCovars <- arrPheno[, rep(1, length(modCovar)), ]
dimnames(arrModCovars)[[2]] <- colnames(pheno)[colnames(pheno) %in% modCovar]
for (i in 1:nrow(dim3)) {
vec.bool <- apply(matrix(pheno[, colnames(dim3)] == dim3[rep(i, nrow(pheno)), ], ncol = ncol(dim3)), 1, all)
arrModCovars[as.character(pheno[vec.bool, 1]), , i] <- as.matrix(pheno[vec.bool, colnames(pheno)[colnames(pheno) %in% modCovar]])
}
}
}
if (!is.null(modCovar)) { # take the correct class of the covariates
phenoCovars <- arrModCovars
attrModCovars <- classList[dimnames(arrModCovars)[[2]]]
for (i in names(attrModCovars)) {
if (attrModCovars[i] != "numeric") {
attrModCovars[i] <- "factor"
}
}
}
pheno <- arrPheno
rm(arrPheno)
}
# match geno and pheno
if (!is.null(geno) & !is.null(pheno)) {
if (is.null(dimnames(pheno)[[1]]) | is.null(rownames(geno))) {
if (dim(pheno)[1] == nrow(geno)) {
warning("assuming identical order of genotypes in 'pheno' and 'geno' because of identical length.\nControll the Output! There is no warranty of correctness!\n")
if (is.null(dimnames(pheno)[[1]])) {
dimnames(pheno)[[1]] <- rownames(geno)
} else {
rownames(geno) <- dimnames(pheno)[[1]]
}
if (is.null(dimnames(pheno)[[1]]) & is.null(rownames(geno))) dimnames(pheno)[[1]] <- rownames(geno) <- paste0("ID", 10^ceiling(log10(nrow(geno))) + 1:nrow(geno))
} else {
stop("missing rownames (animal IDs) for either 'pheno' or 'geno' and lengths do not agree.")
}
# now geno and pheno have rownames
}
}
# sort geno by rownames (alphabetical order)
if (!is.null(geno)) {
if (all(row.names(geno) %in% 1:nrow(geno))) {
geno <- geno[order(as.numeric(row.names(geno))), ]
} else {
geno <- geno[order(row.names(geno)), ]
}
}
# sort markers by chromosome and position within chromosome
if (!is.null(map)) {
if (reorderMap) {
map$sor <- substr(map$chr, nchar(as.character(map$chr)), nchar(as.character(map$chr)))
if (any(unique(map$sor)[!is.na(unique(map$sor))] %in% 0:9)) map$sor <- 1
# first order by rownames in alphabetical order (important for SNPs with the same position)
map <- map[order(as.character(rownames(map))), ]
map <- orderBy(~ sor + chr + pos, data = map)
map$sor <- NULL
# sortcolumns in geno, too
geno <- geno[, rownames(map)]
}
map$pos[is.na(map$chr)] <- NA
class(map) <- c("GenMap", "data.frame")
}
if (!is.null(pedigree)) {
if (!is(pedigree, "pedigree")) warning(paste("object pedigree should be of class 'pedigree', not class", paste(class(pedigree), collapse = " ")))
}
# return object
obj <- list(covar = NULL, pheno = pheno, geno = geno, map = map, pedigree = pedigree, phenoCovars = phenoCovars)
# add information to element covar
# sort all available individuals
ids <- sort(unique(c(dimnames(obj$pheno)[[1]], rownames(obj$geno), as.character(obj$pedigree$ID))))
if (all(ids %in% 1:length(ids))) ids <- sort(as.numeric(ids))
obj$covar <- data.frame(
id = ids,
phenotyped = ids %in% dimnames(obj$pheno)[[1]],
genotyped = ids %in% rownames(obj$geno),
stringsAsFactors = FALSE
)
# family information for genotyped indviduals
if (!is.null(family)) {
if (!is.data.frame(family) & !is.matrix(family)) stop("family must be either a data.frame or a matrix, not a ", class(family))
colnames(family)[1] <- "family"
family$id <- as.character(rownames(family))
obj$covar <- merge(obj$covar, family, by = "id", all = TRUE)
obj$covar$genotyped[is.na(obj$covar$genotyped)] <- FALSE
obj$covar$phenotyped[is.na(obj$covar$phenotyped)] <- FALSE
} else {
obj$covar$family <- rep(NA, nrow(obj$covar))
}
# add covar from arguments, if available
if (!is.null(covar)) {
if (is.matrix(covar)) {
if (is.null(rownames(covar))) warning("the supplied covar's rownames will default to 1:nrow(covar), which is likely not correct. Inspect the resulting $covar.")
covar <- as.data.frame(covar)
}
if (!is.data.frame(covar)) stop("covar must be a data.frame, not a ", class(covar))
# do not use any existing columns named 'genotyped', 'phenotyped' or 'id'
covar <- covar[!colnames(covar) %in% c("genotyped", "phenotyped", "id", "family")]
# merge with existing data
obj$covar <- merge(obj$covar, covar, by.x = 1, by.y = 0, all = TRUE)
}
if (!is.null(obj$pedigree)) {
obj$covar <- obj$covar[match(obj$pedigree$ID, obj$covar$id), ]
}
# further information
obj$info$map.unit <- map.unit
obj$info$codeGeno <- FALSE
obj$info$attrPhenoCovars <- attrModCovars
obj$info$version <- paste("gpData object was created by synbreed version", sessionInfo()$otherPkgs$synbreed$Version)
obj$info$Call <- infoCall
# return object of class 'gpData'
class(obj) <- "gpData"
return(obj)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.