# phenotype
#' @title Create a phenotype object
#'
#' @description This function creates a phenotype object.
#'
#' @param data a data frame (or object coercible by as.data.frame to a data
#' frame) containing the variables in the model.
#' @param formula an object of class "formula" (or one that can be coerced to
#' that class): a symbolic description of the model to be fitted.The details
#' of model specification are given under 'Details' in the "lm" help file.
#' @param id (optional) column name identifier of the unique subjects in
#' \code{data}. If given the phenotype sample ids will become the rownames
#' of \code{data}. If \code{id} is \code{NULL} the suject id's are assumed
#' to be the rownames of \code{data}.
#' @param gender (optional) column name identifier for the gender in \code{data}
#' @param include (optional) character vector of the subjects in \code{id} to be
#' included in the analysis. See Details.
#' @param exclude (optional) character vector of the subjects in \code{id} to be
#' excluded in the analysis. See Details.
#' @param reduce logical. Should the dataset be reduced to only columns used in
#' formula. See Details
#'
#' @details \code{data} and \code{formula} are similar to what is needed for
#' \code{lm}. If a formula is not specifed, no further analysis will run.
#'
#' If the \code{id} is not specified it is assumed that the rownames in
#' \code{data} are the unique subject identifier. If \code{id} is specifed
#' then \code{rownames(data)} will be set to \code{id}. Thus the subject id's
#' must be unique and without duplication.
#'
#' If the gender column is not specified it will be set to \code{NULL}.
#'
#' Typically either \code{include} or \code{exclude}, or both, is set to
#' \code{NULL}. It is important to note that return \code{include} and
#' \code{exclude} are taken from the subjects in \code{data} and not from the input parameters.
#' [TBD: explain include/exclude]
#'
#' If \code{reduce == TRUE} then \code{data} is reduced to a data.frame
#' containing the variables used in \code{formula} plus \code{gender}. See
#' get_all_vars for specifics.
#'
#' @seealso get_all_vars
#'
#' @return an object of class 'phenotype'. This is a list typicaly used to feed
#' into an analysis function.
#'
#' An object of class 'phenotype' is a list containing at least the following
#' components:
#'
#' \itemize{
#' \item data data frame of the data for analysis
#' \item formula formula to be used in further analysis
#' \item gender column name containing the gender of the subjects
#' \item include the subjects in \code{data} that will be included is further analysis.
#' \item exclude the subjects in \code{data} that will be excluded is further analysis.
#' }
#'
#' It is important to note that return \code{include} and \code{exclude} are taken from data and not from the input parameters.
#'
#' @export
#
# [TBD]
# -Do we auto drop missing values?
# -verbose option?
# -add genderChar??? something to demote which character is "MALE/FEMALE"
# -add print method to show the meta data
# -add "problems" (a la readr)
phenotype <- function(data, formula, family, id, gender,
include, exclude) {
# check data
if (!is.data.frame(data)) {
stop("Error in parameter 'data': Must be a data.frame or a class which extends a data.frame.")
}
# check formula
if (missing(formula)) {
stop("'formula' must be specified!")
}
# - make sure all varaibles are in the data frame
vars <- all.vars(as.formula(formula))
vars <- vars[!(vars %in% ".")]
if (all(vars %in% colnames(data))) {
mf <- get_all_vars(formula, data)
vars <- colnames(mf)
if (nrow(na.omit(mf)) <= 1L) {
stop("'formula' results in empty 'data'")
}
} else {
stop("One or more formula variables not in data")
}
# check family
if (missing(family) || is.null(family) || length(family) == 0L || is.na(family)) {
stop("'family' must be specified!")
}
if (is.character(family)) {
if (length(family) > 1L) {
stop("Family can be only one of: 'gaussian', 'binomial', or 'cox'")
}
if (!all(family %in% c('gaussian', 'binomial', 'cox'))) {
stop("Only family 'gaussian', 'binomial', or 'cox' currently supported")
}
} else {
stop("'family' must be of type character.")
}
# check id
if (missing(id)) {
warning("id not specified. Using row names")
id <- ".id"
data$.id <- rownames(data)
} else {
if (length(id) != 1L) {
stop("Only one id column can be specified.")
} else {
if (!(id %in% colnames(data))) {
stop(paste0(id, " is not a column in data."))
}
}
if (!is.character(data[ , id])) {
warning("Converting id column to character.")
data[ , id] <- as.character(data[ , id])
}
if (anyDuplicated(data[ , id])) {
stop("Duplicate phenotype ids found.")
}
if (anyNA(data[ , id])) {
stop("Missing phenotype ids")
}
}
idCol <- match(id, colnames(data))
# check gender
if (missing(gender)) {
gender <- NULL
} else {
if (length(gender) != 1L) {
stop("Only one gender column can be specified.")
} else {
if (!(gender %in% colnames(data))) {
stop(paste0(gender, " is not a column in data."))
}
}
if (anyNA(data[ , gender])) {
stop("Missing gender data")
}
# [TBD] - is a type than can be grouped
# [TBD] - has no more than 2 groups
# [TBD] - can be converted to TRUE/FALSE??
}
subjects_all <- data[ , id]
# check include
if (!missing(include)) {
if (!is.character(include)) {
stop("include must be a character vector")
}
if (!all(include %in% data[ ,id])) {
warning("Not all ids in 'include' are in 'data'")
}
subjects_include <- intersect(include, data[ , id])
data <- data[(data[, id] %in% subjects_include), , drop = FALSE]
}
# check exclude
if (!missing(exclude)) {
if (!is.character(exclude)) {
stop("exclude must be a character vector")
}
if (!all(exclude %in% data[ ,id])) {
warning("Not all ids in 'exclude' are in 'data'")
}
subjects_exclude <- setdiff(data[ , id], exclude)
data <- data[(data[, id] %in% subjects_exclude), , drop = FALSE]
}
# check null model can be built with the parameters given.
nullmodel <- if (family == "binomial" || family == "gaussian") {
try(glm(formula = as.formula(formula), family = family, data = data[ , -idCol]))
} else if (family == "cox") {
try(coxph(formula = as.formula(formula), data = data[ , -idCol]))
} else {
stop("Unknown family type. Only 'gaussian', 'binomial', and 'cox' are currently supported.")
}
if (is_try_error(nullmodel)) {
stop("nullmodel can not be built.")
}
structure(
data,
formula = formula,
family = family,
idCol = id,
genderCol = gender,
included = data[, id],
excluded = setdiff(subjects_all, data[, id]),
class = unique(c("phenotype", class(data)))
)
}
#' @rdname phenotype
#' @export
is_phenotype <- function(x) inherits(x, "phenotype")
# get functions ---------------------------------------------------------------
#' @export
get_subjects.phenotype <- function(x, excluded = FALSE) {
# [TBD] check exlude is a logical of length 1
stopifnot(length(excluded) == 1L)
if (excluded) {
attr(x, "excluded")
} else {
intersect(x[[attr(x, "idCol")]], attr(x, "included"))
}
}
#' @export
get_formula <- function(x) { attr(x, "formula") }
#' @export
get_family <- function(x) { attr(x, "family") }
#' @export
get_idCol <- function(x) { attr(x, "idCol") }
#' @export
get_genderCol <- function(x) { attr(x, "genderCol") }
#' @export
get_included <- function(x) { attr(x, "included") }
#' @export
get_excluded <- function(x) { attr(x, "excluded") }
# single verbs -----------------------------------------------------------------
#' @export
reduce.phenotype <- function(p, common) {
dropped <- setdiff(get_subjects(p), common$subjects)
if (length(dropped) == 0L) {
attr(p, "dropped") <- NA
} else {
attr(p, "dropped") <- dropped
}
o <- match(common$subjects, p[ , get_idCol(p)])
p[o, ]
}
# This function provides a basic check that the phenotype file is a suitable
# for seqMeta. It checks that the required column names are in the phenotype
# data frame.
#
checkPhenotype <- function(p, pformula, idCol=NULL, genderCol=NULL) {
if (!is.null(idCol)) {
if (!(idCol %in% colnames(p))) {
stop("ID column not in the phenotype data frame.")
}
if (any(duplicated(p[ , idCol]))) {
stop("Duplicated phenotype ids.")
}
}
if(!is.null(genderCol)) {
if (!(genderCol %in% colnames(p))) {
msg <- paste(genderCol, "not found in phenotype file.", sep=" ")
stop(msg)
}
gtype <- typeof(p[ , genderCol])
g <- unique(p[ , genderCol])
if(gtype == "integer") {
if (!all(g %in% c(0, 1))) {
stop("Gender must be (0/1 or F/T) indicating female/male.")
}
} else if(gtype == "character") {
if (!all(g %in% c("F", "M"))) {
stop("Gender must be (0/1 or F/T) indicating female/male.")
}
}
} else {
wmsg <- "No column given to identify Males in phenotype file. No special
handling of the X chromosme."
warning(wmsg)
}
tf <- terms(as.formula(pformula), data=p)
if (grepl("^Surv\\(", pformula)) {
cn <- attr(tf, "term.labels")
} else {
cn <- rownames(attr(tf, "factors"))
}
if (any(!(cn %in% colnames(p)))) {
msg <- paste("Formula varaibles:", cn[!(cn %in% colnames(p))], "not found in phenotype file.", sep=" ")
stop(msg)
}
return(invisible(NULL))
}
# This function reduces a data set to only the variables used in a model
# removing subjects with missing data. Also, it makes the row names of
# the resulting data fram the subject identifier
#
# p: a data frame containing the variables in the model
#
# formula: a character vector which can be coered to an object of class
# "formula" (with as.formula): a symbolic description of the model to
# be fitted. The details of model specification are given under
# 'Details' in the "lm" help file.
#
# id: (optional) colunm name identifier of the subjects
#
# gender: (optional) colunm name identifier for the gender classification of
# the subjects.
#
# returns: data frame with only the columns specified in the formula and with
# the (optional) row names as the subject identifier.
#
reducePheno <- function(p, pformula, id=NULL, gender=NULL) {
checkPhenotype(p, pformula, idCol=id, genderCol=gender)
if (!is.null(id)) {
if (!(id %in% colnames(p))) {
stop("ID column not in the phenotype data frame.")
} else {
rownames(p) <- p[ , id]
p <- p[, -match(id, colnames(p))]
}
}
if (!is.null(gender)) {
gtype <- typeof(p[ , gender])
g <- unique(p[ , gender])
if(gtype == "character") {
# if (all.equal(g, c("F", "M"))) {
if (all.equal(sort(g), c("F", "M"))) {
MF <- p[, gender] == "M"
p[, gender] <- MF
} else {
stop("Unable to convert gender. Gender must be (0/1 or F/T) indicating female/male.")
}
}
}
# tf <- terms(as.formula(pformula), data=p)
# cn <- rownames(attr(tf, "factors"))
tf <- terms(as.formula(pformula), data=p)
if (grepl("^Surv\\(", pformula)) {
tcn <- attr(tf, "term.labels")
fcn <- min(match(tcn, colnames(p)), na.rm=TRUE)
cn <- union(colnames(p)[1:(fcn-1)], tcn)
} else {
cn <- rownames(attr(tf, "factors"))
}
if ((!is.null(gender)) & (!(gender %in% cn))) {
cn <- c(cn, gender)
}
# if (!(gender %in% cn)) {
# cn <- c(cn, gender)
# }
if (any(!(cn %in% colnames(p)))) {
msg <- paste("Formula varaibles:", cn[!(cn %in% colnames(p))], "not found in phenotype file.", sep=" ")
stop(msg)
} else {
if (!is.null(cn)) {
pheno <- na.omit(p[, cn])
} else {
pheno <- na.omit(p)
}
}
return(pheno)
}
summarize_phenotype <- function(pheno, gender=NULL) {
if(!is.null(gender)) {
gtype <- typeof(pheno[ , gender])
g <- unique(pheno[ , gender])
if(gtype == "integer") {
if (!all(g %in% c(0, 1))) {
stop("Gender must be (0/1 or F/T) indicating female/male.")
}
MALES <- which(pheno[ , gender] == 1L)
} else if(gtype == "character") {
if (!all(g %in% c("F", "M"))) {
stop("Gender must be (0/1 or F/T) indicating female/male.")
}
MALES <- which(pheno[ , gender] == "M")
} else if(gtype == "logical") {
MALES <- which(pheno[ , gender] == TRUE)
} else {
wmsg("Gender must be (0/1 or F/T) indicating female/male. Cannot calculate gener specific phenotype summaries")
warning(wmsg)
MALES <- NULL
}
} else {
wmsg("No column given to identify Males in phenotype file. Cannot calculate gener specific phenotype summaries")
warning(wmsg)
MALES <- NULL
}
pooled.summary <- CalculatePhenotypeSummary(pheno)
if (is.null(MALES)) {
male.summary <- NA
female.summary <- NA
} else {
male.summary <- CalculatePhenotypeSummary(pheno[MALES,])
female.summary <- CalculatePhenotypeSummary(pheno[-MALES,])
}
return(list(Pooled=pooled.summary, Male=male.summary, Female=female.summary))
}
CalculatePhenotypeSummary <- function(p) {
cols <- sapply(p, function(x) { (is.numeric(x) | is.integer(x))})
N <- colSums(!is.na(p[, cols]))
Min <- sapply(p[, cols], min, na.rm=TRUE)
Q1 <- sapply(p[, cols], quantile, probs=0.25, na.rm=TRUE)
Median <- sapply(p[, cols], median, na.rm=TRUE)
Mean <- colMeans(p[, cols], na.rm=TRUE)
SD <- sapply(p[, cols], sd, na.rm=TRUE)
Q3 <- sapply(p[, cols], quantile, probs=0.75, na.rm=TRUE)
Max <- sapply(p[, cols], max, na.rm=TRUE)
p.summary <- rbind(N, Min, Q1, Median, Mean, SD, Q3, Max)
return(p.summary)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.