#' Set the GP mean vector, taking TMM or other normalization
#' into account
#'
#' @export
#' @description Creates the \code{c_hat} input for \code{lgp},
#' so that it accounts for normalization between data points in the
#' \code{"poisson"} or \code{"nb"} observation model
#' @param y response variable, vector of length \code{n}
#' @param norm_factors normalization factors, vector of length \code{n}
#' @return a vector of length \code{n}, which can be used as
#' the \code{c_hat} input to the \code{lgp} function
#' @family data frame handling functions
adjusted_c_hat <- function(y, norm_factors) {
check_lengths(y, norm_factors)
check_non_negative_all(y)
check_integer_all(y)
check_positive_all(norm_factors)
c_hat <- log(mean(y))
c_hat <- rep(c_hat, length(y))
c_hat <- c_hat + log(norm_factors)
return(c_hat)
}
#' Easily add a categorical covariate to a data frame
#'
#' @export
#' @param data the original data frame
#' @param x A named vector containing the category for each individual.
#' The names should specify the individual id.
#' @param id_var name of the id variable in \code{data}
#' @return A data frame with one column added. The new column will
#' have same name as the variable passed as input \code{x}.
#' @family data frame handling functions
add_factor <- function(data, x, id_var = "id") {
data <- convert_to_data_frame(data)
name <- deparse(substitute(x))
bad <- name %in% colnames(data)
if (bad) stop("<data> already contains a variable called '", name, "'!")
check_named(x)
x_id <- as.numeric(names(x))
data_id <- dollar(data, id_var)
uid <- unique(x_id)
new_factor <- rep(0, length(data_id))
for (id in uid) {
i_data <- which(data_id == id)
i_new <- which(x_id == id)
new_factor[i_data] <- x[i_new]
}
data[[name]] <- as.factor(new_factor)
return(data)
}
#' Add a crossing of two factors to a data frame
#'
#' @export
#' @param data a data frame
#' @param fac1 name of first factor, must be found in \code{df}
#' @param fac2 name of second factor, must be found in \code{df}
#' @param new_name name of the new factor
#' @return a data frame
#' @family data frame handling functions
add_factor_crossing <- function(data, fac1, fac2, new_name) {
df <- convert_to_data_frame(data)
a <- dollar(df, fac1)
b <- dollar(df, fac2)
check_not_null(new_name)
check_type(a, "factor")
check_type(b, "factor")
df[[new_name]] <- interaction(a, b, sep = "*")
return(df)
}
#' Easily add the disease-related age variable to a data frame
#'
#' @export
#' @description Creates the disease-related age covariate vector based on the
#' disease initiation times and adds it to the data frame
#' @param data the original data frame
#' @param t_init A named vector containing the observed initiation or onset
#' time for each individual. The names, i.e. \code{names(t_init)}, should
#' specify the individual id.
#' @param id_var name of the id variable in \code{data}
#' @param time_var name of the time variable in \code{data}
#' @return A data frame with one column added. The new column will
#' be called \code{dis_age}. For controls, its value will be \code{NaN}.
#' @family data frame handling functions
add_dis_age <- function(data, t_init, id_var = "id", time_var = "age") {
data <- convert_to_data_frame(data)
bad <- "dis_age" %in% colnames(data)
if (bad) stop("<data> already contains a variable called 'dis_age'!")
check_named(t_init)
x_id <- as.numeric(names(t_init))
data_id <- dollar(data, id_var)
data_age <- dollar(data, time_var)
uid <- unique(x_id)
dage <- rep(NaN, length(data_id))
for (id in uid) {
i_data <- which(data_id == id)
i_new <- which(x_id == id)
dage[i_data] <- data_age[i_data] - t_init[i_new]
}
data$dis_age <- dage
return(data)
}
#' Split data into training and test sets
#'
#' @name split
#' @param data a data frame
#' @param var_name name of a factor in the data
#' @description
#' \itemize{
#' \item \code{split_by_factor} splits according to given factor
#' \item \code{split_within_factor} splits according to given
#' data point indices within the same level of a factor
#' \item \code{split_within_factor_random} selects k points
#' from each level of a factor uniformly at random as test data
#' \item \code{split_random} splits uniformly at random
#' \item \code{split_data} splits according to given data rows
#' }
#' @return a named list with names \code{train}, \code{test}, \code{i_train}
#' and \code{i_test}
#' @family data frame handling functions
NULL
#' @export
#' @rdname split
#' @param test the levels of the factor that will be used as test data
split_by_factor <- function(data, test, var_name = "id") {
check_type(data, "data.frame")
fac <- dollar(data, var_name)
check_type(fac, "factor")
i_test <- which(fac %in% test)
split_data(data, i_test)
}
#' @export
#' @rdname split
#' @param idx_test indices point indices with the factor
split_within_factor <- function(data, idx_test, var_name = "id") {
data <- convert_to_data_frame(data)
id <- dollar(data, var_name)
check_type(id, "factor")
uid <- unique(id)
i_test <- c()
for (i in uid) {
inds <- which(id == i)
i_test <- c(i_test, inds[idx_test])
}
split_data(data, i_test)
}
#' @export
#' @rdname split
#' @param k_test desired number of test data points per each level of the
#' factor
split_within_factor_random <- function(data, k_test = 1, var_name = "id") {
data <- convert_to_data_frame(data)
id <- dollar(data, var_name)
check_type(id, "factor")
uid <- unique(id)
i_test <- c()
for (i in uid) {
inds <- which(id == i)
L <- length(inds)
i_sel <- sample.int(L, k_test)
i_test <- c(i_test, inds[i_sel])
}
split_data(data, i_test)
}
#' @export
#' @rdname split
#' @param p_test desired proportion of test data
#' @param n_test desired number of test data points (if NULL, \code{p_test}
#' is used to compute this)
split_random <- function(data, p_test = 0.2, n_test = NULL) {
data <- convert_to_data_frame(data)
n_total <- dim(data)[1]
if (is.null(n_test)) {
n_test <- round(p_test * n_total)
}
i_test <- sample.int(n_total, size = n_test)
split_data(data, i_test)
}
#' @export
#' @rdname split
#' @param i_test test data row indices
#' @param sort_ids should the test indices be sorted into increasing order
split_data <- function(data, i_test, sort_ids = TRUE) {
data <- convert_to_data_frame(data)
i_test <- if (sort_ids) sort(i_test, decreasing = FALSE) else sort_ids
n_total <- dim(data)[1]
i_train <- setdiff(seq_len(n_total), i_test)
# Return
list(
train = data[i_train, ],
test = data[i_test, ],
i_train = i_train,
i_test = i_test
)
}
#' Create test input points for prediction
#'
#' @description Replaces a continuous variable \code{x} in the data frame, and
#' possibly another continuous variable \code{x_ns} derived from it, with new
#' values, for each level of a grouping factor (usually id)
#' @export
#' @param data A data frame. Can also be an \linkS4class{lgpfit} or
#' \linkS4class{lgpmodel} object, in which case data is extracted from it.
#' @param group_by name of the grouping variable, must be a factor
#' in \code{data} (or use \code{group_by=NA} to create a dummy grouping
#' factor which has only one value)
#' @param x of the variable along which to extend,
#' must be a numeric in \code{data}
#' @param x_ns of a nonstationary variable derived from \code{x},
#' must be a numeric in \code{data}
#' @param x_values the values of \code{x} to set for each individual
#' @return a data frame containing the following columns
#' \itemize{
#' \item all factors in the original \code{data}
#' \item \code{x}
#' \item \code{x_ns} (unless it is NULL)
#' }
#'
#' @family data frame handling functions
new_x <- function(data, x_values, group_by = "id", x = "age", x_ns = NULL) {
data <- allow_data_model_or_fit(data)
data <- convert_to_data_frame(data)
check_not_null(x)
check_not_null(x_values)
check_in_data(x, data, "data")
if (is.na(group_by)) {
x_grp <- create_grouping_factor(data, group_by) # util
data["group__"] <- x_grp
group_by <- "group__"
} else {
check_in_data(group_by, data, "data")
}
df <- pick_one_row_each(data, group_by)
k <- length(x_values)
col_names <- if (is.null(x_ns)) x else c(x, x_ns)
df <- select_factors_and(df, col_names)
N <- nrow(df)
inds <- rep(seq_len(N), each = k)
df <- df[inds, ]
x_val_rep <- rep(x_values, times = N)
df[[x]] <- x_val_rep
if (!is.null(x_ns)) {
t0 <- get_teff_obs(data, group_by, x, x_ns)
t0 <- as.numeric(t0)
df[[x_ns]] <- x_val_rep - rep(t0, each = k)
}
rownames(df) <- NULL
return(df)
}
# Get observed effect times from a data frame
get_teff_obs <- function(data, group_by = "id", x = "age",
x_ns = "diseaseAge") {
check_type(data, "data.frame")
df <- pick_one_row_each(data, group_by)
times <- dollar(df, x) - dollar(df, x_ns)
names(times) <- dollar(df, group_by)
return(times)
}
# For each unique value of a factor fac, pick one row from data
pick_one_row_each <- function(data, fac) {
check_type(data, "data.frame")
z <- dollar(data, fac)
check_type(z, "factor")
rows <- c()
for (lev in levels(z)) {
inds <- which(z == lev)
rows <- c(rows, inds[1])
}
data[rows, ]
}
# Select data columns which are factors or have name specified by valid
select_factors_and <- function(data, valid) {
check_type(data, "data.frame")
col_inds <- c()
D <- ncol(data)
nams <- names(data)
for (j in seq_len(D)) {
a <- is.factor(data[, j])
b <- nams[j] %in% valid
if (a || b) {
col_inds <- c(col_inds, j)
}
}
data[, col_inds]
}
#' Vizualizing longitudinal data
#'
#' @export
#' @param data A data frame.
#' @param x_name Name of x-axis variable.
#' @param y_name Name of the y-axis variable.
#' @param group_by Name of grouping variable (must be a factor).
#' @param color_by Name of coloring variable (must be a factor).
#' @param facet_by Name of the faceting variable (must be a factor).
#' @param highlight Value of category of the \code{group_by} variable
#' that is highlighted. Can only be used if \code{color_by} is \code{NULL}.
#' @param main main plot title
#' @param sub plot subtitle
#' @return a \code{\link[ggplot2]{ggplot}} object
#' @family data frame handling
plot_data <- function(data,
x_name = "age",
y_name = "y",
group_by = "id",
facet_by = NULL,
color_by = NULL,
highlight = NULL,
main = NULL,
sub = NULL) {
data <- allow_data_model_or_fit(data)
data <- convert_to_data_frame(data)
# Check that needed variables exist
check_in_data(x_name, data, "data")
check_in_data(y_name, data, "data")
check_in_data(group_by, data, "data")
# Create initial plot and add data
df <- data[c(x_name, y_name, group_by, color_by, facet_by)]
df <- plot_data_add_highlight_factor(df, group_by, highlight)
skip_hl <- is.null(highlight)
color_by <- if (skip_hl) color_by else paste0(group_by, "_")
aes <- plot_data_aes(x_name, y_name, group_by, color_by)
h <- ggplot2::ggplot(df, aes)
# Add data
h <- h + ggplot2::geom_line() + ggplot2::geom_point()
# Add titles, faceting and coloring
titles <- plot_data_titles(main, sub, data, group_by)
label <- dollar(titles, "main")
subtitle <- dollar(titles, "sub")
h <- h + ggplot2::ggtitle(label = label, subtitle = subtitle)
if (!is.null(facet_by)) {
f <- stats::as.formula(paste("~", facet_by))
h <- h + ggplot2::facet_wrap(f, labeller = ggplot2::label_both)
}
return(h)
}
# Create aes for plot_data
plot_data_aes <- function(x_name, y_name, group_by, color_by) {
if (is.null(color_by)) {
aes <- ggplot2::aes_string(x = x_name, y = y_name, group = group_by)
} else {
aes <- ggplot2::aes_string(
x = x_name, y = y_name, group = group_by,
color = color_by
)
}
return(aes)
}
# Create titles for plot_data
plot_data_titles <- function(main, sub, data, group_by) {
g <- data[[group_by]]
N <- length(unique(g))
n <- dim(data)[1]
if (is.null(main)) {
main <- paste0(n, " data points")
}
if (is.null(sub)) {
sub <- paste0(
"Points that share the same value for '", group_by,
"' are connected by a line (", N, " levels)."
)
}
list(main = main, sub = sub, N = N, n = n)
}
# Add factor to data frame for highlighting in plot
plot_data_add_highlight_factor <- function(df, group_by, highlight) {
if (!is.null(highlight)) {
check_length(highlight, 1)
g <- df[[group_by]]
s <- sum(g == highlight)
if (s == 0) {
str <- paste(levels(g), collapse = ", ")
msg <- paste0(
"Invalid <highlight> argument ", highlight, "! The ",
"possible values of ", group_by, " are: {", str, "}."
)
stop(msg)
}
hl <- 1 + as.numeric(g == highlight)
levels <- c("other", highlight)
name <- paste0(group_by, "_")
df[[name]] <- as.factor(levels[hl])
}
return(df)
}
# This allows data utilities to work with lgpmodel or lgpfit, too
allow_data_model_or_fit <- function(data) {
if (is(data, "lgpfit")) data <- get_data(get_model(data))
if (is(data, "lgpmodel")) data <- get_data(data)
check_type(data, "data.frame")
return(data)
}
# Convert data to data.frame
convert_to_data_frame <- function(data) {
check_type(data, "data.frame") # check if inherits from data.frame
cname <- class(data)
if (length(cname) > 1) {
warning("data is not a plain data.frame, converting using as.data.frame()")
data <- as.data.frame(data)
}
data
}
#' Function for reading the built-in proteomics data
#'
#' @export
#' @param parentDir Path to local parent directory for the data.
#' If this is \code{NULL}, data is downloaded from
#' \url{https://github.com/jtimonen/lgpr-usage/tree/master/data/proteomics}.
#' @param protein Index or name of protein.
#' @param verbose Can this print some output?
#' @return a \code{data.frame}
#' @family built-in data
read_proteomics_data <- function(parentDir = NULL, protein = NULL,
verbose = TRUE) {
if (is.null(protein)) stop("specify name or index of protein!")
REPO_PATH <- "jtimonen/lgpr-usage/master/data/proteomics/"
RAW_PATH <- paste0("https://raw.githubusercontent.com/", REPO_PATH)
fn_X <- "liu_preproc_X.csv"
fn_Y <- "liu_preproc_Y.csv"
if (is.null(parentDir)) {
msg <- "Given parentDir was NULL, downloading the data from internet..."
log_info(msg, verbose)
fn_X <- paste0(RAW_PATH, fn_X)
fn_Y <- paste0(RAW_PATH, fn_Y)
X_data <- read_csv_url(fn_X, verbose, header = TRUE, sep = ",")
Y_data <- read_csv_url(fn_Y, verbose, header = TRUE, sep = ",")
} else {
fn_X <- paste0(parentDir, "/", fn_X)
fn_Y <- paste0(parentDir, "/", fn_Y)
X_data <- utils::read.csv(fn_X, header = TRUE, sep = ",")
Y_data <- utils::read.csv(fn_Y, header = TRUE, sep = ",")
}
# Get protein
names <- colnames(Y_data)
if (!is.character(protein)) {
pname <- names[protein]
} else {
pname <- protein
}
msg <- paste0("Read data for protein '", pname, "'.")
log_info(msg, verbose)
# Remove rows containing NaNs for the protein
y <- Y_data[[pname]]
notnan <- which(!is.nan(y))
num_nan <- length(which(is.nan(y)))
data <- data.frame(cbind(X_data, y))
data <- data[notnan, ]
msg <- paste0(
"Removed ", num_nan, " rows with NaN value as the ",
"protein measurement."
)
if (num_nan > 0) log_info(msg, verbose)
# Categorical variables to factors
data$id <- as.factor(data$id)
data$group <- as.factor(data$group)
data$sex <- as.factor(data$sex)
levels(data$group)[levels(data$group) == 0] <- "Control"
levels(data$group)[levels(data$group) == 1] <- "Case"
levels(data$sex)[levels(data$sex) == 0] <- "Female"
levels(data$sex)[levels(data$sex) == 1] <- "Male"
return(data)
}
# Read a csv from URL
read_csv_url <- function(fn, verbose, ...) {
msg <- paste0("Reading ", fn, " with ?accessType=DOWNLOAD")
log_info(msg, verbose)
url <- paste0(fn, "?accessType=DOWNLOAD")
text <- RCurl::getURL(url)
utils::read.csv(text = text, ...)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.