library(SomaDataIO) library(ggplot2) library(dplyr) library(tidyr) library(purrr) knitr::opts_chunk$set( echo = TRUE, collapse = TRUE, comment = "#>", fig.path = "figures/linear-reg-" )
Although targeted statistical analyses are beyond the scope of
the SomaDataIO
package, below is an example analysis
that typical users/customers would perform on 'SomaScan' data.
It is not intended to be a definitive guide in statistical
analysis and existing packages do exist in the R
ecosystem that perform
parts or extensions of these techniques. Many variations of the workflow
below exist, however the framework highlights how one could perform standard
preliminary analyses on 'SomaScan' data.
# the `example_data` package data dim(example_data) table(example_data$SampleType) # center/scale cs <- function(.x) { # .x = numeric vector out <- .x - mean(.x) # center out / sd(out) # scale } # prepare data set for analysis cleanData <- example_data |> filter(SampleType == "Sample") |> # rm control samples drop_na(Age) |> # rm NAs if present log10() |> # log10-transform (Math Generic) modify_at(getAnalytes(example_data), cs) # center/scale analytes summary(cleanData$Age)
# idx = hold-out idx <- withr::with_seed(3, sample(1:nrow(cleanData), size = nrow(cleanData) - 50)) train <- cleanData[idx, ] test <- cleanData[-idx, ] # assert no overlap isTRUE( all.equal(intersect(rownames(train), rownames(test)), character(0)) )
We use the cleanData
, train
, and test
data objects from above.
LinR_tbl <- getAnalyteInfo(train) |> # `train` from above select(AptName, SeqId, Target = TargetFullName, EntrezGeneSymbol, UniProt) |> mutate( formula = map(AptName, ~ as.formula(paste("Age ~", .x, collapse = " + "))), model = map(formula, ~ stats::lm(.x, data = train, model = FALSE)), # fit models slope = map(model, coef) |> map_dbl(2L), # pull out B_1 p.value = map2_dbl(model, AptName, ~ { summary(.x)$coefficients[.y, "Pr(>|t|)"] }), # pull out p-values fdr = p.adjust(p.value, method = "BH") # FDR for multiple testing ) |> arrange(p.value) |> # re-order by `p-value` mutate(rank = row_number()) # add numeric ranks LinR_tbl
Fit an 8-marker model with the top 8 features from LinR_tbl
:
feats <- head(LinR_tbl$AptName, 8L) form <- as.formula(paste("Age ~", paste(feats, collapse = "+"))) fit <- stats::lm(form, data = train, model = FALSE) n <- nrow(test) p <- length(feats) # Results res <- tibble( true_age = test$Age, pred_age = predict(fit, newdata = test), pred_error = pred_age - true_age ) # Lin's Concordance Correl. Coef. # Accounts for location + scale shifts linCCC <- function(x, y) { stopifnot(length(x) == length(y)) a <- 2 * cor(x, y) * sd(x) * sd(y) b <- var(x) + var(y) + (mean(x) - mean(y))^2 a / b } # Regression metrics tibble( rss = sum(res$pred_error^2), # residual sum of squares tss = sum((test$Age - mean(test$Age))^2), # total sum of squares rsq = 1 - (rss / tss), # R-squared rsqadj = max(0, 1 - (1 - rsq) * (n - 1) / (n - p - 1)), # Adjusted R-squared R2 = stats::cor(res$true_age, res$pred_age)^2, # R-squared Pearson approx. MAE = mean(abs(res$pred_error)), # Mean Absolute Error RMSE = sqrt(mean(res$pred_error^2)), # Root Mean Squared Error CCC = linCCC(res$true_age, res$pred_age) # Lin's CCC )
lims <- range(res$true_age, res$pred_age) res |> ggplot(aes(x = true_age, y = pred_age)) + geom_point(colour = "#24135F", alpha = 0.5, size = 4) + expand_limits(x = lims, y = lims) + # make square geom_abline(slope = 1, colour = "black") + # add unit line geom_rug(colour = "#286d9b", linewidth = 0.2) + labs(y = "Predicted Age", x = "Actual Age") + ggtitle("Concordance in Predicted vs. Actual Age") + theme(plot.title = element_text(size = 21, face = "bold"), axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.