library(SomaDataIO)
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
knitr::opts_chunk$set(
  echo = TRUE,
  collapse = TRUE,
  comment = "#>",
  fig.path = "figures/linear-reg-"
)

Regression of Continuous Variables

Although targeted statistical analyses are beyond the scope of the SomaDataIO package, below is an example analysis that typical users/customers would perform on 'SomaScan' data.

It is not intended to be a definitive guide in statistical analysis and existing packages do exist in the R ecosystem that perform parts or extensions of these techniques. Many variations of the workflow below exist, however the framework highlights how one could perform standard preliminary analyses on 'SomaScan' data.

Data Preparation

# the `example_data` package data
dim(example_data)

table(example_data$SampleType)

# center/scale
cs <- function(.x) {    # .x = numeric vector
  out <- .x - mean(.x)  # center
  out / sd(out)         # scale
}

# prepare data set for analysis
cleanData <- example_data |>
  filter(SampleType == "Sample") |>          # rm control samples
  drop_na(Age) |>                            # rm NAs if present
  log10() |>                                 # log10-transform (Math Generic)
  modify_at(getAnalytes(example_data), cs)   # center/scale analytes

summary(cleanData$Age)

Set up Train/Test Data

# idx = hold-out 
idx   <- withr::with_seed(3, sample(1:nrow(cleanData), size = nrow(cleanData) - 50))
train <- cleanData[idx, ]
test  <- cleanData[-idx, ]

# assert no overlap
isTRUE(
  all.equal(intersect(rownames(train), rownames(test)), character(0))
)

Linear Regression

We use the cleanData, train, and test data objects from above.

Predict Age

LinR_tbl <- getAnalyteInfo(train) |>                # `train` from above
  select(AptName, SeqId, Target = TargetFullName, EntrezGeneSymbol, UniProt) |>
  mutate(
    formula = map(AptName, ~ as.formula(paste("Age ~", .x, collapse = " + "))),
    model   = map(formula, ~ stats::lm(.x, data = train, model = FALSE)), # fit models
    slope   = map(model, coef) |> map_dbl(2L),     # pull out B_1
    p.value = map2_dbl(model, AptName, ~ {
      summary(.x)$coefficients[.y, "Pr(>|t|)"] }), # pull out p-values
    fdr     = p.adjust(p.value, method = "BH")     # FDR for multiple testing
  ) |>
  arrange(p.value) |>            # re-order by `p-value`
  mutate(rank = row_number())    # add numeric ranks

LinR_tbl

Fit Model | Calculate Performance

Fit an 8-marker model with the top 8 features from LinR_tbl:

feats <- head(LinR_tbl$AptName, 8L)
form  <- as.formula(paste("Age ~", paste(feats, collapse = "+")))
fit   <- stats::lm(form, data = train, model = FALSE)
n     <- nrow(test)
p     <- length(feats)

# Results
res   <- tibble(
  true_age   = test$Age,
  pred_age   = predict(fit, newdata = test),
  pred_error = pred_age - true_age
)

# Lin's Concordance Correl. Coef.
# Accounts for location + scale shifts
linCCC <- function(x, y) {
  stopifnot(length(x) == length(y))
  a <- 2 * cor(x, y) * sd(x) * sd(y)
  b <- var(x) + var(y) + (mean(x) - mean(y))^2
  a / b
}

# Regression metrics
tibble(
  rss  = sum(res$pred_error^2),                 # residual sum of squares
  tss  = sum((test$Age - mean(test$Age))^2),    # total sum of squares
  rsq  = 1 - (rss / tss),                       # R-squared
  rsqadj = max(0, 1 - (1 - rsq) * (n - 1) / (n - p - 1)), # Adjusted R-squared
  R2   = stats::cor(res$true_age, res$pred_age)^2,        # R-squared Pearson approx.
  MAE  = mean(abs(res$pred_error)),             # Mean Absolute Error
  RMSE = sqrt(mean(res$pred_error^2)),          # Root Mean Squared Error
  CCC  = linCCC(res$true_age, res$pred_age)     # Lin's CCC
)

Visualize Concordance

lims <- range(res$true_age, res$pred_age)
res |>
  ggplot(aes(x = true_age, y = pred_age)) +
  geom_point(colour = "#24135F", alpha = 0.5, size = 4) +
  expand_limits(x = lims, y = lims) +                # make square
  geom_abline(slope = 1, colour = "black") +         # add unit line
  geom_rug(colour = "#286d9b", linewidth = 0.2) +
  labs(y = "Predicted Age", x = "Actual Age") +
  ggtitle("Concordance in Predicted vs. Actual Age") +
  theme(plot.title = element_text(size = 21, face = "bold"),
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))


SomaLogic/SomaDataIO documentation built on Feb. 8, 2025, 12:19 p.m.