Nothing
#' @title get.pgs.percentiles
#' @description Calculate percentiles and report decile and quartile ranks for a vector of polygenic scores
#' @param pgs numeric vector of polygenic scores
#' @param n.percentiles integer number of percentiles to calculate (optional)
#' @return data frame with columns for percentile, decile, quartile, and optional n.percentiles
#' @examples
#' x <- rnorm(100);
#' get.pgs.percentiles(x, n.percentiles = 20);
#' @export
get.pgs.percentiles <- function(pgs, n.percentiles = NULL) {
# check that pgs is numeric
if (!is.numeric(pgs)) {
stop('pgs must be a numeric vector');
}
# check that n.percentiles is a mathematical integer
if (!is.null(n.percentiles) && (n.percentiles %% 1 != 0)) {
stop('n.percentiles must be an integer');
}
# save the original order of the PGS
pgs.percentile.data <- data.frame(pgs = pgs, order = 1:length(pgs));
# calculate percentiles
pgs.percentile.data$percentile <- sapply(
X = pgs,
FUN = function(x) {
sum(x >= pgs, na.rm = TRUE) / length(pgs);
}
);
# sort pgs percentiles in descending order to facilitate decile and quartile calculations
pgs.percentile.data <- pgs.percentile.data[order(pgs.percentile.data$percentile, decreasing = TRUE),];
# compute which decile each PGS belongs to
pgs.percentile.data$decile <- cut(pgs.percentile.data$percentile, breaks = seq(0, 1, 0.1), labels = FALSE);
# compute which quartile each pgs belongs to
pgs.percentile.data$quartile <- cut(pgs.percentile.data$percentile, breaks = seq(0, 1, 0.25), labels = FALSE);
if (!is.null(n.percentiles)) {
# calculate user-specified percentiles
pgs.percentile.data$percentile.X <- cut(pgs.percentile.data$percentile, breaks = seq(0, 1, 1 / n.percentiles), labels = FALSE);
# replace column name with the number of percentiles
colnames(pgs.percentile.data) <- gsub('percentile.X', paste0('percentile.', n.percentiles), colnames(pgs.percentile.data));
}
# go back to the original order
pgs.percentile.data <- pgs.percentile.data[order(pgs.percentile.data$order),];
# remove extra columns
pgs.percentile.data$order <- NULL;
pgs.percentile.data$pgs <- NULL;
return(pgs.percentile.data);
}
# utility function for identifying data as continuous or binary for analysis and plotting purposes
classify.variable.type <- function(data, continuous.threshold = 4) {
# identify continuous and binary variables
continuous.vars.index <- sapply(
X = data,
FUN = function(x) {
('numeric' == class(x) | 'integer' == class(x)) & continuous.threshold < length(unique(na.omit(x)));
}
);
binary.vars.index <- sapply(
X = data,
FUN = function(x) {
2 == length(unique(na.omit(x)));
}
);
other.vars.index <- !continuous.vars.index & !binary.vars.index;
return(list(
continuous = continuous.vars.index,
binary = binary.vars.index,
other = other.vars.index
));
}
# utility function that runs linear and logistic regression on a polygenic score and a set of phenotypes
#' @title Run linear and logistic regression on a polygenic score and a set of phenotypes
#' @description Phenotype data variables are automatically classified as continuous or binary and a simple linear regression or logistic regression, respectively, is run between the polygenic score and each phenotype.
#' Categorical phenotypes with more than two category are ignored.
#' If a binary variable is not formatted as a factor, it is converted to a factor using \code{as.factor()} defaults. For logistic regression, the first level is classified as "failure" and the second "success" by \code{glm()} defaults.
#' @param pgs numeric vector of polygenic scores
#' @param phenotype.data data.frame of phenotypes
#' @return data frame with columns for phenotype, model, beta, se, p.value, r.squared, and AUC
#' @examples
#' set.seed(200);
#' pgs <- rnorm(200, 0, 1);
#' phenotype.data <- data.frame(
#' continuous.pheno = rnorm(200, 1, 1),
#' binary.pheno = sample(c(0, 1), 200, replace = TRUE)
#' );
#' run.pgs.regression(pgs, phenotype.data);
#' @export
run.pgs.regression <- function(pgs, phenotype.data) {
# initialize conditional outputs
linear.model.aggregated <- NULL;
logistic.model.aggregated <- NULL;
# identify continuous and binary phenotypes
variable.index.by.type <- classify.variable.type(phenotype.data);
# run linear regression on continuous phenotypes
if (any(variable.index.by.type$continuous)) {
continuous.data <- subset(phenotype.data, select = variable.index.by.type$continuous);
linear.model <- lapply(
X = continuous.data,
FUN = function(x) {
return(summary(lm(x ~ pgs, data = phenotype.data)));
}
);
# aggregate results in a data frame
linear.model.aggregated <- lapply(
X = linear.model,
FUN = function(x) {
coeff.index <- if (nrow(x$coefficients) == 1) NA else 'pgs';
data.frame(
beta = x$coefficients[coeff.index, 'Estimate'],
se = x$coefficients[coeff.index, 'Std. Error'],
p.value = x$coefficients[coeff.index, 'Pr(>|t|)'],
r.squared = x$r.squared,
AUC = NA
);
}
);
linear.model.aggregated <- do.call(rbind, linear.model.aggregated);
linear.model.aggregated <- data.frame(
phenotype = names(linear.model),
model = 'linear.regression',
linear.model.aggregated
);
}
# run logistic regression on binary phenotypes
if (any(variable.index.by.type$binary)) {
binary.data <- subset(phenotype.data, select = variable.index.by.type$binary);
# ensure binary data is formatted as factors
binary.data <- lapply(
X = binary.data,
FUN = function(x) {
if (!is.factor(x)) {
return(as.factor(x));
} else {
return(x);
}
}
);
logistic.model <- lapply(
X = binary.data,
FUN = function(x) {
return(glm(x ~ pgs, data = binary.data, family = binomial));
}
);
logistic.model.summary <- lapply(
X = logistic.model,
FUN = summary
);
logistic.auc <- lapply(
X = logistic.model,
FUN = function(x) {
predictions <- predict(x, type = 'response'); # get predicted probabilities on the training set
# auc is a bit wordy by default
auc <- suppressMessages(pROC::auc(x$y, predictions)); # compute area under the curve on training set
}
);
logistic.model.aggregated <- lapply(
X = logistic.model.summary,
FUN = function(x) {
coeff.index <- if (nrow(x$coefficients) == 1) NA else 'pgs';
data.frame(
beta = x$coefficients[coeff.index, 'Estimate'],
se = x$coefficients[coeff.index, 'Std. Error'],
p.value = x$coefficients[coeff.index, 'Pr(>|z|)'],
r.squared = NA
);
}
);
logistic.model.aggregated <- do.call(rbind, logistic.model.aggregated);
logistic.model.aggregated$AUC <- unlist(logistic.auc);
logistic.model.aggregated <- data.frame(
phenotype = names(logistic.model),
model = 'logistic.regression',
logistic.model.aggregated
);
}
all.model.results <- rbind(linear.model.aggregated, logistic.model.aggregated);
return(all.model.results);
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.