##' LFMM least-squares estimates with ridge penalty
##'
##' This function computes regularized least squares estimates
##' for latent factor mixed models using a ridge penalty.
##'
##' The algorithm minimizes the following penalized least-squares criterion
##' \deqn{ L(U, V, B) = \frac{1}{2} ||Y - U V^{T} - X B^T||_{F}^2
##' + \frac{\lambda}{2} ||B||^{2}_{2} ,}
##' where Y is a response data matrix, X contains all explanatory variables,
##' U denotes the score matrix, V is the loading matrix, B is the (direct) effect
##' size matrix, and lambda is a regularization parameter.
##'
##' @param Y a response variable matrix with n rows and p columns.
##' Each column corresponds to a distinct response variable (e.g., SNP genotype,
##' gene expression level, beta-normalized methylation profile, etc).
##' Response variables must be encoded as numeric.
##' @param X an explanatory variable matrix with n rows and d columns.
##' Each column corresponds to a distinct explanatory variable (eg. phenotype).
##' Explanatory variables must be encoded as numeric variables.
##' @param K an integer for the number of latent factors in the regression model.
##' @param lambda a numeric value for the regularization parameter.
##' @param algorithm exact (analytical) algorithm or numerical algorithm.
##' The exact algorithm is based on the global minimum of the loss function and
##' computation is very fast. The numerical algorithm converges toward a local
##' minimum of the loss function. The exact method should preferred. The numerical method is
##' for very large n.
##' @param it.max an integer value for the number of iterations for the
##' numerical algorithm.
##' @param relative.err.epsilon a numeric value for a relative convergence error. Test
##' whether the numerical algorithm converges or not (numerical algorithm only).
##' @return an object of class \code{lfmm} with the following attributes:
##' - U the latent variable score matrix with dimensions n x K,
##' - V the latent variable axis matrix with dimensions p x K,
##' - B the effect size matrix with dimensions p x d.
##' @details The response variable matrix Y and the explanatory variable are centered.
##' @export
##' @author cayek, francoio
##' @examples
##'
##' library(lfmm)
##'
##' ## a GWAS example with Y = SNPs and X = phenotype
##' data(example.data)
##' Y <- example.data$genotype
##' X <- example.data$phenotype
##'
##' ## Fit an LFMM with K = 6 factors
##' mod.lfmm <- lfmm_ridge(Y = Y,
##' X = X,
##' K = 6)
##'
##' ## Perform association testing using the fitted model:
##' pv <- lfmm_test(Y = Y,
##' X = X,
##' lfmm = mod.lfmm,
##' calibrate = "gif")
##'
##' ## Manhattan plot with causal loci shown
##'
##' pvalues <- pv$calibrated.pvalue
##' plot(-log10(pvalues), pch = 19,
##' cex = .2, col = "grey", xlab = "SNP")
##' points(example.data$causal.set,
##' -log10(pvalues)[example.data$causal.set],
##' type = "h", col = "blue")
##'
##'
##' ## An EWAS example with Y = methylation data and X = exposure
##' Y <- scale(skin.exposure$beta.value)
##' X <- scale(as.numeric(skin.exposure$exposure))
##'
##' ## Fit an LFMM with 2 latent factors
##' mod.lfmm <- lfmm_ridge(Y = Y,
##' X = X,
##' K = 2)
##'
##' ## Perform association testing using the fitted model:
##' pv <- lfmm_test(Y = Y,
##' X = X,
##' lfmm = mod.lfmm,
##' calibrate = "gif")
##'
##' ## Manhattan plot with true associations shown
##' pvalues <- pv$calibrated.pvalue
##' plot(-log10(pvalues),
##' pch = 19,
##' cex = .3,
##' xlab = "Probe",
##' col = "grey")
##'
##' causal.set <- seq(11, 1496, by = 80)
##' points(causal.set,
##' -log10(pvalues)[causal.set],
##' col = "blue")
lfmm_ridge <- function(Y, X, K, lambda = 1e-5, algorithm = "analytical",
it.max = 100, relative.err.min = 1e-6) {
## init
m <- ridgeLFMM(K = K, lambda = lambda)
m$algorithm <- algorithm[1]
dat <- LfmmDat(Y = scale(Y, scale = FALSE), X = scale(X, scale = FALSE))
## run
m <- lfmm_fit(m, dat, it.max = it.max, relative.err.min = relative.err.min)
## return
m
}
##' Cross validation of LFMM estimates with ridge penalty
##'
##' This function splits the data set into a train set and a test set, and returns
##' a prediction error. The function \code{\link{lfmm_ridge}} is run with the
##' train set and the prediction error is evaluated from the test set.
##'
##'
##' @param Y a response variable matrix with n rows and p columns.
##' Each column corresponds to a distinct response variable (e.g., SNP genotype,
##' gene expression level, beta-normalized methylation profile, etc).
##' Response variables must be encoded as numeric.
##' @param X an explanatory variable matrix with n rows and d columns.
##' Each column corresponds to a distinct explanatory variable (eg. phenotype).
##' Explanatory variables must be encoded as numeric.
##' @param Ks a list of integer for the number of latent factors in the regression model.
##' @param lambdas a list of numeric values for the regularization parameter.
##' @param n.fold.row number of cross-validation folds along rows.
##' @param p.fold.col number of cross-validation folds along columns.
##' @return a dataframe containing prediction errors for all values of lambda and K
##' @details The response variable matrix Y and the explanatory variables X are centered.
##'
##' @export
##' @author cayek, francoio
##' @examples
##' library(ggplot2)
##' library(lfmm)
##'
##' ## sample data
##' K <- 3
##' dat <- lfmm_sampler(n = 100, p = 1000, K = K,
##' outlier.prop = 0.1,
##' cs = c(0.8),
##' sigma = 0.2,
##' B.sd = 1.0,
##' U.sd = 1.0,
##' V.sd = 1.0)
##'
##' ## run cross validation
##' errs <- lfmm_ridge_CV(Y = dat$Y,
##' X = dat$X,
##' n.fold.row = 5,
##' n.fold.col = 5,
##' lambdas = c(1e-10, 1, 1e20),
##' Ks = c(1,2,3,4,5,6))
##'
##' ## plot error
##' ggplot(errs, aes(y = err, x = as.factor(K))) +
##' geom_boxplot() +
##' facet_grid(lambda ~ ., scale = "free")
##'
##' ggplot(errs, aes(y = err, x = as.factor(lambda))) +
##' geom_boxplot() +
##' facet_grid(K ~ ., scales = "free")
##'
##' @seealso \code{\link{lfmm_ridge}}
lfmm_ridge_CV <- function(Y, X, n.fold.row, n.fold.col, lambdas, Ks) {
## init
lfmm <- lfmm::ridgeLFMM(K = NULL,
lambda = NULL)
dat <- LfmmDat(Y = scale(Y, scale = FALSE), X = scale(X, scale = FALSE))
## run and return
return(lfmm::lfmm_CV(m = lfmm, dat = dat,
n.fold.row = n.fold.row,
n.fold.col = n.fold.col,
Ks = Ks,
lambdas = lambdas))
}
##' LFMM least-squares estimates with lasso penalty
##'
##' This function computes regularized least squares estimates
##' for latent factor mixed models using a lasso penalty.
##'
##' The algorithm minimizes the following penalized least-squares criterion
##'
##'
##' @param Y a response variable matrix with n rows and p columns.
##' Each column is a response variable (e.g., SNP genotype,
##' gene expression level, beta-normalized methylation profile, etc).
##' Response variables must be encoded as numeric.
##' @param X an explanatory variable matrix with n rows and d columns.
##' Each column corresponds to a distinct explanatory variable (eg. phenotype).
##' Explanatory variables must be encoded as numeric.
##' @param K an integer for the number of latent factors in the regression model.
##' @param nozero.prop a numeric value for the expected proportion of non-zero effect sizes.
##' @param lambda.num a numeric value for the number of 'lambda' values (obscure).
##' @param lambda.min.ratio (obscure parameter) a numeric value for the smallest `lambda` value,
##' A fraction of `lambda.max`, the data derived entry value (i.e. the smallest value for
##' which all coefficients are zero).
##' @param lambda (obscure parameter) Smallest value of `lambda`. A fraction of 'lambda.max',
##' the (data derived) entry value (i.e. the smallest value for which all
##' coefficients are zero).
##' @param it.max an integer value for the number of iterations of the algorithm.
##' @param relative.err.epsilon a numeric value for a relative convergence error. Determine
##' whether the algorithm converges or not.
##' @return an object of class \code{lfmm} with the following attributes:
##' - U the latent variable score matrix with dimensions n x K,
##' - V the latent variable axes matrix with dimensions p x K,
##' - B the effect size matrix with dimensions p x d.
##' @details The response variable matrix Y and the explanatory variable are centered.
##'
##' @export
##' @author cayek, francoio
##' @examples
##'
##' library(lfmm)
##'
##' ## a GWAS example with Y = SNPs and X = phenotype
##' data(example.data)
##' Y <- example.data$genotype
##' X <- example.data$phenotype
##'
##' ## Fit an LFMM with 6 factors
##' mod.lfmm <- lfmm_lasso(Y = Y,
##' X = X,
##' K = 6,
##' nozero.prop = 0.01)
##'
##' ## Perform association testing using the fitted model:
##' pv <- lfmm_test(Y = Y,
##' X = X,
##' lfmm = mod.lfmm,
##' calibrate = "gif")
##'
##' ## Manhattan plot with causal loci shown
##' pvalues <- pv$calibrated.pvalue
##' plot(-log10(pvalues),
##' pch = 19, cex = .2,
##' col = "grey", xlab = "SNP")
##'
##' points(example.data$causal.set,
##' -log10(pvalues)[example.data$causal.set],
##' type = "h", col = "blue")
##'
##' ## An EWAS example with Y = methylation data
##' ## and X = exposure
##' Y <- scale(skin.exposure$beta.value)
##' X <- scale(as.numeric(skin.exposure$exposure))
##'
##' ## Fit an LFMM with 2 latent factors
##' mod.lfmm <- lfmm_lasso(Y = Y,
##' X = X,
##' K = 2,
##' nozero.prop = 0.01)
##'
##' ## Perform association testing using the fitted model:
##' pv <- lfmm_test(Y = Y,
##' X = X,
##' lfmm = mod.lfmm,
##' calibrate = "gif")
##'
##' ## Manhattan plot with true associations shown
##' pvalues <- pv$calibrated.pvalue
##' plot(-log10(pvalues),
##' pch = 19,
##' cex = .3,
##' xlab = "Probe",
##' col = "grey")
##'
##' causal.set <- seq(11, 1496, by = 80)
##' points(causal.set,
##' -log10(pvalues)[causal.set],
##' col = "blue")
lfmm_lasso <- function(Y, X, K,
nozero.prop = 0.01,
lambda.num = 100,
lambda.min.ratio = 0.01,
lambda = NULL,
it.max = 100,
relative.err.epsilon = 1e-6) {
## init
m <- lassoLFMM(K = K,
nozero.prop = nozero.prop,
lambda.num = lambda.num,
lambda.min.ratio = lambda.min.ratio,
lambda = lambda)
dat <- LfmmDat(Y = scale(Y, scale = FALSE), X = scale(X, scale = FALSE))
## run
m <- lfmm_fit(m, dat, it.max = it.max, relative.err.epsilon = relative.err.epsilon)
## return
m
}
##' Statistical tests with latent factor mixed models (linear models)
##'
##'
##' This function returns significance values for the association between each column of the
##' response matrix, Y, and the explanatory variables, X, including correction for unobserved confounders
##' (latent factors). The test is based on an LFMM fitted with a ridge or lasso penalty (linear model).
##'
##'
##' @param Y a response variable matrix with n rows and p columns.
##' Each column is a response variable (numeric).
##' @param X an explanatory variable matrix with n rows and d columns.
##' Each column corresponds to an explanatory variable (numeric).
##' @param lfmm an object of class \code{lfmm} returned by the \link{lfmm_lasso}
##' or \link{lfmm_ridge} function
##' @param calibrate a character string, "gif" or "median+MAD". If the "gif" option is set (default),
##' significance values are calibrated by using the genomic control method. Genomic control
##' uses a robust estimate of the variance of z-scores called "genomic inflation factor".
##' If the "median+MAD" option is set, the pvalues are calibrated by computing the median and MAD of the zscores. If \code{NULL}, the
##' pvalues are not calibrated.
##' @return a list with the following attributes:
##' - B a p x d matrix of effect sizes for each locus and each explanatory variable. Note that the direction
##' of association is "Y explained by X",
##' - calibrated.pvalue a p x d matrix which contains calibrated p-values for each explanatory variable,
##' - gif a numeric value for the genomic inflation factor,
##' - epsilon.sigma2 a vector of length p containing the residual variances for each locus,
##' - B.sigma2 a matrix of size n x (d+K) that contains the variance of effect sizes for the d explanatory variables
##' and the K latent factors. It could be used to evaluate the proportion of the response variance (genetic variation)
##' explained by the exposure (X) and latent factors (U) at each locus,
##' - score a p x d matrix which contains z-scores for each explanatory variable (columns of X), before calibration.
##' This is equal to B[,j]/sqrt(B.sigma2[,j]) for variable j.
##' - pvalue a p x d matrix which contains uncalibrated p-values for each explanatory variable before calibration.
##' This may be useful to users preferring alternative methods to the GIF, like the local FDR method.
##' - calibrated.score2 a p x d matrix which contains squared Z-score after calibration.
##' This may be useful to expert users who may want to perform test recalibration with a different numeric value
##' for the GIF.
##'
##' @details The response variable matrix Y and the explanatory variables X are centered. Note that
##' scaling the Y and X matrices would convert the effect sizes into correlation coefficients. Calibrating
##' p-values means that their distribution is uniform under the null-hypothesis. Additional corrections are
##' required for multiple testing. For this, Benjamini-Hochberg or Bonferroni adjusted p-values could be obtained from
##' the calibrated values by using one of several the packages that implements multiple testing corrections.
##' @seealso \link{glm_test}
##' @export
##' @author cayek, francoio
##' @examples
##'
##' library(lfmm)
##'
##' ## a GWAS example with Y = SNPs and X = phenotype
##' data(example.data)
##' Y <- example.data$genotype
##' X <- example.data$phenotype
##'
##' ## Fit an LFMM with K = 6 factors
##' mod.lfmm <- lfmm_ridge(Y = Y,
##' X = X,
##' K = 6)
##'
##' ## Perform association testing using the fitted model:
##' pv <- lfmm_test(Y = Y,
##' X = X,
##' lfmm = mod.lfmm,
##' calibrate = "gif")
##'
##' ## Manhattan plot with causal loci shown
##'
##' pvalues <- pv$calibrated.pvalue
##' plot(-log10(pvalues), pch = 19,
##' cex = .2, col = "grey", xlab = "SNP")
##' points(example.data$causal.set,
##' -log10(pvalues)[example.data$causal.set],
##' type = "h", col = "blue")
##'
##'
##' ## An EWAS example with Y = methylation data and X = exposure
##' data("skin.exposure")
##' Y <- scale(skin.exposure$beta.value)
##' X <- scale(as.numeric(skin.exposure$exposure))
##'
##' ## Fit an LFMM with 2 latent factors
##' mod.lfmm <- lfmm_ridge(Y = Y,
##' X = X,
##' K = 2)
##'
##' ## Perform association testing using the fitted model:
##' pv <- lfmm_test(Y = Y,
##' X = X,
##' lfmm = mod.lfmm,
##' calibrate = "gif")
##'
##' ## Manhattan plot with true associations shown
##' pvalues <- pv$calibrated.pvalue
##' plot(-log10(pvalues),
##' pch = 19,
##' cex = .3,
##' xlab = "Probe",
##' col = "grey")
##'
##' causal.set <- seq(11, 1496, by = 80)
##' points(causal.set,
##' -log10(pvalues)[causal.set],
##' col = "blue")
lfmm_test <- function(Y, X, lfmm, calibrate = "gif") {
## init
dat <- LfmmDat(Y = scale(Y, scale = FALSE), X = scale(X, scale = FALSE)) ## WARNING : scale here !!
## hp
X <- cbind(dat$X, lfmm$U)
d <- ncol(dat$X)
if (class(lfmm) == "ridgeLFMM") {
lfmm.la <- lfmm$lambda } else {
lfmm.la <- 0.0
}
hp <- hypothesis_testing_lm(dat, X, lfmm.la)
hp$score <- hp$score[,1:d, drop = FALSE]
hp$pvalue <- hp$pvalue[,1:d, drop = FALSE]
hp$B <- hp$B[ ,1:d, drop = FALSE]
## calibrate
if (is.null(calibrate)) {
NULL ## nothing
} else if (calibrate == "median+MAD") {
hp$mad <- apply(hp$score, 2, mad)
hp$median <- apply(hp$score, 2, median)
hp$calibrated.score <- sweep(hp$score, 2, hp$median, FUN = "-")
hp$calibrated.score <- sweep(hp$score, 2, hp$mad, FUN = "/")
hp$calibrated.pvalue <- compute_pvalue_from_zscore(hp$calibrated.score)
} else if (calibrate == "gif") {
hp$gif <- compute_gif(hp$score)
hp$calibrated.score2 <- sweep(hp$score ^ 2, 2, hp$gif, FUN = "/")
hp$calibrated.pvalue <- compute_pvalue_from_zscore2(hp$calibrated.score2, df = 1)
}
hp
}
##' GLM tests with latent factor mixed models
##'
##'
##' This function returns significance values for the association between each column of the
##' response matrix, Y, and the explanatory variables, X, including correction for unobserved confounders
##' (latent factors). The test is based on an LFMM fitted with a ridge or lasso penalty and a generalized linear
##' model.
##'
##'
##' @param Y a response variable matrix with n rows and p columns.
##' Each column is a response variable (numeric).
##' @param X an explanatory variable matrix with n rows and d columns.
##' Each column corresponds to an explanatory variable (numeric).
##' @param lfmm.obj an object of class \code{lfmm} returned by the \link{lfmm_lasso}
##' or \link{lfmm_ridge} function
##' @param calibrate a character string, "gif". If the "gif" option is set (default),
##' significance values are calibrated by using the genomic control method. Genomic control
##' uses a robust estimate of the variance of z-scores called "genomic inflation factor".
##' @return a list with the following attributes:
##' - B the effect size matrix with dimensions p x d.
##' - score a p x d matrix which contains z-scores for each explanatory variable (columns of X),
##' - pvalue a p x d matrix which contains p-values for each explanatory variable,
##' - calibrated.pvalue a p x d matrix which contains calibrated p-values for each explanatory variable,
##' - gif a numeric value for the genomic inflation factor.
##' @details The response variable matrix Y and the explanatory variable are centered.
##' @seealso \link{lfmm_test}
##' @export
##' @author cayek, francoio
##' @examples
##'
##' library(lfmm)
##'
##'
##' ## An EWAS example with Y = methylation data
##' ## and X = "exposure" (categorical variable)
##' data("skin.exposure")
##' Y <- skin.exposure$beta.value
##' Y[Y == 0] <- 0.0001 #avoid NA
##' Y[Y == 1] <- 0.9999
##' Y <- qnorm(as.matrix(Y))
##' X <- skin.exposure$exposure == "exposure: sun exposed"
##'
##' ## Fit an LFMM with 2 latent factors
##' mod.lfmm <- lfmm_ridge(Y = Y,
##' X = X,
##' K = 2)
##'
##' ## Perform association testing using the fitted model:
##' pv <- glm_test(Y = pnorm(Y),
##' X = X,
##' lfmm.obj = mod.lfmm,
##' family = binomial(link = "probit"),
##' calibrate = "gif")
##'
##' ## Manhattan plot with true associations shown
##' pvalues <- pv$calibrated.pvalue
##' plot(-log10(pvalues),
##' pch = 19,
##' cex = .3,
##' xlab = "Probe",
##' col = "grey")
##'
##' causal.set <- seq(11, 1496, by = 80)
##' points(causal.set,
##' -log10(pvalues)[causal.set],
##' col = "blue")
glm_test <- function(Y, X, lfmm.obj, calibrate = "gif",family = binomial(link = "logit")) {
## init
dat <- LfmmDat(Y = Y, X = scale(X, scale = FALSE)) ## WARNING : scale here !!
d <- ncol(dat$X)
p <- ncol(dat$Y)
## tests based on GLMs
p.value <- NULL
z.score <- NULL
effect.size <- NULL
for (j in 1:p){
mod.glm <- glm(dat$Y[,j] ~ .,
data = data.frame(dat$X, lfmm.obj$U),
family = family)
p.value <- rbind(p.value, summary(mod.glm)$coeff[2:(d+1),4])
z.score <- rbind(z.score, summary(mod.glm)$coeff[2:(d+1),3])
effect.size <- rbind(effect.size, summary(mod.glm)$coeff[2:(d+1),1])
}
hp <- NULL
hp$score <- z.score
hp$pvalue <- p.value
hp$B <- effect.size
## calibrate
if (is.null(calibrate)) {
NULL ## nothing
} else {
hp$gif <- compute_gif(hp$score)
hp$calibrated.score2 <- sweep(hp$score ^ 2, 2, hp$gif, FUN = "/")
hp$calibrated.pvalue <- compute_pvalue_from_zscore2(hp$calibrated.score2, df = 1)
}
hp
}
##' Direct effect sizes estimated from latent factor models
##'
##'
##' This function returns 'direct' effect sizes for the regression of X (of dimension 1) on the matrix Y,
##' as usually computed in genome-wide association studies.
##'
##' @param Y a response variable matrix with n rows and p columns.
##' Each column is a response variable (numeric).
##' @param X an explanatory variable with n rows and d = 1 column (numeric).
##' @param lfmm.object an object of class \code{lfmm} returned by the \link{lfmm_lasso}
##' or \link{lfmm_ridge} function.
##' @return a vector of length p containing all effect sizes for the regression
##' of X on the matrix Y
##' @details The response variable matrix Y and the explanatory variable are centered.
##' @export
##' @author cayek, francoio
##' @examples
##' library(lfmm)
##'
##' ## Simulation of 1000 genotypes for 100 individuals (y)
##' u <- matrix(rnorm(300, sd = 1), nrow = 100, ncol = 2)
##' v <- matrix(rnorm(3000, sd = 2), nrow = 2, ncol = 1000)
##' y <- matrix(rbinom(100000, size = 2,
##' prob = 1/(1 + exp(-0.3*(u%*%v
##' + rnorm(100000, sd = 2))))),
##' nrow = 100,
##' ncol = 1000)
##'
##' #PCA of genotypes, 3 main axes of variation (K = 2)
##' plot(prcomp(y))
##'
##' ## Simulation of 1000 phenotypes (x)
##' ## Only the last 10 genotypes have significant effect sizes (b)
##' b <- matrix(c(rep(0, 990), rep(6000, 10)))
##' x <- y%*%b + rnorm(100, sd = 100)
##'
##' ## Compute effect sizes using lfmm_ridge
##' ## Note that centering is important (scale = F).
##' mod.lfmm <- lfmm_ridge(Y = y,
##' X = x,
##' K = 2)
##'
##' ## Compute direct effect sizes using lfmm_ridge estimates
##' b.estimates <- effect_size(y, x, mod.lfmm)
##'
##' ## plot the last 30 effect sizes (true values are 0 and 6000)
##' plot(b.estimates[971:1000])
##' abline(0, 0)
##' abline(6000, 0, col = 2)
##'
##' ## Prediction of phenotypes
##' candidates <- 991:1000 #set of causal loci
##' x.pred <- scale(y[,candidates], scale = F) %*% matrix(b.estimates[candidates])
##'
##' ## Check predictions
##' plot(x - mean(x), x.pred,
##' pch = 19, col = "grey",
##' xlab = "Observed phenotypes (centered)",
##' ylab = "Predicted from PRS")
##' abline(0,1)
##' abline(lm(x.pred ~ scale(x, scale = FALSE)), col = 2)
effect_size <- function(Y, X, lfmm.object){
if (ncol(X) > 1) stop("Indirect effect sizes are computed for
a single variable (d=1).")
Y = scale(Y, scale = FALSE)
X = scale(X, scale = FALSE)
reg.lm <- function(i){
datafr <- data.frame(Y[,i], lfmm.object$U)
lm(X ~ ., data = datafr)$coefficients[2]
}
p <- ncol(Y)
effect.sizes <- sapply(1:p, FUN = reg.lm)
return(effect.sizes)
}
##' Predict polygenic scores from latent factor models
##'
##'
##' This function computes polygenic risk scores from the estimates of latent factor models.
##' It uses the indirect' effect sizes for the regression of X (a single phenotype) on the matrix Y,
##' for predicting phenotypic values for new genotype data.
##'
##' @param Y a response variable matrix with n rows and p columns, typically containing genotypes.
##' Each column is a response variable (numeric).
##' @param X an explanatory variable with n rows and d = 1 column (numeric) representing a phenotype
##' with zero mean across the sample.
##' @param lfmm.object an object of class \code{lfmm} returned by the \link{lfmm_lasso}
##' or \link{lfmm_ridge} function, computed for X and Y.
##' @param fdr.level a numeric value for the FDR level in the lfmm test used to define
##' candidate variables for predicting new phenotypes.
##' @param newdata a matrix with n rows and p' columns, and similar to Y, on which
##' predictions of X will be based. If NULL, Y is used as new data.
##' @return a list with the following attributes:
##' - prediction: a vector of length n containing the predicted values for X. If newdata
##' = NULL, the fitted values are returned.
##' - candidates: a vector of candidate columns of Y on which the predictions are built.
##' @details The response variable matrix Y and the explanatory variable are centered.
##' @export
##' @author cayek, francoio
##' @examples
##' library(lfmm)
##'
##' ## Simulation of 1000 genotypes for 100 individuals (y)
##' u <- matrix(rnorm(300, sd = 1), nrow = 100, ncol = 2)
##' v <- matrix(rnorm(3000, sd = 2), nrow = 2, ncol = 1000)
##' y <- matrix(rbinom(100000, size = 2,
##' prob = 1/(1 + exp(-0.3*(u%*%v
##' + rnorm(100000, sd = 2))))),
##' nrow = 100,
##' ncol = 1000)
##'
##' #PCA of genotypes, 2 main axes of variation (K = 2)
##' plot(prcomp(y))
##'
##' ## Simulation of 1000 phenotypes (x)
##' ## Only the last 10 genotypes have significant effect sizes (b)
##' b <- matrix(c(rep(0, 990), rep(6000, 10)))
##' x <- y%*%b + rnorm(100, sd = 100)
##'
##' ## Compute effect sizes using lfmm_ridge
##' mod <- lfmm_ridge(Y = y,
##' X = x,
##' K = 2)
##'
##' x.pred <- predict_lfmm(Y = y,
##' X = x,
##' fdr.level = 0.25,
##' mod)
##'
##' x.pred$candidates
##'
##' ##Compare simulated and predicted/fitted phenotypes
##' plot(x - mean(x), x.pred$pred,
##' pch = 19, col = "grey",
##' xlab = "Observed phenotypes (centered)",
##' ylab = "Predicted from PRS")
##' abline(0,1)
##' abline(lm(x.pred$pred ~ scale(x, scale = FALSE)), col = 2)
predict_lfmm <- function(Y, X, lfmm.object, fdr.level = 0.1, newdata = NULL){
Y = scale(Y, scale = FALSE)
X = scale(X, scale = FALSE)
b.values <- effect_size(Y, X, lfmm.object)
pvalues <- lfmm_test(Y, X, lfmm.object, calibrate = "gif")$calibrated.pvalue
pvalues[is.na(pvalues)] <- 1
p = length(pvalues)
w = which(sort(pvalues) < fdr.level * (1:p)/p)
candidates = order(pvalues)[w]
if (is.null(newdata)) {newdata <- Y}
x.pred <- newdata[,candidates] %*% matrix(b.values[candidates])
return( list(prediction = x.pred, candidates = candidates) )
}
##' Forward inclusion tests with latent factor mixed models
##'
##'
##' This function tests for association between each column of the response matrix, Y,
##' and the explanatory variables, X, by recursively conditioning on the top hits in the set
##' of explanatory variables. The conditional tests are based on LFMMs with ridge penalty.
##'
##'
##' @param Y a response variable matrix with n rows and p columns.
##' Each column is a response variable (numeric).
##' @param X an explanatory variable matrix with n rows and d = 1 column (eg. phenotype).
##' @param K an integer for the number of latent factors in the regression model.
##' @param lambda a numeric value for the regularization parameter.
##' @param niter an integer value for the number of forward inclusion tests.
##' @param scale a boolean value, \code{TRUE} if the explanatory variable, X, is scaled
##' (recommended option).
##' @param rev.confounder a boolean value. If \code{TRUE} confounders are revaluated in each
##' conditional test. May take some time (default = \code{TRUE}).
##' @param candidate.list a vector of integers corresponding to response variables (columns in Y),
##' which are known candidates for association. If \code{NULL}, a list of candidates
##' is built in during the algorithm run.
##'
##' @return a list with the following attributes:
##' - candidates a vector of niter response variables (column labels in Y) detected as top hits in
##' each conditional association analysis.
##' - log.p a vector of uncorrected log p-values for checking that the algorithm behaves well (but not trustable for testing).
##' @details The response variable matrix Y and the explanatory variable are centered.
##'
##' @export
##' @author cayek, francoio
##' @examples
##' library(lfmm)
##' data("example.data")
##' Y <- example.data$genotype
##' X <- example.data$phenotype #scaled variable
##'
##' ## fits an LFMM, i.e, computes B, U, V:
##' mod.lfmm <- lfmm_ridge(Y = Y,
##' X = X,
##' K = 6)
##'
##' ## performs initial association testing using the fitted model:
##' pv <- lfmm_test(Y = Y,
##' X = X,
##' lfmm = mod.lfmm,
##' calibrate = "gif")
##' ## Manhattan plot
##' plot(-log10(pv$calibrated.pvalue),
##' pch = 19,
##' cex = .2,
##' col = "grey")
##'
##' ## Start forward tests (3 iterations)
##' obj <- forward_test(Y,
##' X,
##' K = 6,
##' niter = 3,
##' scale = TRUE)
##'
##' ## Record Log p.values for the 3 top hits
##' log.p <- obj$log.p
##' log.p
##'
##' ## Check perfect hits for each causal SNPs (labelled from 1 to 20)
##' obj$candidate %in% example.data$causal.set
##'
##' ## Check for candidates at distance 20 SNPs (about 10kb)
##' theta <- 20
##' ## Number of hits for each causal SNPs (1-20)
##' hit.3 <- as.numeric(
##' apply(sapply(obj$candidate,
##' function(x) abs(x - example.data$causal.set) < theta),
##' 2,
##' which))
##' ## Number of hits for each causal SNPs (1-20)
##' table(hit.3)
##'
##'
##' ## Continue forward tests (2 additional iterations)
##' obj <- forward_test(Y,
##' X,
##' K = 6,
##' niter = 2,
##' candidate.list = obj$candidates,
##' scale = TRUE)
##'
##' ## Record Log p.values for all 5 top hits
##' log.p <- c(log.p, obj$log.p)
##' log.p
##'
##' ## Check perfect hits for each causal SNPs (labelled from 1 to 20)
##' obj$candidate %in% example.data$causal.set
##'
##' ## Check for candidates at distance 5 SNPs (about 2.5kb)
##' theta <- 5
##' ## Number of hits for each causal SNPs (1-20)
##' hit.5 <- as.numeric(
##' apply(sapply(obj$candidate,
##' function(x) abs(x - example.data$causal.set) < theta),
##' 2,
##' which))
##' ## Number of hits for each causal SNPs (1-20)
##' table(hit.5)
##'
##' ## Plot log P
##' plot(log.p, xlab = "Conditional test iteration", ylab="Top hit log(p)")
forward_test <- function(Y,
X,
K,
niter = 5,
scale = FALSE,
candidate.list = NULL,
rev.confounder = TRUE,
lambda = 1e-5){
if (ncol(X) > 1) stop("The function works with
a single explanatory variable
(d = 1).")
if (!is.null(candidate.list)){
if (!is.integer(candidate.list))
{
stop("Error in candidate.list: A list
of integers is required.")}
if (max(candidate.list)>ncol(Y))
{
stop("Error in candidate.list:
max(candidate.list)>ncol(Y).")}
}
candidates <- candidate.list
log.pvalues <- NULL
for (i in 1:niter){
if (scale){
X.m <- scale(cbind(X, Y[,candidates]))}
else {
X.m <- cbind(X, Y[,candidates])
}
if (i == 1 | rev.confounder == TRUE){
mod.lfmm <- lfmm_ridge(Y = Y,
X = X.m,
K = K,
lambda = lambda)
}
pv <- lfmm_test(Y = Y,
X = X.m,
lfmm = mod.lfmm,
calibrate = "gif")
j <- 1
while (order(-log10(pv$calibrated.pvalue[,1]),
decreasing = TRUE)[j] %in% candidates) {
j <- j + 1
}
candidates <- c(candidates,
order(-log10(pv$calibrated.pvalue[,1]),
decreasing = TRUE)[j])
log.pvalues <- c(log.pvalues,
sort(-log10(pv$calibrated.pvalue[,1]),
decreasing = TRUE)[j])
}
return(list(candidates = candidates, log.p = log.pvalues))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.