#' GWAS with PCA
#'
#' @param pheno file with numeric phenotypic values
#' @param geno data.frame with genotype calls coded as 0,1,2.
#' @param Cov numeric data.frame with covariates values
#' @param GM genetic map of data with chr and position of each SNP
#' @param PCA.M number of principal components to use default is 3
#' @param QTN.position posistion of QTN if applicable
#' @param cutoff If cutoff is default, uses Bonferroni;0.05/number of SNPs
#' @param plots if TRUE, function plots PCA graphs, Manhatten Plot and QQ plot
#' @param messages if TRUE, returns messages for the GWAS function
#' @param print if TRUE, results are saved in a CSV
#' @param trait character value for trait name
#' @param rapid this option skips all side options and returns just GWAS results
#' @return Manhatten plot, QQ plot plus p-values, type-1 error and power for every SNP and results in a CSV
GWASapply<- function(pheno=NULL, geno=NULL, Cov=NULL, GM=NULL, PCA.M=3, QTN.position=NULL, cutoff=NULL,plots=FALSE,messages=FALSE,print=FALSE,trait="unknown",rapid=FALSE){
if(rapid==TRUE){
apply_start <- proc.time()
GD=geno
n=nrow(GD)
m=ncol(GD)
CV=Cov[,-1]
y=pheno
PCA=prcomp(GD)
P=apply(GD,2, function(x)
# CHECK FOR GENOTYPE DISTRIBUTION
if(max(x)==min(x)){p=1
P=p[length(p)]
# IF MAX NOT EQUAL TO MIN
}else{
# CHECK FOR DEPENDENCE
# IF NO CV INPUT
if (is.null(CV)){X=cbind(1, PCA$x[,1:PCA.M],x)
# IF THERE IS CV INPUT
}else{fD <- fix_Dep(PCA$x[,1:PCA.M], as.matrix(CV))
# WITH CV INPUT, CONDITION 1: NO DEPENDENCE
if (is.null(fD)){X=cbind(1, PCA$x[,1:PCA.M],as.matrix(CV), x)
# WITH CV INPUT, CONDITION 2: WITH DEPENDENCE
}else {X=cbind(1, PCA$x[,1:PCA.M],x)}
}# END FOR DEPENDECE
# SOLVE THE LINEAR REGRESSION MATRIX:
# X=as.matrix(X)
LHS=t(X)%*%X
C=solve(LHS)
RHS=t(X)%*%y
b=C%*%RHS
yb=X%*%b
e=y-yb
n=length(y)
ve=sum(e^2)/(n-1)
vt=C*ve
t=b/sqrt(diag(vt))
p=2*(1-pt(abs(t),n-2))
P=p[length(p)]
}# END FOR MAX NOT EQUAL TO MIN
# COLLECT P-VALUES
) #end of appply function for markers
P=t(matrix(P))
P.value=P
order.SNP=order(P.value)
#If cutoff is default, uses Bonferroni
cutoff.final=(ifelse(
is.null(cutoff),
0.05/m,
cutoff
))
sig.SNP <- order.SNP[sort(P.value)<=cutoff.final]
lsnp=length(sig.SNP)
###
zeros=P==0
P[zeros]=1e-20
P=data.frame(P)
GWAS.Results=list(P.value.res=P.value, cutoff.final.res=cutoff.final, sig.SNP.res=sig.SNP, sig.SNP.P.res=P.value[sig.SNP], order.SNP.res=order.SNP)
apply_end <- proc.time()
apply_elapsed <- apply_end[3] - apply_start[3]
if(messages==TRUE){print(paste0("GWASapply ran successfully and took ",as.character(round(apply_elapsed[[1]],2))," seconds"))}
return(GWAS.Results)
}else{
#If messages is true display gwas starting
if(messages==TRUE){print("GWASapply Starting")}
apply_start <- proc.time()
###check and copy input data
GD=geno
n=nrow(GD)
m=ncol(GD)
CV=Cov[,-1]
y=pheno
PCA=prcomp(GD)
if(messages==TRUE){print("Principal Components have been calculated Successfully")}
PCA_results <- summary(PCA)
if(plots==TRUE){print(kable(round(PCA_results$importance[,1:10], 2)))
#Variance Explained
var_exp_plot_data <- data.frame(t(PCA_results$importance)) # I transposed the data to be in long form and coerce to a data.frame for ggplot
names(var_exp_plot_data) <- c("sdev", "var_exp", "cum_var_exp") # rename the columns (because the original rownames has spaces)
var_exp_plot_data$pca_index <- 1:nrow(var_exp_plot_data) # Add a new column that is the integer index of the component
var_exp_plot <- ggplot(data = var_exp_plot_data, aes(x = pca_index, y = var_exp)) +
geom_line() +
geom_point() +
labs(x = "PCA component index", y = "Variance explained", title = "Variance Explained")
#Cumulative Variance Explained
cum_var_exp_plot <- ggplot(data = var_exp_plot_data, aes(x = pca_index, y = cum_var_exp)) +
geom_line() +
geom_point() +
labs(x = "PCA component index", y = "Cumulative Variance explained", title = "Cumulative Variance Explained")
grid.arrange(var_exp_plot, cum_var_exp_plot, nrow = 1, ncol = 2, top = "Variance of Principal Components")
#normal_print(pcav)
# Plotting the first three components
PCA_plot_data <- data.frame(PCA$x)
pca_comp_plot_12 <-
ggplot(data = PCA_plot_data, aes(x = PC1, y = PC2)) +
geom_point()
pca_comp_plot_13 <-
ggplot(data = PCA_plot_data, aes(x = PC1, y = PC3)) +
geom_point()
pca_comp_plot_23 <-
ggplot(data = PCA_plot_data, aes(x = PC2, y = PC3)) +
geom_point()
grid.arrange(pca_comp_plot_12, pca_comp_plot_13, pca_comp_plot_23, nrow = 2, ncol = 2, top = "Principal Components")
if(messages==TRUE){print("Principal Components plots have been printed Successfully")}}
P=apply(GD,2, function(x)
# CHECK FOR GENOTYPE DISTRIBUTION
if(max(x)==min(x)){p=1
P=p[length(p)]
# IF MAX NOT EQUAL TO MIN
}else{
# CHECK FOR DEPENDENCE
# IF NO CV INPUT
if (is.null(CV)){X=cbind(1, PCA$x[,1:PCA.M],x)
# IF THERE IS CV INPUT
}else{fD <- fix_Dep(PCA$x[,1:PCA.M], as.matrix(CV))
# WITH CV INPUT, CONDITION 1: NO DEPENDENCE
if (is.null(fD)){X=cbind(1, PCA$x[,1:PCA.M],CV, x)
# WITH CV INPUT, CONDITION 2: WITH DEPENDENCE
}else {X=cbind(1, PCA$x[,1:PCA.M],x)}
}# END FOR DEPENDECE
# SOLVE THE LINEAR REGRESSION MATRIX:
# X=as.matrix(X)
LHS=t(X)%*%X
C=solve(LHS)
RHS=t(X)%*%y
b=C%*%RHS
yb=X%*%b
e=y-yb
n=length(y)
ve=sum(e^2)/(n-1)
vt=C*ve
t=b/sqrt(diag(vt))
p=2*(1-pt(abs(t),n-2))
P=p[length(p)]
}# END FOR MAX NOT EQUAL TO MIN
# COLLECT P-VALUES
) #end of appply function for markers
P=t(matrix(P))
P.value=P
order.SNP=order(P.value)
#If cutoff is default, uses Bonferroni; else uses -log(value)
cutoff.final=(ifelse(
is.null(cutoff),
0.05/m,
cutoff
))
if(messages==TRUE){print(paste0("The final cuttoff for a significant p-value is ",as.character(cutoff.final)))}
sig.SNP <- order.SNP[sort(P.value)<=cutoff.final]
lsnp=length(sig.SNP)
minor_allele_freq <- apply(GD, 2, function(x) #Minor allele frequency calculation using Matt McGowan's code
{
allele_freq1 <- (sum(x == 0)*2 + sum(x == 1)) / (sum(!is.na(x)) * 2)
allele_freq2 <- (sum(x == 2)*2 + sum(x == 1)) / (sum(!is.na(x)) * 2)
return(min(allele_freq1, allele_freq2))
})
sig.MAF<-minor_allele_freq[sig.SNP]
if(messages==TRUE){print(paste0(as.character(lsnp), " Significant SNPs were found"))}
###
zeros=P==0
P[zeros]=1e-20
P=data.frame(P)
###Generate Manhattan plot
if (is.null(QTN.position)){
if(plots==TRUE){mp=manhattan_plot(GM,P,cutoff.final,trait=as.character(trait))
knit_print(mp)
if(messages==TRUE){print("Manhattan Plot Printed without QTN")}}
# WITH CV INPUT, CONDITION 2: WITH DEPENDENCE
}else {
###power.fdr.type1error calculation
power.fdr.type1error=NULL
if (!is.null(QTN.position)){
power.fdr.type1error=power.fdr(P.value, QTN.position,cutoff.final)
}
detected=intersect(sig.SNP,QTN.position)
if(length(detected)==0){detected=NULL}
falsePositive=setdiff(sig.SNP, QTN.position)
if(messages==TRUE){print(paste0("QTN's detected with ",as.character(length(detected))," True Positives and ",as.character(length(falsePositive))," False Positives."))}
if(plots==TRUE){ mp=manhattan_plot(GM,P,cutoff.final,QTN.position,falsePositive,detected,trait=as.character(trait))
knit_print(mp)
if(messages==TRUE){print("Manhattan Plot Printed with QTNs and False and True Positives")}}
}
###Generate QQ plot
if(plots==TRUE){qq=qq_plot(GM,P,trait=as.character(trait))
knit_print(qq)
if(messages==TRUE){print("QQ-Plot Printed")}}
#Generate Results and Return
##Create a table of P-values
P.value_df=data.frame(t(P.value))
#Create a table of results
SNP.MAF=data.frame(minor_allele_freq)
sig.table=data.frame(sig.SNP,P.value[sig.SNP],sig.MAF)
sig.table=data.frame(rownames(sig.table),sig.table)
rownames(sig.table) <- c()
colnames(sig.table)=c("SNP ID","SNP Position","P-value","MAF")
#Create out put for all markers and output to CSV
if(print==TRUE){GLM.results = data.frame(GM,P.value_df,rank(P.value_df),SNP.MAF)
colnames(GLM.results)=c("SNP ","Chromosome ","Position ","P value","Order","MAF")
fwrite(GLM.results,file="GLM.results.csv",row.names=FALSE)}
if(messages==TRUE){print("GWASapply results have printed")}
#Returns Values that include false and true positives if QTN are provided
if (!is.null(QTN.position)){
GWAS.Results=list(True.Positive=detected,False.Positive=falsePositive,P.value.res=P.value, cutoff.final.res=cutoff.final, sig.SNP.res=sig.SNP, sig.SNP.P.res=P.value[sig.SNP], sig.SNP.MAF.res=sig.MAF,order.SNP.res=order.SNP, power.fdr.type1error.res=power.fdr.type1error,sig.table=sig.table)
}else{
GWAS.Results=list(P.value.res=P.value, cutoff.final.res=cutoff.final, sig.SNP.res=sig.SNP, sig.SNP.P.res=P.value[sig.SNP],sig.SNP.MAF.res=sig.MAF, order.SNP.res=order.SNP,sig.table=sig.table)
}
apply_end <- proc.time()
apply_elapsed <- apply_end[3] - apply_start[3]
if(messages==TRUE){print(paste0("GWASapply ran successfully and took ",as.character(round(apply_elapsed[[1]],2))," seconds"))}
return(GWAS.Results)
}}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.