## Function to add gene for repective rsid, if genes are not provided by user
addgene <- function(gwasmulti){
ensembl <- useMart(biomart="ENSEMBL_MART_SNP", host="grch37.ensembl.org", path="/biomart/martservice", dataset="hsapiens_snp")
human = useMart(biomart="ENSEMBL_MART_ENSEMBL", host="feb2014.archive.ensembl.org", path="/biomart/martservice", dataset="hsapiens_gene_ensembl")
gwasmulti$gene <- "NA"
for(i in rownames(gwasmulti)){
xx <- getBM(attributes=c( "ensembl_gene_stable_id"),filters="snp_filter", values=gwasmulti[i, "SNP"],mart=ensembl, uniqueRows=TRUE)
if (nrow(xx) == 0){
gwasmulti[i,]$gene <- "NA"
}else{
gene <- getBM(attributes = c("hgnc_symbol"),
filters = "ensembl_gene_id", values = unlist(xx), mart = human)
gwasmulti[i,]$gene <- if (nrow(gene) == 0) "NA" else toString( unlist(gene))
}}
print("Finished mapping")
return(gwasmulti)
}
######################################### ######################################### #########################################
############## FAST PROCESSPHEGWAS - THIS IS USED FOR PROCESISNG THE PHENOTYPE BEFORE THE LANDSCAPE FUNCTION
######################################### ######################################### #########################################
#' Prepare the dataframe to pass to landscape function
#'
#' @import tidyverse
#' @import tidyr
#' @importFrom data.table setDT
#' @param phenos Vector of names of dataframes that need to do PheGWAS on. Arrange the dataframe in the order how the the phenotypes should align in y axis
#' @param LDpop Process using LD Block data summarized in T Berisa et.al paper 2016, Default eur. Available asn & afn
#' @param LDblock If want to pass a custom LDblock file for division of BP groups (applicable only for chromosomal level)
#' @return A processed dataframe to pass to PheGWAS landscape function
#' @details Make sure there are no duplicate rsid's in any of the dataframe, If there make sure to resolve it before passing it to this function.
#' @author George Gittu
#' @examples
#' \dontrun{
#' HDL <- hdl
#' LDL <- ldl
#' TRIGS <- trig
#' TOTALCHOLESTROL <- tchol
#' phenos <- c("HDL", "LDL", "TRIGS", "TOTALCHOLESTROL")
#' ## y is ready to be passed to function landscape
#' y <- fastprocessphegwas(phenos)
#' }
#' @export
fastprocessphegwas <- function(phenos,LDblock= FALSE,LDpop= "eur"){
if(is.vector(phenos) & !is.list(phenos)) {
list.df_pre = mget(phenos,envir = .GlobalEnv)
for (df in 1:length(list.df_pre)){
list.df_pre[[df]] <- list.df_pre[[df]] %>% rename(BETA = beta, SE = se)
}
}else{
phenos <- names(list.df_pre)
}
action3 = . %>% mutate(P.value = as.double(P.value))
list.df_pre <- lapply(list.df_pre, action3)
if(LDblock){
print("Processing for LDBlocks passed by the user")
bpp = read.table(LDblock,header=TRUE,sep = "\t")
}else if(LDpop == "asn"){
print("Processing for AFR LDBlocks")
bpp = bppafr
}else if(LDpop == "afr"){
print("Processing for ASN LDBlocks")
bpp = bppasn
}else {
bpp = bppeur
}
colnames(bpp)[1] <- "CHR"
bpp$CHR <-as.integer(gsub("chr","",bpp$CHR))
bpp$start <- as.integer(bpp$start)
bpp$stop <- as.integer(bpp$stop)
bpp <- na.omit(bpp)
##getting all the minimum from each label
labelfunction <- function(xx,bpp){
setDT(xx)[, label :=
setDT(bpp)[setDT(xx), on=.(CHR, start<=BP, stop>BP), paste(CHR, x.start, sep="_")]
]
}
#this list.df shoudnt be changed
list.df_pre <- lapply(list.df_pre, labelfunction,bpp =bpp)
# THis is addeed to the enviornment as we need this in nthe landscape functionn
assign('list.df_pre',list.df_pre,envir=.GlobalEnv)
# Following processing the dataframe for the labels
action = . %>% group_by(label) %>% slice(which.min(P.value))
list.df <- lapply(list.df_pre, action)
list.df <- Map(function(x, nm) {i1 <- !(names(x) %in% c('label','CHR'))
names(x)[i1] <- paste0(names(x)[i1], "_", nm)
x
}, list.df, names(list.df))
out <- Reduce(function(...) merge(..., by = c('CHR','label'), all = TRUE), list.df)
names(out)
for(i in 1:length(phenos))
{
a <- grep(paste0("^", phenos[i], "$", collapse = "|"), sapply(strsplit(names( out ), split="\\_"), tail, 1L))
selection <- colnames(out)[a]
hju <- paste0(phenos[i])
out <- unite_(out, hju, selection, sep = "and", remove = TRUE)
}
out %>% gather(color,Entire_Val,-CHR, -label) -> gwasmulti.meltF
if(length(grep("gene", colnames(list.df_pre[[1]]))) == 0){
gwasmulti.melt <- gwasmulti.meltF %>% separate(Entire_Val, c("BP", "SNP","A1","A2","BETA","SE","P"), "and")
## Getting the dataframe that can be used
suppressWarnings({d <- data.frame(CHR = as.numeric(gwasmulti.melt$CHR), BP = as.numeric(gwasmulti.melt$BP),A1 =gwasmulti.melt$A1,A2 =gwasmulti.melt$A2,SNP =gwasmulti.melt$SNP,
P = as.numeric(gwasmulti.melt$P),BETA = as.numeric(gwasmulti.melt$BETA),SE = as.numeric(gwasmulti.melt$SE),
PHENO = gwasmulti.melt$color,label = gwasmulti.melt$label)})
}else{
gwasmulti.melt <- gwasmulti.meltF %>% separate(Entire_Val, c("BP", "SNP","A1","A2","BETA","SE","P","gene"), "and")
## Getting the dataframe that can be used
suppressWarnings({d <- data.frame(CHR = as.numeric(gwasmulti.melt$CHR), BP = as.numeric(gwasmulti.melt$BP),A1 =gwasmulti.melt$A1,A2 =gwasmulti.melt$A2,SNP =gwasmulti.melt$SNP,
P = as.numeric(gwasmulti.melt$P),BETA = as.numeric(gwasmulti.melt$BETA),SE = as.numeric(gwasmulti.melt$SE),gene = gwasmulti.melt$gene,
PHENO = gwasmulti.melt$color,label = gwasmulti.melt$label)})
}
d <- subset(d, (is.numeric(CHR) & is.numeric(BP) & is.numeric(P) ))
d <- d[order(d$CHR, d$BP), ]
d$logp <- round(-log10(d$P),3)
d$BETA <- round(d$BETA,3)
d$logp[is.infinite(d$logp)] <- max(d$logp[!is.infinite(d$logp)],
na.rm = TRUE) + 10
# This dataframe is what used for the landscape function.
d
}
######################################### ######################################### #########################################
############## FAST PROCESSLANDSCAPE - THIS IS USED FOR VIWEING THE LANDSCAPE
######################################### ######################################### #########################################
#' Interactive 3-D association landscape for many phenotypes
#'
#' @import tidyverse
#' @import tidyr
#' @import plotly
#' @importFrom reshape2 acast
#' @importFrom biomaRt useMart getBM
#' @importFrom httr GET stop_for_status content_type content
#' @importFrom utils read.table
#' @import xml2
#' @rawNamespace import(jsonlite, except = flatten)
#' @importFrom stringr str_trim
#' @param d DataFrame output from processphegwas
#' @param sliceval Integer to indicate value of -log10(p) to do the sectionalcut. Usually value > -log10 6 is considered to be significant
#' @param chromosome Integer to indicate the chromosome number thats interested, If not given entire chromosome is given
#' @param genemap if true it map SNP's to gene. This takes time as its using BioMart package to map genes.
#' @param geneview This checks for the common genes across the section
#' @param calculateLD This shoudld be set to true if the calcualte LD logic needed to be added to the plot
#' @param pop The population to select for calculation the LD (default GB)
#' @param R2 The value to set to calculate LD
#' @param D THe value to set to calcualte LD
#' @param mutualLD Calcualte the mutual LD SNP between the phenotypes
#' @param levelsdown Used to find the independant signals
#' @param phenos Vector of names of dataframes that need to do PheGWAS on. Arrange the dataframe in the order how the the phenotypes should align in y axis
#' chromosome view the max peak is selected
#' @author George Gittu
#' @examples
#' \dontrun{
#' # here y is output from function fastprocessphegwas
#' # 3D landscape-viz of all the phenos across the base pair positions(threshold > -log10(p) 7.5)
#' # Without iPheGWAS you get output in the order you pass phenos
#' landscapefast(y,sliceval = 10,phenos =phenos)
#'
#' # 3D landscape visualization after applying iPheGWAS
#' # Passing the order from the iPheGWAS function
#' landscapefast(y,sliceval = 10,phenos = iphegwas(phenos))
#'
#' #3D landscape visualization of chromosome number 19 (above a threshold of -log10 (p) 7.5)
#' landscapefast(y,sliceval = 7.5,chromosome = 19,phenos =phenos)
#'
#' # 3D landscape-viz of chromosome 19, gene view active(threshold > -log10(p) 7.5)
#' landscape(y,sliceval = 7.5,chromosome = 19, geneview = TRUE,phenos =phenos)
#'
#' # 3D landscape-viz with calculate-LD and mutual-LD functionality active
#' landscapefast(y, sliceval = 30, chromosome = 19,calculateLD= TRUE,mutualLD = TRUE,phenos =phenos)
#'}
#' @export
landscapefast <- function(d,sliceval = 7,chromosome = FALSE,pop = "GBR",R2 = 0.75,
D = 0.75,calculateLD = FALSE, mutualLD = FALSE,phenos,genemap= FALSE,geneview = FALSE,levelsdown = 0){
if(chromosome == FALSE){
print("Processing for the entire chromosome")
# this is for the MAtrix to process the entire chromosome
gwasmultifull <- d %>%
### dont want to show the phenotypes thats doesnt have anything above a certain threshold
group_by(PHENO) %>%
filter(any(logp > sliceval)) %>%
ungroup() %>%
group_by(PHENO,CHR) %>%
slice(which.max(logp))
if(length(grep("gene", colnames(d))) == 0 & genemap == TRUE){
print("Applying BioMArt module for matching gene to rsid")
gwasmultifull <- addgene(gwasmultifull)
}
if(length(grep("gene", colnames(d))) == 0){
gwasmultifull$gene <- NA
}
gwasmultifull <- gwasmultifull[!is.na(gwasmultifull$CHR) & !is.na(gwasmultifull$BP),]
## Selecting only chr 1 to 22 ignoring X and Y, phenos is the order that we wannt the traits to be ordered
gwas_surface_full <- acast(gwasmultifull, gwasmultifull$PHENO ~ gwasmultifull$CHR, value.var = "logp")[phenos,1:22]
## To remove the cases which dont have values fro all the snps
gwas_surface_full <- gwas_surface_full[complete.cases(gwas_surface_full), ]
gwas_surface_prime_full_use <- gwas_surface_full_use <- gwas_surface_full
upperlimit = max(gwas_surface_full_use)
gwas_surface_prime_full_use[,]= upperlimit + 2
gwas_surface_copy_full_use <- gwas_surface_full_use
# preparing labels
for (i in 1:nrow(gwas_surface_copy_full_use)) {
for (j in 1:ncol(gwas_surface_copy_full_use))
gwas_surface_copy_full_use[i, j] <-
paste("Phenotype = ", rownames(gwas_surface_copy_full_use)[i], '\n',"Chromosome = ", colnames(gwas_surface_copy_full_use)[j],'\n',
"BP =",gwasmultifull[gwasmultifull$PHENO==rownames(gwas_surface_copy_full_use)[i]& gwasmultifull$CHR==colnames(gwas_surface_copy_full_use)[j],]$BP,"\n",
"A1 =",gwasmultifull[gwasmultifull$PHENO==rownames(gwas_surface_copy_full_use)[i]& gwasmultifull$CHR==colnames(gwas_surface_copy_full_use)[j],]$A1,' ',
"A2 =",gwasmultifull[gwasmultifull$PHENO==rownames(gwas_surface_copy_full_use)[i]& gwasmultifull$CHR==colnames(gwas_surface_copy_full_use)[j],]$A2,'\n',
"Effect Size =",gwasmultifull[gwasmultifull$PHENO==rownames(gwas_surface_copy_full_use)[i]& gwasmultifull$CHR==colnames(gwas_surface_copy_full_use)[j],]$BETA,' ',
"SE =",gwasmultifull[gwasmultifull$PHENO==rownames(gwas_surface_copy_full_use)[i]& gwasmultifull$CHR==colnames(gwas_surface_copy_full_use)[j],]$SE,'\n',
"-log10 (P-value) = ", gwas_surface_copy_full_use[i, j],'\n',
"SNPID =",gwasmultifull[gwasmultifull$PHENO==rownames(gwas_surface_copy_full_use)[i]& gwasmultifull$CHR==colnames(gwas_surface_copy_full_use)[j],]$SNP,'\n',
"Gene =",gwasmultifull[gwasmultifull$PHENO==rownames(gwas_surface_copy_full_use)[i]& gwasmultifull$CHR==colnames(gwas_surface_copy_full_use)[j],]$gene)
}
p <- plot_ly(x=colnames(gwas_surface_full_use),y= rownames(gwas_surface_full_use),z = gwas_surface_full_use) %>%
add_surface(cmin = sliceval,cmax = upperlimit,opacity=.95,surfacecolor = gwas_surface_full_use,text = gwas_surface_copy_full_use,hoverinfo = "text") %>%
layout(title = 'PheGWAS',scene = list(camera = list(eye = list(x=2, y=2, z=2)),yaxis = list(title = "Phenotypes", tickmode = "array",nticks = 8),xaxis =list(title= "Chromosomes",autotick = F, dtick = 1),
zaxis =list(title= "-log 10(P)",range=c(sliceval,upperlimit+2)),aspectmode = "manual",aspectratio = list( x = 2, y = .8, z = .8))) %>%
add_trace(x=colnames(gwas_surface_prime_full_use),y= rownames(gwas_surface_prime_full_use),z = gwas_surface_prime_full_use, type = "surface",surfacecolor = gwas_surface_full_use,showscale = FALSE,
text = gwas_surface_copy_full_use,hoverinfo = "text",cmin = sliceval,cmax = upperlimit) %>% colorbar(title = "-log10 (P-value)")
p
}else{
print(paste0("Processing for chromosome ",chromosome))
whileivar = 0
while (whileivar <= levelsdown) {
dchrom1initial <- d[d$CHR==chromosome,] %>% rename(lab = label)
gwasmulti <- dchrom1initial %>%
group_by(lab) %>%
filter(any(logp > sliceval)) %>%
ungroup() %>%
group_by(lab, PHENO) %>%
slice(which.max(logp))
if(length(grep("gene", colnames(d))) == 0 & genemap == TRUE){
print("Applying BioMArt module for matching gene to rsid")
gwasmulti <- addgene(gwasmulti)
}
if(length(grep("gene", colnames(d))) == 0){
gwasmultifull$gene <- NA
}
###rewriting calculating ld logic
if (calculateLD){
gwasmulti$snpsinld <- "NA"
gwasmulti$snpsinldcount <-0
gwasmulti$snpsinldup <- "NA"
z<-NULL
for (i in 1:nrow(gwasmulti)) {
if(gwasmulti[i,]$logp > sliceval){
stringcreate <- sprintf("/1000GENOMES:phase_3:%s?",pop)
server <- "http://grch37.rest.ensembl.org"
ext <- paste0("/ld/human/",gwasmulti[i,]$SNP,stringcreate)
r <- GET(paste(server, ext, sep = ""), content_type("application/json"))
tryCatch({
stop_for_status(r)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
lddf <- fromJSON(toJSON(content(r)))
ldlist <- unlist(lddf$variation2)
if(!is.null(ldlist)){
ldlist <- unlist(lddf[lddf$r2 >= R2 | lddf$d_prime >= D,]$variation2)
var <- eval(parse(text=paste0("list.df_pre$",gwasmulti[i,]$PHENO)))
hhh <- var[var$label == gwasmulti[i,]$lab & var$P.value <= as.numeric(10^-sliceval)]
snpsinld <- intersect(hhh$rsid,ldlist)
snpsinldup <- length(snpsinld) / nrow(hhh)
gwasmulti[i,]$snpsinld <- paste0(toString(snpsinld))
gwasmulti[i,]$snpsinldcount <- length(snpsinld)
gwasmulti[i,]$snpsinldup <- paste0(snpsinldup)
}
}
}
if(mutualLD){
print("Calculating the mutually shared SNP's")
gwasmulti$common <- "NA"
for(i in unique(gwasmulti$lab)){
for (j in 1:length(phenos)) {
for (k in phenos[-j]) {
uuu <- strsplit(gwasmulti[gwasmulti$lab == i & gwasmulti$PHENO == phenos[j], ]$snpsinld,",")
vfg <- strsplit(gwasmulti[gwasmulti$lab == i & gwasmulti$PHENO == k, ]$snpsinld,",")
vfg <- lapply(vfg, str_trim)
uuu <- lapply(uuu, str_trim)
unld <- intersect(append(unlist(uuu),toString(gwasmulti[gwasmulti$lab == i & gwasmulti$PHENO == phenos[j], ]$SNP)), append(unlist(vfg), toString(gwasmulti[gwasmulti$lab == i & gwasmulti$PHENO == k, ]$SNP)))
if(length(unld)== 0){
unld <- NA
}
z <- c(z,paste(phenos[j],"with",k,":" ,paste0(toString(unld)), collapse = "," ))
}
gwasmulti[gwasmulti$lab == i & gwasmulti$PHENO == phenos[j],]$common <- paste(toString(z))
z <- NULL
}
}
}
}
if (levelsdown != 0 & calculateLD) {
for (ipheno in unique(gwasmulti$PHENO)) {
bgt <- strsplit(gwasmulti[gwasmulti$PHENO ==
ipheno, ]$snpsinld, ",")
xxxx <- lapply(bgt, str_trim)
ppp <- union(unlist(xxxx), gwasmulti[gwasmulti$PHENO ==
ipheno, ]$SNP)
xxlist <- as.list(ppp)
list.df_pre[[ipheno]] <- list.df_pre[[ipheno]][!list.df_pre[[ipheno]]$rsid %in% xxlist, ]
}
assign('list.df_pre',list.df_pre,envir=.GlobalEnv)
d <- fastprocessphegwas(list.df_pre)
}
whileivar = whileivar + 1
}
gwas_surface <- acast(gwasmulti, gwasmulti$PHENO ~ gwasmulti$lab, value.var = "logp")[phenos,]
gwas_surface_use <- gwas_surface
gwas_surface_copy_use <- gwas_surface
gwas_surface_copy_gene_use <- gwas_surface_use
colnames(gwas_surface) <- gsub(paste0(chromosome,"_"), "", colnames(gwas_surface), fixed = TRUE)
yy <- order(as.numeric(colnames(gwas_surface)))
gwas_surface <- gwas_surface[,yy]
gwas_surface_copy_use <- gwas_surface_copy_use[,yy]
gwas_surface_use <- gwas_surface_use[,yy]
gwas_surface_copy_gene_use <- gwas_surface_copy_gene_use[,yy]
for (i in 1:nrow(gwas_surface_copy_use)) {
for (j in 1:ncol(gwas_surface_copy_use))
if(calculateLD){
gwas_surface_copy_use[i, j] <-
paste("Phenotype: ", rownames(gwas_surface_copy_use)[i], ' ',"kbp: ", colnames(gwas_surface_use)[j],' ',
"Effect Size: ",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_use)[j],]$BETA,'\n',
"SE: ",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_copy_use)[j],]$SE,' ',
"-log10 (P-value): ", gwas_surface_copy_use[i, j],' ',
"SNPID: ",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_copy_use)[j],]$SNP,"\n",
"Gene =",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_copy_use)[j],]$gene,"\n",
gsub('(.{1,90})(\\s|$)', '\\1\n', paste("SNP's in LD: ",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_copy_use)[j],]$snpsinld)),"\n",
"Linked SNP's ratio: ",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_copy_use)[j],]$snpsinldup,"\n",
gsub('(.{1,90})(\\s|$)', '\\1\n', paste("With other pheno: ",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_copy_use)[j],]$common)))
}else{
gwas_surface_copy_use[i, j] <-
paste("Phenotype = ", rownames(gwas_surface_copy_use)[i], '\n',"kbp position range= ", colnames(gwas_surface_use)[j],'\n',
"BP =",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_copy_use)[j],]$BP,"\n",
"Effect Size =",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_use)[j],]$BETA,' ',
"SE =",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_copy_use)[j],]$SE,'\n',
"-log10 (P-value) = ", gwas_surface_copy_use[i, j],'\n',
"SNPID =",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_copy_use)[j],]$SNP,'\n',
"Gene =",gwasmulti[gwasmulti$PHENO==rownames(gwas_surface_copy_use)[i]& gwasmulti$lab==colnames(gwas_surface_copy_use)[j],]$gene)
}
}
gwas_surface_use <- gwas_surface
upperlimit = max(gwas_surface_use, na.rm = TRUE)
gwas_surface_prime_use <- gwas_surface_use
gwas_surface_prime_use[,]= upperlimit + 2
colorheatmap = NULL
if (geneview == TRUE) {
print("GENE View is active")
for (i in 1:nrow(gwas_surface_copy_gene_use)) {
for (j in 1:ncol(gwas_surface_copy_gene_use)) gwas_surface_copy_gene_use[i,
j] <- paste(gwasmulti[gwasmulti$PHENO == rownames(gwas_surface_copy_gene_use)[i] &
gwasmulti$lab == colnames(gwas_surface_copy_gene_use)[j],
]$gene)
}
xxx <- apply(gwas_surface_copy_gene_use, 2, function(x) unique(unlist(strsplit(x[!is.na(x)],
";"))[duplicated(unlist(strsplit(x[!is.na(x)],
";")))]))
for (i in 1:nrow(gwas_surface_copy_gene_use)) {
for (j in 1:ncol(gwas_surface_copy_gene_use)) {
x <- gwas_surface_copy_gene_use[i, j]
xx <- unlist(strsplit(x[!is.na(x)], ";"))
if (length(intersect(xx, unlist(xxx[j]))) >
0 & gwasmulti[gwasmulti$PHENO == rownames(gwas_surface_copy_gene_use)[i] &
gwasmulti$lab == colnames(gwas_surface_copy_gene_use)[j],
]$logp > sliceval & !is.na(gwasmulti[gwasmulti$PHENO ==
rownames(gwas_surface_copy_gene_use)[i] &
gwasmulti$lab == colnames(gwas_surface_copy_gene_use)[j],
]$gene)) {
gwas_surface_copy_gene_use[i, j] <- max(gwas_surface_use)
}
else {
gwas_surface_copy_gene_use[i, j] <- sliceval
}
}
}
surfacecolourheat = gwas_surface_copy_gene_use
colorheatmap = "Viridis"
}
else {
surfacecolourheat = gwas_surface_use
}
p <- plot_ly() %>%
layout(title = 'PheGWAS',scene = list(camera = list(eye = list(x=2, y=2, z=2)),yaxis = list(title = "Phenotypes",tickvals = rownames(gwas_surface_use))
,xaxis =list(title= "100 kbp divisions" ,
type = 'category',tickmode = "array",tickvals = colnames(gwas_surface_use), ticks = "outside"),
zaxis =list(title= "-log 10(P)",range=c(sliceval,upperlimit + 2)),aspectmode = "manual",
aspectratio = list( x = 2, y = 1, z = .8)))
p <- p %>% add_surface(x=colnames(gwas_surface_use),y= rownames(gwas_surface_use),z = gwas_surface_use,opacity=.98,
surfacecolor = gwas_surface_use, cmin = sliceval ,cmax = upperlimit,
showscale = FALSE,text = gwas_surface_copy_use,hoverinfo = "text") %>%
add_trace(x=colnames(gwas_surface_use),y= rownames(gwas_surface_use),z = gwas_surface_prime_use,
type = "surface",surfacecolor = surfacecolourheat,showscale = FALSE,text = gwas_surface_copy_use,hoverinfo = "text",colorscale = colorheatmap)
p
}
}
######################################### ######################################### #########################################
############## FAST PROCESSPHEGWAS - THIS IS USED FOR PROCESISNG THE PHENOTYPE BEFORE THE LANDSCAPE FUNCTION
######################################### ######################################### #########################################
#' Prepare the dataframe to pass to landscape function
#'
#' @import tidyverse
#' @import tidyr
#' @importFrom reshape2 acast
#' @import factoextra
#' @import cluster
#' @import seriation
#' @import stringr
#' @import purrr
#' @import fs
#' @import dplyr
#' @importFrom data.table fread rbindlist
#' @param phenos Vector of names of dataframes that need to do iPheGWAS on.
#' @param dentogram to show structural differences
#' @param pathname To read summarystats from a folder instead of passing it as phenos
#' @return A processed dataframe to pass to PheGWAS landscape function/ dentogram if it's TRUE
#' @details Make sure there are no duplicate rsid's in any of the dataframe, If there aremake sure to resolve it before passing it to this function.
#' @author George Gittu
#' @examples
#' \dontrun{
#' phenos <- c("HDL", "LDL", "TRIGS", "TOTALCHOLESTROL")
#' ## This gives the order after applying iPheGWAS. We can pass this order to the landscape function
#' iphegwas(phenos)
#' ## This gives the dentogram as output
#' iphegwas(phenos,dentogram = TRUE)
#' ## Give folder path
#' pathname <- system.file("extdata", "samplesummary", package = "iphegwas")
#' iphegwas(pathname = pathname,dentogram = TRUE)
#' }
#' @export
iphegwas <- function(phenos,dentogram = FALSE,pathname = FALSE){
if(pathname != FALSE){
files <- fs::dir_ls(pathname)
list.df_pre <- purrr::map(files,fread)
phenos <- names(list.df_pre) <- str_extract(path_file(files),"[^.]+")
}else{
list.df_pre = mget(phenos,envir = .GlobalEnv)
}
for (df in 1:length(list.df_pre)){
if(length(grep("Z", colnames( list.df_pre[[df]]))) == 0){
action3 = . %>% subset(rsid %in% skeltonsnps$rsid) %>% merge(skeltonsnps, by= "rsid", how='inner') %>%
mutate(ZZ = beta/se) %>%
mutate(Z = ifelse(toupper(A1.x) == toupper(A1.y), ZZ * -1,ZZ)) %>%
rename(FEATURE =rsid) %>%
select("FEATURE","Z") %>%
distinct(FEATURE, .keep_all= TRUE)
list.df_pre[[df]] <- list.df_pre[[df]] %>% action3()
}else{
action3 = . %>% drop_na(rsid) %>% subset(rsid %in% skeltonsnps$rsid) %>% merge(skeltonsnps, by= "rsid", how='inner') %>%
rename(FEATURE =rsid) %>%
select("FEATURE","Z") %>%
distinct(FEATURE, .keep_all= TRUE)
list.df_pre[[df]] <- list.df_pre[[df]] %>% action3()
}
colnames(list.df_pre[[df]]) <- toupper(colnames(list.df_pre[[df]]))
}
xfull <- rbindlist(list.df_pre,idcol = "PHENOS")
gwas_surface_pre <- xfull[,c("PHENOS","FEATURE","Z")]
gwas_surface <- acast(gwas_surface_pre, PHENOS~FEATURE, value.var="Z") #>>>>>>>>>>>>>Z ang logcal
gwas_surface <- gwas_surface[ , colSums(is.na(gwas_surface)) == 0]
res.dist <- get_dist(gwas_surface, method = "pearson") #>>>>>>>>>>>>>>>>>>
correlation.coefficent <- as.matrix(1 - res.dist)[phenos,phenos]
A <- abs(correlation.coefficent)
AA <- as.dist(1- A)
res.hc1 <- hclust(d = AA,method = "mcquitty") #>>
res.hc1 <- reorder(res.hc1, AA, method = "OLO")
if(dentogram){
## Finding optimal number of cluster using silhouette
silh <- fviz_nbclust(A, FUN = hcut, method = "silhouette",k.max=nrow(A)-1)
kcluster <- as.numeric(silh$data[which.max(silh$data$y),]$clusters)
suppressWarnings({fviz_dend(res.hc1, k = kcluster, # Cut in four groups
cex = .6, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE,
main = "Dentogram iPheGWAS"# Add rectangle around groups,
)})
}else{
phenos[res.hc1$order]
}
}
#' Function to use LDSC
#' @import tidyverse
#' @import purrr
#' @import factoextra
#' @import cluster
#' @import seriation
#' @import fs
#' @import stringr
#' @import corrplot
#' @import dplyr
#' @import glue
#' @param pathname The folder that contains the summary stats to use for LDSC
#' @param ldscpath Path to the LDSC GitHub folder in your computer
#' @param dentogram If TRUE returns dentogram using LDSC method
#' @param plot If TRUE returns rg correlation plot using LDSC method
#'
#' @return Default LDSC ordering/ dentogram is TRUE/ rg plot if TRUE
#' @details if used for LDSC ordering then summary stats file names should match with the phenos that you are passing to the landscape function.
#' @details You should have a ldsc conda environment that you use for running LDSC. Details here https://github.com/bulik/ldsc
#' @author George Gittu
#' @examples
#' \dontrun{
#' ## Giving absolute file path
#' pathname <- "/Users/ggeorg02/Desktop/databetasefiltered/files_ldsc/"
#' ldscpath <- "/Users/ggeorg02/Desktop/GitHub/ldsc"
#' # This gives the order after applying LDSC. We can pass this order to the landscape function
#' ldscmod(pathname,ldscpath)
#' ldscmod(pathname,ldscpath,dentogram = TRUE)
#' ldscmod(pathname,ldscpath,plot = TRUE)
#' }
#' @export
ldscmod <- function(pathname,ldscpath, dentogram = FALSE, plot = FALSE){
if(!exists("correlationmatrix")){
munge_summary <- function(pathfile,ldscpath){
hm3 <- system.file("extdata", "w_hm3.snplist", package = "iphegwas")
command <- glue::glue(
"{ldscpath}/munge_sumstats.py ",
"--sumstats {pathfile} ",
"--chunksize 500000 ",
"--out {pathfile} ",
"--merge-alleles {hm3}",
)
# assumption user use ldsc conda env
system(paste("source ~/.zshrc && conda activate ldsc && ",command),intern=TRUE,
ignore.stderr = TRUE,ignore.stdout = TRUE)
paste0("Munged ",path_file(pathname))
}
ldsc_rg <- function(pathname, ldscpath) {
files_sumstats <- as.character(dir_ls(pathname,type = "file",glob = "*.sumstats.gz"))
ld <- system.file("extdata", "eur_w_ld_chr", package = "iphegwas")
i = 0
while( i < length(files_sumstats) ){
rg_val <- stringr::str_flatten(files_sumstats,collapse = ",")
# print(glue::glue("Processing for {files_sumstats[1]}"))
command <- glue::glue(
"{ldscpath}/ldsc.py ",
"--rg {rg_val} ",
"--ref-ld-chr {ld}/ ",
"--w-ld-chr {ld}/ ",
"--out {files_sumstats[1]}.ldsc"
)
files_sumstats <- c(files_sumstats[-1],files_sumstats[1])
i = i + 1
# assumption user use ldsc conda env
system(paste("source ~/.zshrc && conda activate ldsc && ",command),intern=TRUE,
ignore.stderr = TRUE,ignore.stdout = TRUE)
}
}
rg_extract <- function(fileline){
x <- readLines(fileline)
selected_lines <- str_subset(x,str_replace(fileline,".ldsc.log",""))[-c(1:3)]
# getting the numbers to extract the first element
# selected_names <- unlist(purrr::map(selected_lines,strsplit,"\\s+"),recursive = FALSE)
## I can flatten
selected_names <- purrr::flatten(purrr::map(selected_lines,strsplit,"\\s+"))
# selected_numbers <- map(selected_lines,~discard(as.numeric(unlist(str_split(.,pattern = " "))),is.na))
# selecting the 3rd elelemt from list of list which is the rg
rg <- as.numeric(map_chr(selected_names, 3))
phenosrow <- purrr::map_chr(purrr::map_chr(selected_names, 1),~str_replace(path_file(.),".tsv.sumstats.gz",""))
phenoscol <- purrr::map_chr(purrr::map_chr(selected_names, 2),~str_replace(path_file(.),".tsv.sumstats.gz",""))
correlationmatrix[unique(phenosrow),phenoscol] <<- rg
diag(correlationmatrix) <<- 1
.GlobalEnv$correlationmatrix <- correlationmatrix
correlationmatrix
}
files <- dir_ls(pathname,type = "file")
purrr::map(files,munge_summary,ldscpath)
ldsc_rg(pathname,ldscpath)
files_ldsc <- as.character(dir_ls(pathname,type = "file",glob = "*.ldsc.log"))
phenos <- str_extract(path_file(files_ldsc),"[^.]+")
correlationmatrix <- data.frame(matrix(nrow = length(files_ldsc),ncol = length(files_ldsc)))
rownames(correlationmatrix) <- colnames(correlationmatrix) <- phenos
purrr::map(files_ldsc,rg_extract)
}
files_ldsc <- as.character(dir_ls(pathname,type = "file",glob = "*.ldsc.log"))
phenos <- str_extract(path_file(files_ldsc),"[^.]+")
if(plot){
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
col <- colorRampPalette(c( "#4477AA", "#77AADD", "#FFFFFF","#EE9988" ,"#BB4444"))
return(suppressWarnings({corrplot(as.matrix(correlationmatrix), method="circle", col=col(200),
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=45, #Text label color and rotation
is.corr = FALSE)}))
}
B <- abs(correlationmatrix)
BB <- as.dist(1- B)
res.hc1 <- hclust(d = BB,method = "mcquitty") #>>
res.hc1 <- reorder(res.hc1, BB, method = "OLO")
if(dentogram){
## Finding optimal number of cluster using silhouette
silh <- fviz_nbclust(B, FUN = hcut, method = "silhouette",k.max=nrow(B)-1)
kcluster <- as.numeric(silh$data[which.max(silh$data$y),]$clusters)
dendo <- fviz_dend(res.hc1, k = kcluster, # Cut in four groups
cex = .6, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE,
main = "Dentogram LDSC"# Add rectangle around groups,
)
return(dendo)
} else {
return(phenos[res.hc1$order])
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.