# Script contains helper functions for drug response prediction
# label the drugs with proper u
giveDrugLabel3 = function(drid, dtab=BloodCancerMultiOmics2017::drugs,
ctab=BloodCancerMultiOmics2017::conctab) {
vapply(strsplit(drid, "_"), function(x) {
k <- paste(x[1:2], collapse="_")
paste0(dtab[k, "name"], " ",
switch(x[3], "1:5"="c1:5", "4:5"="c4:5",
paste0(ctab[k, as.integer(x[3])], " \u00B5","M")))
}, character(1))
}
# select features from the lpd object to use as response and predictors in the model
featureSelectionForLasso = function(objective, predictors, lpd) {
# quiets concerns of R CMD check "no visible binding for global variable"
dataType=NULL
dimobj = dimnames(objective)
objectiveName = dimobj[[1]][1]
#design matix
fx = t(Biobase::exprs(lpd)[predictors, ])
# check the objective
stopifnot(identical(rownames(fx), dimobj[[2]]))
fy = objective[,,drop=TRUE]
# select predictors
fx = fx[, (fData(lpd)[predictors, "type"] %in% c("Methylation_Cluster",
"IGHV","gen", "pretreat")),
drop=FALSE]
# make sure that there are no NAs in the table
stopifnot(all(!is.na(fx)))
# print table with numer of predictors from each group
# message("Objective: ", objectiveName, "\n")
prefix = paste(ifelse(dataType %in% unique(fData(lpd)[colnames(fx), "type"]),
names(dataType), "_"), collapse="")
n = length(unique(fy))
family = "gaussian"
return(list(fx=fx, fy=fy, objectiveName=objectiveName, prefix=prefix,
family=family))
}
# plotting the predictions
plotPredictions = function(fx, fy, objective, pred, coeffs, lpd, nm, lim,
objectiveName, colors) {
# quiets concerns of R CMD check "no visible binding for global variable"
X=NULL; Y=NULL; Measure=NULL
# PREPARE DATA FOR PLOTTING
stopifnot( identical(dim(pred), c(length(fy), 1L)), identical(rownames(pred),
names(fy)) )
ordy <- order(fy, pred[, 1])
# design matrix of selected predictors (unscaled)
mat <- t(fx[ordy, names(coeffs), drop=FALSE])
# human-readable names where available & drugs with concentrations
nicename = names(coeffs) %>% `names<-`(names(coeffs))
idx = grepl("D_", nicename)
nicename[idx] = giveDrugLabel3(nicename[idx])
nicename[!idx] = fData(lpd)[names(nicename)[!idx], "name"] %>%
`names<-`(names(nicename)[!idx])
nicename = ifelse(!is.na(nicename) & (nicename!=""), nicename, names(nicename))
nicename = gsub("Methylation_Cluster", "Methylation cluster", nicename)
rownames(mat) = nicename[rownames(mat)]
# prepare plot title
title=nm
# CREATE INDIVIDUAL PARTS OF THE FIGURE
# bar plot
stopifnot(all(coeffs<lim & coeffs>-lim))
part1df = data.frame(coeffs,
nm=factor(names(coeffs), levels=names(rev(coeffs))))
part1df$col= ifelse(rownames(part1df)=="IGHV", "I",
ifelse(rownames(part1df)=="Methylation_Cluster", "M",
ifelse(rownames(part1df)=="Pretreatment", "P","G")))
part1 = ggplot(data=part1df, aes(x=nm, y=coeffs, fill=col)) +
geom_bar(stat="identity", colour="black", position = "identity",
width=.66, size=0.2) +theme_bw() +
geom_hline(yintercept=0, size=0.3) + scale_x_discrete(expand=c(0,0.5)) +
scale_y_continuous(expand=c(0,0)) + coord_flip(ylim=c(-lim,lim)) +
theme(panel.grid.major=element_blank(),
panel.background=element_blank(),
panel.grid.minor = element_blank(),
axis.text=element_text(size=8),
panel.border=element_blank()) +
xlab("") + ylab("Model Coefficients") +
geom_vline(xintercept=c(0.5), color="black", size=0.6)+
scale_fill_manual(c("M", "I", "G", "P"),
values=c(M=colors[["M"]][2],
I=colors[["I"]],
G=colors[["G"]],
P=colors[["P"]]))
# heat map
# mat contains selected predictors with status for each patient
# (e.g. 0/1 for mutations and IGHV, 0/0.5/1 for M)
# to assign colors Gosia added 5 to meth and 2 to IGHV values,resuting in
# 5 LP, 5.5 IP, 6 HP, 2 unmut IGHV or mut, 4 IGHV mut, 3 mut, 7 pre-treatment
idx = grep("Methylation cluster", rownames(mat))
mat[idx,] = mat[idx,]+5
## ighv
idx = grep("IGHV", rownames(mat))
mat[idx,] = (mat[idx,]*2)+2
## gene
rnm = sapply(rownames(mat), function(nm) strsplit(nm," ")[[1]][1])
idx = rownames(fData(lpd))[fData(lpd)$type=="gen" & rownames(lpd) %in% rnm]
idx = match(idx, rnm)
mat[idx,] = mat[idx,]+2
## pretreat
idx = grep("Pretreatment", rownames(mat))
mat[idx,] = ifelse(mat[idx,]==0,2,7)
mat2 = meltWholeDF(mat)
mat2$Measure = factor(mat2$Measure, levels=sort(unique(mat2$Measure)))
mat2$X = factor(mat2$X, levels=colnames(mat))
mat2$Y = factor(mat2$Y, levels=rev(rownames(mat)))
part2 = ggplot(mat2, aes(x=X, y=Y, fill=Measure)) +
geom_tile() + theme_bw() +
scale_fill_manual(name="Mutated",
values=c(`2`="gray96", `3`=paste0(colors["G"], "E5"),
`5`=colors[["M"]][1], `5.5`=colors[["M"]][2],
`6`=colors[["M"]][3], `7`=colors[["P"]],
`4`=paste0(colors["I"],"E5")), guide=FALSE) +
scale_y_discrete(expand=c(0,0)) +
theme(axis.text.y=element_text(hjust=0, size=14),
axis.text.x=element_blank(),
axis.ticks=element_blank(),
panel.border=element_rect(colour="gainsboro"),
plot.title=element_text(size=12),
legend.title=element_text(size=12),
legend.text=element_text(size=12),
panel.background=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank()) +
xlab("patients") + ylab("") + ggtitle(title)
if(length(levels(mat2$Y)) > 1) {
part2 = part2 + geom_hline(yintercept=seq(1.5, length(levels(mat2$Y)), 1),
colour="gainsboro", size=0.2)
}
# scatter plot
mat3 = fy[colnames(mat)]
mat3 = data.frame(X=factor(names(mat3), levels=names(mat3)), Y=mat3*100)
Yrange = range(mat3$Y)
Yhangs = diff(Yrange)*0.05
Ylims = c(Yrange[1]-Yhangs, Yrange[2]+Yhangs)
Yran = diff(Yrange)
Ybreaks = if(Yran<=15) 5 else if(Yran>15 & Yran<=30) 10 else if(Yran>30 & Yran<=40) 15 else if(Yran>40 & Yran<=60) 20 else 40
part4 = ggplot(mat3, aes(x=X, y=Y)) +
geom_point(shape=21, fill="dimgrey", colour="black", size=1.2) +
theme_bw() +
theme(panel.grid.minor=element_blank(),
panel.grid.major.x=element_blank(),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_text(size=8),
panel.border=element_rect(colour="dimgrey", size=0.1),
panel.background=element_rect(fill="gray96")) +
ylab("Viability [%]") +
scale_y_continuous(limits=c(Yrange[1]-Yhangs, Yrange[2]+Yhangs),
breaks=seq(-200,200,Ybreaks))
# MERGE PARTS OF THE FIGURE
# construct the gtable
wdths = c(4, 0.5, 0.05*ncol(mat), 0.05)
hghts = c(0.3, 0.25*nrow(mat), 0.1, 0.4, 0.35)
gt = gtable(widths=unit(wdths, "in"), heights=unit(hghts, "in"))
## make grobs
gg1 = ggplotGrob(part1)
gg2 = ggplotGrob(part2)
gg4 = ggplotGrob(part4)
## fill in the gtable
gt = gtable_add_grob(gt, gtable_filter(gg1, "panel"), 2, 1) # add boxplot
gt = gtable_add_grob(gt, gtable_filter(gg2, "panel"), 2, 3) # add heatmap
gt = gtable_add_grob(gt, gtable_filter(gg4, "panel"), 4, 3) # add scatterplot
gt = gtable_add_grob(gt, gtable_filter(gg2, "xlab"), 5, 3)
gt = gtable_add_grob(gt, gg2$grobs[[whichInGrob(gg2, "title")]], 1, 3)
gt = gtable_add_grob(gt, gg4$grobs[[whichInGrob(gg4, "axis-l")]], 4, 2)
gt = gtable_add_grob(gt, gg1$grobs[[whichInGrob(gg1, "axis-b")]], 3, 1)
gt = gtable_add_grob(gt, gtable_filter(gg1, "xlab-b"), 4, 1)
# now complicated: add the axis-l labels from gg2
gia = which(gg2$layout$name == "axis-l")
gga = gg2$grobs[[gia]]
gax = gga$children[[2]]
gax$widths = rev(gax$widths)
gax$grobs = rev(gax$grobs)
gt = gtable_add_cols(gt, gg2$widths[gg2$layout[gia, ]$l])
gt = gtable_add_grob(gt, gax, 2, 5)
wdth = convertUnit(gt$widths, "in", valueOnly=TRUE)[5]
# add column on the right of appropriate width
maxwdth = 2
gt = gtable_add_cols(gt, unit(maxwdth-wdth, "in"))
return(list(plot=gt, width=sum(wdths)+maxwdth, height=sum(hghts),
name=make.names(objectiveName)))
}
# main function to fit Lasso and produce barplots to find genetic
# determinants of drug response
doLasso = function(objective, predictors, lpd,suffix="",
nm=NA, lim=0.21, ncv=10, nfolds=10, std=FALSE,
adaLasso = TRUE, colors) {
#construct design and response matrix
out = featureSelectionForLasso(objective, predictors, lpd)
# for simplification
fy = out[["fy"]]
fx = out[["fx"]]
family = out[["family"]]
objectiveName = out[["objectiveName"]]
prefix = out[["prefix"]]
fxdim = dim(fx)
print(sprintf("Prediction for: %s; #samples: %d; #features: %d",
objectiveName, fxdim[1], fxdim[2]))
# adaptive lasso for a more stable feature selection
set.seed(19087)
if(adaLasso){
if(ncol(fx)>= nrow(fx)) {
RidgeFit <- cv.glmnet(fx, fy, alpha = 0, standardize = std,
family = family, nfolds=10)
# wRidge <- pmin(1/abs((coef(RidgeFit, s = RidgeFit$lambda.min))), 1e+300)
wRidge <- 1/abs(coef(RidgeFit, s = RidgeFit$lambda.min))
wRidge <- wRidge[-1]
weights <- wRidge
} else {
lmFit <- lm(fy ~ fx)
# wLM <- pmin(1/abs(coefficients(lmFit)[-1]), 1e+300)
wLM <- 1/abs(coefficients(lmFit)[-1])
weights <- wLM
}
excludedFeatures <- which(weights==Inf)
} else {
weights <- rep(1, ncol(fx))
excludedFeatures <- NULL
}
#perform repeated cross-validation to find an optimal penalisatio
#parameter minimizing the cross-validated MSE
cv.out <- cvr.glmnet(Y=fy, X=fx, family=family,
alpha=1, nfolds=nfolds,
ncv=ncv, standardize=std,
exclude = excludedFeatures,
type.measure = "mse", penalty.factor = weights)
# #fit Lasso model for optimal lambda
fit = glmnet(y=fy, x=fx, family=family,
alpha=1,
standardize=std,
exclude = excludedFeatures,
lambda=cv.out$lambda, penalty.factor = weights)
#get optimal lambda and corresponding predictors with coefficients
lambda <- cv.out$lambda[which.min(cv.out$cvm)]
coeffs <- coef(fit, lambda)
coeffs_all <- coeffs
coeffs <- as.vector(coeffs) %>%
`names<-`(rownames(coeffs)) # cast from sparse matrix to ordinary vector
coeffs <- coeffs[ coeffs!=0 ]
# remove intercept term
stopifnot(names(coeffs)[1]=="(Intercept)")
if (length(coeffs) > 1) {
coeffs <- coeffs[-1]
} else {
print("No (0) predictors for given parameters!")
return(0)
}
coeffs <- sort(coeffs)
# Residuals in the model
pred <- predict(fit, newx = fx, s = lambda, type = "response")
residuals <- pred[,1]-fy
plot = plotPredictions(fx, fy, objective, pred, coeffs, lpd, nm, lim,
objectiveName, colors)
return(list(residuals=residuals, coeffs=coeffs_all, plot=plot))
}
# Make list of predictors for the given lpd
makeListOfPredictors = function(lpd) {
return(list(
predictorsM = rownames(fData(lpd))[fData(lpd)$type=="Methylation_Cluster"],
predictorsG = rownames(fData(lpd))[fData(lpd)$type=="gen"],
predictorsI = rownames(fData(lpd))[fData(lpd)$type=="IGHV"],
predictorsP = rownames(fData(lpd))[fData(lpd)$type=="pretreat"]
))
}
# Pre-process data: explore what is available & feature selection
prepareLPD = function(lpd, minNumSamplesPerGroup, withMC=TRUE) {
# PRETREATMENT
# update the expression set by adding row about pretreatment
pretreated <- t(matrix(ifelse(
BloodCancerMultiOmics2017::patmeta[colnames(lpd),
"IC50beforeTreatment"], 0, 1),
dimnames=list(colnames(lpd), "Pretreatment")))
fdata_pretreat <- data.frame(name=NA, type="pretreat", id=NA, subtype=NA,
row.names="Pretreatment")
lpd <- ExpressionSet(assayData=rbind(Biobase::exprs(lpd), pretreated),
phenoData=new("AnnotatedDataFrame", data=pData(lpd)),
featureData=new("AnnotatedDataFrame",
rbind(fData(lpd), fdata_pretreat)))
# METHYLATION
Biobase::exprs(lpd)[fData(lpd)$type=="Methylation_Cluster",] =
Biobase::exprs(lpd)[fData(lpd)$type=="Methylation_Cluster",]/2
# IGHV: changing name from Uppsala to IGHV
rownames(lpd)[which(fData(lpd)$type=="IGHV")] = "IGHV"
# GENETICS
# changing name od del13q any to del13q & remove the rest of del13q (mono, single)
idx = which(rownames(lpd) %in% c("del13q14_bi", "del13q14_mono"))
lpd = lpd[-idx,]
rownames(lpd)[which(rownames(lpd)=="del13q14_any")] = "del13q14"
# remove CHROMOTHRYPSIS
if("Chromothripsis" %in% rownames(lpd))
lpd = lpd[-which(rownames(lpd)=="Chromothripsis"),]
# SELECT GOOD SAMPLES
idx = !is.na(Biobase::exprs(lpd)["IGHV",])
if(withMC) idx = idx & !is.na(Biobase::exprs(lpd)["Methylation_Cluster",])
# cut out the data to have information about Methylation_Cluster and IGHV for all samples
lpd = lpd[, idx]
# for the genetics - remove the genes which do not have enough samples
which2remove = names(
which(!apply(Biobase::exprs(lpd)[rownames(lpd)[fData(lpd)$type %in%
c("gen")],], 1, function(cl) {
if(all(is.na(cl))) return(FALSE)
if(sum(is.na(cl)) >= 0.1*length(cl)) return(FALSE)
tmp = table(cl)
return(length(tmp)==2 & all(tmp>=minNumSamplesPerGroup))
})))
lpd = lpd[-match(which2remove, rownames(lpd)),]
# for the ones with NA put 0 instead
featOther = rownames(lpd)[fData(lpd)$type %in%
c("IGHV", "Methylation_Cluster", "gen", "viab",
"pretreat")]
tmp = Biobase::exprs(lpd)[featOther,]
tmp[is.na(tmp)] = 0
Biobase::exprs(lpd)[featOther,] = tmp
return(lpd)
}
# wrapper functions to do Lasso model fitting, plotting and prediction
makePredictions = function(drs, frq, lpd, predictorList, lim, std=FALSE,
adaLasso = TRUE, colors) {
res = lapply(names(drs), function(typ) {
setNames(lapply(drs[[typ]], function(dr) {
if(typ=="1:5")
nm <- paste0(BloodCancerMultiOmics2017::drugs[dr, "name"],
" (average of all concentrations)")
else if(typ=="4:5")
nm <- paste0(BloodCancerMultiOmics2017::drugs[dr, "name"],
" (average of ", paste(
BloodCancerMultiOmics2017::conctab[dr,4:5]*1000,
collapse = " and "), " nM)")
# G & I & M & P
doLasso(Biobase::exprs(lpd)[grepl(dr, rownames(lpd)) &
fData(lpd)$subtype==typ,, drop=FALSE],
predictors=with(predictorList,
c(predictorsI, predictorsG, predictorsM,
predictorsP)),
lpd=lpd,
suffix=paste0("_","th0", "_c",gsub(":","-",typ)),
nm=nm, lim=lim, colors=colors)
}), nm=drs[[typ]])
})
return(res)
}
# Function to plot the legends
makeLegends = function(legendFor, colors) {
# quiets concerns of R CMD check "no visible binding for global variable"
x=NULL; y=NULL
# select the colors needed
colors = colors[names(colors) %in% legendFor]
nleg = length(colors)
wdths = rep(2, length.out=nleg); hghts = c(2)
gtl = gtable(widths=unit(wdths, "in"), heights=unit(hghts, "in"))
n=1
# M
if("M" %in% names(colors)) {
Mgg = ggplot(data=data.frame(x=1, y=factor(c("LP","IP","HP"),
levels=c("LP","IP","HP"))),
aes(x=x, y=y, fill=y)) + geom_tile() +
scale_fill_manual(name="Methylation cluster",
values=setNames(colors[["M"]], nm=c("LP","IP","HP"))) +
theme(legend.title=element_text(size=12),
legend.text=element_text(size=12))
gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Mgg), "guide-box"), 1, n)
n = n+1
}
# I
if("I" %in% names(colors)) {
Igg = ggplot(data=data.frame(x=1,
y=factor(c("unmutated","mutated"),
levels=c("unmutated","mutated"))),
aes(x=x, y=y, fill=y)) + geom_tile() +
scale_fill_manual(name="IGHV",
values=setNames(c("gray96",
paste0(colors["I"], c("E5"))),
nm=c("unmutated","mutated"))) +
theme(legend.title=element_text(size=12),
legend.text=element_text(size=12))
gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Igg), "guide-box"), 1, n)
n = n+1
}
# G
if("G" %in% names(colors)) {
Ggg = ggplot(data=data.frame(x=1,
y=factor(c("wild type","mutated"),
levels=c("wild type","mutated"))),
aes(x=x, y=y, fill=y)) + geom_tile() +
scale_fill_manual(name="Gene",
values=setNames(c("gray96",
paste0(colors["G"], c("E5"))),
nm=c("wild type","mutated"))) +
theme(legend.title=element_text(size=12),
legend.text=element_text(size=12))
gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Ggg), "guide-box"), 1, n)
n = n+1
}
# P
if("P" %in% names(colors)) {
Pgg = ggplot(data=data.frame(x=1,
y=factor(c("no","yes"),
levels=c("no","yes"))),
aes(x=x, y=y, fill=y)) + geom_tile() +
scale_fill_manual(name="Pretreatment",
values=setNames(c(colors[["P"]], "white"),
nm=c("yes","no"))) +
theme(legend.title=element_text(size=12),
legend.text=element_text(size=12))
gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Pgg), "guide-box"), 1, n)
n = n+1
}
return(list(plot=gtl, width=sum(wdths), height=sum(hghts)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.