#libs ####
library("plyr")
library("lattice")
library("tidyr")
library("qvalue")
library("MASS")
# exploratory plotting functions for raw data ####
plot.raw <- function(df, ...){
xyplot(FG.Quantity~R.Condition | EG.PrecursorId, main="Raw scale measurements", panel = function(x, y, ...){
panel.xyplot(x, y, ...)
panel.loess(x, y, ...)
panel.lmline(x, y, ...)
}, data = df, auto.key = TRUE)
}
plot.ylog <- function(df, ...){
xyplot(log2(FG.Quantity)~R.Condition | EG.PrecursorId, panel = function(x, y, ...){
panel.xyplot(x, y, ...)
panel.loess(x, y, ...)
panel.lmline(x, y, ...)
}, data = df, auto.key = TRUE, main="log-transformed measurements")
}
plot.xylog <- function(df, v = 2, ...){
if(v==1){
xyplot(log2(FG.Quantity)~log2(R.Condition), groups = EG.PrecursorId, panel = function(x, y, ...){
panel.xyplot(x, y, ...)
panel.loess(x, y, ...)
#panel.lmline(x, y, ...)
}, data = df, auto.key = TRUE, main="log-log transformed data (grouped)")
}
else{
xyplot(log2(FG.Quantity)~log2(R.Condition) | EG.PrecursorId, panel = function(x, y, ...){
panel.xyplot(x, y, ...)
panel.loess(x, y, ...)
#panel.lmline(x,y, ...)
}, data = df, auto.key = TRUE, main="log-log transformed data (conditioned)")
}
}
# co-ploting functions for fitted models and data
plot.linear.model <- function(lm, df){
#expects linear model object and df containing observations used for model fitting
#limitations: uses only coef 1 and 2 for drawing model line
xyplot(log2(FG.Quantity)~R.Condition|EG.PrecursorId, panel = function(x, y, ...){
panel.xyplot(x, y)
panel.abline(coef = coef(lm))
}, data=df)
}
plot.loglog.linear.model <- function(lm, df, c =1){
#expects linear model object and df containing observations used for model fitting
#limitations: uses only coef 1 and 2 for drawing model line
xyplot(log2(FG.Quantity)~log2(R.Condition+c)|EG.PrecursorId, panel = function(x, y, ...){
panel.xyplot(x, y)
panel.abline(coef = coef(lm))
}, data=df)
}
#ploting functions for model parameter estimates
vplot <- function(df, fc = 1){
#assumes df to contain parameter estimates and q-values and grouping var
#red line is 5% FDR
#green line is 1% FDR
xyplot(-log10(q.value)~Estimate | groups, panel = function(x,y){
panel.xyplot(x, y)
panel.abline(v=c(-fc, fc))
panel.abline(h=-log10(0.05), col="red")
panel.abline(h=-log10(0.01), col="green")},
data = df, main="Vulcano plot")
}
zplot <- function(x){
#x is ecpected to be a ecdf instance
plot(x, pch=1)
abline(h=0.01, col="red")
abline(h=0.99, col="blue")
abline(h=0.5, col="black")
abline(v=0, col="black")
}
#########################
# prepare input data ####
#########################
#read in Peptide Quant Default export from Spectronaut 9 ####
# pd workflow (3 pass SEQUESTHT search + Percolator validation) was executed using all DDA runs
# Spectronaut 9 was used to query DIA maps and resulst were exported using default scheme
PepQuantDef <- read.csv("~/Documents/RStudio/p2095_CasTC/CasTC_all_in_edition_PeptideQuantDefault.csv", sep=";")
str(PepQuantDef)
#diagnostic plots
bwplot(log10(FG.Quantity)~R.FileName, data = PepQuantDef, main="Signal distribution per run")
histogram(~EG.Qvalue, data = PepQuantDef, main="Q-value distribution per runs")
# load pd 2.1 results export for Peptide Group level####
# spectronaut assays were build using pd results file and fasta DB used for SEQUESTHT searches
PeptideGroups <- read.delim("~/Documents/RStudio/p2095_CasTC/CasTC_3pass_analysis_PeptideGroups.txt")
str(PeptideGroups)
# load degrabase 1.0 ####
# was downloaded from degrabase website and saved as csv
degrabase <- read.csv("~/Documents/RStudio/p2095_CasTC/degrabase_071712.csv")
str(degrabase)
get.stripped.peptides <- function(x){
y <- sub("^.*-", "", as.character(x), perl = TRUE)
z <- sub("\\(.*\\)", "", y, perl=TRUE)
return(z)
}
degrabase <- mutate(degrabase, stripped.peptide = get.stripped.peptides(degrabase$peptide))
####################
# model fitting ####
####################
# LS - ordinary linear regression using least squares estimation ####
# Version 1 : incl. charge as independent variable
fit.lm <- function(df){
df <- droplevels(df)
if (nlevels(df$EG.PrecursorId)>1) {
lm(log2(FG.Quantity)~R.Condition + EG.PrecursorId, data = df)
}
else {
lm(log2(FG.Quantity)~R.Condition, data = df)
}
}
model.list <- dlply(.variables = "EG.ModifiedSequence", .fun = failwith(NA, fit.lm), .data = PepQuantDef, .parallel=FALSE, .progress = "text")
# Version 2 : fit a model for every precursor using split-apply-combine strategy
fit.lm <- function(df){
df <- droplevels(df)
return(lm(log2(FG.Quantity)~R.Condition, data = df))
}
model.list <- dlply(.variables = "EG.PrecursorId", .fun = failwith(NA, fit.lm), .data = PepQuantDef, .parallel=FALSE, .progress = "text")
head(model.list)
save(model.list, file = "lm_models.RData")
# WLS - estimation using weighted least squares according to signal intensity ####
fit.w.lm <- function(df){
df <- droplevels(df)
if (nlevels(df$EG.PrecursorId)>1) {
lm(log2(FG.Quantity)~R.Condition + EG.PrecursorId, weights = FG.Quantity, data = df)
}
else {
lm(log2(FG.Quantity)~R.Condition, weights = FG.Quantity, data = df)
}
}
w.model.list <- dlply(.variables = "EG.ModifiedSequence", .fun = failwith(NA, fit.w.lm), .data = PepQuantDef, .parallel=FALSE, .progress = "text")
head(w.model.list)
save(w.model.list, file = "wlm_models.RData")
# RL - robust linear regression using M estimators ####
# not there yet
#log-log model ####
#log2 transformation of response and explanatory variable
#normaly used as a transformation for power functions : y = axˆb
# parameter a : slope
# parameter b: shape
#since log(0) is not def. -> log(0+c)
fit.loglog.lm <- function(df, c = 1){
df <- droplevels(df)
if (nlevels(df$EG.PrecursorId)>1) {
lm(log2(FG.Quantity)~log2(I(R.Condition+c)) + EG.PrecursorId, data = df)
}
else {
lm(log2(FG.Quantity)~log2(I(R.Condition+c)), data = df)
}
}
loglog.model.list <- dlply(.variables = "EG.ModifiedSequence", .fun = failwith(NA, fit.loglog.lm), .data = PepQuantDef, .parallel=FALSE, .progress = "text")
save(loglog.model.list, file = "logloglm_models.RData")
###################
# test section ####
###################
iRT.protein <- c("LGGNEQVTRYILAGVENSKGTFIIDPGGVIRGTFIIDPAAVIRGAGSSEPVTGLDAKTPVISGGPYEYRVEATFGVDESNAKTPVITGAPYEYRDGLDAASYYAPVRADVTPADFSEWSKLFLQFGAQGSPFLK")
iRT <- gsub(x = iRT.protein, pattern = "R", replacement = "R,")
iRT <- gsub(x = iRT, pattern = "K", replacement = "K,")
iRT <- unlist(strsplit(iRT, split = ","))
# easy to expand to protein digest function
#digest.protein <- function(x, spec="RK"){
#
#}
poi <- "_[+28]VSK[+28]PDLTAALR_" #two charge states
poi <- "_[+28]GQVITIGNER_" #one charge state
# select subset according to modified sequence
poi.subset <- droplevels(PepQuantDef[PepQuantDef$EG.ModifiedSequence == poi,])
summary(poi.subset)
# select subset according to precursor ID
poi <- "_[+28]GQVITIGNER_.2"
poi.subset <- droplevels(PepQuantDef[PepQuantDef$EG.PrecursorId == poi,])
# select according to stripped sequence
poi.subset <- droplevels(PepQuantDef[PepQuantDef$EG.StrippedSequence %in% iRT,])
plot.raw(poi.subset)
plot.xylog(poi.subset)
lm1 <- fit.lm(poi.subset)
lm2 <- fit.w.lm(poi.subset)
lm3 <- fit.loglog.lm(poi.subset)
summary(lm1)
summary(lm2)
summary(lm3)
#is the shape parameter of the log-log model sig different from 1?
t.value <- abs(0.38284-1)/0.07296
dt(t.value, df=1) #not sure about correct df
#compare the difference between log-log model and linear model in raw space
plot(FG.Quantity~R.Condition, data=poi.subset[poi.subset$EG.PrecursorId=="_[+28]VSK[+28]PDLTAALR_.2",])
plot(FG.Quantity~R.Condition, data=poi.subset[poi.subset$EG.PrecursorId=="_[+28]VSK[+28]PDLTAALR_.3",])
abline(lm(FG.Quantity~R.Condition, data=poi.subset[poi.subset$EG.PrecursorId=="_[+28]VSK[+28]PDLTAALR_.2",]))
abline(lm(FG.Quantity~R.Condition, data=poi.subset[poi.subset$EG.PrecursorId=="_[+28]VSK[+28]PDLTAALR_.3",]))
xv <- seq(0,5,0.1)
a <- coef(lm3)[1]
a <- 2^a
b <- coef(lm3)[2]
yv <- a*xv^b
lines(xv, yv, col="red")
##########################
# Examine the results ####
##########################
# helper functions
get.R2 <- function(linear.model){summary(linear.model)$r.squared}
get.model.p <- function(linear.model){
f.stat <- summary(linear.model)$fstatistic
if(class(f.stat) == "numeric"){
p <- pf(f.stat[1], f.stat[2], f.stat[3], lower.tail=FALSE)
attributes(p) <- NULL
return(p)
}
else return(NA)
}
get.coef <- function(linear.model, n){
if(class(linear.model) != "lm"){
warning("First argument is not an instance of class lm!")
return(c(NA, NA, NA, NA))
}
else {
s <- summary(linear.model)$coefficients
n.coef <- c(NA, NA, NA, NA)
tryCatch(expr = n.coef <- s[n,], finally = return(n.coef))
}
}
# compile model quality table with model R2 and model p and adjusted p (here q for simplicity) ####
model.quality <- ldply(model.list, function(x){c(R2 = get.R2(x), p.value=get.model.p(x))}, .progress = "text")
xyplot(R2~p.value, data = model.quality, main="Model p-value vs. R2")
# choose multiple testing adjustment method by # usage
#model.quality <- transform(model.quality, q.value=qvalue(model.quality$p.value)$qvalues)
model.quality <- transform(model.quality, q.value=p.adjust(model.quality$p.value, method = "BH"))
histogram(~p.value + q.value, data=model.quality, main="Model p-value distribution before and after multiple testing adjustment", breaks = 50)
summary(model.quality)
good.model.index <- model.quality$q.value <= 0.05 #5% FDR
# compile table with 2nd coef (slope) plus q-value ####
model.2ndCoef <- ldply(model.list, failwith(NA, function(x){get.coef(x, 2)}), .progress = "text")
str(model.2ndCoef)
#choose multiple testing adjustment
add.q <- function(df){
names <- colnames(df)
names[5] <- "p.value"
colnames(df) <- names
return(transform(df, q.value = p.adjust(df$p.value, method = "BH")))
}
add.q <- function(df){
names <- colnames(df)
names[5] <- "p.value"
colnames(df) <- names
q <- qvalue(df$p.value)
return(cbind(df, q.value=q$qvalues))
}
model.2ndCoef <- add.q(model.2ndCoef)
summary(model.2ndCoef)
# Disgnostic plots ####
# multiple testing adjustment
histogram(~q.value + p.value, data = model.2ndCoef, main="Slope p & q-value distribution", breaks = 50)
#vulcano plot
#vulcano plots for good and bad models and fc cutoff as used for candidate list
vplot(transform(model.2ndCoef, groups = as.character(good.model.index)), fc = 0.8)
# marginal distributions
histogram(~q.value, data = model.2ndCoef, main="slope q-value distribution", breaks = 50)
histogram(~Estimate, data = model.2ndCoef, main="slope effect size distribution", breaks = 50)
#############################
# compile candidate list ####
#############################
#example: select good models with abs(slope)>0.8 at 1% sig. level (look at vulcano plot)
candidates.pos <- na.omit(model.2ndCoef[good.model.index & (model.2ndCoef$Estimate > 0.8 & model.2ndCoef$q.value < 0.01),])
candidates.neg <- na.omit(model.2ndCoef[good.model.index & (model.2ndCoef$Estimate < -0.8 & model.2ndCoef$q.value < 0.01),])
#########################################################
# estimation of cutoff from the data using a 0-model ####
#########################################################
# workflow
# 1: select a set of peptides detected by DDA that should not show a dependence on time
# 2: examine the distribution of estimated coefs
#instead of using a fixed cutoff one can determine a 0 distribution
# set1 : natural N-term. peptides
# Problem : split variable has changed -> name of feature is now "EG.PrecursorId"
# solution could be: rename column to generic name: model.Id or use column number for reference
model.quality <- rename(model.quality, c("EG.PrecursorId" = "ModelId"))
Met.removed <- grepl("_[-131]", model.quality$ModelId, fixed=TRUE)
Nterm.acet <- grepl("_[+42]", model.quality$ModelId, fixed=TRUE)
Met.removed.acet <- grepl("_[-89]", model.quality$ModelId, fixed=TRUE)
natural <- Met.removed | Nterm.acet | Met.removed.acet
#set2: unmodified, semitryptic peptides without internal D
semi.tryptic <- grepl("R_", model.quality$ModelId)
free.nterm <- !grepl("_\\[", model.quality$ModelId)
noD <- !grepl("D", model.quality$ModelId)
trp <- semi.tryptic & free.nterm & noD
#diagnostic plots ####
#group TRUE is semi-tryptic, free peptides without internal Ds
vplot(transform(model.2ndCoef, groups=as.character(trp)))
#marginal x distribution
densityplot(~Estimate, groups = groups, data = transform(model.2ndCoef, groups = as.character(trp)), auto.key = TRUE)
#compute empirical density function of zero distribution of slope effect size
zero.dist.ecdf <- ecdf(model.2ndCoef$Estimate[trp])
zplot(zero.dist.ecdf)
#compute empirical density function of zero distribution of slope p-value
zero.dist.ecdf.pvalue <- ecdf(model.2ndCoef$p.value[trp])
plot(zero.dist.ecdf.pvalue, pch = 1, main="slope p.value ECDF")
abline(a=0, b=1) #for zero we expect uniform distribution of p-value = diagonal
(quantile(zero.dist.ecdf.pvalue, 0.01))
#compute empirical density function of zero distribution of slope q-value
zero.dist.ecdf.qvalue <- ecdf(model.2ndCoef$q.value[trp])
plot(zero.dist.ecdf.qvalue, pch = 1, main="slope q-value ECDF")
abline(a=0, b=1) #for zero we expect uniform distribution of q-value
(quantile(zero.dist.ecdf.qvalue, 0.01))
# Determine the critical slope value from the data distribution ####
n <- length(model.list)
(n*0.01) #expected number of false positives at 1% FDR
(cutoff <- quantile(zero.dist.ecdf, 0.99)) #critical value for slope at 1% FDR
vplot(transform(model.2ndCoef, groups = as.character(good.model.index)), fc = cutoff)
#override with new candidate list using 0-model
candidates.pos <- na.omit(model.2ndCoef[good.model.index & (model.2ndCoef$Estimate > cutoff & model.2ndCoef$q.value < 0.01),])
candidates.pos.nterm <- na.omit(model.2ndCoef[good.model.index & (model.2ndCoef$Estimate > cutoff & model.2ndCoef$q.value < 0.01) & grepl("^_\\[\\+28]", model.2ndCoef$EG.ModifiedSequence, perl = TRUE),])
#write candidate lists to hard drive
write.csv(candidates.pos, file = "candidate_pos.csv", row.names = FALSE, quote = FALSE)
##########################################
# 1-Model - true cases from degrabase ####
##########################################
# workflow:
# 1. select peptides iden. in DDA runs that start after D and are dimethylated at the N-term
# 2. retrieve corresponding protein acc and position of peptide within protein
# 3. query degrabase based on protein acc and position (implemented by SQL left join)
# 4. query list of models and raw data - How do known Cas-dependent peptides behave?
# helper functions
get.acc <- function(x){
acc.pattern <- "^.*\\s"
grep(pattern = acc.pattern, x=x, perl=TRUE)
m <- regexpr(pattern = acc.pattern, x)
n <- sub(pattern = " ", "", regmatches(x=x, m))
return(n)
}
get.start <- function(x){
start.pattern <- "\\[.*-"
m <- regexpr(pattern = start.pattern, x)
n <- sub("\\[","", sub(pattern = "-", "", regmatches(x=x, m)))
return(as.numeric(n))
}
get.interval <- function(x){
#expects x to be a character, returns numeric vector x1=start, x2=stop
interval.pattern <- "\\[\\d*-\\d*\\]"
m <- regexpr(pattern = interval.pattern, x)
n <- strsplit(regmatches(x=x, m),split = "-")
return(as.numeric(sub(pattern = "\\[|\\]", replacement = "", unlist(n))))
}
# select all peptide groups indentified in DDA starting after D and having N-term. dimethyl mark
# queries pd 2.1 peptide groups export
after.D.pattern <- "^\\[D\\]\\."
nterm.dim.pattern <- "Dimethyl\\s\\[N-Term"
subset <- PeptideGroups[grepl(after.D.pattern, PeptideGroups$Annotated.Sequence, perl = TRUE) & grepl(nterm.dim.pattern, PeptideGroups$Modifications, perl = TRUE),]
subset <- separate_rows(data = subset[,c(3:4,11,13)], Positions.in.Master.Proteins, sep = ";\\s") #ungroups
subset <- mutate(subset, acc = get.acc(subset$Positions.in.Master.Proteins), start = get.start(subset$Positions.in.Master.Proteins)) #extracts acc and start position of peptide
#SCL-type left join
joined.subset <- droplevels(join(x = subset, y = degrabase, by=c("acc","start"), type="left"))
summary(joined.subset)
# Diagnostic plots ####
histogram(~P4 + P3 + P2 + P1 + P1. + P2. + P3. + P4., joined.subset)
summary(is.na(joined.subset)) #check for missing values
# filter joined data for relevant experimental context ####
# CasTC exp has been performed using Jurkat cells and Stau. treatment
annotated <- !is.na(joined.subset$uniprot.ID) #select all features (queries) that are known to degrabase
jurkat <- joined.subset$cell.type %in% c("Jurkat", "MM1S, DB, Jurkat") #select all fatures that are known in the context of Jurkat cells
staurosporine <- joined.subset$perturbagen %in% c("Staurosporine (2uM) or TRAIL (2nM), extracts combined", "Staurosporine, Bortezomib, Doxorubicin") #select all features detected in Staurosporine treatent
known <- as.character(unique(joined.subset[annotated & jurkat & staurosporine, "Annotated.Sequence"]))
#helper functions
# intended use : each model is named according to split factor (e.g. EG.PrecursorId),
# version 1 : EG.ModifiedSequence gives pattern: _XX[mod]XX_
# version 2 : EG.PrecursorId gives pattern: _XX[mod]XX_.z
isolate.peptide <- function(x){
y <- sub(x, pattern = "^\\[.\\].", replacement = "", perl = TRUE)
z <- sub(y, pattern = "\\.\\[.\\]$", replacement = "", perl = TRUE)
return(z)
}
modify.peptide <- function(x){
y <- sub(x, pattern = "M", replacement = "M[+16]", perl = TRUE)
z <- sub(y, pattern = "K", replacement = "M[+28]", perl = TRUE)
return(z)
}
strip.peptide <- function(x, aa.only =FALSE){
#strips off modifications
if(aa.only){
z <- gsub(x, pattern = "(!?[ABCDEFGHIJKLMNOPQRSTUVWYZX])", replacement = "", perl = TRUE)
}
else{
y <- gsub(x, pattern = "\\[(\\+||\\-)\\d+\\]", replacement = "", perl = TRUE)
z <- gsub(y, pattern = "_||_\\.\\d", replacement = "", perl = TRUE)
}
return(z)
}
#test statement
grep(pattern = isolate.peptide(known[1]), x = strip.peptide(names(model.list)), value = TRUE)
# See how known Cas-dependent peptides behave in CasTC exp ####
length(known)
i <- 30
known[i] #Peptide known to degrabase in the context of Jurkat cells treated with Stau.
l <- model.list[grep(pattern = isolate.peptide(known[i]), x = strip.peptide(names(model.list)))] #corresponding linear model, based on stripped sequence
# Model IDs
names(l)
# How does the raw data over these models look like?
#poi.subset <- droplevels(PepQuantDef[PepQuantDef$EG.ModifiedSequence %in% names(l), ])
poi.subset <- droplevels(PepQuantDef[PepQuantDef$EG.PrecursorId %in% names(l), ]) #substrings also match!
plot.raw(poi.subset)
plot.ylog(poi.subset)
plot.xylog(poi.subset, v = 2)
# Look at raw data for a specific model
j <- 14
#poi.subset <- droplevels(PepQuantDef[PepQuantDef$EG.ModifiedSequence == names(l)[j], ])
poi.subset <- droplevels(PepQuantDef[PepQuantDef$EG.PrecursorId == names(l)[j], ])
plot.raw(poi.subset)
plot.ylog(poi.subset)
#next step
#generate QC plots over a list of features to determine distribution of estimated coef
# grep protein group information for candidate ####
(pep <- as.character(candidates.pos.nterm$EG.ModifiedSequence[7]))
p <- strip.ends(strip.mods(pep))
grep(pattern = p, x = CasTC_3pass_analysis_PeptideGroups$Annotated.Sequence, fixed = TRUE, value = TRUE) #gets also substring matches :-)
CasTC_3pass_analysis_PeptideGroups[grep(pattern = p, x = CasTC_3pass_analysis_PeptideGroups$Annotated.Sequence, fixed = TRUE),]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.