## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
## -----------------------------------------------------------------------------
library(ggplot2)
library(reshape)
library(MAUDE)
maudeGitPathRoot = "https://raw.githubusercontent.com/de-Boer-Lab/MAUDE/master"
## -----------------------------------------------------------------------------
allSamples = read.table(file=sprintf("%s/vignettes/BACH2_data/sample_metadata.txt",maudeGitPathRoot), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F)
head(allSamples)
#remove some of the control samples
allSamples = allSamples[!grepl("HEK62", allSamples$replicate),]
## -----------------------------------------------------------------------------
if (FALSE){
#this was run on my computer to load the data from many files spread out over many subdirectories. Rather than upload all these files separately, I have loaded them all locally and saved the resulting concatenated file onto github. I leave this code here so that others may view how the data was loaded, should they have their own CRISPResso files to analyze.
inDir = "/Path/To/CRISPResso/Files";
setwd(inDir)
allCRISPRessoData= data.frame()
for (i in 1:nrow(allSamples)){
curData = read.table(file=sprintf("%s/%s/Alleles_frequency_table.txt",inDir, allSamples$directory[i]), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F, comment.char = "")
curData2 = read.table(file=sprintf("%s/Deep_resequencing_analysis/%s/Alleles_frequency_table.txt",inDir, allSamples$directory[i]), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F, comment.char = "")
curData = rbind(curData, curData2);
curData$locus = allSamples$locus[i];
curData$replicate = allSamples$replicate[i];
curData$sortBin = allSamples$sortBin[i];
allCRISPRessoData = rbind(allCRISPRessoData, curData)
}
#remove gaps from sequences
allCRISPRessoData$SeqSpecies = gsub("-","",allCRISPRessoData$Aligned_Sequence); # Allele
#remove unwanted fields
allCRISPRessoData$Aligned_Sequence=NULL; allCRISPRessoData$Reference_Sequence=NULL; allCRISPRessoData$X.Reads.1=NULL;
write.table(allCRISPRessoData, file=sprintf("%s/loaded_BACH2_BE_CRISPResso_data.txt", inDir),row.names = F, col.names = T, quote=F, sep="\t")
#I then gzipped this file and uploaded it to github
}else{
#load the CRISPResso files from GitHub.
z= gzcon(url(sprintf("%s/vignettes/BACH2_data/loaded_BACH2_BE_CRISPResso_data.txt.gz",maudeGitPathRoot)));
fileConn=textConnection(readLines(z));
allCRISPRessoData = read.table(fileConn, sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F)
close(fileConn)
}
## -----------------------------------------------------------------------------
#CRISPResso has split some sequences into two or maybe more lines; below is an example
#We also input multiple files from different sequencing runs. This merges all identical seq species'/samples
allCRISPRessoData = cast(melt(allCRISPRessoData, id.vars = c("SeqSpecies","Reference_Name","Read_Status","n_deleted","n_inserted","n_mutated", "locus","replicate","sortBin")), SeqSpecies + Reference_Name + Read_Status + n_deleted + n_inserted + n_mutated + locus + replicate + sortBin ~ variable, value="value", fun.aggregate=sum)
#Get read totals per replicate
allCRISPRessoDataTotals = cast(allCRISPRessoData, locus + replicate + sortBin ~ ., value="X.Reads", fun.aggregate = sum)
names(allCRISPRessoDataTotals)[ncol(allCRISPRessoDataTotals)] = "totalReads";
#Add totalReads column to allCRISPRessoData, calculate read fractions
allCRISPRessoData = merge(allCRISPRessoData, allCRISPRessoDataTotals, by=c("locus","replicate","sortBin"))
allCRISPRessoData$readFraction = allCRISPRessoData$X.Reads/allCRISPRessoData$totalReads;
## -----------------------------------------------------------------------------
p = ggplot(allCRISPRessoDataTotals, aes(x=replicate, fill=sortBin, y=totalReads)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ locus) + theme_classic() + scale_y_continuous(expand=c(0,0))+theme(axis.text.x=element_text(hjust=1, angle=90)); print(p)
## -----------------------------------------------------------------------------
p = ggplot(allCRISPRessoDataTotals, aes(x=replicate, fill=sortBin, y=totalReads)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ locus) + theme_classic() + scale_y_log10(expand=c(0,0))+theme(axis.text.x=element_text(hjust=1, angle=90)); print(p)
## -----------------------------------------------------------------------------
allCRISPRessoData = allCRISPRessoData[ !(allCRISPRessoData$locus=="HEK" & grepl("^[ABCR][12]$", allCRISPRessoData$replicate)) & !(allCRISPRessoData$locus=="BACH2" & grepl("^HEK9[12]", allCRISPRessoData$replicate)),]
## -----------------------------------------------------------------------------
p = ggplot(allCRISPRessoData[allCRISPRessoData$sortBin=="NS",], aes(x= readFraction, colour=replicate)) + stat_ecdf()+facet_grid(locus ~ .) + scale_x_log10() + theme_bw(); print(p)
## -----------------------------------------------------------------------------
temp = cast(allCRISPRessoData[allCRISPRessoData$Read_Status=="UNMODIFIED" & allCRISPRessoData$Reference_Name=="Reference",], formula = sortBin + locus+ replicate ~. ,value = "readFraction", fun.aggregate = max)
names(temp)[ncol(temp)]="readFraction";
p = ggplot(temp, aes(x=sortBin, y=replicate, fill=readFraction)) + geom_tile() + facet_grid(locus ~., scales="free_y")+ggtitle("just unmodified read fractions") + scale_fill_gradientn(colours=c("red","orange", "green","cyan","blue","violet"), limits=c(0,0.8)); print(p)
min(temp$readFraction, na.rm = T) # 0.0356963
## -----------------------------------------------------------------------------
allCRISPRessoData = allCRISPRessoData[allCRISPRessoData$replicate!="A1-HEK",] # this sample had no data for bin D (BACH2) because it didn't PCR well
allCRISPRessoData = allCRISPRessoData[allCRISPRessoData$replicate!="B2-HEK",] # this sample had very low coverage for bin D for BACH2
## -----------------------------------------------------------------------------
seqsObserved = cast(allCRISPRessoData, SeqSpecies + locus + Read_Status + Reference_Name~ .)
names(seqsObserved)[ncol(seqsObserved)]="seqRunsObserved";
retainedSamples = unique(allCRISPRessoData[,c("locus","replicate","sortBin")])
for (l in unique(allSamples$locus)){
seqsObserved$inAll[seqsObserved$locus == l] = seqsObserved$seqRunsObserved[seqsObserved$locus == l]==sum(retainedSamples$locus==l)
}
#require that all samples have at least one read for every species considered. This will help exclude read errors
keepAlleles = seqsObserved[seqsObserved$inAll,]
## -----------------------------------------------------------------------------
#First make a count matrix, but only of alleles seen in all replicates
readCountMat = cast(allCRISPRessoData[allCRISPRessoData$SeqSpecies %in% keepAlleles$SeqSpecies,], SeqSpecies + replicate +locus ~ sortBin, value="X.Reads")
readCountMat[is.na(readCountMat)] = 0
readCountMat = readCountMat[order(readCountMat$NS, decreasing = T),]
#Label WT alleles
wtSeqs = data.frame(locus = c("HEK","BACH2"), SeqSpecies = c("GGTAGCCAGAGACCCGCTGGTCTTCTTTCCCCTCCCCTGCCCTCCCCTCCCTTCAAGATGGCTGACAA","TGCCCCACCCTGTGCCCTTTTTACATTACACACAAATAGGGACGGATTTCCTGTAAGCTGATCTTGAAGAAAAAAAACATGTTAGACAAAGAAAATCAGAACTAAGA"), isWT=T, stringsAsFactors = F);
readCountMat = merge(readCountMat, wtSeqs, by=c("locus","SeqSpecies"), all.x=T)
readCountMat$isWT[is.na(readCountMat$isWT)]=F;
allCRISPRessoData = merge(allCRISPRessoData, wtSeqs, by=c("locus","SeqSpecies"), all.x=T)
allCRISPRessoData$isWT[is.na(allCRISPRessoData$isWT)]=F;
binBounds = makeBinModel(data.frame(Bin=c("A","B","C","D","E","F"), fraction=rep(0.105,6))) # 10.5% bins
# but only the top ~21% (CD) and bottom ~21% (AB) were retained
binBounds = binBounds[binBounds$Bin %in% c("A","B","E","F"),]
binBounds$Bin[binBounds$Bin=="E"]="C"
binBounds$Bin[binBounds$Bin=="F"]="D"
## -----------------------------------------------------------------------------
p = ggplot(binBounds, aes(colour=Bin)) +
geom_density(data=data.frame(x=rnorm(100000)), aes(x=x), fill="gray", colour=NA)+
ggplot2::geom_segment(aes(x=binStartZ, xend=binEndZ, y=fraction, yend=fraction)) +
xlab("Bin bounds as expression Z-scores") +
ylab("Fraction of the distribution captured") +theme_classic()+scale_y_continuous(expand=c(0,0))+
coord_cartesian(ylim=c(0,0.7)); print(p)
## -----------------------------------------------------------------------------
curExpts = unique(readCountMat[c("replicate","locus")])
binStatsAll = data.frame();
for(i in 1:nrow(curExpts)){
binStatsAll = rbind(binStatsAll, cbind(binBounds, curExpts[i,]))
}
## -----------------------------------------------------------------------------
#perform MAUDE analysis at guide (allele in this case) level for each locus.
guideLevelStats = data.frame();
for( l in unique(allSamples$locus)){
message(l);
statsA = findGuideHitsAllScreens(experiments = curExpts[curExpts$locus==l,], countDataFrame = readCountMat[readCountMat$locus==l,], binStats = binStatsAll[binStatsAll$locus==l,], sortBins = c("A","B","C","D"), unsortedBin = "NS", negativeControl = "isWT")
guideLevelStats = rbind(guideLevelStats, statsA)
}
## -----------------------------------------------------------------------------
baseChanges = data.frame()
for (l in wtSeqs$locus){
wtSeq = wtSeqs$SeqSpecies[wtSeqs$locus==l];
wtSplit = strsplit(wtSeq,"")[[1]]
curSeqs = unique(guideLevelStats$SeqSpecies[guideLevelStats$locus==l & guideLevelStats$libFraction > 0.001])
for (i in 1:length(curSeqs)){
curSplit = strsplit(curSeqs[i],"")[[1]]
mismatches = c();
mismatchPoss=c();
for (j in 1:min(length(wtSplit),length(curSplit))){
if (wtSplit[j]!=curSplit[j]){
mismatches = c(mismatches, curSplit[j]);
mismatchPoss = c(mismatchPoss, j)
}
}
if (length(mismatchPoss)>0){
baseChanges = rbind(baseChanges, data.frame(mismatch=mismatches, position = mismatchPoss, SeqSpecies=curSeqs[i], locus=l))
}
}
}
baseChangesSummary = cast(baseChanges, SeqSpecies +locus~ ., value="mismatch")
names(baseChangesSummary)[ncol(baseChangesSummary)] = "numMismatches";
p = ggplot(baseChangesSummary, aes(x=numMismatches, colour=locus)) + stat_ecdf()+ geom_vline(xintercept = 15); print(p)
## -----------------------------------------------------------------------------
#exclude any where the number of mismatches is greater than 15 - these are also likely read or PCR artifacts.
baseChangesSummary$keepAlleles= baseChangesSummary$numMismatches<15;
baseChanges$has = 1;
baseChangesNumWith = cast(baseChanges[baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles],], position + mismatch + locus~ ., value="has", fun.aggregate = sum)
names(baseChangesNumWith)[ncol(baseChangesNumWith)] = "numSeqsWith"
baseChanges = merge(baseChanges, baseChangesNumWith, by=c("position","mismatch","locus"))
baseChanges$mismatch = as.character(baseChanges$mismatch)
baseChanges$SeqSpecies = as.character(baseChanges$SeqSpecies)
p = ggplot(baseChanges[baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles] & !(baseChanges$SeqSpecies %in% baseChanges$SeqSpecies[baseChanges$numSeqsWith==1]),], aes(x=position, y=SeqSpecies, fill=mismatch)) + geom_tile()+scale_fill_manual(values=c("orange","green","blue","red"))+scale_x_continuous(expand=c(0,0))+theme_classic() + theme(axis.text.y = element_text(size=5, family = "Courier")) + facet_grid(locus ~ ., scales="free", space ="free"); print(p)
## -----------------------------------------------------------------------------
#A_53 is the mutation of interest
baseChangesMatrix = cast(baseChanges[baseChanges$locus == "BACH2" & baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles] & !(baseChanges$SeqSpecies %in% baseChanges$SeqSpecies[baseChanges$numSeqsWith==1]),], SeqSpecies ~ mismatch + position, value="has", fill = 0)
A53_seq = baseChangesMatrix$SeqSpecies[apply(baseChangesMatrix[2:ncol(baseChangesMatrix)],1, sum)==1 & baseChangesMatrix$A_53==1]
guideLevelStats$pooled = grepl("-",guideLevelStats$replicate);
## -----------------------------------------------------------------------------
wtAndVarMaudeMu = cast(guideLevelStats[guideLevelStats$SeqSpecies %in% c(A53_seq, wtSeqs$SeqSpecies[wtSeqs$locus=="BACH2"]),], replicate +pooled ~ SeqSpecies, value="mean")
names(wtAndVarMaudeMu)[names(wtAndVarMaudeMu)==A53_seq]="rs72928038";
names(wtAndVarMaudeMu)[names(wtAndVarMaudeMu)==wtSeqs$SeqSpecies[wtSeqs$locus=="BACH2"]]="WT";
ttestResults = t.test(x = wtAndVarMaudeMu$rs72928038[!wtAndVarMaudeMu$pooled], y=wtAndVarMaudeMu$WT[!wtAndVarMaudeMu$pooled], alternative="less", paired=TRUE, var.equal = FALSE)
ttestResults_mixedCells = t.test(x = wtAndVarMaudeMu$rs72928038[wtAndVarMaudeMu$pooled], y=wtAndVarMaudeMu$WT[wtAndVarMaudeMu$pooled], alternative="less", paired=TRUE, var.equal = FALSE)
wtAndVarMaudeMu$replicate2 = gsub("-HEK","",wtAndVarMaudeMu$replicate)
## -----------------------------------------------------------------------------
meltedSNPMus = melt(as.data.frame(wtAndVarMaudeMu), id.vars=c("pooled","replicate", "replicate2"));
meltedSNPMus$genotype = factor(ifelse(meltedSNPMus$variable=="WT", "G", "A (risk)"),levels=c("G","A (risk)"))
p = ggplot(meltedSNPMus, aes(x=genotype, y=value, group=replicate)) + geom_point() + geom_line()+facet_grid(. ~ pooled)+theme_bw()+xlab("Genotype") + ylab("Mean expression") + ggtitle(sprintf("P=%f; P=%f", ttestResults$p.value, ttestResults_mixedCells$p.value)); print(p)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.