doc/BACH2_base_editor_screen.R

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## -----------------------------------------------------------------------------
library(ggplot2)
library(reshape)
library(MAUDE)
maudeGitPathRoot = "https://raw.githubusercontent.com/de-Boer-Lab/MAUDE/master"

## -----------------------------------------------------------------------------
allSamples = read.table(file=sprintf("%s/vignettes/BACH2_data/sample_metadata.txt",maudeGitPathRoot), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F)
head(allSamples)
#remove some of the control samples
allSamples = allSamples[!grepl("HEK62", allSamples$replicate),]


## -----------------------------------------------------------------------------
if (FALSE){
  #this was run on my computer to load the data from many files spread out over many subdirectories. Rather than upload all these files separately, I have loaded them all locally and saved the resulting concatenated file onto github. I leave this code here so that others may view how the data was loaded, should they have their own CRISPResso files to analyze.
  inDir = "/Path/To/CRISPResso/Files";
  setwd(inDir)
  
  
  allCRISPRessoData= data.frame()
  for (i in 1:nrow(allSamples)){
    curData = read.table(file=sprintf("%s/%s/Alleles_frequency_table.txt",inDir, allSamples$directory[i]), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F, comment.char = "")
    curData2 = read.table(file=sprintf("%s/Deep_resequencing_analysis/%s/Alleles_frequency_table.txt",inDir, allSamples$directory[i]), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F, comment.char = "")
    curData = rbind(curData, curData2);
    curData$locus = allSamples$locus[i];
    curData$replicate = allSamples$replicate[i];
    curData$sortBin = allSamples$sortBin[i];
    allCRISPRessoData = rbind(allCRISPRessoData, curData)
  }
  #remove gaps from sequences
  allCRISPRessoData$SeqSpecies = gsub("-","",allCRISPRessoData$Aligned_Sequence); # Allele
  #remove unwanted fields
  allCRISPRessoData$Aligned_Sequence=NULL; allCRISPRessoData$Reference_Sequence=NULL;  allCRISPRessoData$X.Reads.1=NULL;
  write.table(allCRISPRessoData, file=sprintf("%s/loaded_BACH2_BE_CRISPResso_data.txt", inDir),row.names = F, col.names = T, quote=F, sep="\t")
  #I then gzipped this file and uploaded it to github
}else{
  #load the CRISPResso files from GitHub. 
  z= gzcon(url(sprintf("%s/vignettes/BACH2_data/loaded_BACH2_BE_CRISPResso_data.txt.gz",maudeGitPathRoot)));
  fileConn=textConnection(readLines(z));
  allCRISPRessoData =  read.table(fileConn, sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F)
  close(fileConn)
}

## -----------------------------------------------------------------------------
#CRISPResso has split some sequences into two or maybe more lines; below is an example
#We also input multiple files from different sequencing runs. This merges all identical seq species'/samples
allCRISPRessoData = cast(melt(allCRISPRessoData, id.vars = c("SeqSpecies","Reference_Name","Read_Status","n_deleted","n_inserted","n_mutated", "locus","replicate","sortBin")), SeqSpecies + Reference_Name + Read_Status + n_deleted + n_inserted + n_mutated + locus + replicate + sortBin ~ variable, value="value", fun.aggregate=sum)

#Get read totals per replicate
allCRISPRessoDataTotals = cast(allCRISPRessoData, locus + replicate + sortBin ~ ., value="X.Reads", fun.aggregate = sum)
names(allCRISPRessoDataTotals)[ncol(allCRISPRessoDataTotals)] = "totalReads";

#Add totalReads column to allCRISPRessoData, calculate read fractions
allCRISPRessoData = merge(allCRISPRessoData, allCRISPRessoDataTotals, by=c("locus","replicate","sortBin"))
allCRISPRessoData$readFraction = allCRISPRessoData$X.Reads/allCRISPRessoData$totalReads;

## -----------------------------------------------------------------------------
p = ggplot(allCRISPRessoDataTotals, aes(x=replicate, fill=sortBin, y=totalReads)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ locus) + theme_classic() + scale_y_continuous(expand=c(0,0))+theme(axis.text.x=element_text(hjust=1, angle=90)); print(p)

## -----------------------------------------------------------------------------
p = ggplot(allCRISPRessoDataTotals, aes(x=replicate, fill=sortBin, y=totalReads)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ locus) + theme_classic() + scale_y_log10(expand=c(0,0))+theme(axis.text.x=element_text(hjust=1, angle=90)); print(p)

## -----------------------------------------------------------------------------
allCRISPRessoData = allCRISPRessoData[ !(allCRISPRessoData$locus=="HEK" & grepl("^[ABCR][12]$", allCRISPRessoData$replicate)) & !(allCRISPRessoData$locus=="BACH2" & grepl("^HEK9[12]", allCRISPRessoData$replicate)),]

## -----------------------------------------------------------------------------
p = ggplot(allCRISPRessoData[allCRISPRessoData$sortBin=="NS",], aes(x= readFraction, colour=replicate)) + stat_ecdf()+facet_grid(locus ~ .) + scale_x_log10() + theme_bw(); print(p)

## -----------------------------------------------------------------------------
temp = cast(allCRISPRessoData[allCRISPRessoData$Read_Status=="UNMODIFIED" & allCRISPRessoData$Reference_Name=="Reference",], formula = sortBin + locus+ replicate ~. ,value = "readFraction", fun.aggregate = max)
names(temp)[ncol(temp)]="readFraction";
p = ggplot(temp, aes(x=sortBin, y=replicate, fill=readFraction)) + geom_tile() + facet_grid(locus ~., scales="free_y")+ggtitle("just unmodified read fractions") + scale_fill_gradientn(colours=c("red","orange", "green","cyan","blue","violet"), limits=c(0,0.8)); print(p)
min(temp$readFraction, na.rm = T) # 0.0356963

## -----------------------------------------------------------------------------
allCRISPRessoData = allCRISPRessoData[allCRISPRessoData$replicate!="A1-HEK",] # this sample had no data for bin D (BACH2) because it didn't PCR well

allCRISPRessoData = allCRISPRessoData[allCRISPRessoData$replicate!="B2-HEK",] # this sample had very low coverage for bin D for BACH2

## -----------------------------------------------------------------------------
seqsObserved = cast(allCRISPRessoData, SeqSpecies + locus + Read_Status + Reference_Name~ .)
names(seqsObserved)[ncol(seqsObserved)]="seqRunsObserved";

retainedSamples = unique(allCRISPRessoData[,c("locus","replicate","sortBin")])

for (l in unique(allSamples$locus)){
  seqsObserved$inAll[seqsObserved$locus == l] = seqsObserved$seqRunsObserved[seqsObserved$locus == l]==sum(retainedSamples$locus==l)
}

#require that all samples have at least one read for every species considered. This will help exclude read errors
keepAlleles = seqsObserved[seqsObserved$inAll,]

## -----------------------------------------------------------------------------
#First make a count matrix, but only of alleles seen in all replicates
readCountMat = cast(allCRISPRessoData[allCRISPRessoData$SeqSpecies %in% keepAlleles$SeqSpecies,], SeqSpecies  + replicate +locus ~ sortBin, value="X.Reads")
readCountMat[is.na(readCountMat)] = 0
readCountMat = readCountMat[order(readCountMat$NS, decreasing = T),]

#Label WT alleles
wtSeqs = data.frame(locus = c("HEK","BACH2"), SeqSpecies =  c("GGTAGCCAGAGACCCGCTGGTCTTCTTTCCCCTCCCCTGCCCTCCCCTCCCTTCAAGATGGCTGACAA","TGCCCCACCCTGTGCCCTTTTTACATTACACACAAATAGGGACGGATTTCCTGTAAGCTGATCTTGAAGAAAAAAAACATGTTAGACAAAGAAAATCAGAACTAAGA"), isWT=T, stringsAsFactors = F);
readCountMat = merge(readCountMat, wtSeqs, by=c("locus","SeqSpecies"), all.x=T)
readCountMat$isWT[is.na(readCountMat$isWT)]=F;

allCRISPRessoData = merge(allCRISPRessoData, wtSeqs, by=c("locus","SeqSpecies"), all.x=T)
allCRISPRessoData$isWT[is.na(allCRISPRessoData$isWT)]=F;


binBounds = makeBinModel(data.frame(Bin=c("A","B","C","D","E","F"), fraction=rep(0.105,6))) # 10.5% bins
# but only the top ~21% (CD) and bottom ~21% (AB) were retained
binBounds = binBounds[binBounds$Bin %in% c("A","B","E","F"),] 
binBounds$Bin[binBounds$Bin=="E"]="C"
binBounds$Bin[binBounds$Bin=="F"]="D"

## -----------------------------------------------------------------------------
p = ggplot(binBounds, aes(colour=Bin))  + 
  geom_density(data=data.frame(x=rnorm(100000)), aes(x=x), fill="gray", colour=NA)+ 
  ggplot2::geom_segment(aes(x=binStartZ, xend=binEndZ, y=fraction, yend=fraction)) + 
  xlab("Bin bounds as expression Z-scores") + 
  ylab("Fraction of the distribution captured") +theme_classic()+scale_y_continuous(expand=c(0,0))+
  coord_cartesian(ylim=c(0,0.7)); print(p)

## -----------------------------------------------------------------------------
curExpts = unique(readCountMat[c("replicate","locus")])

binStatsAll = data.frame();
for(i in 1:nrow(curExpts)){
  binStatsAll = rbind(binStatsAll, cbind(binBounds, curExpts[i,]))
}

## -----------------------------------------------------------------------------
#perform MAUDE analysis at guide (allele in this case) level for each locus.
guideLevelStats = data.frame();
for( l in unique(allSamples$locus)){
  message(l);
  statsA = findGuideHitsAllScreens(experiments = curExpts[curExpts$locus==l,], countDataFrame = readCountMat[readCountMat$locus==l,], binStats = binStatsAll[binStatsAll$locus==l,], sortBins = c("A","B","C","D"), unsortedBin = "NS", negativeControl = "isWT")
  guideLevelStats = rbind(guideLevelStats, statsA)
}

## -----------------------------------------------------------------------------
baseChanges = data.frame()
for (l in wtSeqs$locus){
  wtSeq = wtSeqs$SeqSpecies[wtSeqs$locus==l];
  wtSplit = strsplit(wtSeq,"")[[1]]
  curSeqs = unique(guideLevelStats$SeqSpecies[guideLevelStats$locus==l & guideLevelStats$libFraction > 0.001])
  for (i in 1:length(curSeqs)){
    curSplit = strsplit(curSeqs[i],"")[[1]]
    mismatches = c();
    mismatchPoss=c();
    for (j in 1:min(length(wtSplit),length(curSplit))){
      if (wtSplit[j]!=curSplit[j]){
        mismatches = c(mismatches, curSplit[j]);
        mismatchPoss = c(mismatchPoss, j)
      }
    }
    if (length(mismatchPoss)>0){
      baseChanges = rbind(baseChanges, data.frame(mismatch=mismatches, position = mismatchPoss, SeqSpecies=curSeqs[i], locus=l))
    }
  }
}

baseChangesSummary = cast(baseChanges, SeqSpecies +locus~ ., value="mismatch")
names(baseChangesSummary)[ncol(baseChangesSummary)] = "numMismatches";
p = ggplot(baseChangesSummary, aes(x=numMismatches, colour=locus)) + stat_ecdf()+ geom_vline(xintercept = 15); print(p)

## -----------------------------------------------------------------------------
#exclude any where the number of mismatches is greater than 15 - these are also likely read or PCR artifacts.
baseChangesSummary$keepAlleles= baseChangesSummary$numMismatches<15;

baseChanges$has = 1;

baseChangesNumWith = cast(baseChanges[baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles],], position + mismatch + locus~ ., value="has", fun.aggregate = sum)
names(baseChangesNumWith)[ncol(baseChangesNumWith)] = "numSeqsWith"
baseChanges = merge(baseChanges, baseChangesNumWith, by=c("position","mismatch","locus"))
baseChanges$mismatch = as.character(baseChanges$mismatch)
baseChanges$SeqSpecies = as.character(baseChanges$SeqSpecies)


p = ggplot(baseChanges[baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles] & !(baseChanges$SeqSpecies %in% baseChanges$SeqSpecies[baseChanges$numSeqsWith==1]),], aes(x=position, y=SeqSpecies, fill=mismatch)) + geom_tile()+scale_fill_manual(values=c("orange","green","blue","red"))+scale_x_continuous(expand=c(0,0))+theme_classic() + theme(axis.text.y = element_text(size=5, family = "Courier")) + facet_grid(locus ~ ., scales="free", space ="free"); print(p)

## -----------------------------------------------------------------------------
#A_53 is the mutation of interest

baseChangesMatrix = cast(baseChanges[baseChanges$locus == "BACH2" & baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles] & !(baseChanges$SeqSpecies %in% baseChanges$SeqSpecies[baseChanges$numSeqsWith==1]),], SeqSpecies ~ mismatch + position, value="has", fill = 0)
A53_seq = baseChangesMatrix$SeqSpecies[apply(baseChangesMatrix[2:ncol(baseChangesMatrix)],1, sum)==1 & baseChangesMatrix$A_53==1]

guideLevelStats$pooled = grepl("-",guideLevelStats$replicate);

## -----------------------------------------------------------------------------
wtAndVarMaudeMu = cast(guideLevelStats[guideLevelStats$SeqSpecies %in% c(A53_seq, wtSeqs$SeqSpecies[wtSeqs$locus=="BACH2"]),], replicate +pooled ~ SeqSpecies, value="mean")
names(wtAndVarMaudeMu)[names(wtAndVarMaudeMu)==A53_seq]="rs72928038";
names(wtAndVarMaudeMu)[names(wtAndVarMaudeMu)==wtSeqs$SeqSpecies[wtSeqs$locus=="BACH2"]]="WT";

ttestResults = t.test(x = wtAndVarMaudeMu$rs72928038[!wtAndVarMaudeMu$pooled], y=wtAndVarMaudeMu$WT[!wtAndVarMaudeMu$pooled], alternative="less", paired=TRUE, var.equal = FALSE)

ttestResults_mixedCells = t.test(x = wtAndVarMaudeMu$rs72928038[wtAndVarMaudeMu$pooled], y=wtAndVarMaudeMu$WT[wtAndVarMaudeMu$pooled], alternative="less", paired=TRUE, var.equal = FALSE)

wtAndVarMaudeMu$replicate2 = gsub("-HEK","",wtAndVarMaudeMu$replicate)


## -----------------------------------------------------------------------------
meltedSNPMus = melt(as.data.frame(wtAndVarMaudeMu), id.vars=c("pooled","replicate", "replicate2"));
meltedSNPMus$genotype = factor(ifelse(meltedSNPMus$variable=="WT", "G", "A (risk)"),levels=c("G","A (risk)"))
p = ggplot(meltedSNPMus, aes(x=genotype, y=value, group=replicate)) + geom_point() + geom_line()+facet_grid(. ~ pooled)+theme_bw()+xlab("Genotype") + ylab("Mean expression") + ggtitle(sprintf("P=%f; P=%f", ttestResults$p.value, ttestResults_mixedCells$p.value)); print(p)
Carldeboer/MAUDE documentation built on March 27, 2022, 8:50 p.m.