#' @title getStutterData
#' @description helpfunction to obtain stutter data table used for making inference
#' @details Does not require getFilteredData first anymore
#' The nomenlclature for stutters are as follows:
#' BW1 means that the LUS motif is providing a backward stutter (n-1).
#' FW1 means that the LUS motif is providing a forward stutter (n+1).
#' BW2 means that the non-LUS motif is providing a backward stutter (n-1).
#' etc... prefix "D" means double (n-2)
#' n0 stutters are named as either FWBW or BWFW depending on which occuring first in the sequence (hence orientation matters!)
#'
#' @param dat Datatable with columns (RefName,SampleName,Locus,Allele,Coverage,Dose)
#' @param minStuttOccurence The number of required observations per stutter type (remaining put to noise)
#' @param platform Platform type {"MPS","CE"}. NOTE: CE has only consider only some stutter types (but also include -2bp)
#' @param stutterTypes Stutter types. See getStutterTypeRule for supported types
#' @param verbose Whether to print out progress
#' @export
getStutterData = function(dat, minStuttOccurence=5, platform="MPS",stutterTypes=NULL,verbose=FALSE) {
sep = "/"
if(!platform%in%c("CE","MPS")) stop("Platform not supported. Please use either CE or MPS!")
isCE = platform=="CE"
isMPS = platform=="MPS"
#ASSUME COMMON STUTTER TYPES FOR EACH PLATFORM
if(is.null(stutterTypes)) {
stutterTypes=c("BW1","FW1","DBW1","DFW1","TBW1") #also for CE
if(isMPS) {
stutterTypes=c(stutterTypes,"BW2","FWBW","BWFW") #exclusive for MPS
} else if(isCE) {
stutterTypes = substr(stutterTypes, 1,nchar(stutterTypes)-1) #remove rank letters
stutterTypes = c(stutterTypes,"2BP") #-2BP stutters (important for SE33)
}
}
#Extracting unique samples and loci
samples = unique(dat$SampleName) #obtain unique samples
locs = unique(dat$Locus) #obtain unique loci
nLocs = length(locs) #number of loci
nSamples = length(samples) #number of samples
#Define column name for output tables
cn = c("Locus","Sample","Type","Allele","Coverage","Parental","Ratio","Prop","BLMM")
if(verbose) print("Performing indexing for different stutter types...")
stuttertable <- noisetable <- NULL #output tables
for(loc in locs) { #traversing each locus
# loc=locs[1]
nAmbiguous = 0 #count number of sequences that are removed since being ambiguous stutter type
if(verbose) print(loc)
for(sample in samples) {
# sample=samples[2]
#sample="DSU 0137-2"
#sub = dat[dat$Locus==loc,]
sub = dat[dat$Locus==loc & dat$SampleName==sample,,drop=F]
donor = sub$Allele[sub$Dose>0] #donor allele
isArtefact = sub$Dose==0 #get index of artefact
artefacts = sub$Allele[isArtefact] #artefacts
#if(length(donor)==nrow(sub)) next #skip if homozygous OR all alleles are donors (no artefacts)
if(sum(isArtefact)==0) next #skip if no artefacts
if(length(donor)>2) stop( paste0("Sample=",sample," Locus=",loc,": Not possible with more than 2 true alleles") )
# stop()
#STEP 1: Obtain canidate stutter types for each artefacts
candStuttType = rep(NA,length(artefacts)) #get candidate stutter types for each artefact
candBLMM <- candParental <- candStuttType
for(type in stutterTypes) {
for(donor1 in donor) { #traverse one donor allele at the time
# donor1=donor[1]
stuttObj = MPSproto::getStutteredSequence(donor1,type) #get expected stutter
stuttAllele = stuttObj$stutter #get stutter
if(is.na(stuttAllele)) next
indStutt = stuttAllele==artefacts #get which could be parental allele
if(!any(indStutt)) next
isNAins = is.na(candStuttType[indStutt])
indStutt1 = indStutt[isNAins] #non inserted before
indStutt2 = indStutt[!isNAins] # already included before
candStuttType[indStutt1] = type
candStuttType[indStutt2] = paste0(candStuttType[indStutt2],sep,type)
candBLMM[indStutt1] = stuttObj$BLMM
candParental[indStutt1] = donor1
}
}
#Alleles not having any parental allele are indicated as Noise
isNoise = is.na(candStuttType)
if(any(isNoise)) {
isNoise2 = which(isArtefact)[isNoise]
noiseAlleles = sub$Allele[isNoise2]
newrows = data.frame(loc,sample,"NOISE",noiseAlleles,sub$Coverage[isNoise2],NA,NA,NA,NA)
colnames(newrows) = cn
#Check if the noise allele is a 1bp error
donor_seqs = MPSproto::convertBracket2seq(donor)
noise_seqs = MPSproto::convertBracket2seq(noiseAlleles) #obtain sequence of noise
nchanges = adist(noise_seqs,donor_seqs) #obtain sequence of donors
is1errorTab = which(nchanges==1,arr.ind = TRUE) #check which is 1 bp error
for(i in seq_len(nrow(is1errorTab)) ) {
inds = is1errorTab[i,]
parental = donor[inds[2]]
ratio = newrows$Coverage[inds[1]]/sub$Coverage[sub$Allele==parental] #calc ratio
newrows$Type = "ERR1" #one base error
newrows$Parental[inds[1]] = parental
newrows$Ratio[inds[1]] = ratio
newrows$Prop[inds[1]] = 1/(1+1/ratio) #convert to proportion
#get broken part (BLMM)
allele1 = MPSproto::getMotifRepsSequences(newrows$Allele)[[1]] #errored
allele2 = MPSproto::getMotifRepsSequences(parental)[[1]] #parental
for(j in seq_along(allele2)) {
blmm = allele2[j] #obtain motif length
if(blmm!=allele1[j]) break #found if repat length was not the same
}
newrows$BLMM = blmm
}
noisetable = rbind(noisetable, newrows)
}
isOKstutter = !grepl(sep,candStuttType) & !isNoise #obtain stutters to use: Not noise and not multiple stutter candidates
isOKstutterInd = which(isOKstutter)
#Extract stutter allele info
for(ind in isOKstutterInd) { #traverse each OK stutter
#stop()
ToInd = which(isArtefact)[ind] #get index of stutter
if(sub$Allele[ToInd]!=artefacts[ind]) stop() # must be true
type = candStuttType[ind]
FromBLMM = candBLMM[ind]
Parental = candParental[ind] #obtain perantal allele
FromInd = Parental==sub$Allele
coverage_stutter = sub$Coverage[ToInd] #this is coverage of stutter
stutter_rate = sub$Coverage[ToInd]/sub$Coverage[FromInd]
stutter_prop = 1/(1/stutter_rate + 1) #convert to stutter prop
newrow = data.frame(loc,sample,type,artefacts[ind],coverage_stutter,Parental,stutter_rate,stutter_prop,FromBLMM)
colnames(newrow) = cn
stuttertable = rbind(stuttertable,newrow)
} #end for each stutter type
nAmbiguous = nAmbiguous + sum(!isOKstutter)
} #end for each sample
if(verbose) print(paste0("#Ambiguous stutters removed=",nAmbiguous))
} #end for each locus
#LAST: IF TOO FEW STUTTER TYPES OBTAINED FOR A MARKER
#THESE ARE MOVED TO NOISE TABLE
if(minStuttOccurence>0) {
if(verbose) print("Attaching too few stutter observations to noise data...")
#create a overview table:
tab = table(stuttertable$Locus,stuttertable$Type)
mustMove = which(tab < minStuttOccurence & tab>0,arr.ind = TRUE) #obtain
for(i in seq_len( nrow(mustMove)) ) {
stuttType = colnames(tab)[mustMove[i,2]]
stuttLoc = rownames(tab)[mustMove[i,1]]
#INDICATE
moveAsNoise = stuttertable$Locus==stuttLoc & stuttertable$Type==stuttType
#ADD TO NOISE
noisetable = rbind(noisetable,stuttertable[moveAsNoise,,drop=FALSE])
#ADD TO NOISE
stuttertable = stuttertable[!moveAsNoise,,drop=FALSE] #update by removing
}
}
return(list(stutterdata=stuttertable,noisedata=noisetable))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.