#' Predict the probability of homogolous recombination deficiency using mutational signatures
#'
#' @description A wrapper for predict.randomForest() from the randomForest package
#'
#' @param features The output of extractSigsChord(), which is a dataframe containing the SNV, indel
#' and SV context counts.
#' @param rf.model The random forest model. Defaults to CHORD.
#' @param hrd.cutoff Default=0.5. Samples greater or equal to this cutoff will be marked as HRD
#' (is_hrd==TRUE).
#' @param trans.func Function used to transform raw features. Raw features should be in the format:
#' list(snv=matrix(), indel=matrix(), sv=matrix())
#' @param min.indel.load Default=50. The minimum number of indels required to make an accurate HRD
#' prediction. Samples with fewer indels than this value will be marked as is_hrd==NA (HR status
#' could not be confidently determined).
#' @param min.sv.load Default=30. The minimum number of SVs required to make an accurate prediction
#' of BRCA1-type vs. BRCA2-type HRD. Samples with fewer SVs than this value will be marked as
#' hrd_type==NA (HRD type could not be confidently determined).
#' @param min.msi.indel.rep Default=14000 (changing this value is not advised). Samples with more
#' indels within repeats than this threshold will be considered to have microsatellite instability.
#' @param do.bootstrap Test the stability of prediction probabilities? NOTE: this is computationally
#' expensive. Resamples the feature vector for each sample (number of times provided by
#' bootstrap.iters) and calculates HRD probabilities for each iteration. Returns the probabilities
#' at the quantiles specifying in bootstrap.quantiles
#' @param bootstrap.iters Number of resampling iterations for determining the confidence intervals
#' @param bootstrap.quantiles A numeric vector of length 2 specifying the quantiles used to calculate the
#' confidence intervals
#' @param detailed.remarks If TRUE, shows min.indel.load and min.sv.load numbers in the remarks columns
#' @param show.features If TRUE, appends features to output
#' @param verbose Show messages?
#'
#' @return A dataframe containing the HRD probabilities, bootstrap probabilities, and input features
#' @export
#'
#' @examples
#' ## Extract mutation contexts
#' vcf_dir <- '/path_to_vcfs/'
#' vcf_snv <- paste0(vcf_dir,'SampleX_post_processed_v2.2.vcf.gz')
#' vcf_indel <- paste0(vcf_dir,'SampleX_post_processed_v2.2.vcf.gz')
#' vcf_sv <- paste0(vcf_dir,'SampleX_somaticSV_bpi.vcf.gz')
#' contexts <- extractSigsChord(vcf_snv, vcf_indel, vcf_sv, sample.name='SampleX')
#'
#' ## Predict HRD probability with CHORD
#' chordPredict(contexts)
chordPredict <- function(
features, rf.model=CHORD, hrd.cutoff=0.5,
trans.func=NULL,
## QC thresholds
min.indel.load=100, min.sv.load=30, min.msi.indel.rep=14000,
## Confidence interval estimation
do.bootstrap=F, bootstrap.iters=20, bootstrap.quantiles=c(0.05, 0.5, 0.95),
## Other
detailed.remarks=T, show.features=F, verbose=T
){
# features=read.delim('/Users/lnguyen/hpc/cog_bioinf/cuppen/project_data/Luan_projects/CHORD/scripts_main/CHORD/example/output/merged_contexts.txt', check.names=F)
# features=readRDS('/Users/lnguyen/hpc/cog_bioinf/cuppen/project_data/Luan_projects/CHORD/HMF_DR010_DR047/matrices/merged_contexts.rds')
# features=do.call(cbind, unname(features))
# features=features[1:20,]
if(verbose){ message('Preprocessing features...') }
## Converts the raw signature counts from extractSigsChord() to features used by CHORD
if(is.list(features) & !is.data.frame(features)){
features_split <- features
} else {
features_split <- mutSigExtractor::splitDfRegex(features, c(snv='>',indel='[a-z]{3}[.]',sv='[A-Z]{3}'))
}
#features_split <- features_split[c('snv','indel','sv')] ## Remove other mut types if they exist
## Default feature transformation function
if(is.null(trans.func)){
trans.func <- function(features_split){
## Merge del mh bimh 2-5
indels <- features_split$indel
indel_types <- c('del.rep','ins.rep','del.mh','ins.mh','del.none','ins.none')
indels_split <- mutSigExtractor::splitDfRegex(indels, indel_types)
names(indels_split) <- indel_types
counter <- 0
indels_custom <- lapply(indels_split, function(i){
counter <<- counter + 1
df <- as.data.frame(rowSums(i))
colnames(df) <- indel_types[counter]
return(df)
})
indels_custom$del.mh <- with(indels_split,{
cbind(
del.mh['del.mh.bimh.1'],
'del.mh.bimh.2.5'=rowSums(del.mh[!(colnames(del.mh) %in% 'del.mh.bimh.1')])
)
})
## Add new indels back to list
l <- features_split[c('snv','indel','sv')]
l$indel <- do.call(cbind, unname(indels_custom))
## Simplify SNVs to the 6 types of base substitions (C>A, C>G, C>T, T>A, T>C, T>G) by ignoring
## the immediate flanking nucleotides
## Convert absolute counts to relative counts. Calculated per variant type.
m <- mutSigExtractor::transformContexts(l, simplify.types='snv', rel.types='all')
return(m)
}
}
features_processed <- trans.func(features_split)
#--------- Prediction ---------#
if(verbose){ message('Calculating HRD probabilities...') }
doPredict <- function(m){
df <- as.data.frame(
randomForest:::predict.randomForest(rf.model, m, type='prob'),
stringsAsFactors=F
)
df <- df[,c('none','BRCA1','BRCA2')]
colnames(df) <- paste0('p_',colnames(df))
df$p_hrd <- df$p_BRCA1 + df$p_BRCA2
#df <- cbind(sample=rownames(df), df); rownames(df) <- NULL
return(df)
}
df <- doPredict(features_processed)
#--------- Bootstrap predictions ---------#
if(do.bootstrap){
if(verbose){ message('Calculating bootstrap confidence intervals...') }
resampleFeatureMatrix <- function(m, repeats=bootstrap.iters){
#m=features_split$sv
feature_ids <- as.factor(1:ncol(m)) ## Replace feature names by integers to save memory
resampleFeatureVector <- function(v){
#v=unlist(m[229,])
total_counts <- sum(v)
if(total_counts==0){ ## Prevent divide by 0 errors
return(
matrix(0, nrow=repeats, ncol=ncol(m))
)
}
prob <- v/total_counts
t(replicate(
repeats,
table( sample(feature_ids, total_counts, replace=T, prob=prob) )
))
}
if(verbose){ pb <- txtProgressBar(max=nrow(m), style=3) }
out <- lapply(1:nrow(m), function(i){
#i=1
#print(i)
if(verbose){ setTxtProgressBar(pb, i) }
m_resampled <- resampleFeatureVector(m[i,])
colnames(m_resampled) <- colnames(m)
return(m_resampled)
})
message('\n')
names(out) <- rownames(m)
return(out)
}
## list structure: mut type -> sample name
features_split_resampled <- lapply(names(features_split), function(i){
if(verbose){ message('> Resampling ',i,' matrix...') }
resampleFeatureMatrix(features_split[[i]])
})
names(features_split_resampled) <- names(features_split)
if(verbose){ message('> Performing bootstrap predictions...') }
bootstrap_pred <- lapply(1:nrow(features), function(i){
#i=229
#print(i)
## convert to: sample name --> mut type
l <- list(
snv=features_split_resampled$snv[[i]],
indel=features_split_resampled$indel[[i]],
sv=features_split_resampled$sv[[i]]
)
## Transform features
m <- trans.func(l)
## Predict
pred <- doPredict(m)
## Calculate quantiles
unlist(lapply(pred,function(pred.class){ ## Unlist automatically prepends pred class names
quantile(pred.class, bootstrap.quantiles)
}))
})
bootstrap_pred <- do.call(rbind, bootstrap_pred)
rownames(bootstrap_pred) <- rownames(features)
}
#--------- QC ---------#
if(verbose){ message('Performing QC checks...') }
qc <- list()
qc$has_msi <- with(features_split,{
rowSums(indel[,grep('rep',colnames(indel)),drop=F]) > min.msi.indel.rep
})
qc$low_indel_load <- rowSums(features_split$indel) < min.indel.load
qc$low_sv_load <- rowSums(features_split$sv) < min.sv.load
qc <- as.data.frame(qc)
failed_qc <- with(qc,{ has_msi | low_indel_load | low_sv_load })
## Informative QC tags
if(!detailed.remarks){
failed_qc_strings <- colnames(qc)
names(failed_qc_strings) <- colnames(qc)
} else {
## Note to self: make sure order of qc names is correct upon editing
failed_qc_strings <- c(
has_msi=paste0('Has MSI (>',min.msi.indel.rep,' indel.rep)'),
low_indel_load=paste0('<',min.indel.load,' indels'),
low_sv_load=paste0('<',min.sv.load,' SVs')
)
}
## is_hrd qc tags
df_qc_is_hrd <- data.frame(
has_msi=ifelse(qc$has_msi,failed_qc_strings['has_msi'],''),
low_indel_load=ifelse(qc$low_indel_load,failed_qc_strings['low_indel_load'],'')
)
qc_is_hrd <- with(df_qc_is_hrd,{
paste(has_msi, low_indel_load,sep=';')
})
qc_is_hrd <- gsub('^;|;$','',qc_is_hrd)
qc_is_hrd[nchar(qc_is_hrd)==0] <- ''
## hrd_type qc tags
qc_hrd_type <- ifelse(
qc$low_sv_load,
failed_qc_strings['low_sv_load'],
''
)
qc_out <- data.frame(
remarks_hr_status=qc_is_hrd,
remarks_hrd_type=qc_hrd_type
)
#--------- Determine HR status ---------#
if(verbose){ message('Finalizing output...') }
## Is HRD?
df$hr_status <- ifelse(
df$p_hrd >= hrd.cutoff,
'HR_deficient','HR_proficient'
)
## Which HRD subtype?
df$hrd_type <- unlist(
Map(function(p_BRCA1, p_BRCA2, p_hrd){
if(p_hrd>=hrd.cutoff){
c('BRCA1_type','BRCA2_type')[ which.max(c(p_BRCA1,p_BRCA2)) ]
} else {
'none'
}
}, df$p_BRCA1, df$p_BRCA2, df$p_hrd)
)
## Add qc
df <- cbind(df, qc_out)
## Clean up tags
df$hr_status[ qc$low_indel_load | qc$has_msi ] <- 'cannot_be_determined'
df$hrd_type[ qc$low_sv_load ] <- 'cannot_be_determined'
df$hrd_type[ df$hr_status %in% c('HR_proficient','cannot_be_determined') ] <- 'none'
df$remarks_hrd_type[ df$hr_status=='HR_proficient' ] <- ''
#df <- df[,!grepl('^p_none',colnames(df))] ## Remove redunant 'none' class
#--------- Gather outputs ---------#
out <- cbind(sample=rownames(df), df)
if(do.bootstrap){ out <- cbind(out, bootstrap_pred) }
if(show.features){ out <- cbind(out, features_processed) }
out <- out[,!grepl('^p_none',colnames(out))] ## Remove redunant 'none' class
rownames(out) <- NULL
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.