#' Read FASTA file as character vector.
#'
#' @param fname name of file to be read.
#'
#' @return character vector with names corresponding to annotations from FASTA.
#' @export
read_fasta = function(fname) {
fas = readLines(fname)
dat = grep(">", fas, value=TRUE)
seq_coords = cbind(grep(">", fas)+1, c(c(grep(">", fas)-1)[-1], length(fas)))
seqs = apply(seq_coords, 1, function(x) paste(fas[x[1]:x[2]], collapse=""))
names(seqs) = dat
names(seqs) = gsub(">", "", sapply(strsplit(names(seqs), split="\\|"), '[[', 1))
seqs
}
map_one_active_site = function(pos, residue, my_pep, gene, flank=7) {
pos_end = pos+nchar(residue)-1
if (!all(c(pos, pos_end) %in% 1:length(my_pep))) {
cat("warning: Indicated signaling sites of ", pos, pos_end, "not present in gene", gene, "of length", length(my_pep), "; skipping\n")
return(NULL)
}
if (paste(my_pep[pos:pos_end], collapse="") != residue) {
cat("warning: Indicated signaling sequence", residue, "not equal to wildtype sequence", paste(my_pep[pos:pos_end], collapse="") ,"of", gene, "at ", pos, pos_end, "; skipping\n")
return(NULL)
}
regions = rep(FALSE, length(my_pep))
regions[intersect((pos-flank):(pos_end+flank), 1:length(my_pep))] = TRUE
regions
}
map_all_active_sites = function(active_site, pep, gene, flank=7) {
if (nrow(active_site)==0){
return(NULL)
}
my_pep = strsplit(pep, split="")[[1]]
mat = do.call("rbind", lapply(1:nrow(active_site), function(i) map_one_active_site(active_site[i, "position"], as.character(active_site[i, "residue"]), my_pep, gene, flank=flank)))
if (is.null(mat[[1]]) || nrow(mat)==0){
return(NULL)
}
active_site_reg = apply(mat, 2, any)
active_site_rle_v = rle(active_site_reg)$values
active_site_rle_p1 = cumsum(rle(active_site_reg)$lengths)
active_site_rle_p0 = c(1, active_site_rle_p1[-length(active_site_rle_p1)]+1)
counter = 1
for (i in 1:length(active_site_rle_v)) {
active_site_reg[active_site_rle_p0[i]:active_site_rle_p1[i]] = ifelse(active_site_rle_v[i], counter, 0)
counter = ifelse(active_site_rle_v[i], counter+1, counter)
}
active_site_reg
}
#pos = muts[i, "position"]; wt_residue = muts[i, "wt_residue"]; mut_residue = muts[i, "mut_residue"]; my_count = muts[i, "count"]; my_pep = strsplit(pep, split="")[[1]]
map_one_mutation = function(pos, wt_residue, mut_residue, pep, gene, skip_mismatch, my_count) {
if (!pos %in% 1:length(pep)) {
cat("warning: proposed mutated position", pos, "not present in gene", gene, "of length", length(pep), "; skipping\n")
return(NULL)
}
skip_word = ifelse(skip_mismatch, "", "NOT")
if (pep[[pos]] != wt_residue) {
cat("warning: proposed wildtype residue",
wt_residue,
"not equal to expected wildtype residue",
pep[pos] ,"of", gene, "at position",
pos, "; ", skip_word," skipping\n")
if (skip_mismatch) {
return(NULL)
}
}
return(rep(pos, my_count))
}
#muts=l$mutations_input; pep=l$protein_sequence; gene=l$gene; skip_mismatch=T
map_all_mutations = function(muts, pep, gene, skip_mismatch) {
if (nrow(muts)==0){
return(NULL)
}
my_pep = strsplit(pep, split="")[[1]]
posi = unlist(lapply(1:nrow(muts), function(i) map_one_mutation(muts[i, "position"], muts[i, "wt_residue"], muts[i, "mut_residue"], my_pep, gene, skip_mismatch=skip_mismatch, muts[i, "count"])))
if (is.null(posi[[1]])) {
return(NULL)
}
lapply(1:length(my_pep), function(x) which(x==posi))
}
get_mutation_status = function(pos, active_site_pos, flank, mid_flank) {
if (is.null(active_site_pos[[1]])) {
return("")
}
stat = c()
if (active_site_pos[pos]>0) {
stat = c(stat, "DI")
}
proximal_flank_reg = intersect(setdiff((pos-mid_flank):(pos+mid_flank), pos), 1:length(active_site_pos))
if (flank>0 & any(active_site_pos[proximal_flank_reg]>0)) {
stat = c(stat, "N1")
}
distal_flank_reg = intersect(setdiff((pos-flank):(pos+flank), proximal_flank_reg), 1:length(active_site_pos))
if (flank>0 & any(active_site_pos[distal_flank_reg]>0)) {
stat = c(stat, "N2")
}
paste(stat, collapse=",")
}
count_flanking_active_sites_in_sequence = function(pos, active_sites, f1, f2) {
posi = c((pos-f2):(pos-f1), (pos+f1):(pos+f2))
posi = intersect(posi, 1:length(active_sites))
length(which(active_sites[posi]!=0))
}
glm1 = function(form, data, type="poisson") {
if (type=="nb") {
return(MASS::glm.nb(stats::as.formula(form), data=data))
} else {
return(stats::glm(stats::as.formula(form), family=stats::poisson, data=data))
}
}
# r = 1; active_site_regions = l$active_regions; mut_pos = l$mutations_per_position; active_site_pos = l$active_sites_in_sequence; dis = l$disorder
assess_one_region = function(r, active_site_regions, mut_pos, active_site_pos, dis, flank, mid_flank, simplified=FALSE, type="poisson") {
n_muts = sapply(mut_pos, length)
rr = active_site_regions == r
dfr = data.frame(rr, n_muts, dis=dis==1)
h0 = try(glm1("n_muts~dis", data=dfr, type=type), T)
if (simplified) {
h1 = try(glm1("n_muts~rr+dis", data=dfr, type=type), T)
} else {
jp_sites_nearby = n2_sites_nearby = n1_sites_nearby = rep(0, length(active_site_pos))
n2_sites_nearby[active_site_regions == r] = sapply(which(active_site_regions == r), count_flanking_active_sites_in_sequence, active_site_pos, mid_flank+1, flank)
n1_sites_nearby[active_site_regions == r] = sapply(which(active_site_regions == r), count_flanking_active_sites_in_sequence, active_site_pos, 1, mid_flank)
jp_sites_nearby[active_site_regions == r] = sapply(which(active_site_regions == r), count_flanking_active_sites_in_sequence, active_site_pos, 0, 0)
dfr = data.frame(rr, n_muts, n2_sites_nearby, n1_sites_nearby, jp_sites_nearby, dis=dis==1)
h1_scope = paste("n_muts~", paste(collapse="+", setdiff(colnames(dfr), c("n_muts"))))
h1 = NULL
if (type=="nb") {
h1 = try(MASS::stepAIC(MASS::glm.nb(n_muts~dis, data=dfr), trace=0, direction="forward", scope=stats::as.formula(h1_scope)), T)
} else {
h1 = try(MASS::stepAIC(stats::glm(n_muts~dis, data=dfr, family=stats::poisson), trace=0, direction="forward", scope=stats::as.formula(h1_scope)), T)
}
}
if (any(c(class(h0)[[1]], class(h1)[[1]])=="try-error")) {
return(data.frame(p=NA, low=NA, med=NA, high=NA, obs=NA, stringsAsFactors=FALSE))
}
p = stats::anova(h0, h1, test="Chisq")[2, ifelse(type=="nb", "Pr(Chi)", "Pr(>Chi)")]
h0_predicted_lambdas = stats::predict(h0, type="response")[rr]
if (type=="nb") {
exp_sampled = replicate(1000, sum(MASS::rnegbin(length(h0_predicted_lambdas), mu=h0_predicted_lambdas, theta=h0$theta), na.rm=T))
} else {
exp_sampled = replicate(1000, sum(stats::rpois(n=length(h0_predicted_lambdas), lambda=h0_predicted_lambdas)))
}
exp_mean = mean(exp_sampled)
exp_sd = stats::sd(exp_sampled)
res = data.frame(p, low=exp_mean-exp_sd, med=exp_mean, high=exp_mean+exp_sd, obs=sum(n_muts[rr]), stringsAsFactors=FALSE)
rm(h0,h1,dfr)
gc()
res
}
assess_all_regions = function(active_site_regions, mut_pos, active_site_pos, dis, flank, mid_flank, simplified=FALSE, all_sites_together=FALSE, type="poisson") {
if (all_sites_together) {
active_site_regions = (active_site_regions>0)+0
}
all_reg = setdiff(unique(active_site_regions), 0)
data.frame(region=all_reg, do.call("rbind", lapply(all_reg, assess_one_region, active_site_regions, mut_pos, active_site_pos, dis, flank, mid_flank, simplified, type=type)), stringsAsFactors=FALSE)
}
create_gene_record = function(gene, fasta, mut, pho, disorder, flank=7, mid_flank=2, simplified=FALSE, all_sites_together=FALSE, skip_mismatch=TRUE, type="poisson", enriched_only=TRUE) {
l = list()
l$gene = gene
l$protein_sequence = fasta[[gene]]
l$disorder = rep(0, nchar(l$protein_sequence))
if (!is.null(disorder[[gene]][[1]])) {
l$disorder = as.numeric(strsplit(disorder[[gene]], split="")[[1]])
}
l$mutations_input = mut[mut$gene==gene,,drop=FALSE]
l$mutations_input = l$mutations_input[l$mutations_input$position %in% 1:nchar(l$protein_sequence),,drop=F]
l$active_sites_input = pho[pho$gene==gene,,drop=FALSE]
l$active_sites_input = l$active_sites_input[l$active_sites_input$position %in% 1:nchar(l$protein_sequence),,drop=F]
if (simplified) {
flank = mid_flank = 0
}
l$active_regions = map_all_active_sites(l$active_sites_input, l$protein_sequence, l$gene, flank=flank)
l$active_sites_in_sequence = map_all_active_sites(l$active_sites_input, l$protein_sequence, l$gene, flank=0)
l$mutations_per_position = map_all_mutations(l$mutations_input, l$protein_sequence, l$gene, skip_mismatch=skip_mismatch)
if (nrow(l$mutations_input)>0) { l$mutations_input$status = "" }
if (nrow(l$mutations_input)>0) { l$mutations_input$active_region = 0 }
if (nrow(l$active_sites_input)>0) { l$active_sites_input$active_region = 0 }
if (!is.null(l$mutations_per_position[[1]])) {
l$mutations_input$status = sapply(l$mutations_input$position, get_mutation_status, l$active_sites_in_sequence, flank, mid_flank)
}
if (!is.null(l$active_sites_in_sequence[[1]])) {
regi = unique(setdiff(unique(l$active_regions), 0))
l$active_region_summary =
do.call(rbind, lapply(regi, function(i) data.frame(gene, reg=i, t(summary(which(l$active_regions==i))[c("Min.", "Max.")]))))
colnames(l$active_region_summary)[3:4] = c("pos0", "pos1")
}
if (!is.null(l$mutations_per_position[[1]]) & !is.null(l$active_sites_in_sequence[[1]])) {
if (length(l$disorder) != nchar(l$protein_sequence)) {
cat("warning: disorder length (", length(l$disorder), ") is different than sequence length (", nchar(l$protein_sequence), ") for gene", gene, "; skipping\n")
}
else {
l$region_mutation_significance =
assess_all_regions(l$active_regions, l$mutations_per_position, l$active_sites_in_sequence, l$disorder, flank, mid_flank, simplified, all_sites_together, type=type)
tot = prod(l$region_mutation_significance[l$region_mutation_significance$p<0.05, "p"], na.rm=T)
if (enriched_only) {
tot = prod(l$region_mutation_significance[l$region_mutation_significance$p<0.05 & l$region_mutation_significance$obs>l$region_mutation_significance$med, "p"], na.rm=T)
}
l$total_mutation_significance = data.frame(p=tot, fdr=tot)
l$active_sites_input$active_region = l$active_regions[l$active_sites_input$position]
l$mutations_input$active_region = l$active_regions[l$mutations_input$position]
}
}
cat(".")
gc()
l
}
merge_report = function(all_active_sites, all_active_regions, all_region_based_pval, all_active_mutations, flank) {
colnames(all_active_sites)[2] = "PTM_position"
all_active_regions$actreg = paste(all_active_regions$gene, all_active_regions$reg, sep=":")
all_region_based_pval$actreg = paste(all_region_based_pval$gene, all_region_based_pval$region, sep=":")
all_active_sites$actreg = paste(all_active_sites$gene, all_active_sites$active_region, sep=":")
all_active_mutations$actreg = paste(all_active_mutations$gene, all_active_mutations$active_region, sep=":")
mat1 = merge(all_active_mutations, all_active_regions, by="actreg")
mat2 = merge(mat1, all_active_sites, by="actreg")
colnames(all_region_based_pval)[colnames(all_region_based_pval)=="gene"] = "gene1"
mat3 = merge(mat2, all_region_based_pval, by="actreg")
exclude_cols =
c("actreg", "cancer_type", "gene.y", "reg", "pos0", "pos1", "gene.x", "active_region.y","gene.y","region", "low", "med", "high", "obs", "gene1")
mat3 = mat3[,!colnames(mat3) %in% exclude_cols]
colnames(mat3)[colnames(mat3)=="position"] = "mut_position"
colnames(mat3)[colnames(mat3)=="active_region.x"] = "active_region"
colnames(mat3)[colnames(mat3)=="p"] = "active_region_p"
# exclude psites that are further away from mutations than +/- 7
# however keep all direct sites as all active regions are annotated as DI when simplified=T
mat3 = mat3[mat3$status=="DI" | (mat3$PTM_position>=mat3$mut_position-flank & mat3$PTM_position<=mat3$mut_position+flank),]
rownames(mat3) = NULL
mat3
}
check_mutations_and_sites = function(seqs_to_check, muts_to_check, sites_to_check, genes_to_check) {
genes <- character()
for (i in 1:length(muts_to_check$gene)) {
if (muts_to_check$gene[i] %in% genes_to_check & !(muts_to_check$gene[i] %in% genes)) {
seq_to_check <- seqs_to_check[muts_to_check$gene[i]]
position <- muts_to_check$position[i]
if (muts_to_check$wt_residue[i] == substring(seq_to_check, position, position)) {
genes <- append(genes, muts_to_check$gene[i])
}
}
}
for (i in 1:length(sites_to_check$gene)) {
if (sites_to_check$gene[i] %in% genes) {
seq_to_check <- seqs_to_check[sites_to_check$gene[i]]
position <- sites_to_check$position[i]
length <- nchar(sites_to_check$residue[i])
if (sites_to_check$residue[i] == substring(seq_to_check, position, position + length - 1)) {
return(TRUE)
}
}
}
FALSE
}
#' Identification of active protein sites (post-translational modification sites, signalling domains, etc) with
#' specific and significant mutations.
#'
#' @param sequences character vector of protein sequences, names are protein IDs.
#' @param seq_disorder character vector of disorder in protein sequences, names are protein IDs and values are strings
#' 1/0 for disordered/ordered protein residues.
#' @param mutations data frame of mutations, with [gene, sample_id, position, wt_residue, mut_residue] as columns.
#' @param active_sites data frame of active sites, with [gene, position, residue, kinase] as columns. Kinase field may
#' be blank and is shown for informative purposes.
#' @param flank numeric for selecting region size around active sites considered important for site activity. Default
#' value is 7. Ignored in case of simplified analysis.
#' @param mid_flank numeric for splitting flanking region size into proximal (<=X) and distal (>X). Default value is
#' 2. Ignored in case of simplified analysis.
#' @param mc.cores numeric for indicating number of computing cores dedicated to computation. Default value is 1.
#' @param simplified true/false for selecting simplified analysis. Default value is FALSE. If TRUE, no flanking regions
#' are considered and only indicated sites are tested for mutations.
#' @param return_records true/false for returning a collection of gene records with more data regarding sites and
#' mutations. Default value is FALSE.
#' @param skip_mismatch true/false for skipping mutations whose reference protein residue does not match expected
#' residue from FASTA sequence file.
#' @param regression_type 'nb' for negative binomial, 'poisson' for poisson GLM. The latter is default.
#' @param enriched_only true/false to indicate whether only sites with enriched active site mutations will be included
#' in the final p-value estimation (TRUE is default). If FALSE, sites with less than expected mutations will be also
#' included.
#'
#' @return list with the following components:
#' @return all_active_mutations - table with mutations that hit or flank an active site. Additional columns of
#' interest include Status (DI - direct active mutation; N1 - proximal flanking mutation; N2 - distal flanking
#' mutation) and Active_region (region ID of active sites in that protein).
#' @return all_active_sites -
#' @return all_region_based_pval - p-values for regions of sites, statistics on observed mutations (obs) and expected
#' mutations (exp, low, high based on mean and s.d. from Poisson sampling). The field Region identifies region in
#' all_active_sites.
# @return all_gene_based_fdr - gene-based uncorrected and FDR-corrected p-values aggregated over multiple sites.
# @return gene_records - if return_records is TRUE, a list of gene-based records is returned (large size).
#' @references Systematic analysis of somatic mutations in phosphorylation signaling predicts novel cancer drivers
#' (2013, Molecular Systems Biology) by Juri Reimand and Gary Bader.
#' @author Juri Reimand <juri.reimand@@utoronto.ca>
#' @examples
#' data(ActiveDriver_data)
#' \donttest{
#' phos_results = ActiveDriver(sequences, sequence_disorder, mutations, phosphosites)
#' ovarian_mutations = mutations[grep("ovarian", mutations$sample_id),]
#' phos_results_ovarian = ActiveDriver(sequences, sequence_disorder, ovarian_mutations, phosphosites)
#' GBM_muts = mutations[grep("glioblastoma", mutations$sample_id),]
#' kin_rslt_GBM = ActiveDriver(sequences, sequence_disorder, GBM_muts, kinase_domains, simplified=TRUE)
#' }
#' kin_results = ActiveDriver(sequences, sequence_disorder, mutations, kinase_domains, simplified=TRUE)
#' @export
ActiveDriver = function(sequences, seq_disorder, mutations, active_sites, flank = 7, mid_flank = 2, mc.cores = 1, simplified = FALSE, return_records = FALSE, skip_mismatch = TRUE, regression_type = "poisson", enriched_only = TRUE) {
# first initiate mutation counts if needed
if(is.null(mutations$count)) {mutations$count=1}
# then check that no counts are zero
mutations = mutations[mutations$count>0,]
# replace factors with characters
mutations[] = lapply(mutations, as.character)
active_sites[] = lapply(active_sites, as.character)
# make sure positions are numeric
mutations$position = as.numeric(mutations$position)
active_sites$position = as.numeric(active_sites$position)
# ensure case is uniform
mutations$wt_residue = toupper(mutations$wt_residue)
mutations$mut_residue = toupper(mutations$mut_residue)
active_sites$residue = toupper(active_sites$residue)
sequences = toupper(sequences)
genes_to_test = Reduce(intersect, list(mutations$gene, active_sites$gene, names(sequences), names(seq_disorder)))
if (length(genes_to_test)<1) {
cat("Error: no genes matched in tables for mutations, active sites and sequences\n");
return(NULL)
}
# check if mutation and active site wt residues match with reference sequences
if (check_mutations_and_sites(sequences, mutations, active_sites, genes_to_test) == FALSE) {
cat("Error: wildtype residues do not match reference sequences\n")
return(NULL)
}
cat("genes:", length(genes_to_test))
gene_records = parallel::mclapply(genes_to_test, create_gene_record,
sequences, mutations, active_sites, seq_disorder, flank=flank, mid_flank=mid_flank,
mc.cores=mc.cores, simplified=simplified, skip_mismatch=skip_mismatch, type=regression_type, enriched_only=enriched_only, mc.preschedule=F)
names(gene_records) = sapply(gene_records, '[[', 'gene')
all_gene_based_fdr = do.call("rbind",
lapply(gene_records, function(gr)
if(!is.null(gr$total_mutation_significance[[1]]))
data.frame(gene=gr$gene, gr$total_mutation_significance, stringsAsFactors=FALSE)))
if (!is.null(all_gene_based_fdr[[1]])) {
all_gene_based_fdr$fdr = stats::p.adjust(all_gene_based_fdr$fdr, method="fdr")
# update fdr values in gene records
for (i in 1:nrow(all_gene_based_fdr)) {
gene_records[[as.character(all_gene_based_fdr[i, "gene"])]]$total_mutation_significance[,"fdr"] = all_gene_based_fdr[i,"fdr"]
}
}
all_region_based_pval = do.call("rbind",
lapply(gene_records, function(gr)
if(!is.null(gr$region_mutation_significance[[1]])) data.frame(gene=gr$gene, gr$region_mutation_significance, stringsAsFactors=FALSE)))
all_active_mutations = do.call("rbind", lapply(gene_records, '[[', 'mutations_input'))
all_active_mutations = all_active_mutations[all_active_mutations$status!="",]
all_active_sites = do.call("rbind", lapply(gene_records, '[[', 'active_sites_input'))
all_active_sites = all_active_sites[all_active_sites$active_region>0,,drop=F]
all_active_regions = do.call("rbind", lapply(gene_records, '[[', 'active_region_summary'))
rownames(all_active_mutations) = rownames(all_active_sites) = rownames(all_active_regions) = rownames(all_region_based_pval) = NULL
## assemble merged report
merged_report = merge_report(all_active_sites, all_active_regions, all_region_based_pval, all_active_mutations, flank)
results = list(all_active_mutations, all_active_sites, all_region_based_pval, all_gene_based_fdr, all_active_regions, merged_report)
names(results) = c("all_active_mutations", "all_active_sites", "all_region_based_pval", "all_gene_based_fdr", "all_active_regions", "merged_report")
if (return_records) {
results$gene_records = gene_records
}
rm(gene_records)
gc()
results
}
#' Example protein sequences for ActiveDriver
#'
#' A dataset containing the sequences of four proteins.
#'
#' @docType data
#' @keywords datasets
#' @name sequences
#' @usage data(ActiveDriver_data)
#' @format A named character vector with 4 elements
NULL
#' Example protein disorder for ActiveDriver
#'
#' A dataset containing the disorder of four proteins.
#'
#' @docType data
#' @keywords datasets
#' @name sequence_disorder
#' @usage data(ActiveDriver_data)
#' @format A named character vector with 4 elements
NULL
#' Example mutations for ActiveDriver
#'
#' A dataset describing mis-sense mutations (i.e., substitutions in proteins). The variables are as follows:
#'
#' \itemize{
#' \item gene. the mutated gene
#' \item sample_id. the sample where the mutation originates
#' \item position. the position in the protein sequence where the mutation occurs
#' \item wt_residue. the wild-type residue
#' \item mut_residue. the mutant residue
#' }
#'
#' @docType data
#' @keywords datasets
#' @name mutations
#' @usage data(ActiveDriver_data)
#' @format A data frame with 408 observations of 5 variables
NULL
#' Example phosphosites for ActiveDriver
#'
#' A dataset describing protein phosphorylation sites. The variables are as follows:
#'
#' \itemize{
#' \item gene. the gene symbol the phosphosite occurs in
#' \item position. the position in the protein sequence where the phosphosite occurs
#' \item residue. the phosphosite residue
#' \item kinase. the kinase that phosphorylates this site
#' }
#'
#' @docType data
#' @keywords datasets
#' @name phosphosites
#' @usage data(ActiveDriver_data)
#' @format A data frame with 131 observations of 4 variables
NULL
#' Example kinase domains for ActiveDriver
#'
#' A dataset describing kinase domains. The variables are as follows:
#'
#' \itemize{
#' \item gene. the gene symbol of the gene where the kinase domain occurs
#' \item position. the position in the protein sequence where the kinase domain begins
#' \item phos. TRUE
#' \item residue. the kinase domain residues
#' }
#'
#' @docType data
#' @keywords datasets
#' @name kinase_domains
#' @usage data(ActiveDriver_data)
#' @format A data frame with 1 observation of 4 variables
NULL
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.