#' mtSummary function
#'
#' @description Identification of mtDNA variations, output summary statistics,
#' annotation of heteroplasmic and/or homoplasmic variations.
#'
#' @param aaf a numeric matrix (16569 x N) provided by the user.
#' Rows correspond to loci and columns correspond to subjects.
#' It contains subject ID as the column names,
#' and the AAFs of all 16569 mtDNA loci for each subject.
#' It is generated from mtAAF function.
#' @param allele a character matrix (16569 x N) provided by the user.
#' Rows correspond to loci and columns correspond to subjects.
#' This matrix contains the alleles of each subject at each locus.
#' The matrix must contain subject ID as the column names.
#' "/" is used to delimited different allele calls in a locus.
#' @param freq a character matrix (16569 x N) provided by the user.
#' Rows correspond to loci and columns correspond to subjects.
#' This matrix contains the allele fractions of the corresponding allele matrix.
#' The matrix must contain subject ID as the column names.
#' "/" is used to delimited the allele fractions.
#' @param coverage a numeric matrix (16569 x N) provided by the user.
#' Rows correspond to loci and columns correspond to subjects.
#' This matrix contains the reads coverage of the 16569 mtDNA loci for each
#' subject. The matrix must contain the subject ID as the column names.
#' @param coverage.qc a number(default is 250) of threshold for the coverage.
#' If the coverage<coverage.qc, the allele call at that locus of the subject
#' will not be used.
#' @param thre.lower a number(default is 0.03) of lower bound of the threshold
#' defining heteroplasmic
#' and homoplasmic variations
#' @param thre.upper a number(default is 0.97) of upper bound of the threshold
#' defining heteroplasmic
#' and homoplasmic variations
#' @param loci one of:
#' 1. a vector(default is c(1:16569)) of mitochondrial DNA loci to specify
#' which loci should be used to identify the variations and annotate,
#' 2. a character string for the regions
#' (e.g. "coding" , "tRNA", "RNR1" , "RNR2",...)
#' @param type a character of indicator choosing to output annotation
#' to all variations,
#' heteroplasmic variations, or homoplasmic variations. “both” returns
#' annotation to all
#' variations (default), "heter" returns annotation to heteroplasmic
#' variations and "homo"
#' returns annotation to homoplasmic variations.
#' @param coverSummary logical(default is True). A user can specify to output
#' summary of mean coverage at each mtDNA loci (across all participants) and
#' for each individual (across all mtDNA loci).
#' @param varHist logical(default is True). A user can specify to output
#' histograms to visualize the
#' heteroplasmic and homoplasmic burden across participants and mtDNA loci.
#' @param annot.select A character vector of variation position, alternative
#' allele, corresponding gene and types of annotation scores to output based on
#' user's choice. The available choices are "Pos", "ref", "Gene", "TypeMutation",
#' "MissensMutation", "CodonPosition", "ProteinDomain", "mFOLD_dG",
#' "mFOLD_Initial", "mFOLD_rCRS.DG", "mFOLD_rCRS.Initial",
#' "mFOLD_AnticodonAminoAcidChange", "mFOLD_Location", "PolyPhen2",
#' "PolyPhen2_score", "SIFT", "SIFT_score", "PROVEAN", "PROVEAN_score",
#' "MutationAssessor", "MutationAssessor_score", "CADD", "CADD_score",
#' "CADD_phred_score", "PANTHER", "PANTHER_score", "PhD_SNP", "PhD_SNP_score",
#' "SNAP", "SNAP_score", "MutationTaster", "MutationTaster_score", "dbSNP_150_id",
#' "APOGEE_boost_consensus", "APOGEE_boost_mean_prob", "APOGEE_boost_mean"
#' @param path the path of the output annotation file. If not provided,
#' the annotation file will
#' output to the current working directory
#' @param study A string of study names. Default is Study.
#' @param anno A logical value (default is False) indicating whether output the
#' annotation results in a .csv file
#'
#' @return 1. Summary frequency: descriptive statistics of sequencing coverage
#' across individual
#' and across mtDNA loci; the total number of mtDNA loci with variations and
#' the number of
#' heteroplasmic/homoplasmic mtDNA loci in the study sample;
#' the min/Q1/mean/median/Q3/largest number
#' of homoplasmy/heteroplasmy carried by an individual.
#' 2. Plots: a scatter plot of the median coverage
#' across mtDNA loci; histograms to visualize the heteroplasmic and
#' homoplasmic burden across
#' participants and mtDNA loci based on user’s choice.
#' 3. Summary annotation for all of the
#' heteroplasmic and/or homoplasmic variations observed in
#' the study data based on user's choice.
#' @import graphics grDevices utils
#' @export
#' @examples
#'
#'\dontrun{
#' ## Read input data
#' allele_file <- "allele.csv"
#' freq_file <- "freq.csv"
#'
#' allele <- as.matrix( read.csv(file = allele_file, sep = ",") )
#' freq <- as.matrix( read.csv(file = freq_file, sep = ",") )
#'
#' aaf = mtAAF ( allele, freq)
#'
#' ## Summary for the aaf object
#' mtSummary(aaf, allele, freq, coverage)
#'}
mtSummary<-function(aaf, allele, freq, coverage,
coverage.qc=250,
thre.lower=0.03, thre.upper=0.97,
loci=c(1 : .mtLength),
type="both",
coverSummary=T, varHist=T,
annot.select=c("Pos", "ref", "Gene", "TypeMutation",
"MissensMutation", "CodonPosition",
"ProteinDomain", "dbSNP_150_id", "PolyPhen2",
"PolyPhen2_score",
"SIFT", "SIFT_score", "CADD", "CADD_score",
"CADD_phred_score"),
path="./",
study="Study",
anno=T) {
if(!all(is.character(allele), is.character(freq) ,
is.numeric(aaf), is.numeric(coverage)))
stop("the allele and freq shoud be character,
aaf and coverage should be numeric")
if(!all(sapply(list(dim(aaf), dim(allele), dim(freq)),
function(x) x == dim(coverage))))
stop("the coverage, allele, frequency and
AAF should have same dimension")
if(dim(aaf)[1] != .mtLength)
stop("the coverage, allele, frequency and aaf should have
16569 loci (rows)")
if(!all(setequal(colnames(allele), colnames(freq)),
setequal(colnames(allele), colnames(aaf)),
setequal(colnames(allele), colnames(coverage))))
stop("the coverage, allele, frequency and aaf should have same
subject IDs (column names)")
if(!(is.logical(coverSummary) & length(coverSummary)==1)){
stop("coverSummary must be logical value")
}
if(!(is.logical(varHist) & length(varHist)==1)){
stop("varHist must be logical value")
}
if(!(is.logical(anno) & length(anno)==1)){
stop("anno must be logical value")
}
if(!all(mode(loci) %in% c("numeric","character"), is.vector(loci)))
stop("loci must be numeric vector or character")
if(is.character(loci) & !all(loci %in% c("coding", "tRNA", "RNR1", "RNR2") &
length(loci) == 1))
stop("loci must be one of coding, tRNA, RNR1, RNR2 if it is character")
if(is.numeric(loci) & !all(loci %in% (1:.mtLength)))
stop("loci should be a subset of 1:16569")
if( ! dir.exists(path) )
stop( paste(" Output path", path, "does not exist.") )
if(!all(is.numeric(coverage.qc), length(coverage.qc)==1))
stop("coverage.qc must be numeric of length 1")
if(!all(is.numeric(c(thre.lower, thre.upper)),
length(thre.lower) == 1,
length(thre.upper) == 1))
stop("thre.lower and thre.upper must be numeric of length 1")
if(thre.lower >= thre.upper)
stop("thre.lower must be smaller than thre.upper")
if(!all(length(type) == 1 , type %in% c("both","heter","homo")))
stop("type must be one of both, heter and homo")
## Check if annot.select belongs to the colnames of annotation files
if(!all(annot.select %in% names(.AA), is.vector(annot.select)))
stop("annot.select must be a subset of variables of annotation files")
subjectID <- colnames(allele)
subjectID <- sort(subjectID)
allele <- allele[ , subjectID]
freq <- freq[ , subjectID]
coverage <- coverage[ , subjectID]
aaf <- aaf[ , subjectID]
if (is.character(loci) ){
if( loci == "coding"){
loci <- .loci.coding
}else if(loci == "tRNA"){
loci <- .loci.tRNA
}else if( loci == "RNR1"){
loci <- .loci.RNR1
}else if( loci == "RNR2"){
loci <- .loci.RNR2
}else {
stop("loci name must be one of: coding, tRNA, RNR1, RNR2")
}
}
aaf <- aaf[loci,]
allele <- allele[loci,]
rownames(allele) <- as.character(loci)
freq <- freq[loci,]
rownames(freq) <- as.character(loci)
coverage <- coverage[loci,]
rownames(coverage) <- as.character(loci)
# create an object of output of summary
mt_summary_obj <- NULL
if(coverSummary){
mt_summary_obj$coverLoci <- summary( rowMeans(coverage, na.rm=T) )
mt_summary_obj$coverSubjects <- summary( colMeans(coverage, na.rm=T) )
}
## categorize by AAF: AAF < thre.lower means no variation(coded as 0)
## thre.lower <= AAF <= thre.upper means heteroplasmic variation(coded as 1)
## thre.upper<AAF means homoplastic variation(coded as 2)
aaf_cat <- matrix(0, nrow=nrow(aaf), ncol=ncol(aaf))
aaf_cat[ aaf >= thre.lower & aaf <= thre.upper ] <- 1
aaf_cat[ aaf > thre.upper ] <- 2
rownames(aaf_cat) <- rownames(aaf)
colnames(aaf_cat) <- colnames(aaf)
aaf_cat[ coverage < coverage.qc ] <- NA
## The reads of these loci are not reliable, so these loci should be removed
loci_removed <- .mtLociNUMT # c(301,302,310,316,3107,16182)
loci_removed <- as.character(loci_removed)
loci_left <- setdiff(rownames(aaf_cat), loci_removed)
aaf_cat <- aaf_cat[loci_left,]
## only include mutations loci: at least one subject has mutation at that locus
mutation_collect <- aaf_cat
n_mutation <- rowSums(aaf_cat > 0, na.rm=T)
## only include the mutation(heter/homo) loci
mutation_collect <- mutation_collect[n_mutation > 0,]
## loci of variations(heter/homo)
loci_var <- as.numeric(rownames(mutation_collect))
mt_summary_obj$loci_var <- loci_var
n_mutation <- n_mutation[n_mutation > 0]
## output IDs of individuals
mt_summary_obj$IDs <- subjectID
## output summary statistics of hereoplasmic variations
## calculate the number of heteroplasmic mutations for each subject
heter_burden <- colSums(mutation_collect == 1, na.rm=T)
mt_summary_obj$heter_burden <- heter_burden
mt_summary_obj$heter_burden_sum <- summary(heter_burden)
## calculate the number of heteroplasmic mutations for each locus
heter_loci <- rowSums(aaf_cat == 1, na.rm=T)
mt_summary_obj$heter_loci <- heter_loci
mt_summary_obj$heter_loci_sum <- summary(heter_loci)
## loci with heteroplasmic variations
loci_heter <- as.numeric(names(heter_loci))[heter_loci > 0]
mt_summary_obj$loci_heter <- loci_heter
## calculate the total number of heteroplasmic variations
mt_summary_obj$heter_total <- sum(heter_burden)
## output summary statistics of homoplasmic variations
## calculate the number of homoplasmic variations for each subject
homo_burden <- colSums(mutation_collect == 2, na.rm=T)
mt_summary_obj$homo_burden <- homo_burden
mt_summary_obj$homo_burden_sum <- summary(homo_burden)
## calculate the number of homoplasmic mutations for each locus
homo_loci <- rowSums(aaf_cat == 2, na.rm=T)
mt_summary_obj$homo_loci <- homo_loci
mt_summary_obj$homo_loci_sum <- summary(homo_loci)
## loci with homoplasmic variations
loci_homo <- as.numeric(names(homo_loci))[homo_loci > 0]
mt_summary_obj$loci_homo <- loci_homo
## calculate the total number of homoplasmic variations
mt_summary_obj$homo_total <- sum(homo_burden)
## generate histograms of heter and homo burden of across subjects and across mtDNA loci
if(varHist){
## Open an output pdf file
pdf(file = paste0(path, "/mtHistograms.pdf"))
## histogram of heteroplasmic burden across subjects
h <- hist(heter_burden,
breaks = 100,
xlab ="Heteroplasmic burden score" ,
main ="Histogram of heteroplasmic burden score" )
h <- hist(heter_loci,
breaks = 100,
xlab ="Heteroplasmic variations of mtDNA loci" ,
main ="Histogram of heteroplasmic variations of mtDNA loci" )
h <- hist(homo_burden,
breaks = 100,
xlab ="Homoplasmic burden score" ,
main ="Histogram of homoplasmic burden score" )
h <- hist(homo_loci,
breaks = 100,
xlab ="Homoplasmic variations of mtDNA loci" ,
main ="Histogram of homoplasmic variations of mtDNA loci" )
dev.off()
}
if(anno){
## annotation for all heter and/or homo variations at each
## mutation loci based on user's choice
## if type=="both", annotate both heter/homo variations
if(type == "both"){
allele2 <- allele[as.character(loci_var),]
freq2 <- freq[as.character(loci_var),]
allele_both <- rep(NA,length(loci_var))
for(i in seq_along(loci_var)) {
var_index <- (mutation_collect[i,] > 0)
## identify the alleles at the variation locus
allele_all <- allele2[i,]
allele_var <- allele_all[var_index]
allele_var <- allele_var[ !is.na(allele_var)]
## identify the corresponding frequencies of
## the alleles at the variation loci
freq_all<-freq2[i,]
freq_var<-freq_all[var_index]
freq_var <- freq_var[ !is.na(freq_var)]
allele_all_var <- unlist(strsplit(allele_var,split="/"),
use.names = F)
freq_all_var <- as.numeric(unlist(strsplit(freq_var ,
split="/"),
use.names = F))
## only includes alleles which have frequency
## within the [thre.lower, 1] interval
allele_all_var <- allele_all_var[ freq_all_var >= thre.lower ]
## combine with the reference allele
if( sum(mutation_collect[i,] == 0, na.rm = T) > 0 ){
allele_all_var <- c( .mtRef[loci_var[i] ], allele_all_var)
}
allele_all_var <- allele_all_var[ !duplicated(allele_all_var) ]
allele_all_var <- paste(allele_all_var,collapse = '/')
allele_both[i] <- allele_all_var
}
head <- c("mtID", "ref_allele", "allele_all", "n_var",
"n_heter", "n_homo", "mut_allele", annot.select)
## Open file for writing
temp.file.name <- tempfile("var",fileext=c(".csv"))
file.conn <- file( temp.file.name, "w")
write (head, file= file.conn ,sep=",",ncolumns=length(head), append=T)
## this loop used to annotate all the mutations at each
## heter loci with the scores chosen(annot.select)
for (i in seq_along(loci_var)) {
pos <- loci_var[i]
ref_allele <- .mtRef[pos]
x <- allele_both[i]
x2 <- unlist(strsplit(x,split="/"), use.names = F)
if(length(x2) > 1){
pos.not <- which( !(x2 == ref_allele ))
if(length(pos.not)==1){
point <- x2[pos.not]
score = switch(point,
"A" = .AA[.AA$Pos == pos,annot.select],
"C" = .CC[.CC$Pos == pos,annot.select],
"G" = .GG[.GG$Pos == pos,annot.select],
"T" = .TT[.TT$Pos == pos,annot.select] )
if(point=="D" | point=="I"){
score <- .AA[.AA$Pos == pos,c("Pos", "ref", "Gene")]
NA_len <- length(annot.select)-length(score)
score <- c(score, rep(NA, NA_len))
}
cat(paste(pos,
ref_allele,
x,
n_mutation[i],
heter_loci[as.character(pos)],
homo_loci[as.character(pos)],
point,
paste(score, collapse=","), sep=","),
file=file.conn, fill=TRUE, append=TRUE)
}else if(length(pos.not)>1){
var_index <- (mutation_collect[i,] > 0)
## identify the alleles at the variation locus
allele_all <- allele2[i,]
allele_var <- allele_all[var_index]
allele_var <- allele_var[ !is.na(allele_var)]
## identify the corresponding frequencies of
## the alleles at the variation loci
freq_all<-freq2[i,]
freq_var<-freq_all[var_index]
freq_var <- freq_var[ !is.na(freq_var)]
allele_all_var <- unlist(strsplit(allele_var,split="/"),
use.names = F)
freq_all_var <- as.numeric(unlist(strsplit(freq_var ,
split="/"),
use.names = F))
## only includes alleles which have frequency
## within the [thre.lower, 1] interval
allele_all_var_heter <- allele_all_var[ freq_all_var >= thre.lower & freq_all_var<=thre.upper]
allele_all_var_homo <- allele_all_var[ freq_all_var > thre.upper ]
for (k in seq_along(pos.not)){
point <- x2[pos.not[k]]
score = switch(point,
"A" = .AA[.AA$Pos == pos,annot.select],
"C" = .CC[.CC$Pos == pos,annot.select],
"G" = .GG[.GG$Pos == pos,annot.select],
"T" = .TT[.TT$Pos == pos,annot.select])
if(point=="D" | point=="I"){
score <- .AA[.AA$Pos == pos,c("Pos", "ref", "Gene")]
NA_len <- length(annot.select)-length(score)
score <- c(score, rep(NA, NA_len))
}
n_heter1 <- sum(point == allele_all_var_heter)
n_homo1 <- sum(point == allele_all_var_homo)
n_var1 <- n_heter1 + n_homo1
cat(paste(pos,
ref_allele,
x,
n_var1,
n_heter1,
n_homo1,
point,
paste(score, collapse=","), sep=","),
file=file.conn, fill=TRUE, append=TRUE)
}
}
} else if(length(x2) == 1 & x2 != ref_allele){
score = switch(x2,
"A" = .AA[.AA$Pos == pos,annot.select],
"C" = .CC[.CC$Pos == pos,annot.select],
"G" = .GG[.GG$Pos == pos,annot.select],
"T" = .TT[.TT$Pos == pos,annot.select])
if(point=="D" | point=="I"){
score <- .AA[.AA$Pos == pos,c("Pos", "ref", "Gene")]
NA_len <- length(annot.select)-length(score)
score <- c(score, rep(NA, NA_len))
}
cat(paste(pos,
ref_allele,
x,
n_mutation[i],
heter_loci[as.character(pos)],
homo_loci[as.character(pos)],
x2,
paste(score, collapse=","), sep=","),
file=file.conn, fill=TRUE, append=TRUE)
}
}
close(file.conn)
## Copy file to the permanent location:
out.file.name <- paste0(path, study, "_annotation_variation.csv")
file.copy(temp.file.name, out.file.name , overwrite = T)
} else if(type == "heter"){
allele2 <- allele[as.character(loci_heter), ]
freq2 <- freq[as.character(loci_heter),]
heter_collect <- mutation_collect[loci_var %in% loci_heter,]
allele_heter <- rep(NA, length(loci_heter))
for(i in seq_along(loci_heter)){
heter_index <- (heter_collect[i,] == 1)
## identify the alleles at the variation locus
allele_all <- allele2[i,]
allele_var <- allele_all[heter_index]
allele_var <- allele_var[ !is.na(allele_var)]
## identify the corresponding frequencies of the alleles at the variation loci
freq_all <- freq2[i,]
freq_var <- freq_all[heter_index]
freq_var <- freq_var[ !is.na(freq_var)]
allele_all_var <- unlist(strsplit(allele_var,split="/"),
use.names = F)
freq_all_var <- as.numeric(unlist(strsplit(freq_var ,split="/"),
use.names = F))
## only includes alleles which have frequency within the [thre.lower, thre.upper] interval
allele_all_var <- allele_all_var[freq_all_var >= thre.lower &
freq_all_var <= thre.upper]
## combine with the reference allele
if(sum(heter_collect[i,] == 0, na.rm = T) > 0){
allele_all_var <- c(.mtRef[loci_heter[i]], allele_all_var)
}
allele_all_var <- allele_all_var[!duplicated(allele_all_var)]
allele_all_var <- paste(allele_all_var,collapse = '/')
allele_heter[i] <- allele_all_var
}
head <- c("mtID", "ref_allele", "allele_all", "n_heter", "mut_allele",
annot.select)
## Open file for writing
temp.file.name <- tempfile("heter", fileext=c(".csv"))
file.conn <- file( temp.file.name, "w")
write (head, file=file.conn, sep=",", ncolumns=length(head), append=T)
## this loop used to annote all the heter mutations at each heter loci
## with the scores chosen(annot.select)
for (i in seq_along(loci_heter)) {
pos <- loci_heter[i]
ref_allele <- .mtRef[pos]
x <- allele_heter[i]
x2 <- unlist(strsplit(x,split="/"), use.names = F)
if(length(x2) > 1){
pos.not <- which(!(x2 == ref_allele))
if(length(pos.not)==1){
point <- x2[pos.not]
score = switch(point,
"A" = .AA[.AA$Pos == pos,annot.select],
"C" = .CC[.CC$Pos == pos,annot.select],
"G" = .GG[.GG$Pos == pos,annot.select],
"T" = .TT[.TT$Pos == pos,annot.select])
if(point=="D" | point=="I"){
score <- .AA[.AA$Pos == pos,c("Pos", "ref", "Gene")]
NA_len <- length(annot.select)-length(score)
score <- c(score, rep(NA, NA_len))
}
cat(paste(pos,
ref_allele,
x,
heter_loci[as.character(pos)],
point,
paste(score, collapse=","), sep=","),
file=file.conn, fill=TRUE, append=TRUE)
}else if(length(pos.not)>1){
heter_index <- (heter_collect[i,] == 1)
## identify the alleles at the variation locus
allele_all <- allele2[i,]
allele_var <- allele_all[heter_index]
allele_var <- allele_var[ !is.na(allele_var)]
## identify the corresponding frequencies of the alleles at the variation loci
freq_all <- freq2[i,]
freq_var <- freq_all[heter_index]
freq_var <- freq_var[ !is.na(freq_var)]
allele_all_var <- unlist(strsplit(allele_var,split="/"),
use.names = F)
freq_all_var <- as.numeric(unlist(strsplit(freq_var ,split="/"),
use.names = F))
## only includes alleles which have frequency
## within the [thre.lower, thre.upper] interval
allele_all_var <- allele_all_var[freq_all_var >= thre.lower &
freq_all_var <= thre.upper]
for (k in seq_along(pos.not)){
point <- x2[pos.not[k]]
score = switch(point,
"A" = .AA[.AA$Pos == pos,annot.select],
"C" = .CC[.CC$Pos == pos,annot.select],
"G" = .GG[.GG$Pos == pos,annot.select],
"T" = .TT[.TT$Pos == pos,annot.select])
if(point=="D" | point=="I"){
score <- .AA[.AA$Pos == pos,c("Pos", "ref", "Gene")]
NA_len <- length(annot.select)-length(score)
score <- c(score, rep(NA, NA_len))
}
cat(paste(pos,
ref_allele,
x,
sum(point == allele_all_var),
point,
paste(score, collapse=","), sep=","),
file=file.conn, fill=TRUE, append=TRUE)
}
}
}else if(length(x2) == 1 & x2 != ref_allele){
score = switch(x2,
"A" = .AA[.AA$Pos == pos,annot.select],
"C" = .CC[.CC$Pos == pos,annot.select],
"G" = .GG[.GG$Pos == pos,annot.select],
"T" = .TT[.TT$Pos == pos,annot.select])
if(point=="D" | point=="I"){
score <- .AA[.AA$Pos == pos,c("Pos", "ref", "Gene")]
NA_len <- length(annot.select)-length(score)
score <- c(score, rep(NA, NA_len))
}
cat(paste(pos,
ref_allele,
x,
heter_loci[as.character(pos)],
x2,
paste(score, collapse=","), sep=","),
file=file.conn, fill=TRUE, append=TRUE)
}
}
close(file.conn)
## Copy file to the permanent location:
out.file.name <- paste0(path, study, "_annotation_heteroplasmy.csv")
file.copy(temp.file.name, out.file.name , overwrite = T)
} else if(type=="homo"){
allele2 <- allele[as.character(loci_homo),]
freq2 <- freq[as.character(loci_homo),]
homo_collect <- mutation_collect[loci_var %in% loci_homo,]
allele_homo <- rep(NA,length(loci_homo))
for(i in seq_along(loci_homo)){
homo_index <- (homo_collect[i,]==2)
## identify the alleles at the variation locus
allele_all <- allele2[i,]
allele_var <- allele_all[homo_index]
allele_var <- allele_var[!is.na(allele_var)]
## identify the corresponding freqencies of the alleles
## at the variation loci
freq_all <- freq2[i,]
freq_var <- freq_all[homo_index]
freq_var <- freq_var[!is.na(freq_var)]
allele_all_var <- unlist(strsplit(allele_var, split="/"))
freq_all_var <- as.numeric(unlist(strsplit(freq_var, split="/")))
## only includes alleles which have frequency within
##the [thre.lower, thre.upper] interval
allele_all_var<-allele_all_var[freq_all_var > thre.upper]
## combine with the reference allele
if(sum(homo_collect[i,] == 0, na.rm = T) > 0){
allele_all_var <- c(.mtRef[loci_homo[i]], allele_all_var)
}
allele_all_var <- allele_all_var[!duplicated(allele_all_var)]
allele_all_var <- paste(allele_all_var,collapse = '/')
allele_homo[i] <- allele_all_var
}
head <- c("mtID", "ref_allele", "allele_all", "n_homo", "mut_allele",
annot.select)
#Open file for writing
temp.file.name <- tempfile("homo",fileext=c(".csv"))
file.conn <- file( temp.file.name, "w")
write (head, file= file.conn, sep=",", ncolumns=length(head), append=T)
## this loop used to annote all the homo mutations at each
## homo loci with the scores chosen(annot.select)
for (i in seq_along(loci_homo)) {
pos <- loci_homo[i]
ref_allele <- .mtRef[pos]
x <- allele_homo[i]
x2 <- unlist(strsplit(x,split="/"))
if(length(x2)>1){
pos.not <- which(!(x2==ref_allele))
if(length(pos.not)==1){
point <- x2[pos.not]
score = switch(point,
"A" = .AA[.AA$Pos == pos,annot.select],
"C" = .CC[.CC$Pos == pos,annot.select],
"G" = .GG[.GG$Pos == pos,annot.select],
"T" = .TT[.TT$Pos == pos,annot.select])
if(point=="D" | point=="I"){
score <- .AA[.AA$Pos == pos,c("Pos", "ref", "Gene")]
NA_len <- length(annot.select)-length(score)
score <- c(score, rep(NA, NA_len))
}
cat(paste(pos,
ref_allele,
x,
homo_loci[as.character(pos)],
point,
paste(score, collapse=","), sep=","),
file=file.conn, fill=TRUE, append=TRUE)
}else if(length(pos.not)>1){
homo_index <- (homo_collect[i,]==2)
## identify the alleles at the variation locus
allele_all <- allele2[i,]
allele_var <- allele_all[homo_index]
allele_var <- allele_var[!is.na(allele_var)]
## identify the corresponding freqencies of the alleles
## at the variation loci
freq_all <- freq2[i,]
freq_var <- freq_all[homo_index]
freq_var <- freq_var[!is.na(freq_var)]
allele_all_var <- unlist(strsplit(allele_var, split="/"))
freq_all_var <- as.numeric(unlist(strsplit(freq_var, split="/")))
## only includes alleles which have frequency within
## the (thre.upper, 1] interval
allele_all_var<-allele_all_var[freq_all_var > thre.upper]
for (k in seq_along(pos.not)){
point <- x2[pos.not[k]]
score = switch(point,
"A" = .AA[.AA$Pos == pos,annot.select],
"C" = .CC[.CC$Pos == pos,annot.select],
"G" = .GG[.GG$Pos == pos,annot.select],
"T" = .TT[.TT$Pos == pos,annot.select])
if(point=="D" | point=="I"){
score <- .AA[.AA$Pos == pos,c("Pos", "ref", "Gene")]
NA_len <- length(annot.select)-length(score)
score <- c(score, rep(NA, NA_len))
}
cat(paste(pos,
ref_allele,
x,
sum(point==allele_all_var),
point,
paste(score, collapse=","), sep=","),
file=file.conn, fill=TRUE, append=TRUE)
}
}
} else if(length(x2)==1 & x2 != ref_allele){
score = switch(x2,
"A" = .AA[.AA$Pos == pos,annot.select],
"C" = .CC[.CC$Pos == pos,annot.select],
"G" = .GG[.GG$Pos == pos,annot.select],
"T" = .TT[.TT$Pos == pos,annot.select])
if(point=="D" | point=="I"){
score <- .AA[.AA$Pos == pos,c("Pos", "ref", "Gene")]
NA_len <- length(annot.select)-length(score)
score <- c(score, rep(NA, NA_len))
}
cat(paste(pos,
ref_allele,
x,
homo_loci[as.character(pos)],
x2,
paste(score, collapse=","), sep=","),
file=file.conn, fill=TRUE, append=TRUE)
}
}
close(file.conn)
## Copy file to the permanent location:
out.file.name <- paste0(path,study, "_annotation_homoplasmy.csv")
file.copy(temp.file.name, out.file.name , overwrite = T)
}
}
return(mt_summary_obj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.