#' Generate overrepresented kmers from all kmer counts results
#' @param fseq the read result from seqTools
#' @param fseq_count kmer result generated from kmer function
#' @param k the length of the sequence looking for
#' @param name the path to the data
#' @param nc number of positions
#' @param nr number of reads
#' @importMethodsFrom seqTools
#' @export the overrepresented kmer table, with the kmer and its position
#'
#'
overrep_kmer <- function(fseq,fseq_count,k,name,nc,nr){
#fseq_count <- kmer("data/full.fq.gz",7)
colnames(fseq_count) = seq(1,nc-k+1,1)
# find marginal probabilities of ATGC
probA <- sum(sequence_content(fseq,"A"))/nr/nc
probG <- sum(sequence_content(fseq,"G"))/nr/nc
probC <- sum(sequence_content(fseq,"C"))/nr/nc
probT <- sum(sequence_content(fseq,"T"))/nr/nc
fseq_count$total = rowSums(fseq_count)
fseq_count$kmer = rownames(fseq_count)
#find the number of A, T, G, C in each kmer
for (i in 1:nrow(fseq_count)){
if (grepl("A",rownames(fseq_count[i,])) ==TRUE){
fseq_count$counta[i] <- length(gregexpr("A",rownames(fseq_count[i,]))[[1]])
}else {
fseq_count$counta[i] = 0
}
if (grepl("G",rownames(fseq_count[i,])) ==TRUE){
fseq_count$countg[i] <- length(gregexpr("G",rownames(fseq_count[i,]))[[1]])
}else {
fseq_count$countg[i] = 0
}
if (grepl("T",rownames(fseq_count[i,])) ==TRUE){
fseq_count$countt[i] <- length(gregexpr("T",rownames(fseq_count[i,]))[[1]])
}else {
fseq_count$countt[i] = 0
}
if (grepl("C",rownames(fseq_count[i,])) ==TRUE){
fseq_count$countc[i] <- length(gregexpr("C",rownames(fseq_count[i,]))[[1]])
}else {
fseq_count$countc[i] = 0
}
}
#calculate expecte kmer per read
fseq_count$expected <- ((probA^fseq_count$counta)*(probG^fseq_count$countg)*(probC^fseq_count$countc)*(probT^fseq_count$countt))*nr
fseq_count_copy <- subset(fseq_count,select = -c(counta,countg,countt,countc))
# calculate an log2(obs/exp) as indicator for further analysis
for (i in 1:(nc-k+1)){
fseq_count_copy[,i] = log2(fseq_count_copy[,i]/fseq_count_copy$expected)
}
#coalesce into a long vector, find the index of large value and detect the kmer they belong to
fseq_count_vector <- as.vector(as.matrix(subset(fseq_count_copy,select = -c(total,expected,kmer))))
index <- which(fseq_count_vector>6)
value <- fseq_count_vector[which(fseq_count_vector>6)]
#empty data frame
indexes <- data.frame(matrix(ncol=3,nrow = length(index)))
indexes$originalindex <- index
indexes$obsexp <- value
indexes$positionindex <- index%% 94
indexes$kmerindex <- rownames(fseq_count_copy)[index%/% 94+1*(indexes$positionindex!=0)]
indexes = subset(indexes,select = -c(X1,X2,X3))
write.csv(file="OverrepresentedKmers.csv",indexes)
return(indexes)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.