#' @title Screen target sequences for miR binding sites
#'
#' @description The function getMirSites() searches miRNA binding sites within
#' the circRNA sequences. The user can restrict the analisis only to a subset
#' of miRs. In this case, miR ids must go in miR.txt file. If the latter
#' is absent or empty, all miRs of the specified miRspeciesCode are considered
#' in the analysis.
#'
#' @param targets A list containing the target sequences to analyze.
#' This data frame can be generated with \code{\link{getCircSeqs}}.
#'
#' @param miRspeciesCode A string specifying the species code (3 letters) as
#' reported in miRBase db. E.g. to analyze the mouse microRNAs specify "mmu",
#' to analyze the human microRNAs specify "hsa". Type data(miRspeciesCode)
#' to see the available codes and the corresponding species reported
#' in miRBase 22 release. Default value is "hsa".
#'
#' @param miRBaseLatestRelease A logical specifying whether to download the
#' latest release of the mature sequences of the microRNAs from miRBase
#' (\url{http://www.mirbase.org/ftp.shtml}). If TRUE is specified then the
#' latest release is automatically downloaded. If FALSE is specified then a
#' file named mature.fa containing fasta format sequences of all mature miRNA
#' sequences previously downloaded by the user from mirBase must be present
#' in the working directory. Default value is TRUE.
#'
#' @param totalMatches An integer specifying the total number of matches that
#' have to be found between the seed region of the miR and the seed site of the
#' target sequence. If the total number of matches found is less than the
#' cut-off, the seed site is discarded. The maximun number of possible matches
#' is 7. Default value is 7.
#'
#' @param maxNonCanonicalMatches An integer specifying the max number of
#' non-canonical matches (G:U) allowed between the seed region of the miR and
#' the seed site of the target sequence. If the max non-canonical matches found
#' is greater than the cut-off, the seed site is discarded. Default value is 1.
#'
#' @param pathToMiRs A string containing the path to the miRs.txt file.
#' The file miRs.txt contains the microRNA ids from miRBase
#' specified by the user. It must have one column with header id. The first row
#' must contain the miR name starting with the ">", e.g >hsa-miR-1-3p. The
#' sequences of the miRs will be automatically retrieved from the mirBase latest
#' release or from the given mature.fa file, that should be present in the
#' working directory. By default pathToMiRs is set to NULL and the file it is
#' searched in the working directory. If miRs.txt is located in a different
#' directory then the path needs to be specified. If this file is absent or
#' empty, all miRs of the specified species are considered in the
#' miRNA analysis.
#'
#' @return A list.
#'
#' @examples
#' # Load a data frame containing detected back-spliced junctions
#' data("mergedBSJunctions")
#'
#' # Load short version of the gencode v19 annotation file
#' data("gtf")
#'
#' # Annotate the first back-spliced junctions
#' annotatedBSJs <- annotateBSJs(mergedBSJunctions[1, ], gtf)
#'
#' # Get genome
#' if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)){
#' genome <- BSgenome::getBSgenome("BSgenome.Hsapiens.UCSC.hg19")
#'
#' # Retrieve target sequences.
#' targets <- getCircSeqs(
#' annotatedBSJs,
#' gtf,
#' genome)
#'
#' # Screen target sequence for miR binding sites.
#' pathToMiRs <- system.file("extdata", "miRs.txt", package="circRNAprofiler")
#'
#' #miRsites <- getMiRsites(
#' # targets,
#' # miRspeciesCode = "hsa",
#' # miRBaseLatestRelease = TRUE,
#' # totalMatches = 6,
#' # maxNonCanonicalMatches = 1,
#' # pathToMiRs )
#' }
#'
#'
#' @importFrom IRanges reverse
#' @importFrom readr read_tsv
#' @importFrom utils read.table
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @importFrom seqinr swap
#' @importFrom stringr str_count
#' @importFrom utils data
#' @import dplyr
#' @export
getMiRsites <- function(targets, miRspeciesCode = "hsa",
miRBaseLatestRelease = TRUE, totalMatches = 7,
maxNonCanonicalMatches = 1, pathToMiRs = NULL) {
# Retrieve miR sequences from miRBase or from mature.fa file
microRNAs <- .getMiRseqs(miRBaseLatestRelease, miRspeciesCode, pathToMiRs)
if (length(targets) == 1 & names(targets)[[1]] == "circ") {
# Create list to store miR results
miRsites <- .createMiRsitesList(targets, microRNAs)
# Scan the target sequences, by using a sliding windows of 1
for (i in seq_along(miRsites$targets$id)) {
cat(paste0('\nAnalysing: ', miRsites$targets$id[i]))
analysisStart <- 30
circSeq <- miRsites$targets$seq[i]
circLen <- miRsites$targets$length[i]
for (j in seq_along(miRsites$microRNAs$id)) {
# miR sequence to analyze
mirSeq <- miRsites$microRNAs$seqRev[j]
# The analysis start at position 30 to retrieve upstream features
indexTargetSeq <- analysisStart
# Create createMiRsitesTempDF data frame
tempDF <- .createMiRsitesTempDF()
k <- 1
# CircRNAs seq is X nucleotides longer than the predicted length.
while (indexTargetSeq <= (circLen + (analysisStart - 1))) {
# FIND MATCHES WITH THE SEED REGION
seedMatches <- .getSeedMatches(circSeq,
indexTargetSeq,
mirSeq,
maxNonCanonicalMatches)
# Retrieve all info if seedMatches are found
if (nrow(seedMatches) != 0) {
if (seedMatches$maxTotalMatches >= totalMatches) {
tempDF[k, ] <- .createMiRsitesTempDF()
tempDF$totMatchesInSeed[k] <- seedMatches$maxTotalMatches
# check if there are continuous WC matches
tempDF$cwcMatchesInSeed[k] <-.checkMer(seedMatches)
# Keep t1 nucleotide info
tempDF$t1[k] <- .getT1nucleotide(circSeq, indexTargetSeq, mirSeq)
# Keep location info
tempDF$seedLocation[k] <- indexTargetSeq
# FIND MATCHES WITH THE CENTRAL REGION
centralMatches <- .getCentralMatches(circSeq, indexTargetSeq, mirSeq)
tempDF$totMatchesInCentral[k] <- centralMatches$maxTotalMatches
tempDF$cwcMatchesInCentral[k] <- centralMatches$maxContinuousMatches
# FIND MATCHES WITH THE COMPENSATORY REGION
compensatoryMatches <- .getCompensatoryMatches(circSeq, indexTargetSeq, mirSeq)
tempDF$totMatchesInCompensatory[k] <- compensatoryMatches$maxTotalMatches
tempDF$cwcMatchesInCompensatory[k] <- compensatoryMatches$maxContinuousMatches
# FOR LOCAL AU CONTENT around the seed region 2-8
tempDF$localAUcontent[k] <- .getAUcontent(circSeq, indexTargetSeq, mirSeq)
# Count seed site
tempDF$count[k] <- 1
k <- k + 1
}
}
indexTargetSeq <- indexTargetSeq + 1 # increase of 1
}
# Keep the following info if at least 1 seed match is found
if (sum(tempDF$count) >= 1) {
mirId <- miRsites$microRNAs$id[j]
rowId <- i
miRsites <- .fillMiRsites(miRsites, rowId, mirId, tempDF)
}
}
}
} else{
stop("target sequences not valid, only circRNA sequences are allowed.")
}
return(miRsites)
}
# Retrive miRNA sequences
.getMiRseqs <- function(miRBaseLatestRelease = TRUE,
miRspeciesCode = "hsa",
pathToMiRs = NULL) {
# 271 options are possible the argument miRspeciesCode
.checkMiRspeciesCode(miRspeciesCode)
# Read experiment information
options(readr.num_columns = 0)
# Retrieve miR sequences from miRBase or read from a mature.fa file
miRBase <- .getMiRsFromMiRBase(miRBaseLatestRelease)
# Get miRNAs ids to analyze
microRNAids <- .getMiRids(miRBase, pathToMiRs, miRspeciesCode)
# Fill the microRNAids data frame in the miRsites list
microRNAs <- data.frame(matrix(nrow = length(microRNAids), ncol = 4))
colnames(microRNAs) <- c("id", "length", "seq", "seqRev")
for (i in seq_along(microRNAs$id)) {
indexMir <- which(miRBase == microRNAids[i])
microRNAs$id[i] <- microRNAids[i]
microRNAs$seq[i] <- miRBase[indexMir + 1]
# The direction of the sequences of the microRNAs downloaded
# from miRBase are from 5' to 3'. To be able to find matching
# regions within the target sequences also reported in direction
# 5' to 3', the reverse of the mir sequences must be used.
microRNAs$seqRev[i] <- IRanges::reverse(miRBase[indexMir + 1])
microRNAs$length[i] <- nchar(microRNAs$seq[i])
}
return(microRNAs)
}
# Check miR species code
.checkMiRspeciesCode <- function(miRspeciesCode){
miRspeciesCodes <- NULL
data("miRspeciesCodes", package= "circRNAprofiler",envir = environment())
if (!miRspeciesCode %in% miRspeciesCodes$code) {
stop(
"miR species code not correct, type data(miRspeciesCode) to see
the available codes and the corresponding species."
)
}
}
#Retrieve mir sequences from miRBase or read from a mature.fa file
.getMiRsFromMiRBase <- function(miRBaseLatestRelease = TRUE){
if (miRBaseLatestRelease) {
url<-"ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz"
miRBase <-
tryCatch(
readr::read_tsv(url, col_names = FALSE), error = function(e) NULL)
if(is.null(miRBase)){
stop(
"Unable to establish a connection with mirbase.org.
Try later."
)
}
} else if (file.exists("mature.fa")) {
miRBase <-
utils::read.table(
"mature.fa",
header = FALSE,
stringsAsFactors = FALSE,
sep = "\t"
)
} else {
stop(
"mature.fa not present in the project folder.
Set miRBaseLatestRelease = TRUE to automatically download
the latest release of the mature sequences of the
microRNAs from miRBase"
)
}
colnames(miRBase)[1] <- "seq"
firstChar <- base::substr(miRBase$seq[1], 1, 1)
if (firstChar != ">") {
stop(
"The microRNAs in mature.fa must be reported in fasta
format. The first row must start with >miRBaseID then the
second row must contain the miR sequence."
)
}
miRBaseCleaned <- gsub(" .*", "", miRBase$seq)
return(miRBaseCleaned)
}
# Retrive miRNA to analyze
.getMiRids <- function(miRBase, pathToMiRs = NULL, miRspeciesCode){
# Read miRs.txt
miRsFromFile <- .readMiRs(pathToMiRs)
if (nrow(miRsFromFile) == 0) {
cat("miRs.txt is empty or absent. All miRNAs of the
specified species will be analyzed")
}
if (nrow(miRsFromFile) > 0) {
# The first line should start with ">"
firstChar <- base::substr(miRsFromFile$id[1], 1, 1)
if (firstChar != ">") {
stop("The microRNA ids in miRs.txt must start with " > ".
E.g. >miRid")
}
microRNAids <- miRBase[miRBase %in% miRsFromFile$id]
if (length(miRsFromFile$id) - length(microRNAids) > 0) {
notFound <- miRsFromFile$id[!(miRsFromFile$id %in% microRNAids)]
cat(paste("miRs not found:", paste(notFound, collapse = ", ")))
}
} else{
microRNAids <- miRBase[grep(miRspeciesCode, miRBase)]
}
return(microRNAids)
}
# Create list to store miR results
.createMiRsitesList <- function(targets, microRNAs) {
if (length(targets) == 1 & names(targets)[[1]] == "circ") {
miRsites <- vector("list", 12)
miRsites <- .nameMiRsitesListDF(miRsites)
# Fill the target dataframe in the miRsites list
miRsites$targets <- targets[[1]]
# Fill the microRNAs data frame in the miRsites list
miRsites$microRNAs <- microRNAs
# Dataframe to store the number of miR binding sites
miRsites$counts <- createMiRsitesInternalDF(miRsites, microRNAs)
colnames(miRsites$counts) <- c("id", microRNAs$id)
miRsites$counts$id <- miRsites$targets$id
# Dataframe to store the total matches
miRsites$totMatchesInSeed <- createMiRsitesInternalDF(miRsites, microRNAs)
colnames(miRsites$totMatchesInSeed) <- c("id", microRNAs$id)
miRsites$totMatchesInSeed$id <- miRsites$targets$id
# Dataframe to store the continuous WC matches
miRsites$cwcMatchesInSeed <- createMiRsitesInternalDF(miRsites, microRNAs)
colnames(miRsites$cwcMatchesInSeed) <- c("id", microRNAs$id)
miRsites$cwcMatchesInSeed$id <- miRsites$targets$id
# Dataframe to store the location of the seed match
miRsites$seedLocation <- createMiRsitesInternalDF(miRsites, microRNAs)
colnames(miRsites$seedLocation) <- c("id", microRNAs$id)
miRsites$seedLocation$id <- miRsites$targets$id
# Dataframe to store the info about the t1 nucleotide
miRsites$t1 <- createMiRsitesInternalDF(miRsites, microRNAs)
colnames(miRsites$t1) <- c("id", microRNAs$id)
miRsites$t1$id <- miRsites$targets$id
# Dataframe to store the total matches with the central region
miRsites$totMatchesInCentral <- createMiRsitesInternalDF(miRsites, microRNAs)
colnames(miRsites$totMatchesInCentral) <- c("id", microRNAs$id)
miRsites$totMatchesInCentral$id <- miRsites$targets$id
# Dataframe to store continuous WC matches with the central region
miRsites$cwcMatchesInCentral <- createMiRsitesInternalDF(miRsites, microRNAs)
colnames(miRsites$cwcMatchesInCentral) <- c("id", microRNAs$id)
miRsites$cwcMatchesInCentral$id <- miRsites$targets$id
# Dataframe to store total matches found with the compensatory region
miRsites$totMatchesInCompensatory <- createMiRsitesInternalDF(miRsites, microRNAs)
colnames(miRsites$totMatchesInCompensatory) <- c("id", microRNAs$id)
miRsites$totMatchesInCompensatory$id <- miRsites$targets$id
# Dataframe to store continuous WC matches with the compensatory region
miRsites$cwcMatchesInCompensatory <- createMiRsitesInternalDF(miRsites, microRNAs)
colnames(miRsites$cwcMatchesInCompensatory) <- c("id", microRNAs$id)
miRsites$cwcMatchesInCompensatory$id <- miRsites$targets$id
# Dataframe to store the percentage of A/U nucleotides
miRsites$localAUcontent <- createMiRsitesInternalDF(miRsites, microRNAs)
colnames(miRsites$localAUcontent) <- c("id", microRNAs$id)
miRsites$localAUcontent$id <- miRsites$targets$id
} else{
stop("target sequences not valid, only circRNA sequences are allowed.")
}
return(miRsites)
}
# get names miRsiteslist
.nameMiRsitesListDF <- function(miRsites){
names(miRsites)[1] <- "targets"
names(miRsites)[2] <- "microRNAs"
names(miRsites)[3] <- "counts"
names(miRsites)[4] <- "totMatchesInSeed"
names(miRsites)[5] <- "cwcMatchesInSeed"
names(miRsites)[6] <- "seedLocation"
names(miRsites)[7] <- "t1"
names(miRsites)[8] <- "totMatchesInCentral"
names(miRsites)[9] <- "cwcMatchesInCentral"
names(miRsites)[10] <- "totMatchesInCompensatory"
names(miRsites)[11] <- "cwcMatchesInCompensatory"
names(miRsites)[12] <- "localAUcontent"
return(miRsites)
}
# Create miRsites internal data frame
createMiRsitesInternalDF <- function(miRsites, microRNAs) {
intDF <- data.frame(matrix(
nrow = nrow(miRsites$targets),
ncol = length(microRNAs$id) + 1
))
return(intDF)
}
# Create createMiRsitesTempDF data frame
.createMiRsitesTempDF <- function() {
tempDF <-
data.frame(matrix(nrow = 1,
ncol = 10))
colnames(tempDF) <-
c(
"count",
"totMatchesInSeed",
"cwcMatchesInSeed",
"seedLocation",
"t1",
"totMatchesInCentral",
"cwcMatchesInCentral",
"totMatchesInCompensatory",
"cwcMatchesInCompensatory",
"localAUcontent"
)
tempDF$count <- 0
return(tempDF)
}
# The function getMatches() retrives the number of total matches,
# continuous canonical Watson and Crick matches and a non-canonical matches
# (G:U) between 2 given sequences of the same length. The analysis is
# performed by comparing the 2 sequences without gap and by introducing a
# gap in each position of one of the 2 sequences at the time.
.getMatches <- function(seq1, seq2, isSeed = TRUE) {
# Exclude the t1 nucleotide from the analysis if the seq2 is the seed
if (isSeed) {
seq1 <- base::substr(seq1, 1, nchar(seq2))
}
# Introduce a gap WITHIN the sequence.
seq1[2] <- paste0("-", seq1)
seq1Char <- unlist(base::strsplit(seq1[2], ""))
seq2[2] <- paste0("-", seq2)
seq2Char <- unlist(base::strsplit(seq2[2], ""))
i <- 1
# seq2Char and seq1Char have the same length
# We do not consider the gap in the last position since it correspond to a
# shift of 1 nucleotide. It will be analyzed when we increase the
# indexTargetSeq.
while (i < (length(seq2Char) - 1)) {
# Swap the gap position "-" within the sequence of the microRNA
swap(seq2Char[i], seq2Char[i + 1])
seq2[i + 2] <-
paste(seq2Char, collapse = "")
# Swap the gap position "-" within the sequence of the circRNA
swap(seq1Char[i], seq1Char[i + 1])
seq1[i + 2] <- paste(seq1Char, collapse = "")
i <- i + 1
}
# Compare the 2 sequences and store the results in matches data frame
matches <- .fillMatches(seq1, seq2)
return(matches)
}
# Create matches data frame
.createMatchesDF <- function(seq1, seq2){
# cwcm stands for continuous watson-crik matches tm stands for total matches
# ncm stands for non canonical matches
matches <- data.frame(matrix(nrow = 1 + ((length(
seq1
) - 2) + (length(
seq2
) - 2)), ncol = 3))
colnames(matches) <- c("cwcm", "tm", "ncm")
return(matches)
}
# Compare the 2 sequences and find the max number of total matches and the
# max number of continuous matches between the seed region and the seed site
# considering also the presence of a buldge within the seed region of the miR
# sequence or the target sequences. Store the information of each iteration
# matches data frame.
.fillMatches<- function(seq1, seq2){
matches <- .createMatchesDF(seq1, seq2)
# n is a non canonical pair, w is a watson- crick match, m is a mistmatch
comparedSeq <- .compareSequences(seq1[1], seq2[1], isGUMatch = FALSE)
continuousMatches <- max(nchar(base::strsplit(comparedSeq, "m")[[1]]))
matches$cwcm[1] <- continuousMatches
comparedSeq <- .compareSequences(seq1[1], seq2[1], isGUMatch = TRUE)
totalMatches <- stringr::str_count(comparedSeq, "w") +
stringr::str_count(comparedSeq, "n")
nonCanonicalpairs <- stringr::str_count(comparedSeq, "n")
matches$tm[1] <- totalMatches
matches$ncm[1] <- nonCanonicalpairs
# Find the max number of total matches and the max number of continuous
# matches considering also the presence of a buldge
i <- 2
k <- 2
while (i < length(seq1)) {
comparedSeq <- .compareSequences(seq1[2], seq2[i + 1], isGUMatch = FALSE)
continuousMatches <- max(nchar(base::strsplit(comparedSeq, "m")[[1]]))
matches$cwcm[k] <- continuousMatches
comparedSeq <- .compareSequences(seq1[2], seq2[i + 1], isGUMatch = TRUE)
totalMatches <- stringr::str_count(comparedSeq, "w") + stringr::str_count(comparedSeq, "n")
matches$tm[k] <- totalMatches
nonCanonicalpairs <- stringr::str_count(comparedSeq, "n")
matches$ncm[k] <- nonCanonicalpairs
i <- i + 1
k <- k + 1
}
# Find the max number of total matches and continuous matches considering
# also the presence of a buldge within the seed site of the target sequence.
i <- 2
while (i < length(seq2)) {
comparedSeq <- .compareSequences(seq2[2], seq1[i + 1], isGUMatch = FALSE)
continuousMatches <- max(nchar(base::strsplit(comparedSeq, "m")[[1]]))
matches$cwcm[k] <- continuousMatches
comparedSeq <- .compareSequences(seq1[2], seq2[i + 1], isGUMatch = TRUE)
totalMatches <- stringr::str_count(comparedSeq, "w") + stringr::str_count(comparedSeq, "n")
matches$tm[k] <- totalMatches
nonCanonicalpairs <- stringr::str_count(comparedSeq, "n")
matches$ncm[k] <- nonCanonicalpairs
i <- i + 1
k <- k + 1
}
return(matches)
}
# The function compareSequences() combines two equal-length
# strings that are assumed to be aligned into a single character string.
# The Watson-Crick matches are replaced with "w", the mismatches with "m",
# G:U matches are replaced either with "n" or with "w" based on the value of
# isGUMatch argument.
.compareSequences <- function(seq1, seq2, isGUMatch = TRUE) {
# n is a non canonical pair
# w is a watson- crick match
# m is a mistmatch
GU <- as.character()
if (isGUMatch) {
GU <- "n"
} else{
GU <- "m"
}
charsSeq1 <- unlist(base::strsplit(seq1, ""))
charsSeq2 <- unlist(base::strsplit(seq2, ""))
newSeq <- as.character()
for (i in seq_along(charsSeq1)) {
if ((charsSeq1[i] == "A" & charsSeq2[i] == "U") |
(charsSeq1[i] == "U" & charsSeq2[i] == "A") |
(charsSeq1[i] == "C" & charsSeq2[i] == "G") |
(charsSeq1[i] == "G" & charsSeq2[i] == "C")) {
newSeq[i] <- "w"
} else if ((charsSeq1[i] == "G" & charsSeq2[i] == "U") |
(charsSeq1[i] == "U" & charsSeq2[i] == "G")) {
newSeq[i] <- GU
} else {
newSeq[i] <- "m"
}
}
comparedSeq <- paste(newSeq, collapse = "")
return(comparedSeq)
}
# Find match between miR seed region and the miR seed region binding site
.getSeedMatches <-
function(circSeq,
indexTargetSeq,
mirSeq,
maxNonCanonicalMatches) {
seedSeq <-
base::substr(mirSeq, nchar(mirSeq) - 7, nchar(mirSeq) - 1)
circSeqForSeed <-
base::substr(circSeq,
indexTargetSeq,
indexTargetSeq + nchar(seedSeq))
seedMatches <-
.getMatches(circSeqForSeed,
seedSeq,
isSeed = TRUE)
# Filter based on the number of non-canonical matches
# allowed and keep the max total matches found
seedMatchesFiltered <- seedMatches %>%
dplyr::filter(ncm <= maxNonCanonicalMatches) %>%
dplyr::arrange(desc(tm)) %>%
dplyr::slice(1) %>%
dplyr::rename(maxTotalMatches = tm,
maxContinuousMatches = cwcm)
return(seedMatchesFiltered)
}
# check if there are continuous WC matches
# Write mer if there are at least 2 continuous
# wc matches
.checkMer <- function(seedMatches) {
cwcMatchesInSeed <- as.character()
if (seedMatches$maxContinuousMatches > 1) {
cwcMatchesInSeed <- paste(seedMatches$maxContinuousMatches,
"mer",
sep = "")
} else{
cwcMatchesInSeed <- seedMatches$maxContinuousMatches
}
return(cwcMatchesInSeed)
}
# Get t1 nucleotide
.getT1nucleotide <- function(circSeq, indexTargetSeq, mirSeq) {
seedSeq <-
base::substr(mirSeq, nchar(mirSeq) - 7, nchar(mirSeq) - 1)
circSeqForSeed <-
base::substr(circSeq,
indexTargetSeq,
indexTargetSeq + nchar(seedSeq))
t1 <-
base::substr(circSeqForSeed,
nchar(circSeqForSeed),
nchar(circSeqForSeed))
return(t1)
}
# Find match between miR central region and the sequence upstream the
# miR seed region binding site
.getCentralMatches <- function(circSeq, indexTargetSeq, mirSeq) {
circSeqForCentral <-
base::substr(circSeq,
indexTargetSeq - 4,
indexTargetSeq - 1)
centralSeq <-
base::substr(mirSeq,
nchar(mirSeq) - 11,
nchar(mirSeq) - 8)
centralMatches <-
.getMatches(circSeqForCentral,
centralSeq,
isSeed = FALSE)
# Filter based on the number of non-canonical
# matches allowed and keep the max total matches
# found
centralMatchesFiltered <- centralMatches %>%
dplyr::arrange(desc(tm)) %>%
dplyr::slice(1) %>%
dplyr::rename(maxTotalMatches = tm,
maxContinuousMatches = cwcm)
return(centralMatchesFiltered)
}
# Find match between miR compensatory region and the sequence upstream the
# miR seed region binding site
.getCompensatoryMatches <-
function(circSeq, indexTargetSeq, mirSeq) {
circSeqForCompensatory <-
base::substr(circSeq,
indexTargetSeq - 8,
indexTargetSeq - 5)
compensatorySeq <-
base::substr(mirSeq,
nchar(mirSeq) - 15,
nchar(mirSeq) - 12)
compensatoryMatches <-
.getMatches(circSeqForCompensatory,
compensatorySeq,
isSeed = FALSE)
# Filter based on the number fo non-canonical
# matches allowed and keep the max total matches
# found
compensatoryMatchesFiltered <-
compensatoryMatches %>%
dplyr::arrange(desc(tm)) %>%
dplyr::slice(1) %>%
dplyr::rename(maxTotalMatches = tm,
maxContinuousMatches = cwcm)
return(compensatoryMatchesFiltered)
}
# Find the local AU content around the miR seed region binding site
.getAUcontent <- function(circSeq, indexTargetSeq, mirSeq) {
seedSeq <-
base::substr(mirSeq, nchar(mirSeq) - 7, nchar(mirSeq) - 1)
upstreamRegion <-
base::substr(circSeq,
indexTargetSeq - 10,
indexTargetSeq - 1)
downstreamRegion <-
base::substr(circSeq,
indexTargetSeq + nchar(seedSeq[1]),
indexTargetSeq + nchar(seedSeq[1]) + 9)
localAUcontent <-
Biostrings::letterFrequency(Biostrings::RNAStringSet(paste0(upstreamRegion,
downstreamRegion)),
letters = "AU") / 20
return(localAUcontent)
}
# Fill dataframe in miRsites list
.fillMiRsites <- function(miRsites, rowId, mirId, tempDF) {
# Number of miR seed binding sites
miRsites$counts[rowId, mirId] <- sum(tempDF$count)
# Location of the miR seed binding sites
miRsites$seedLocation[rowId, mirId] <-
paste(tempDF$seedLocation, collapse = ",")
# Check t1 nucleotide
miRsites$t1[rowId, mirId] <- paste(tempDF$t1, collapse = ",")
# Number of total matches with miR seed region
miRsites$totMatchesInSeed[rowId, mirId] <-
paste(tempDF$totMatchesInSeed, collapse = ",")
# Number of continuous WC matches miR seed region
miRsites$cwcMatchesInSeed[rowId, mirId] <-
paste(tempDF$cwcMatchesInSeed, collapse = ",")
# Number of total matches with miR central region
miRsites$totMatchesInCentral[rowId, mirId] <-
paste(tempDF$totMatchesInCentral, collapse = ",")
# Number of continuous WC matches with miR central region
miRsites$cwcMatchesInCentral[rowId, mirId] <-
paste(tempDF$cwcMatchesInCentral, collapse = ",")
# Number of total matches with miR compensatory region
miRsites$totMatchesInCompensatory[rowId, mirId] <-
paste(tempDF$totMatchesInCompensatory,
collapse = ",")
# Number of of continuous WC matches with miR compensatory region
miRsites$cwcMatchesInCompensatory[rowId, mirId] <-
paste(tempDF$cwcMatchesInCompensatory,
collapse = ",")
# Content of AU nucleotide around the miR seed binding sites
miRsites$localAUcontent[rowId, mirId] <-
paste(tempDF$localAUcontent, collapse = ",")
return(miRsites)
}
#' @title Rearrange miR results
#'
#' @description The function rearrangeMiRres() rearranges the results of the
#' getMiRsites() function. Each element of the list contains the miR results
#' relative to one circRNA. For each circRNA only miRNAs for which at least 1
#' miRNA binding site is found are reported.
#'
#' @param miRsites A list containing the miR sites found in the RNA
#' target sequence. it can be generated with \code{\link{getMiRsites}}.
#'
#' @return A list.
#'
#' @examples
#' # Load data frame containing predicted back-spliced junctions
#' data("mergedBSJunctions")
#'
#' # Load short version of the gencode v19 annotation file
#' data("gtf")
#'
#' # Annotate the first back-spliced junctions
#' annotatedBSJs <- annotateBSJs(mergedBSJunctions[1, ], gtf)
#'
#' # Get genome
#' genome <- BSgenome::getBSgenome("BSgenome.Hsapiens.UCSC.hg19")
#'
#' # Retrieve target sequences.
#' targets <- getCircSeqs(
#' annotatedBSJs,
#' gtf,
#' genome)
#'
#' # Screen target sequence for miR binding sites.
#' pathToMiRs <- system.file("extdata", "miRs.txt",package="circRNAprofiler")
#'
#' # miRsites <- getMiRsites(
#' # targets,
#' # miRspeciesCode = "hsa",
#' # miRBaseLatestRelease = TRUE,
#' # totalMatches = 6,
#' # maxNonCanonicalMatches = 1,
#' # pathToMiRs)
#'
#' # Rearrange miR results
#' # rearragedMiRres <- rearrangeMiRres(miRsites)
#'
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @export
rearrangeMiRres <- function(miRsites) {
# Create an empty list of 9 data frames
rearragedMiRres <- vector("list", nrow(miRsites$targets))
for (i in seq_along(rearragedMiRres)) {
rearragedMiRres[[i]] <- vector("list", 2)
# First dataframe is filled with the target information
rearragedMiRres[[i]][[1]] <- data.frame(matrix(
nrow = 1,
ncol = length(.getTargetsColNames())
))
colnames(rearragedMiRres[[i]][[1]]) <- colnames(miRsites$targets)
rearragedMiRres[[i]][[1]] <- miRsites$targets[i,]
# Second dataframe is filled with the results
rearragedMiRres[[i]][[2]] <- data.frame(matrix(
nrow = nrow(miRsites$microRNAs),
ncol = 3 + length(names(miRsites)[3:9])
))
colnames(rearragedMiRres[[i]][[2]]) <-
c("miRid", "miRseq", "miRseqRev", names(miRsites)[3:9])
rearragedMiRres[[i]][[2]]$miRid <- miRsites$microRNAs$id
rearragedMiRres[[i]][[2]]$miRseq <-miRsites$microRNAs$seq
rearragedMiRres[[i]][[2]]$miRseqRev <- miRsites$microRNAs$seqRev
idM <- miRsites$microRNAs$id
counts <- as.character(miRsites$counts[i, idM])
counts[which(is.na(miRsites$counts[i, idM]))] <- 0
rearragedMiRres[[i]][[2]]$counts <- as.numeric(counts)
rearragedMiRres[[i]][[2]]$totMatchesInSeed <-
as.character(miRsites$totMatchesInSeed[i, idM])
rearragedMiRres[[i]][[2]]$cwcMatchesInSeed <-
as.character(miRsites$cwcMatchesInSeed[i, idM])
rearragedMiRres[[i]][[2]]$seedLocation <-
as.character(miRsites$seedLocation[i, idM])
rearragedMiRres[[i]][[2]]$t1 <- as.character(miRsites$t1[i, idM])
rearragedMiRres[[i]][[2]]$totMatchesInCentral <-
as.character(miRsites$totMatchesInCentral[i, idM])
rearragedMiRres[[i]][[2]]$cwcMatchesInCentral <-
as.character(miRsites$cwcMatchesInCentral[i, idM])
rearragedMiRres[[i]][[2]]$totMatchesInCompensatory <-
as.character(miRsites$totMatchesInCompensatory[i, idM])
rearragedMiRres[[i]][[2]]$cwcMatchesInCompensatory <-
as.character(miRsites$cwcMatchesInCompensatory[i, idM])
rearragedMiRres[[i]][[2]]$localAUcontent <-
as.character(miRsites$localAUcontent[i, idM])
#Remove miR with zero counts
rearragedMiRres[[i]][[2]] <- rearragedMiRres[[i]][[2]] %>%
dplyr::filter(counts != 0)
}
return(rearragedMiRres)
}
# If the function you are looking for is not here check supportFunction.R
# Functions in supportFunction.R are used by multiple functions.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.