R/importFilesPredictionTool.R

Defines functions .readPathToFile .getURcolNames importUroborus .getCMcolNames importCircMarker importOther .splitKNprediction .getKnColNames importKnife .getCEcolNames importCircExplorer2 .getNScolNames importNCLscan .getMScolNames importMapSplice

Documented in importCircExplorer2 importCircMarker importKnife importMapSplice importNCLscan importOther importUroborus

#' @title Import circRNAs detected by MapSplice2
#'
#' @description The function importMapSplice is specifically designed to read
#' and adapt the MapSplice2-v2.2.0 output file (circularRNAs.txt).
#' See \url{https://github.com/davidroberson/MapSplice2} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNA detected by MapSplice2
#' pathToFile <- system.file("extdata", "mapsplice/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs.
#' importMapSplice(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importMapSplice <- function(pathToFile) {
    options(readr.num_columns = 0)
    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = FALSE)
    # Get column names
    colNames <- .getMScolNames()
    colnames(importedPatientCircTable)[seq_along(colNames)] <- colNames

    # 1 STEP- Select the needed columns and rename them.
    # 2 STEP- The content of the columns chrom, strand and gene is cleaned from
    # unwanted characters (e.g. chr1~chr1 = chr1, ++ = +, Raph1, = Raph1 )
    adaptedPatientCircTable <- importedPatientCircTable %>%
        dplyr::select(
            gene = .data$annotated_gene_acceptor,
            .data$strand,
            .data$chrom,
            startUpBSE = .data$acceptor_start,
            # back-spliced junction coordinate
            endDownBSE = .data$doner_end,
            # back-spliced junction coordinate
            .data$coverage
        ) %>%
        dplyr::mutate(
            chrom = unlist(lapply(.data$chrom, function(x)
                base::strsplit(x, "~")[[1]][1])),
            strand = substring(importedPatientCircTable$strand, 2),
            gene = unlist(lapply(.data$gene, function(x)
                base::strsplit(x, ",")[[1]][1]))
        )%>%
        dplyr::mutate(
            chrom = ifelse(.data$chrom == 'chrMT', 'chrM', .data$chrom))

    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    # Merge duplicated
    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(.data$id, everything()) %>%
        dplyr::group_by(
            .data$id,
            .data$gene,
            .data$strand,
            .data$chrom,
            .data$startUpBSE,
            .data$endDownBSE
        ) %>%
        dplyr::summarise(coverage = sum(.data$coverage))

    # Fix coordinates
    adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)


    return(adaptedPatientCircTable)
}


# get MapSplice column names
.getMScolNames <- function() {
    # Create a character vector with names of the columns reported for the
    # output file (circular_RNAs.txt) generated by MapSplice2-v2.2.0
    # (see http://www.netlab.uky.edu/p/bioinfo/MapSplice)
    colNames <- c(
        "chrom", "doner_end", "acceptor_start", "id", "coverage", "strand",
        "rgb", "block_count", "block_size", "block_distance", "entropy",
        "flank_case", "flank_string", "min_mismatch", "max_mismatch",
        "ave_mismatch", "max_min_suffix", "max_min_prefix",
        "min_anchor_difference", "unique_read_count", "multi_read_count",
        "paired_read_count", "left_paired_read_count",
        "right_paired_read_count", "multiple_paired_read_count",
        "unique_paired_read_count", "single_read_count",
        "encompassing_read", "doner_start", "acceptor_end", "doner_iosforms",
        "acceptor_isoforms", "obsolete1", "obsolete2", "obsolete3",
        "obsolete4", "minimal_doner_isoform_length",
        "maximal_doner_isoform_length", "minimal_acceptor_isoform_length",
        "maximal_acceptor_isoform_length", "paired_reads_entropy",
        "mismatch_per_bp", "anchor_score", "max_doner_fragment",
        "max_acceptor_fragment", "max_cur_fragment", "min_cur_fragment",
        "ave_cur_fragment", "doner_encompass_unique",
        "doner_encompass_multiple", "acceptor_encompass_unique",
        "acceptor_encompass_multiple", "doner_match_to_normal",
        "acceptor_match_to_normal", "doner_seq", "acceptor_seq",
        "match_gene_strand", "annotated_type", "fusion_type", "gene_strand",
        "annotated_gene_donor", "annotated_gene_acceptor"
    )
    return(colNames)
}



#' @title Import circRNAs detected by NCLscan
#'
#' @description The function importNCLscan is specifically designed to read
#' and adapt the NCLscan v1.4 output file (e.g. MyProject.result). Only
#' intragenic circRNAs are kept in the analysis.
#' See \url{https://github.com/TreesLab/NCLscan} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNAs detected by NCLscan
#' pathToFile <- system.file("extdata", "nclscan/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs
#' importNCLscan(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importNCLscan <- function(pathToFile) {
    # Get column names
    colNames <- .getNScolNames()

    options(readr.num_columns = 0)

    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = FALSE)

    colnames(importedPatientCircTable) <- colNames

    # 1 STEP - keep Only intergenic circRNAs (type==1).
    # 2 STEP - Select the needed columns
    adaptedPatientCircTable <- importedPatientCircTable %>%
        dplyr::filter(.data$type == 1) %>%
        dplyr::select(
            .data$gene,
            .data$strand,
            .data$chrom,
            .data$startUpBSE,
            # back-spliced junction coordinate
            .data$endDownBSE,
            # back-spliced junction coordinate
            .data$coverage
        )%>%
        dplyr::mutate(
            chrom = ifelse(.data$chrom == 'chrMT', 'chrM', .data$chrom))

    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(.data$id, everything())

    # Fix coordinates
    adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)
    return(adaptedPatientCircTable)
}

# Get NCLscan column names
.getNScolNames <- function() {
    # Below are reported the content of each column of the output file
    # (MyProject.result) generated by NCLscan
    # (see https://github.com/TreesLab/NCLscan)

    #(1) Chromosome name of the donor side (5'ss)
    #(2) Junction coordinate of the donor side
    #(3) Strand of the donor side
    #(4) Chromosome name of the acceptor side (3'ss)
    #(5) Junction coordinate of the acceptor side
    #(6) Strand of the acceptor side
    #(7) Gene name of the donor side
    #(8) Gene name of the acceptor side
    #(9) Intragenic (1) or intergenic (0) case
    #(10) Total number of all support reads
    #(11) Total number of junc-reads
    #(12) Total number of span-reads

    colNames <- c(
        "chromDownBSE",
        "endDownBSE",
        "strandDownBSE",
        "chrom",
        "startUpBSE",
        "strand",
        "geneDownBSE",
        "gene",
        "type",
        "coverage",
        "juncReads",
        "spanReads"
    )
    return(colNames)
}


#' @title Import circRNAs detected by CircExplorer2
#'
#' @description The function importCircExplorer2 is specifically designed to
#' read and adapt the circExplorer2 v2.3.4 output file (circularRNA_full.txt).
#' See \url{https://github.com/YangLab/CIRCexplorer2.git} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNAs detected by CIRCexplorer2
#' pathToFile <- system.file("extdata", "circexplorer2/circRNAs_001.txt",
#' package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs.
#' importCircExplorer2(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importCircExplorer2 <- function(pathToFile) {

    # Get circExplorer column names
    colNames <- .getCEcolNames()

    options(readr.num_columns = 0)

    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = FALSE)

    colnames(importedPatientCircTable) <- colNames

    # 1 keep only exonic circRNAs (circType == "circRNA").
    # 2 STEP - Select the needed columns
    adaptedPatientCircTable <- importedPatientCircTable %>%
        dplyr::filter(.data$circType == "circRNA") %>%
        dplyr::select(
            gene = .data$geneName,
            .data$strand,
            .data$chrom,
            startUpBSE = .data$start,
            # back-spliced junction coordinate
            endDownBSE = .data$end,
            # back-spliced junction coordinate
            coverage = .data$readNumber
        )%>%
        dplyr::mutate(
            chrom = ifelse(.data$chrom == 'chrMT', 'chrM', .data$chrom))


    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(.data$id, everything())

    # Fix coordinates
    adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)

    return(adaptedPatientCircTable)
}

# Get circExplorer column names
.getCEcolNames <- function(){
    # Below are reported the content of each column of the output file
    # generated by circExplorer2
    # (see https://github.com/YangLab/CIRCexplorer2.git)

    # Field Description
    # chrom Chromosome
    # start Start of circular RNA
    # end End of circular RNA
    # name Circular RNA/Junction reads
    # score Flag of fusion junction realignment
    # strand + or - for strand
    # thickStart No meaning
    # thickEnd No meaning
    # itemRgb 0,0,0
    # exonCount Number of exons
    # exonSizes Exon sizes
    # exonOffsets Exon offsets
    # readNumber Number of junction reads
    # circType Type of circular RNA
    # geneName Name of gene
    # isoformName Name of isoform
    # index Index of exon or intron
    # flankIntron Left intron/Right intron

    colNames <- c(
        "chrom",
        "start",
        "end",
        "name",
        "score",
        "strand",
        "thickStart",
        "thickEnd",
        "itemRgb",
        "exonCount",
        "exonSizes",
        "exonOffsets",
        "readNumber",
        "circType",
        "geneName",
        "isoformName",
        "index",
        "flankIntron"
    )
    return(colNames)
}

#' @title Import circRNAs detected by KINFE
#'
#' @description The function importKnife is specifically designed to read and
#' adapt the KNIFE v1.5 output file (circJuncProbs.txt).
#' See \url{https://github.com/lindaszabo/KNIFE.git} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNAs detected by KNIFE
#' pathToFile <- system.file("extdata", "knife/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs.
#' importKnife(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importKnife <- function(pathToFile) {
    options(readr.num_columns = 0)

    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = TRUE)

    # Split and get knife prediction
    adaptedPatientCircTable <- .splitKNprediction(importedPatientCircTable)

    # Select the needed columns.
    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::select(
            gene = .data$gene1_symbol,
            .data$strand,
            chrom = .data$chr,
            startUpBSE = .data$splice_position1,
            # back-spliced junction coordinate
            endDownBSE = .data$splice_position2,
            # back-spliced junction coordinate
            coverage = .data$readNumber
        )%>%
        dplyr::mutate(
            chrom = ifelse(.data$chrom == 'chrMT', 'chrM', .data$chrom))
    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(.data$id, everything())

    # Fix coordinates
    adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)

    return(adaptedPatientCircTable)
}

.getKnColNames <- function(){
    # the first column of the data frame is composed as following:
    # junction: chr|gene1_symbol:splice_position|gene2_symbol:splice_position|
    # junction_type|strand.  see (https://github.com/lindaszabo/KNIFE.git)
    colNames <-  c(
            "chr",
            "gene1_symbol",
            "splice_position1",
            "splice_position2",
            "strand",
            "readNumber"
        )
    return(colNames)
}

# Split content first column in knife preditions
.splitKNprediction <- function(importedPatientCircTable){
    # The first column is splitted to retrieved the needed information.
    # An empty data frame is filled with the extracted information
    temp <-
        data.frame(matrix(
            nrow = nrow(importedPatientCircTable),
            ncol = 6
        ))

    colnames(temp) <- .getKnColNames()

    for (i in seq_along(importedPatientCircTable$junction)) {
        temp$chr[i] <-
            base::strsplit(importedPatientCircTable$junction[i], "\\|")[[1]][1]
        temp$gene1_symbol[i] <-
            base::strsplit(base::strsplit(importedPatientCircTable$junction[i], "\\|")[[1]][2],
                ":")[[1]][1]
        temp$splice_position1[i] <-
            base::strsplit(base::strsplit(importedPatientCircTable$junction[i], "\\|")[[1]][2],
                ":")[[1]][2]
        temp$splice_position2[i] <-
            base::strsplit(base::strsplit(importedPatientCircTable$junction[i], "\\|")[[1]][3],
                ":")[[1]][2]
        temp$strand[i] <-
            base::strsplit(base::strsplit(importedPatientCircTable$junction[i], "\\|")[[1]][5],
                ":")[[1]]
        temp$readNumber[i] <-
            importedPatientCircTable$total_reads[i]

    }
    return(temp)
}


#' @title Import circRNAs detected by an annotation-based circRNA
#' detection tool
#'
#' @description
#' The function importOther() is designed to read output file from a
#' annotation-based circRNA detection tool. The user after the detection of the
#' crcRNAs must format the output file, so that it has the following columns
#' with header: gene, strand, chrom, startUpBSE, endDownBSE and coverage.
#' If more columns are present they will be discared.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNAs
#' pathToFile <- system.file("extdata", "other/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs
#' importOther(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importOther <- function(pathToFile) {
    options(readr.num_columns = 0)
    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = TRUE)

    adaptedPatientCircTable <- importedPatientCircTable %>%
        dplyr::select(
            .data$gene,
            .data$strand,
            .data$chrom,
            .data$startUpBSE,
            # back-spliced junction
            .data$endDownBSE,
            # back-spliced junction
            .data$coverage
        )%>%
        dplyr::mutate(
            chrom = ifelse(.data$chrom == 'chrMT', 'chrM', .data$chrom))
    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(.data$id, everything())

    return(adaptedPatientCircTable)
}



#' @title Import circRNAs detected by CircMarker
#'
#' @description The function importCircMarker is specifically designed to read
#' and adapt the CircMarker (July.24.2018) output file (Brief_sum.txt).
#' See \url{https://github.com/lxwgcool/CircMarker} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @param gtf A data frame containing the formatted GTF file. This is generated
#' with \code{\link{formatGTF}}.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Load short version of the gencode v19 annotation file
#' data("gtf")
#'
#' # Path to an example file containing circRNA detected by CircMarker
#' pathToFile <- system.file("extdata", "circmarker/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs.
#' # Due to the short version of the gtf file gene names might miss in the
#' # returned output.
#' importCircMarker(pathToFile, gtf)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importCircMarker <- function(pathToFile, gtf) {
    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = FALSE, sep =  " ")
    colnames(importedPatientCircTable) <- .getCMcolNames()

    # Find gene
    m1 <- match(importedPatientCircTable$start, gtf$start)
    m2 <- match(importedPatientCircTable$start, gtf$end)
    indexGenes <- dplyr::coalesce(m1, m2)
    genes <- gtf$gene_name[indexGenes]

    # Select the needed columns.
    adaptedPatientCircTable <- importedPatientCircTable %>%
        dplyr::mutate(gene = genes) %>%
        dplyr::select(
            .data$gene,
            .data$strand,
            chrom = .data$chrom,
            startUpBSE = .data$start,
            # back-spliced junction coordinate
            endDownBSE = .data$end,
            # back-spliced junction coordinate
            .data$coverage
        )%>%
        dplyr::mutate(
            chrom = ifelse(.data$chrom == 'chrMT', 'chrM', .data$chrom))
    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    # Merge duplicated
    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(.data$id, .data$gene, everything()) %>%
        dplyr::group_by(
            .data$id,
            .data$gene,
            .data$strand,
            .data$chrom,
            .data$startUpBSE,
            .data$endDownBSE
        ) %>%
        dplyr::summarise(coverage = sum(.data$coverage))
    # Fix coordinates
    adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)
    return(adaptedPatientCircTable)
}


# Get circMArker column names
.getCMcolNames <- function() {
    colNames <- c("chrom", "start", "end", "coverage", "strand", "type")
    return(colNames)
}



#' @title Import circRNAs detected by UROBORUS
#'
#' @description The function importUroborus is specifically designed to reaad
#' and adapt the UROBORUS v2.0.0 output file (circRNA_list.txt).
#' See \url{https://github.com/WGLab/UROBORUS} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNA detected by UROBORUS
#' pathToFile <- system.file("extdata", "uroborus/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs.
#' importUroborus(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importUroborus <- function(pathToFile) {
    # A character vector with the column names is created
    colNames <- .getURcolNames()

    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = FALSE)

    colnames(importedPatientCircTable) <- colNames

    # Select the needed columns and rename them
    adaptedPatientCircTable <- importedPatientCircTable %>%
        dplyr::select(
            gene = .data$Parental_gene_name,
            .data$strand,
            chrom = .data$Chromosome,
            startUpBSE = .data$start_of_junction,
            # back-spliced junction coordinate
            endDownBSE = .data$end_of_junction,
            # back-spliced junction coordinate
            coverage = .data$read_counts
        )%>%
        dplyr::mutate(
            chrom = ifelse(.data$chrom == 'chrMT', 'chrM', .data$chrom))
    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(.data$id, everything())

    # Fix coordinates
    adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)
    return(adaptedPatientCircTable)
}


# get uroborus column names
.getURcolNames <- function(){
    # A character vector with the column names is created
    colNames <- c(
        "Chromosome",
        "start_of_junction",
        "end_of_junction",
        "strand",
        "Parental_gene_name",
        "genomic_distance",
        "read_counts",
        "matched_transcript_id"
    )
    return(colNames)
}

# Read pathToFile
.readPathToFile <- function(pathToFile, header = FALSE, sep = "\t") {
    # Read a tab separated (\t) values
    importedPatientCircTable <-
        utils::read.table(
            pathToFile,
            header = header,
            sep = sep,
            stringsAsFactors = FALSE
        )
    return(importedPatientCircTable)
}


# If the function you are looking for is not here check supportFunction.R
# Functions in supportFunction.R are used by multiple functions.

Try the circRNAprofiler package in your browser

Any scripts or data that you put into this service are public.

circRNAprofiler documentation built on March 6, 2021, 2 a.m.