R/importFilesPredictionTool.R

Defines functions .readPathToFile .getURcolNames importUroborus .getCMcolNames importCircMarker importOther .splitKNprediction .getKnColNames importKnife .getCEcolNames importCircExplorer2 .getNScolNames importNCLscan .getMScolNames importMapSplice

Documented in importCircExplorer2 importCircMarker importKnife importMapSplice importNCLscan importOther importUroborus

#' @title Import circRNAs detected by MapSplice2
#'
#' @description The function importMapSplice is specifically designed to read
#' and adapt the MapSplice2-v2.2.0 output file (circularRNAs.txt).
#' See \url{https://github.com/davidroberson/MapSplice2} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNA detected by MapSplice2
#' pathToFile <- system.file("extdata", "mapsplice/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs.
#' importMapSplice(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importMapSplice <- function(pathToFile) {
    options(readr.num_columns = 0)
    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = FALSE)
    # Get column names
    colNames <- .getMScolNames()
    colnames(importedPatientCircTable)[seq_along(colNames)] <- colNames

    # 1 STEP- Select the needed columns and rename them.
    # 2 STEP- The content of the columns chrom, strand and gene is cleaned from
    # unwanted characters (e.g. chr1~chr1 = chr1, ++ = +, Raph1, = Raph1 )
    adaptedPatientCircTable <- importedPatientCircTable %>%
        dplyr::select(
            gene = annotated_gene_acceptor,
            strand,
            chrom,
            startUpBSE = acceptor_start,
            # back-spliced junction coordinate
            endDownBSE = doner_end,
            # back-spliced junction coordinate
            coverage
        ) %>%
        dplyr::mutate(
            chrom = unlist(lapply(chrom, function(x)
                base::strsplit(x, "~")[[1]][1])),
            strand = substring(importedPatientCircTable$strand, 2),
            gene = unlist(lapply(gene, function(x)
                base::strsplit(x, ",")[[1]][1]))
        )%>%
        dplyr::mutate(
            chrom = ifelse(chrom == 'chrMT', 'chrM', chrom))

    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    # Merge duplicated
    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(id, everything()) %>%
        dplyr::group_by(
            id,
            gene,
            strand,
            chrom,
            startUpBSE,
            endDownBSE
        ) %>%
        dplyr::summarise(coverage = sum(coverage))

    # Fix coordinates
    adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)


    return(adaptedPatientCircTable)
}


# get MapSplice column names
.getMScolNames <- function() {
    # Create a character vector with names of the columns reported for the
    # output file (circular_RNAs.txt) generated by MapSplice2-v2.2.0
    # (see http://www.netlab.uky.edu/p/bioinfo/MapSplice)
    colNames <- c(
        "chrom", "doner_end", "acceptor_start", "id", "coverage", "strand",
        "rgb", "block_count", "block_size", "block_distance", "entropy",
        "flank_case", "flank_string", "min_mismatch", "max_mismatch",
        "ave_mismatch", "max_min_suffix", "max_min_prefix",
        "min_anchor_difference", "unique_read_count", "multi_read_count",
        "paired_read_count", "left_paired_read_count",
        "right_paired_read_count", "multiple_paired_read_count",
        "unique_paired_read_count", "single_read_count",
        "encompassing_read", "doner_start", "acceptor_end", "doner_iosforms",
        "acceptor_isoforms", "obsolete1", "obsolete2", "obsolete3",
        "obsolete4", "minimal_doner_isoform_length",
        "maximal_doner_isoform_length", "minimal_acceptor_isoform_length",
        "maximal_acceptor_isoform_length", "paired_reads_entropy",
        "mismatch_per_bp", "anchor_score", "max_doner_fragment",
        "max_acceptor_fragment", "max_cur_fragment", "min_cur_fragment",
        "ave_cur_fragment", "doner_encompass_unique",
        "doner_encompass_multiple", "acceptor_encompass_unique",
        "acceptor_encompass_multiple", "doner_match_to_normal",
        "acceptor_match_to_normal", "doner_seq", "acceptor_seq",
        "match_gene_strand", "annotated_type", "fusion_type", "gene_strand",
        "annotated_gene_donor", "annotated_gene_acceptor"
    )
    return(colNames)
}



#' @title Import circRNAs detected by NCLscan
#'
#' @description The function importNCLscan is specifically designed to read
#' and adapt the NCLscan v1.4 output file (e.g. MyProject.result). Only
#' intragenic circRNAs are kept in the analysis.
#' See \url{https://github.com/TreesLab/NCLscan} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNAs detected by NCLscan
#' pathToFile <- system.file("extdata", "nclscan/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs
#' importNCLscan(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importNCLscan <- function(pathToFile) {
    # Get column names
    colNames <- .getNScolNames()

    options(readr.num_columns = 0)

    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = FALSE)

    colnames(importedPatientCircTable) <- colNames

    # 1 STEP - keep Only intergenic circRNAs (type==1).
    # 2 STEP - Select the needed columns
    adaptedPatientCircTable <- importedPatientCircTable %>%
        dplyr::filter(type == 1) %>%
        dplyr::select(
            gene,
            strand,
            chrom,
            startUpBSE,
            # back-spliced junction coordinate
            endDownBSE,
            # back-spliced junction coordinate
            coverage
        )%>%
        dplyr::mutate(
            chrom = ifelse(chrom == 'chrMT', 'chrM', chrom))

    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(id, everything())

    # Fix coordinates
    adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)
    return(adaptedPatientCircTable)
}

# Get NCLscan column names
.getNScolNames <- function() {
    # Below are reported the content of each column of the output file
    # (MyProject.result) generated by NCLscan
    # (see https://github.com/TreesLab/NCLscan)

    #(1) Chromosome name of the donor side (5'ss)
    #(2) Junction coordinate of the donor side
    #(3) Strand of the donor side
    #(4) Chromosome name of the acceptor side (3'ss)
    #(5) Junction coordinate of the acceptor side
    #(6) Strand of the acceptor side
    #(7) Gene name of the donor side
    #(8) Gene name of the acceptor side
    #(9) Intragenic (1) or intergenic (0) case
    #(10) Total number of all support reads
    #(11) Total number of junc-reads
    #(12) Total number of span-reads

    colNames <- c(
        "chromDownBSE",
        "endDownBSE",
        "strandDownBSE",
        "chrom",
        "startUpBSE",
        "strand",
        "geneDownBSE",
        "gene",
        "type",
        "coverage",
        "juncReads",
        "spanReads"
    )
    return(colNames)
}


#' @title Import circRNAs detected by CircExplorer2
#'
#' @description The function importCircExplorer2 is specifically designed to
#' read and adapt the circExplorer2 v2.3.4 output file (circularRNA_full.txt).
#' See \url{https://github.com/YangLab/CIRCexplorer2.git} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNAs detected by CIRCexplorer2
#' pathToFile <- system.file("extdata", "circexplorer2/circRNAs_001.txt",
#' package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs.
#' importCircExplorer2(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importCircExplorer2 <- function(pathToFile) {
  
  # Get circExplorer column names
  colNames <- .getCEcolNames()
  
  options(readr.num_columns = 0)
  
  # Read a tab separated (\t) values
  importedPatientCircTable <- .readPathToFile(pathToFile, header = FALSE)
  
  if (importedPatientCircTable[1,14]=="circRNA"){
    colnames(importedPatientCircTable) <- colNames  
    
  }else if((importedPatientCircTable[1,14])=="circType" & 
           identical(as.character(importedPatientCircTable[1,]),colNames)){
    
    names(importedPatientCircTable) <- as.character(importedPatientCircTable[1,])
    importedPatientCircTable <- importedPatientCircTable[-1,]
    
  } else{
    cat("(!) Column names do not match, circRNaprofiler was tested with 
          circExplorer2 v2.3.4 output file (circularRNA_full.txt)\n")
    cat("(!) Column names order should be:", colNames, "\n")      
    
  }
  
  
  
  # 1 keep only exonic circRNAs (circType == "circRNA").
  # 2 STEP - Select the needed columns
  adaptedPatientCircTable <- importedPatientCircTable %>%
    dplyr::filter(circType == "circRNA") %>%
    dplyr::select(
      gene = geneName,
      strand,
      chrom,
      startUpBSE = start,
      # back-spliced junction coordinate
      endDownBSE = end,
      # back-spliced junction coordinate
      coverage = readNumber
    )%>%
    dplyr::mutate(
      chrom = ifelse(chrom == 'chrMT', 'chrM', chrom)) %>%
    dplyr::mutate_at(c('startUpBSE', 'endDownBSE','coverage'), as.numeric)
  
  
  # Generate a unique identifier
  id <- .getID(adaptedPatientCircTable)
  
  adaptedPatientCircTable <- adaptedPatientCircTable %>%
    dplyr::mutate(id = id) %>%
    dplyr::select(id, everything())
  
  # Fix coordinates
  adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)
  
  return(adaptedPatientCircTable)
}

# Get circExplorer column names
.getCEcolNames <- function(){
    # Below are reported the content of each column of the output file
    # generated by circExplorer2
    # (see https://github.com/YangLab/CIRCexplorer2.git)

    # Field Description
    # chrom Chromosome
    # start Start of circular RNA
    # end End of circular RNA
    # name Circular RNA/Junction reads
    # score Flag of fusion junction realignment
    # strand + or - for strand
    # thickStart No meaning
    # thickEnd No meaning
    # itemRgb 0,0,0
    # exonCount Number of exons
    # exonSizes Exon sizes
    # exonOffsets Exon offsets
    # readNumber Number of junction reads
    # circType Type of circular RNA
    # geneName Name of gene
    # isoformName Name of isoform
    # index Index of exon or intron
    # flankIntron Left intron/Right intron

    colNames <- c(
        "chrom",
        "start",
        "end",
        "name",
        "score",
        "strand",
        "thickStart",
        "thickEnd",
        "itemRgb",
        "exonCount",
        "exonSizes",
        "exonOffsets",
        "readNumber",
        "circType",
        "geneName",
        "isoformName",
        "index",
        "flankIntron"
    )
    return(colNames)
}

#' @title Import circRNAs detected by KINFE
#'
#' @description The function importKnife is specifically designed to read and
#' adapt the KNIFE v1.5 output file (circJuncProbs.txt).
#' See \url{https://github.com/lindaszabo/KNIFE.git} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNAs detected by KNIFE
#' pathToFile <- system.file("extdata", "knife/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs.
#' importKnife(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importKnife <- function(pathToFile) {
    options(readr.num_columns = 0)

    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = TRUE)

    # Split and get knife prediction
    adaptedPatientCircTable <- .splitKNprediction(importedPatientCircTable)

    # Select the needed columns.
    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::select(
            gene = gene1_symbol,
            strand,
            chrom = chr,
            startUpBSE = splice_position1,
            # back-spliced junction coordinate
            endDownBSE = splice_position2,
            # back-spliced junction coordinate
            coverage = readNumber
        )%>%
        dplyr::mutate(
            chrom = ifelse(chrom == 'chrMT', 'chrM', chrom))
    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(id, everything())

    # Fix coordinates
    adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)

    return(adaptedPatientCircTable)
}

.getKnColNames <- function(){
    # the first column of the data frame is composed as following:
    # junction: chr|gene1_symbol:splice_position|gene2_symbol:splice_position|
    # junction_type|strand.  see (https://github.com/lindaszabo/KNIFE.git)
    colNames <-  c(
            "chr",
            "gene1_symbol",
            "splice_position1",
            "splice_position2",
            "strand",
            "readNumber"
        )
    return(colNames)
}

# Split content first column in knife preditions
.splitKNprediction <- function(importedPatientCircTable){
    # The first column is splitted to retrieved the needed information.
    # An empty data frame is filled with the extracted information
    temp <-
        data.frame(matrix(
            nrow = nrow(importedPatientCircTable),
            ncol = 6
        ))

    colnames(temp) <- .getKnColNames()

    for (i in seq_along(importedPatientCircTable$junction)) {
        temp$chr[i] <-
            base::strsplit(importedPatientCircTable$junction[i], "\\|")[[1]][1]
        temp$gene1_symbol[i] <-
            base::strsplit(base::strsplit(importedPatientCircTable$junction[i], "\\|")[[1]][2],
                ":")[[1]][1]
        temp$splice_position1[i] <-
            base::strsplit(base::strsplit(importedPatientCircTable$junction[i], "\\|")[[1]][2],
                ":")[[1]][2]
        temp$splice_position2[i] <-
            base::strsplit(base::strsplit(importedPatientCircTable$junction[i], "\\|")[[1]][3],
                ":")[[1]][2]
        temp$strand[i] <-
            base::strsplit(base::strsplit(importedPatientCircTable$junction[i], "\\|")[[1]][5],
                ":")[[1]]
        temp$readNumber[i] <-
            importedPatientCircTable$total_reads[i]

    }
    return(temp)
}


#' @title Import circRNAs detected by an annotation-based circRNA
#' detection tool
#'
#' @description
#' The function importOther() is designed to read output file from a
#' annotation-based circRNA detection tool. The user after the detection of the
#' crcRNAs must format the output file, so that it has the following columns
#' with header: gene, strand, chrom, startUpBSE, endDownBSE and coverage.
#' If more columns are present they will be discared.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNAs
#' pathToFile <- system.file("extdata", "tool1/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs
#' importOther(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importOther <- function(pathToFile) {
    options(readr.num_columns = 0)
    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = TRUE)

    adaptedPatientCircTable <- importedPatientCircTable %>%
        dplyr::select(
            gene,
            strand,
            chrom,
            startUpBSE,
            # back-spliced junction
            endDownBSE,
            # back-spliced junction
            coverage
        )%>%
        dplyr::mutate(
            chrom = ifelse(chrom == 'chrMT', 'chrM', chrom))
    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(id, everything())

    return(adaptedPatientCircTable)
}



#' @title Import circRNAs detected by CircMarker
#'
#' @description The function importCircMarker is specifically designed to read
#' and adapt the CircMarker (July.24.2018) output file (Brief_sum.txt).
#' See \url{https://github.com/lxwgcool/CircMarker} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @param gtf A data frame containing the formatted GTF file. This is generated
#' with \code{\link{formatGTF}}.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Load short version of the gencode v19 annotation file
#' data("gtf")
#'
#' # Path to an example file containing circRNA detected by CircMarker
#' pathToFile <- system.file("extdata", "circmarker/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs.
#' # Due to the short version of the gtf file gene names might miss in the
#' # returned output.
#' importCircMarker(pathToFile, gtf)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importCircMarker <- function(pathToFile, gtf) {
    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = FALSE, sep =  " ")
    colnames(importedPatientCircTable) <- .getCMcolNames()

    # Find gene
    m1 <- match(importedPatientCircTable$start, gtf$start)
    m2 <- match(importedPatientCircTable$start, gtf$end)
    indexGenes <- dplyr::coalesce(m1, m2)
    genes <- gtf$gene_name[indexGenes]

    # Select the needed columns.
    adaptedPatientCircTable <- importedPatientCircTable %>%
        dplyr::mutate(gene = genes) %>%
        dplyr::select(
            gene,
            strand,
            chrom = chrom,
            startUpBSE = start,
            # back-spliced junction coordinate
            endDownBSE = end,
            # back-spliced junction coordinate
            coverage
        )%>%
        dplyr::mutate(
            chrom = ifelse(chrom == 'chrMT', 'chrM', chrom))
    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    # Merge duplicated
    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(id, gene, everything()) %>%
        dplyr::group_by(
            id,
            gene,
            strand,
            chrom,
            startUpBSE,
            endDownBSE
        ) %>%
        dplyr::summarise(coverage = sum(coverage))
    # Fix coordinates
    adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)
    return(adaptedPatientCircTable)
}


# Get circMArker column names
.getCMcolNames <- function() {
    colNames <- c("chrom", "start", "end", "coverage", "strand", "type")
    return(colNames)
}



#' @title Import circRNAs detected by UROBORUS
#'
#' @description The function importUroborus is specifically designed to reaad
#' and adapt the UROBORUS v2.0.0 output file (circRNA_list.txt).
#' See \url{https://github.com/WGLab/UROBORUS} for more details.
#'
#' @param pathToFile A character string specifying the path to the file
#' containing the detected circRNAs.
#'
#' @return A data frame.
#'
#' @keywords internal
#'
#' @examples
#' # Path to an example file containing circRNA detected by UROBORUS
#' pathToFile <- system.file("extdata", "uroborus/circRNAs_001.txt",
#'     package="circRNAprofiler")
#'
#' # Inner function.
#' # Import circRNAs.
#' importUroborus(pathToFile)
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
importUroborus <- function(pathToFile) {
    # A character vector with the column names is created
    colNames <- .getURcolNames()

    # Read a tab separated (\t) values
    importedPatientCircTable <- .readPathToFile(pathToFile, header = FALSE)

    colnames(importedPatientCircTable) <- colNames

    # Select the needed columns and rename them
    adaptedPatientCircTable <- importedPatientCircTable %>%
        dplyr::select(
            gene = Parental_gene_name,
            strand,
            chrom = Chromosome,
            startUpBSE = start_of_junction,
            # back-spliced junction coordinate
            endDownBSE = end_of_junction,
            # back-spliced junction coordinate
            coverage = read_counts
        )%>%
        dplyr::mutate(
            chrom = ifelse(chrom == 'chrMT', 'chrM', chrom))
    # Generate a unique identifier
    id <- .getID(adaptedPatientCircTable)

    adaptedPatientCircTable <- adaptedPatientCircTable %>%
        dplyr::mutate(id = id) %>%
        dplyr::select(id, everything())

    # Fix coordinates
    adaptedPatientCircTable <- .fixCoords(adaptedPatientCircTable)
    return(adaptedPatientCircTable)
}


# get uroborus column names
.getURcolNames <- function(){
    # A character vector with the column names is created
    colNames <- c(
        "Chromosome",
        "start_of_junction",
        "end_of_junction",
        "strand",
        "Parental_gene_name",
        "genomic_distance",
        "read_counts",
        "matched_transcript_id"
    )
    return(colNames)
}

# Read pathToFile
.readPathToFile <- function(pathToFile, header = FALSE, sep = "\t") {
    # Read a tab separated (\t) values
    importedPatientCircTable <-
        utils::read.table(
            pathToFile,
            header = header,
            sep = sep,
            stringsAsFactors = FALSE
        )
    return(importedPatientCircTable)
}


# If the function you are looking for is not here check supportFunction.R
# Functions in supportFunction.R are used by multiple functions.
Aufiero/circRNAprofiler documentation built on Oct. 31, 2023, 1:18 a.m.