R/processCapRout.R

Defines functions processCapRout

Documented in processCapRout

#' @title processCapRout
#'
#' @description Creates context-separated bedGraph files of CapR output for
#' genome and transcriptome alignments.
#'
#' @param CapR_outfile Name of CapR output file. Required
#' @param output_prefix Prefix string to be appended to all output files.
#' Required.
#' @param chrom_size Name of chromosome size file. File must be in two-column
#' format without a header where first column is chromosome name and second
#' column is chromosome length, as from getChainChrSize. Required.
#' @param genome_gtf The name of the GTF/GFF file that contains all exome
#' annotations. Required
#' @param RNA_fragment RNA component of interest. Options depend on the gtf
#' file but often include "gene", "transcript", "exon", "CDS", "five_prime_utr",
#' and/or "three_prime_utr". Default "exon" for the whole exome.
#' @param chain The name of the chain file to be used. Format should be like
#' chain files derived from GRangesMappingToChainFile. Required
#'
#' @return writes bedGraph files of structure signal for each of the six
#' CapR contexts 1) mapped to the genome and 2) lifted-over to the transcriptome
#'
#' @examples
#' ## make chain file
#' load(system.file("extdata/transcript_list.Rda", package="nearBynding"))
#' gtf<-system.file("extdata/Homo_sapiens.GRCh38.chr4&5.gtf",
#'                 package="nearBynding")
#' GenomeMappingToChainFile(genome_gtf = gtf,
#'                         out_chain_name = "test.chain",
#'                         RNA_fragment = "three_prime_utr",
#'                         transcript_list = transcript_list,
#'                         alignment = "hg38")
#' ## get chromosome size file
#' getChainChrSize(chain = "test.chain",
#'                out_chr = "chr4and5_3UTR.size")
#'
#' processCapRout(CapR_outfile = system.file("extdata/chr4and5_3UTR.out",
#'                                          package="nearBynding"),
#'               chain = "test.chain",
#'               output_prefix = "chr4and5_3UTR",
#'               chrom_size = "chr4and5_3UTR.size",
#'               genome_gtf = gtf,
#'               RNA_fragment = "three_prime_utr")
#'
#' @importFrom magrittr '%>%'
#' @importFrom utils read.delim
#' @importFrom S4Vectors elementMetadata
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @importFrom BiocGenerics strand type
#'
#' @export

processCapRout <- function(CapR_outfile,
                            output_prefix,
                            chrom_size,
                            genome_gtf,
                            RNA_fragment,
                            chain) {
    folded_transcripts_appended <- readLines(CapR_outfile)
    transcript_names <- folded_transcripts_appended[which(
        substr(folded_transcripts_appended, 1, 1) == ">"
    )]
    transcript_names <- lapply(transcript_names, function(x) {
        substr(x, 2, nchar(x))
    }) %>% unlist()
    bulge <- folded_transcripts_appended[which(
        substr(folded_transcripts_appended, 1, 5) == "Bulge"
    )]
    exterior <- folded_transcripts_appended[which(
        substr(folded_transcripts_appended, 1, 8) == "Exterior"
    )]
    hairpin <- folded_transcripts_appended[which(
        substr(folded_transcripts_appended, 1, 7) == "Hairpin"
    )]
    internal <- folded_transcripts_appended[which(
        substr(folded_transcripts_appended, 1, 8) == "Internal"
    )]
    multibranch <- folded_transcripts_appended[which(
        substr(folded_transcripts_appended, 1, 11) == "Multibranch"
    )]
    stem <- folded_transcripts_appended[which(
        substr(folded_transcripts_appended, 1, 4) == "Stem"
    )]
    cut_transcripts <- function(x) {
        transcript <- strsplit(x, " ") %>% unlist()
        transcript <- transcript[2:length(transcript)]
        return(transcript)
    }
    bulge_cut <- lapply(bulge, cut_transcripts) %>% unlist()
    exterior_cut <- lapply(exterior, cut_transcripts) %>% unlist()
    hairpin_cut <- lapply(hairpin, cut_transcripts) %>% unlist()
    internal_cut <- lapply(internal, cut_transcripts) %>% unlist()
    multibranch_cut <- lapply(multibranch, cut_transcripts) %>% unlist()
    stem_cut <- lapply(stem, cut_transcripts) %>% unlist()
    # double check that transcriptome length total (chr size file)
    # equals number of folded transcript positions
    chr_size <- read.delim(chrom_size, header = FALSE)
    if (sum(chr_size$V2) - length(bulge_cut) != 0) {
        stop("The length of the CapR output does not equal the length of
            chromosome size. Make sure that the input chromosome size
            corresponds to the folded CapR transcripts.")
    }
    gtf <- import(genome_gtf)
    gtf <- with(gtf, plyranges::filter(gtf, type == RNA_fragment))
    # write bedgraph file for each transcript
    # create bedGraph for each structure separately
    bg_shell <- as.data.frame(matrix(NA, ncol = 4, nrow = sum(chr_size$V2)))
    colnames(bg_shell) <- c("chr", "start", "end", "score")
    n <- 1
    for (t in transcript_names) {
        gtf_t <- gtf[(elementMetadata(gtf)[, "transcript_id"] == t)]
        bg_shell[n:(n + sum(gtf_t@ranges@width) - 1), "chr"] <-
            gtf_t@seqnames@values %>% as.character()
        if (gtf_t@strand@values[1] == "+") {
            for (unit in seq_len(length(gtf_t))) {
                gtf_t_unit <- gtf_t[unit]
                bg_shell[n:(n + gtf_t_unit@ranges@width - 1), "start"] <-
                    gtf_t_unit@ranges@start:(gtf_t_unit@ranges@start +
                        gtf_t_unit@ranges@width - 1)
                n <- n + gtf_t_unit@ranges@width
            }
        } else {
            for (unit in length(gtf_t):1) {
                gtf_t_unit <- gtf_t[unit]
                bg_shell[n:(n + gtf_t_unit@ranges@width - 1), "start"] <-
                    rev(gtf_t_unit@ranges@start:(gtf_t_unit@ranges@start +
                        gtf_t_unit@ranges@width - 1))
                n <- n + gtf_t_unit@ranges@width
            }
        }
    }
    bg_shell$end <- bg_shell$start
    if (nchar(bg_shell[1, "chr"]) < 3) {
        bg_shell$chr <- paste0("chr", bg_shell$chr)
    }
    # bulge
    df_bulge <- bg_shell
    df_bulge$score <- bulge_cut %>% as.numeric()
    GRanges_bulge <- makeGRangesFromDataFrame(df_bulge,
        keep.extra.columns = TRUE)
    export.bedGraph(GRanges_bulge, paste0(output_prefix,"_bulge.bedGraph"))
    liftOverToExomicBG(input = paste0(output_prefix,"_bulge.bedGraph"),
        chrom_size = chrom_size,
        chain = chain,
        output_bg = paste0(output_prefix, "_bulge_liftOver.bedGraph"))
    # exterior
    df_exterior <- bg_shell
    df_exterior$score <- exterior_cut %>% as.numeric()
    GRanges_exterior <- makeGRangesFromDataFrame(df_exterior,
        keep.extra.columns = TRUE)
    export.bedGraph(GRanges_exterior,paste0(output_prefix,"_exterior.bedGraph"))
    liftOverToExomicBG(input = paste0(output_prefix,"_exterior.bedGraph"),
        chrom_size = chrom_size,
        chain = chain,
        output_bg = paste0( output_prefix,"_exterior_liftOver.bedGraph"))
    # hairpin
    df_hairpin <- bg_shell
    df_hairpin$score <- hairpin_cut %>% as.numeric()
    GRanges_hairpin <- makeGRangesFromDataFrame(df_hairpin,
        keep.extra.columns = TRUE)
    export.bedGraph(GRanges_hairpin, paste0(output_prefix,"_hairpin.bedGraph"))
    liftOverToExomicBG(input = paste0( output_prefix,"_hairpin.bedGraph"),
        chrom_size = chrom_size,
        chain = chain,
        output_bg = paste0(output_prefix,"_hairpin_liftOver.bedGraph"))
    # internal
    df_internal <- bg_shell
    df_internal$score <- internal_cut %>% as.numeric()
    GRanges_internal <- makeGRangesFromDataFrame(df_internal,
        keep.extra.columns = TRUE)
    export.bedGraph(GRanges_internal,paste0(output_prefix,"_internal.bedGraph"))
    liftOverToExomicBG(input = paste0(output_prefix,"_internal.bedGraph"),
        chrom_size = chrom_size,
        chain = chain,
        output_bg = paste0(output_prefix,"_internal_liftOver.bedGraph"))
    # multibranch
    df_multibranch <- bg_shell
    df_multibranch$score <- multibranch_cut %>% as.numeric()
    GRanges_multibranch <- makeGRangesFromDataFrame(df_multibranch,
        keep.extra.columns = TRUE)
    export.bedGraph(GRanges_multibranch, paste0(output_prefix,
                                                    "_multibranch.bedGraph"))
    liftOverToExomicBG(input = paste0(output_prefix, "_multibranch.bedGraph"),
        chrom_size = chrom_size,
        chain = chain,
        output_bg = paste0(output_prefix,"_multibranch_liftOver.bedGraph"))
    # stem
    df_stem <- bg_shell
    df_stem$score <- stem_cut %>% as.numeric()
    GRanges_stem <- makeGRangesFromDataFrame(df_stem,
        keep.extra.columns = TRUE)
    export.bedGraph(GRanges_stem, paste0(output_prefix,"_stem.bedGraph"))
    liftOverToExomicBG(input = paste0(output_prefix,"_stem.bedGraph"),
        chrom_size = chrom_size,
        chain = chain,
        output_bg = paste0(output_prefix,"_stem_liftOver.bedGraph"))
}
vbusa1/nearBynding documentation built on Aug. 4, 2021, 4:08 p.m.