metagene_bins: Divide ORF into bins and return either gene by gene or a...

Description Usage Arguments Value Author(s) See Also Examples

Description

Divide the ORF in the bins and return the value either gene by gene or as cumulative mean or median.

Usage

1
metagene_bins(data, genedf, bins = 100, norms = "rna", simplify = "mean", cores = 1, plot = F)

Arguments

data

list returned by read_profiling_data() function

genedf

data frame that contains the gene name, and the nucleotide coordinates of start and stop locations as well as number of nucleotides and codon of the ORF. This can be the whole transcriptome or a subset of the genedf used by read_profiling_data

bins

number of bins to divide the ORF with defaults to 100

norms

character either "total", "rna" or "none"

simplify

character either "mean", "median", "none"

cores

number of threads to use in UNIX based systems. Defaults to 1.

plot

logical, whether to plot the results. See details.

Value

Returns a molten data frame. Each ORF is divided by the number of bins specified and the mean occupancy is calculated for each ORF. If norms is selected to be "rna" the ribosome profiling data is divided by the coverage from RNA-Seq. If norms is "total" then the fraction of reads mapping to each bin is calculated. "none" just returns average values. For the simplify argument if "none" is selected a molten data frame is returned where each gene and each bin is a row. This is done to make plotting easier. For other simplify options only one value per bin is returned.

Currently PSOCK multi threading is not supported. For Windows set cores to 1. For operating systems that support forking the number of threads that are used can be set by the cores argument. If the cores>1 then the data is processed by mclapply instead of lapply.

In the simplify argument if "mean" or "median" is selected a line plot is returned. If simplify is set to be "none" then plotting is done by geom_smooth which uses generalized additive models.

Author(s)

Alper Celik

See Also

metagene_cds

Examples

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
##---- Should be DIRECTLY executable !! ----
##-- ==>  Define data, use random,
##--	or do  help(data=index)  for the standard data sets.

## The function is currently defined as
function (data, genedf, bins = 100, norms = "rna", simplify = "mean",
    cores = 1, plot = F)
{
    summer <- function(nuc, df, measure) {
        a <- df[df$nucleotide == nuc, ]
        a <- sum(a[[measure]], na.rm = T)
        a
    }
    if (norms == "rna" & is.null(data[["rna"]])) {
        warning("No RNA-Seq Data Setting norms to total")
        norms = "total"
    }
    gene_list <- genedf
    norm_gene <- function(name, gene_list) {
        gene <- get_gene(gene = name, data = data, seq = NA,
            genedf = gene_list)
        if (is.null(gene) || dim(gene)[1] < 1) {
            NULL
        }
        else {
            coord <- genedf[genedf$gene == name, ]
            gene <- gene[gene$nucleotide >= 0 & gene$nucleotide <=
                (coord$end - coord$start + 1), ]
            if (dim(gene)[1] < 1) {
                NULL
            }
            else {
                counts <- sapply(unique(gene$nucleotide), summer,
                  gene, "freq")
                coverage <- sapply(unique(gene$nucleotide), summer,
                  gene, "coverage")
                if (norms == "none") {
                  a <- counts
                  a
                }
                else if (norms == "rna") {
                  a <- counts/coverage
                  a
                }
                else if (norms == "total") {
                  a <- counts/sum(counts, na.rm = T)
                  a
                }
            }
        }
    }
    binner <- function(gene_name) {
        gene <- norm_gene(name = gene_name, gene_list = gene_list)
        gene <- noinf(gene)
        gene_bins <- 0
        for (i in 1:bins) {
            lower <- floor((length(gene)/bins) * (i - 1))
            upper <- ceiling((length(gene)/bins) * (i))
            gene_bins[i] <- mean(gene[lower:upper], na.rm = T)
        }
        if (is.null(gene_bins)) {
            gene_bins <- rep(NA, bins)
            gene_bins
        }
        else {
            gene_bins
        }
    }
    genes <- genedf$gene
    if (cores > 1) {
        suppressWarnings(binned <- do.call("rbind", mclapply(genes,
            binner, mc.cores = cores, mc.cleanup = T)))
    }
    else {
        suppressWarnings(binned <- do.call("rbind", lapply(genes,
            binner)))
    }
    rownames(binned) <- genes
    if (simplify == "mean") {
        binned <- data.frame(bin = 1:bins, value = apply(binned,
            2, mean, na.rm = T))
        print(ggplot(binned, aes(x = bin, y = value)) + geom_line())
    }
    else if (simplify == "median") {
        binned <- data.frame(bin = 1:bins, value = apply(binned,
            2, median, na.rm = T))
        print(ggplot(binned, aes(x = bin, y = value)) + geom_line())
    }
    else if (simplify == "none") {
        binned <- reshape2::melt(binned)
        colnames(binned) <- c("gene", "bin", "value")
        if (plot) {
            print(ggplot(binned, aes(x = bin, y = value)) + geom_smooth())
        }
    }
    invisible(binned)
  }

celalp/ribofootprintR documentation built on May 12, 2019, 12:04 p.m.