metagene_cds: Get average ribosome footprint alignment values at the 5' and...

Usage Arguments Details Value Author(s) See Also Examples

Usage

1
metagene_cds(data, genedf, before_start = 50, after_start = 100, before_stop = 100, after_stop = 50, norms = "total", cores = 1, plot = F)

Arguments

data

list returned by read_profiling_data

genedf

data frame that contains the gene name, and the nucleotide coordinates of start and stop locations as well as number of nucleotides and codon of the ORF.

before_start

integer, the number of nucleotide to calculate before the start nucleotide, defaults to 50

after_start

integer, the number of nucleotide to calculate after the start nucleotide, defaults to 100

before_stop

integer, the number of nucleotide to calculate before the stop nucleotide, defaults to 100

after_stop

integer, the number of nucleotide to calculate before the stop nucleotide, defaults to 50

norms

character, normalization, character either "total", "rna" or "none". Defaults to "total"

cores

integer, number of threads to use. See details.

plot

logical, whether to plot or not, defaults to F

Details

If norms is set to "total"" returns the average fraction of reads that map to the region specified. If norms are set to "rna" then the number of reads that map to each codon are divided by the RNA-Seq coverage. "none" returns the average number of reads per codon.

Currently PSOCK multithreadding is not supported. For Windows set cores to 1. For operating systems that support forking the number of threads that are used can be set by the cores argument. If the cores>1 then the data is processed by mclapply instead of lapply.

Value

a data frame with normalized nucleotide values.

Author(s)

Alper Celik

See Also

metagene_bins

Examples

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
##---- Should be DIRECTLY executable !! ----
##-- ==>  Define data, use random,
##--	or do  help(data=index)  for the standard data sets.

## The function is currently defined as
function (data, genedf, before_start = 50, after_start = 100,
    before_stop = 100, after_stop = 50, norms = "total", cores = 1,
    plot = F)
{
    returner <- function(nucleotide, frame, df) {
        a <- df[df$nucleotide == nucleotide & df$frame == frame,
            ]
        b <- sum(a$norm, na.rm = T)
        b
    }
    if (is.null(data[["rna"]]) & norms == "rna") {
        warning("No RNA-Seq data setting norms to total")
        norms = "total"
    }
    gene_names <- genedf$gene
    get_cds <- function(gene_name, direction, norm = norms, genes = genedf) {
        gene <- get_gene(gene_name, data = data, seq = NA, genedf = genes)
        if (is.null(gene) == T) {
            NULL
        }
        else {
            coord <- genes[genes$gene == gene_name, ]
            if (direction == "five") {
                cdss <- data.frame(nucleotide = c((before_start *
                  -1):after_start))
                cdss <- left_join(cdss, gene, by = "nucleotide")
            }
            else if (direction == "three") {
                cdss <- data.frame(nucleotide = c((coord$end -
                  coord$start - before_stop):(coord$end - coord$start +
                  after_stop)))
                cdss <- left_join(cdss, gene, by = "nucleotide")
                cdss$nucleotide <- cdss$nucleotide - coord$end +
                  coord$start
            }
            if (norm == "total") {
                cdss$norm <- as.numeric(cdss$freq/sum(gene$freq,
                  na.rm = T))
            }
            else if (norm == "rna") {
                cdss$norm <- as.numeric(cdss$freq/cdss$coverage)
            }
            else if (norm == "none") {
                cdss$norm <- as.numeric(cdss$freq)
            }
            cdss$norm[!is.finite(cdss$norm)] <- NA
            cdss
        }
    }
    if (cores > 1) {
        fives <- do.call("rbind", mclapply(gene_names, get_cds,
            "five", mc.cores = cores))
    }
    else {
        fives <- do.call("rbind", lapply(gene_names, get_cds,
            "five"))
    }
    x <- unique(fives[, c(1, 3)])
    x <- na.omit(x)
    counts <- mdply(x, returner, fives)
    fives <- counts
    colnames(fives)[3] <- "value"
    fives$codon <- ceiling(fives$nucleotide/3)
    if (cores > 1) {
        threes <- do.call("rbind", mclapply(gene_names, get_cds,
            "three", mc.cores = cores))
    }
    else {
        threes <- do.call("rbind", lapply(gene_names, get_cds,
            "three"))
    }
    x <- unique(threes[, c(1, 3)])
    x <- na.omit(x)
    counts <- mdply(x, returner, threes)
    threes <- counts
    colnames(threes)[3] <- "value"
    threes$codon <- ceiling(threes$nucleotide)
    fives$location <- "from_start"
    threes$location <- "from_stop"
    cds <- rbind(fives, threes)
    cds
    fives$location <- "from_start"
    threes$location <- "from_stop"
    cds <- rbind(fives, threes)
    if (plot) {
        print(ggplot(cds, aes(x = codon, y = value, fill = frame)) +
            geom_bar(stat = "identity", position = "dodge") +
            facet_grid(. ~ location, scales = "free_x"))
    }
    invisible(cds)
  }

celalp/ribofootprintR documentation built on May 12, 2019, 12:04 p.m.