rrp: Calculate relative ribosome positioning

Description Usage Arguments Details Value Author(s) References Examples

Description

Calculate where within the orf the cumulative location of ribosomal reads are.

Usage

1
rrp(data, genedf, norms = "total", cores = 1, plot = T, percentage = 50, frame = F, absolute = F, ignore_start = T)

Arguments

data

ribodata, the output of read_profiling_data

genedf

data.frame, stating the relative locations of start and stop locations in the transcriptome. Can be manually generated or through generate_transcriptome function

norms

string, "total" for fraction of reads per gene or "rna" for first getting ribo/rna-seq ratio. Default "total"

cores

integer, number of threads to run. only on UNIX systems see details, defaults to 1

plot

logical, should the function produce a density plot defaults to T

percentage

float or integer, the percentage of the reads in the cumulative function. Default 50. The function converts the percentage to fraction if the value is greater than 1

frame

F or integer 0,1,2 should the function use reads coming from a specific frame, default NA uses all reads.

absolute

Logical, should the relative or absoulte positions be returned. Defaults to F

ignore_start

logical, should the calculations start from the first codon or the second one. This might be useful for libraries that are prepared with cycloheximide

Details

Since the data is discerete the way this function goes around calculating the percent specified by the user is as follows: The cumulative sum is calculated for each non NA data point and it's corresponding nucleotide position is found.

Value

Invisible returns a data frame. One row per gene with nucleotide and codon location (relative or absolute depending on settings), gene name, and the percentile locations around the percent input. The weighted average (1/ distance from percent) of he largest read location that's smaller than the precent specified and smallest read location larger than the percent specified is calculated. I would be interested in suggestion for estimating an exact cdf per gene.

Author(s)

Alper Celik

References

This was first concieved by Richard Baker Ph.D. at UMASS Medical School. This is my version of the calculations that were done in his original perl script.

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
##---- Should be DIRECTLY executable !! ----
##-- ==>  Define data, use random,
##--	or do  help(data=index)  for the standard data sets.

## The function is currently defined as
function (data, genedf, norms = "total", cores = 1, plot = T,
    percentage = 50, frame = F, absolute = F, ignore_start = T)
{
    gene_names <- genedf$gene
    if (percentage > 1) {
        percentage <- percentage/100
    }
    if (is.null(data$rna) & norms == "rna") {
        warning("No RNA data setting norms to total")
        norms = "total"
    }
    get_rrp <- function(gene_name, percentage) {
        gene <- get_gene(gene_name, data = data, genedf = genedf)
        if (is.null(gene)) {
            NULL
        }
        else {
            coord <- genedf[genedf$gene == gene_name, ]
            if (ignore_start) {
                gene <- gene[gene$nucleotide > 3 & gene$nucleotide <
                  coord$end - coord$start, ]
            }
            else {
                gene <- gene[gene$nucleotide > 0 & gene$nucleotide <
                  coord$end - coord$start, ]
            }
            gene <- gene[order(gene$nucleotide), ]
            if (dim(gene)[1] == 0) {
                NULL
            }
            else {
                if (frame) {
                  gene <- gene[gene$frame == frame, ]
                }
                if (norms == "total") {
                  sums <- cumsum(na.omit(gene$freq))
                  tots <- sum(na.omit(gene$freq))
                  sums <- sums/tots
                  gene$cumulative <- NA
                  gene$cumulative[!is.na(gene$freq)] <- sums
                  lower <- gene[max(which(gene$cumulative < percentage)),
                    c("nucleotide", "codon", "cumulative")]
                  upper <- gene[min(which(gene$cumulative > percentage)),
                    c("nucleotide", "codon", "cumulative")]
                  nuc <- weighted.mean(x = c(lower$nucleotide,
                    upper$nucleotide), w = c(percentage - lower$cumulative,
                    upper$cumulative - percentage))
                  cod <- weighted.mean(x = c(lower$codon, upper$codon),
                    w = c(1/(percentage - lower$cumulative),
                      1/(upper$cumulative - percentage)))
                  rrp <- data.frame(nucleotide = nuc, codon = cod,
                    gene = gene_name, lower = lower$cumulative,
                    upper = upper$cumulative)
                  if (absolute) {
                    invisible(rrp)
                  }
                  else {
                    rrp$nucleotide <- rrp$nucleotide/coord$nnuc
                    rrp$codon <- rrp$codon/coord$ncodons
                    invisible(rrp)
                  }
                }
                else {
                  norms <- gene$freq/gene$coverage
                  sums <- cumsum(na.omit(norms))
                  tots <- sum(na.omit(gene$freq))
                  sums <- sums/tots
                  gene$cumulative <- NA
                  gene$cumulative[!is.na(gene$freq)] <- sums
                  nuc <- weighted.mean(x = c(lower$nucleotide,
                    upper$nucleotide), w = c(percentage - lower$cumulative,
                    upper$cumulative - percentage))
                  cod <- weighted.mean(x = c(lower$codon, upper$codon),
                    w = c(percentage - lower$cumulative, upper$cumulative -
                      percentage))
                  rrp <- data.frame(nucleotide = nuc, codon = cod,
                    gene = gene_name, lower = lower$cumulative,
                    upper = upper$cumulative)
                  if (absolute) {
                    rrp$gene <- gene_name
                    invisible(rrp)
                  }
                  else {
                    rrp$nucleotide <- rrp$nucleotide/coord$nnuc
                    rrp$codon <- rrp$codon/coord$ncodons
                    rrp$gene <- gene_name
                    invisible(rrp)
                  }
                }
            }
        }
    }
    if (cores > 1) {
        rel_ribo <- do.call("rbind", mclapply(gene_names, get_rrp,
            percentage, mc.cores = cores))
    }
    else {
        rel_ribo <- do.call("rbind", lapply(gene_names, get_rrp,
            percentage))
    }
    if (plot) {
        if (absolute) {
            print(ggplot(rel_ribo, aes(x = nucleotide)) + geom_density())
        }
        else {
            print(ggplot(rel_ribo, aes(x = nucleotide)) + geom_density() +
                xlab("fraction ORF"))
        }
    }
    invisible(rel_ribo)
  }

celalp/ribofootprintR documentation built on May 12, 2019, 12:04 p.m.