occupancy: Get ribosome occupancy at codon or amino acid level

Usage Arguments Details Value Note Author(s) See Also Examples

Usage

1
occupancy(data, seq, genedf, codon = T, norms = "rna", gene_list = F, frame = NA, cores = 1, separate_start = T)

Arguments

data

list returned by read_profiling_data

seq

A DNAStringSet of the FASTA file used for alignment.

genedf

data frame that contains the gene name, and the nucleotide coordinates of start and stop locations as well as number of nucleotites and codon of the ORF.

codon

logical, whether to return codon level or amino acid level occupancy, defaults to TRUE

norms

normalization, character either "total", "rna" or "none". Defaults to "total"

gene_list

logical, whether to aggregate all genes in genedf or return gene by gene values

frame

integer, 0,1,2 or NA indicating whether only reads from a specific frame should be counted. If set to NA all reads are counted.

cores

integer, number of threads to use. See details

separate_start

whether to tread start codon separately from other methionines, defaults to T, codon needs to be set to T

Details

Normalizations are similar to other functions. If "none" is selected then the total number of reads are returned. If "rna" is selected then the the average of the three nucleotides of the codon is used for normalization. If "total" is used then the fraction of reads that correspond to each codon or amino acid is returned.

Currently PSOCK multithreadding is not supported. For Windows set cores to 1. For operating systems that support forking the number of threads that are used can be set by the cores argument. If the cores>1 then the data is processed by mclapply instead of lapply.

Value

a data frame containing the number of reads (or normalized values) with the number of codons or amino acids in the selection is returned. if gene_list=T then a list is returned where each element is a data frame with occupancy values and number of codons or amino acids.

Note

warnings are suppressed because some pseudogenes or putative proteins do not have annotated ORF lengths are not divisible by 3.

Author(s)

Alper Celik

See Also

totals

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
##---- Should be DIRECTLY executable !! ----
##-- ==>  Define data, use random,
##--	or do  help(data=index)  for the standard data sets.

## The function is currently defined as
function (data, seq, genedf, codon = T, norms = "rna", gene_list = F,
    frame = NA, cores = 1, separate_start = T)
{
    codon_summer <- function(gene, class, measure) {
        if (class == "codon") {
            if (measure == "ribo") {
                sums <- sapply(unique(gene[[class]]), function(x) {
                  sum(gene[which(gene[[class]] == x), ][["counts"]])
                })
            }
            else if (measure == "rna") {
                sums <- sapply(unique(gene[[class]]), function(x) {
                  sum(gene[which(gene[[class]] == x), ][["coverage"]])
                })
            }
        }
        else if (class == "aa") {
            if (measure == "ribo") {
                sums <- sapply(unique(gene[[class]]), function(x) {
                  sum(gene[which(gene[[class]] == x), ][["counts"]])
                })
            }
            else if (measure == "rna") {
                sums <- sapply(unique(gene[[class]]), function(x) {
                  sum(gene[which(gene[[class]] == x), ][["coverage"]])
                })
            }
        }
        sums
    }
    summer <- function(nuc, df) {
        a <- df[df$nucleotide == nuc, ]
        a <- sum(a[["freq"]], na.rm = T)
        a
    }
    gene_names <- genedf$gene
    get_occupancy <- function(gene_name, seq = seq, data = data) {
        coord <- genedf[genedf$gene == gene_name, ]
        gene <- get_gene(gene = gene_name, data = data, seq = seq,
            genedf = genedf)
        if (!is.na(frame)) {
            gene <- gene[gene$frame == frame, ]
            gene <- gene[!is.na(gene$nucleotide), ]
        }
        if (is.null(gene)) {
            NULL
        }
        else {
            counts <- sapply(unique(gene$nucleotide), summer,
                gene)
            gene <- as.data.frame(cbind(unique(gene[, 1:2]),
                counts))
            gene <- gene[gene$nucleotide >= 1 & gene$nucleotide <=
                (coord$end - coord$start + 1), ]
            if (dim(gene)[1] < 3) {
                NULL
            }
            else {
                gene$coverage <- rollmean(gene$coverage, 3, fill = NA,
                  align = "left")
                gene <- gene[(1:length(gene$nucleotide/3) * 3 -
                  2), ]
                cods <- as.data.frame(codons(seq[[gene_name]][coord$start:coord$end]))[,
                  1]
                aas <- as.data.frame(translate(seq[[gene_name]][coord$start:coord$end]))[,
                  1]
            }
            if (dim(gene)[1] < length(cods)) {
                gene$codon <- cods[floor(gene$nucleotide/3) +
                  1]
                gene$aa <- aas[floor(gene$nucleotide/3) + 1]
                gene$aa <- as.character(gene$aa)
            }
            else if (dim(gene)[1] >= length(cods)) {
                gene <- gene[1:length(cods), ]
                gene$codon <- cods
                gene$aa <- aas
                gene$aa <- as.character(gene$aa)
            }
            if (separate_start) {
                gene$codon[1] <- "start"
            }
            if (codon) {
                occup <- data.frame(codon = unique(gene$codon),
                  ribo = codon_summer(gene, "codon", "ribo"),
                  rna = codon_summer(gene, "codon", "rna"))
                n_obs <- as.data.frame(plyr::count(gene$codon))
                colnames(n_obs) <- c("codon", "n_obs")
                occup <- plyr::join(x = occup, y = n_obs, by = "codon",
                  type = "full")
                occup
            }
            else {
                occup <- data.frame(aa = unique(gene$aa), ribo = codon_summer(gene,
                  "aa", "ribo"), rna = codon_summer(gene, "aa",
                  "rna"))
                n_obs <- as.data.frame(plyr::count(gene$aa))
                colnames(n_obs) <- c("aa", "n_obs")
                occup <- plyr::join(x = occup, y = n_obs, by = "aa",
                  type = "full")
                occup
            }
            occup
        }
    }
    if (gene_list) {
        if (cores > 1) {
            suppressWarnings(occups <- mclapply(gene_names, get_occupancy,
                seq, data, mc.cores = cores))
            invisible(occups)
        }
        else {
            suppressWarnings(occups <- lapply(gene_names, get_occupancy,
                seq, data))
            invisible(occups)
        }
    }
    else {
        if (cores > 1) {
            suppressWarnings(occups <- do.call("rbind", mclapply(gene_names,
                get_occupancy, seq, data, mc.cores = cores)))
        }
        else {
            suppressWarnings(occups <- do.call("rbind", lapply(gene_names,
                get_occupancy, seq, data)))
        }
        get_sums <- function(val, x, col) {
            a <- x[which(x[, 1] == val), col]
            a <- sum(a, na.rm = T)
        }
        if (is.null(data[["rna"]]) & norms == "rna") {
            warning("No RNA-Seq Data Setting norms to total")
            norms = "total"
        }
        if (norms == "rna") {
            occups$norm <- occups$ribo/occups$rna
            torem <- grep("Inf", occups$norm)
            torem <- c(grep("NaN", occups$norm), torem)
            torem <- unique(torem)
            if (length(torem > 0)) {
                occups <- occups[-torem, ]
            }
            unq <- unique(occups[, 1])
            value <- lapply(unq, get_sums, occups, 5)
            n_obs <- lapply(unq, get_sums, occups, 4)
            b <- data.frame(feature = unq, value = unlist(value),
                n_obs = unlist(n_obs))
            invisible(b)
        }
        else if (norms == "total") {
            unq <- unique(occups[, 1])
            value <- lapply(unq, get_sums, occups, 2)
            n_obs <- lapply(unq, get_sums, occups, 4)
            b <- data.frame(feature = unq, value = unlist(value),
                n_obs = unlist(n_obs))
            b$value <- b$value/sum(b$value)
            invisible(b)
        }
        else if (norms == "none") {
            unq <- unique(occups[, 1])
            value <- lapply(unq, get_sums, occups, 2)
            n_obs <- lapply(unq, get_sums, occups, 4)
            b <- data.frame(feature = unq, value = unlist(value),
                n_obs = unlist(n_obs))
            b <- data.frame(feature = unq, value = unlist(a))
            invisible(b)
        }
    }
  }

celalp/ribofootprintR documentation built on May 12, 2019, 12:04 p.m.