R/classify_exons.R

Defines functions classify_exons

Documented in classify_exons

#' @title Classify exons as single, first, inner or last exons.
#'
#' @description This function takes a gtf file from GENCODE and returns a dataframe in the R Global Environment containing an additional columnn which states the exons position within a transcript whether they are a single exon, first, inner or last exons.
#' @usage classify_exons(x)
#' @param x The name of the downloaded gtf file from GENCODE website
#' @export
#' @keywords
#' @seealso
#' @return A dataframe with additional column describing exon positions within a trnscript
#' examples \dontrun {
#' # You don't have to run this
#' load_gtf("gencode.v27.lncRNAs.gtf")
#' classify_exons(gencode.v27.lncRNAs.gtf)
#’}
classify_exons <- function(x) {
  aa <- x
  bb <- subset(aa, aa$type=="exon")
  cc <- subset(bb, select = c("transcript_id", "exon_number"))
  dd <- as.data.frame(table(cc$transcript_id))
  colnames(dd) <- c("transcript_id", "exon_count")
  ee <- subset(dd, dd$exon_count == 1)
  msg1 <- "Extracting single exons"
  cat(msg1)
  ff <- dplyr::semi_join(bb,ee, by = "transcript_id")
  ff$EXON_CLASSIFICATION <- "single_exons"
  fff <- nrow(ff)
  print(paste0("Single exons: ", fff))
  gg <- dplyr::anti_join(bb,ee, by = "transcript_id")
  msg2 <- "Extracting first exons"
  cat(msg2)
  hh <- subset(gg, gg$exon_number==1)
  hh$EXON_CLASSIFICATION <- "first_exons"
  hhh <- nrow(hh)
  cat(paste0("First exons: ", hhh))
  cat(paste0(""))
  msg3 <- "Extracting last exons"
  cat(msg3)
  ii <- subset(gg, gg$exon_number!=1)
  jj <- subset(dd, dd$exon_count!= 1)
  colnames(jj)[2] <- "exon_number"
  kk <- as.data.frame(as.numeric(ii$exon_number))
  colnames(kk) <- "exon_number"
  ii$exon_number <- NULL
  ii2 <- cbind(ii,kk)
  ll <- dplyr::semi_join(ii2,jj, by = c("transcript_id", "exon_number")) ## last_exons
  ll$EXON_CLASSIFICATION <- "last_exons"
  lll <- nrow(ll)
  print(paste0("Last exons: ", lll))
  mm <- dplyr::anti_join(ii2, ll) ## inner_exons
  mm$EXON_CLASSIFICATION <- "inner_exons"
  mmm <- nrow(mm)
  print(paste0("Inner exons: ", mmm))
  final <- rbind(ff,hh,ll,mm)
  nnn <- nrow(final)
  print(paste0("Total exons: ", nnn))
  final2 <- dplyr::left_join(aa,final)
  assign(deparse(substitute(classified_exons_df)), final2, envir = .GlobalEnv)
}
monahton/GencodeInterrogator documentation built on Dec. 24, 2019, 1:31 p.m.