R/extract_introns.R

Defines functions extract_introns

Documented in extract_introns

#' @title Extract introns coordinates from GENCODE gtf file as a dataframe.
#'
#' @description This function takes a gtf file from GENCODE and returns a dataframe in the R Global Environment containing introns coordinates and their position in the transcript.
#' @usage extract_introns(x)
#' @param x The name of the downloaded gtf file from GENCODE website
#' @export
#' @keywords
#' @seealso
#' @return A dataframe of intron coordinates from the gtf file selected
#' examples \dontrun {
#' # You don't have to run this
#' load_gtf("gencode.v27.lncRNAs.gtf")
#' extract_introns(gencode.v27.lncRNAs.gtf)
#’}
extract_introns <- function(x) {
  a <- subset(x, x$type=="exon")
  a1 <- classify_exons(a)
  b <- subset(a1, select = c("transcript_id", "exon_number"))
  c <- as.data.frame(table(b$transcript_id))
  d <- subset(c, Freq== '1')
  d1 <- nrow(d)
  print(paste0("Single exons: ", d1))
  e <- dplyr::anti_join(a1,d, by = c("transcript_id" = "Var1"))
  colnames(c) <- c("transcript_id", "exon_count")

  x <- sort(unique(as.integer(c$exon_count)))
  x2 <- max(x)
  x3 <- sort(unique(as.integer(e$exon_number)))
  x4 <- setdiff(union(x, x3), intersect(x, x3))
  y <- NULL
  for(i in x) {
    if (i==x2) {
      next
    }
    zz <- subset(e, e$EXON_CLASSIFICATION!="last_exons" & e$exon_number==i)
    zz$intron_start <- ifelse(zz$strand=="+", zz$end + 1, zz$start - 1)
    zz2 <- subset(e,e$exon_number==i+1)
    zz2$intron_end <- ifelse(zz2$strand=="+", zz2$start - 1, zz2$end + 1)
    zz3 <- subset(zz2, select = "intron_end")
    zz4 <- cbind(zz,zz3)
    zz4$intron_number <- paste("intron", i, sep = "")
    y <- rbind(y,zz4)
  }

  m <- NULL
  for(i in x4) {
    zz <- subset(e, e$exon_number==i)
    zz$intron_start <- ifelse(zz$strand=="+", zz$end + 1, zz$start - 1)
    zz2 <- subset(e,e$exon_number==i+1)
    zz2$intron_end <- ifelse(zz2$strand=="+", zz2$start - 1, zz2$end + 1)
    zz3 <- subset(zz2, select = "intron_end")
    zz4 <- cbind(zz,zz3)
    zz4$intron_number <- paste("intron", i, sep = "")
    m <- rbind(m,zz4)
  }
  n <- rbind(y,m)
  n$type <- "intron"
  n1 <- subset(n, n$strand=="+")
  n2 <- subset(n, n$strand=="-")
  n3 <- n2$intron_start
  n2$intron_start <- n2$intron_end
  n2$intron_end <- n3
  n4 <- rbind(n1,n2)
  n5 <- subset(n4, select = c("seqnames", "intron_start", "intron_end", "width"))
  n5$width <- n5$intron_end-n5$intron_start+1
  n6 <- n4[,-which(names(n4) %in% c("seqnames", "start", "end", "width", "exon_number", "exon_id", "EXON_CLASSIFICATION", "intron_start", "intron_end"))]
  n7 <- cbind(n5,n6)
  n8 <- n7 %>% group_by(gene_id) %>% arrange(seqnames, intron_start)
  assign(deparse(substitute(introns_df)), n8, envir = .GlobalEnv)
}
monahton/GencodeInterrogator documentation built on Dec. 24, 2019, 1:31 p.m.