R/biotypeOperations.R

Defines functions filterBiotype extractBiotype

Documented in extractBiotype filterBiotype

#' Get the biotype of the non-coding genes. It is suitable for the
#' GENCODE gtf files
#'
#' @param gtfFile Path of the input gtf file which contains biotype
#'     information. The gtf file must be provided from the Ensembl
#'     or Gencode site. For space efficiency, gft files should be
#'     in a zip format.
#'
#' @return Tabular form of the gtf file with the required features
#'   such as gene id and biotypes
#'
#' @import readr
#'
#' @examples
#' fileImport<-system.file("extdata", "temp.gtf", package = "NoRCE")
#' gtf <- extractBiotype(gtfFile = fileImport)
#'
#' @export
#'
extractBiotype <- function(gtfFile) {
  mydata <- read_table(gtfFile, comment = '#', col_names = FALSE)
  temp <- strsplit(as.character(mydata$X1), "[;\t]+")
  
  #r<-lapply(temp,function(x) grepl("type|transcript_id|gene_id",x))
  r <- lapply(temp, function(x)
    grepl("gene_type|gene_id", x))
  r <- lapply(r, as.logical)
  
  # a <- list()
  # for (i in seq_along(r)) {
  #   tr <- which(r[[i]] == 'TRUE')
  #   a[[i]] <- temp[[i]][tr]
  # }
  #
  # mat <- t(lapply(a,
  #                 function(x, m)
  #                   c(x, rep(NA, m - length(
  #                     x
  #                   ))),
  #                 max(rapply(a, length))))
  # mat<-matrix(unlist(lapply(mat, `[[`, 1)))
  
  a <- data.frame()
  for (i in seq_along(r)) {
    tr <- which(r[[i]] == 'TRUE')
    a[i, seq_len(2)] <- temp[[i]][tr]
  }
  
  #gtf <- gsub("^.* ", "", mat, perl = TRUE)
  gtf <- apply(a, 2, function(x)
    (gsub("^.* ", "", x)))
  
  gtf <- as.data.frame(gsub("\"", "", gtf))
  return(gtf)
}

#' Extract the genes that have user provided biotypes. This method is useful
#' when input gene list is mixed or when research of the interest is only
#' focused on specific group of genes.
#'
#' @param gtfFile Input gtf file for the genes provided by the extractBiotype
#'     function
#' @param biotypes Selected biotypes for the genes
#'
#' @return Table format of genes with a given biotypes
#'
#' @examples
#' biotypes <- c('unprocessed_pseudogene','transcribed_unprocessed_pseudogene')
#' fileImport<-system.file("extdata", "temp.gtf", package = "NoRCE")
#' extrResult <- filterBiotype(fileImport, biotypes)
#'
#' @export
filterBiotype <- function(gtfFile, biotypes) {
  gtf <- extractBiotype(gtfFile = gtfFile)
  all <- data.frame(gene = character(), stringsAsFactors = FALSE)
  for (i in seq_along(biotypes)) {
    index <- which(gtf == biotypes[i], arr.ind = TRUE)
    all <- rbind(all, as.data.frame(gtf[index[, 1], 1]))
  }
  all <-
    as.data.frame(
      unlist(apply(unique(all), 2, strsplit, '[.]'))[c(TRUE, FALSE)])
  colnames(all) <- 'gene'
  return(all)
}

Try the NoRCE package in your browser

Any scripts or data that you put into this service are public.

NoRCE documentation built on Nov. 8, 2020, 7:17 p.m.