R/tcgaTableGenerator.R

Defines functions tcgaTableGenerator

Documented in tcgaTableGenerator

#' @title Generate gene/microRNA expression matrix from single sequencing data files downloaded from GDC Data Portal. 
#'
#' @description \code{tcgaTableGenerator} Generates gene/microRNA expression matrix from single sequencing data files downloaded from GDC Data Portal. The MANIFEST.txt file should be in the same directory.
#'
#' @param dataDir The directory of downloaded sequencing data files as well as the MANIFEST.txt file.
#' @param dataType A string, 'microRNA' for microRNA-seq data or 'mRNA' for RNA-seq data.
#'
#' @return A gene/microRNA expression data matrix, with rows referring to genes/microRNAs and columns to samples.
#'
#' @export tcgaTableGenerator
#'
#' @examples
#' tcgaTableGenerator(dataDir = './TCGA_RNAseq_LUAD', dataType = 'mRNA')
#' tcgaTableGenerator(dataDir = './TCGA_miRNAseq_LUAD', dataType = 'microRNA')



tcgaTableGenerator <- function(dataDir, dataType){

    manifestFile <- paste0(dataDir, '/MANIFEST.txt')
    mani.v <- as.character(read.table(manifestFile, header = TRUE)[, 'id'])
    mani <- mani.v[mani.v != '\\N']

    exp.l <- vector(length = length(mani), mode = 'list')
    for(i in 1:length(mani)){
        dataDir_i <- paste0(dataDir, '/', mani[i])
        f.v <- paste0(dataDir_i, '/', list.files(dataDir_i))

        if(dataType == 'mRNA'){
            f <- f.v[grep('FPKM.txt.gz', f.v)]
            exp.l[[i]] <- read.table(gzfile(f),sep="\t", row.names = 1)
        }
        if(dataType == 'microRNA'){
            f <- f.v[grep('.mirbase21.mirnas.', f.v)]
            exp.l[[i]] <- read.table(f, sep="\t", row.names = 1, header = TRUE)
        }
        print(paste0('read in file ', i))
    }

    exp.m <- matrix(nrow = unique(unlist(lapply(exp.l, nrow))), ncol = length(exp.l), dimnames = list(row.names(exp.l[[1]]), mani))
    for(i in 1:length(exp.l)){
        if(dataType == 'mRNA'){
            exp.m[, i] <- exp.l[[i]][, 1]
        }
        if(dataType == 'microRNA'){
            exp.m[, i] <- exp.l[[i]][, 'reads_per_million_miRNA_mapped']
        }
    }
    exp.m
}
YC3/mirNet documentation built on Sept. 3, 2020, 3:25 a.m.