# Combine a raw data file per sample to a single merged file.
# Sample format of each file "testN-KReadsPerGene.out.tab":
# N_unmapped 105984 105984 105984
# N_multimapping 171224 171224 171224
# N_noFeature 177533 3433150 277677
# N_ambiguous 319796 9239 136511
# ENSG00000223972 0 0 0
# ENSG00000227232 10 0 10
# ENSG00000278267 0 0 0
# ...
#
#
# The desired format of the merged data is:
# test1.1.1 test1.1.2 test1.1.3 test1.2.1 test1.2.2 ...
# N_unmapped 105984 105984 105984
# N_multimapping 171224 171224 171224
# N_noFeature 177533 3433150 277677
# N_ambiguous 319796 9239 136511
# ENSG00000223972 0 0 0
# ENSG00000227232 10 0 10
# ENSG00000278267 0 0 0
# ...
#
# Row names indicate metadata like the number of unmapped segments,
# and Ensembl gene designations. The designations are translated
# to human-readable form and appended as a final column in the merged
# data, taking the first symbol (if any) for each designator, using
# the "org.Hs.eg.db" package:
# ... SYMBOL
# N_unmapped ... null
# N_multimapping ... null
# N_noFeature ... null
# N_ambiguous ... null
# ENSG00000223972 ... DDX11L1
# ENSG00000227232 ... WASH7P
# ENSG00000278267 ... MIR6859-1
# ...
# source("http://bioconductor.org/biocLite.R")
# biocLite("org.Hs.eg.db") # for converting Ensembl IDs to gene symbols
library("org.Hs.eg.db")
# Reframe a data file with desired row and column names.
processFile <- function(fileBaseDir, fileName) {
fullFileName <- paste(fileBaseDir, fileName, sep='/')
table <- read.table(fullFileName, stringsAsFactors=T)
# Reframe the data, taking row names from first column.
table.with.rownames <- data.frame(table[,-1], row.names=table[,1])
# Generate 'test6.4.' from 'test6-4ReadsPerGene.out.tab'.
newColBaseName <- paste(substring(fileName, 1, 5),
substring(fileName, 7, 7),
'', # gives us a trailing '.'
sep='.')
# Rename the columns as test6.4.1, test6.4.2, test6.4.3.
colnames(table.with.rownames) <- paste0(newColBaseName, 1:3)
return(table.with.rownames)
}
# Merge reframed data from all input files, and add 'symbol' column.
mergeFiles <- function(baseDir, files) {
processedData <- lapply(inputFiles, processFile, fileBaseDir=baseDir)
test <- do.call(cbind.data.frame, processedData)
test$symbol <- mapIds(org.Hs.eg.db,
keys=row.names(test),
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
return(test)
}
dataDir <- "/Volumes/G-DRIVE\ mobile\ USB-C/Analysis/Testrun/STAR/results"
inputFiles<-c(
"test1-1ReadsPerGene.out.tab",
"test1-2ReadsPerGene.out.tab",
"test1-3ReadsPerGene.out.tab",
"test1-4ReadsPerGene.out.tab",
"test2-1ReadsPerGene.out.tab",
"test2-2ReadsPerGene.out.tab",
"test2-3ReadsPerGene.out.tab",
"test2-4ReadsPerGene.out.tab",
"test3-1ReadsPerGene.out.tab",
"test3-2ReadsPerGene.out.tab",
"test3-3ReadsPerGene.out.tab",
"test3-4ReadsPerGene.out.tab",
"test4-1ReadsPerGene.out.tab",
"test4-2ReadsPerGene.out.tab",
"test4-3ReadsPerGene.out.tab",
"test4-4ReadsPerGene.out.tab",
"test5-1ReadsPerGene.out.tab",
"test5-2ReadsPerGene.out.tab",
"test5-3ReadsPerGene.out.tab",
"test5-4ReadsPerGene.out.tab",
"test6-1ReadsPerGene.out.tab",
"test6-2ReadsPerGene.out.tab",
"test6-3ReadsPerGene.out.tab",
"test6-4ReadsPerGene.out.tab"
)
test <- mergeFiles(dataDir, inputFiles)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.