Nothing
# Creating Hisat2 index
# Creat Hisat2 index for further use
# Parameters: 11
CreateHisat2Index <- function (path.prefix,
genome.name,
sample.pattern,
splice.site.info = TRUE,
exon.info = TRUE,
Hisat2.Index.num.parallel.threads,
Hisat2.Index.large.index,
Hisat2.Index.local.ftab.chars,
Hisat2.Index.local.off.rate,
Hisat2.Index.ftab.chars,
Hisat2.Index.off.rate) {
if (isTRUE(CheckHisat2(print=FALSE))){
if (!is.logical(splice.site.info) || !is.logical(exon.info)) {
stop("(\u2718) Please make sure the type of ",
"'splice.site.info' and 'exon.info' are logical.\n")
} else {
# Check file progress
check.results <- ProgressGenesFiles(path.prefix,
genome.name,
sample.pattern,
print=TRUE)
message("\n\u2618\u2618\u2618 Index Creation :\n")
message("************** Creating Hisat2 Index **************\n")
if (check.results$gtf.file.logic.df && check.results$fa.file.logic.df){
command.list <- c()
command.list <- c(command.list, "* Creating Hisat2 Index : ")
current.path <- getwd()
setwd(paste0(path.prefix, "gene_data/indices/"))
if (isTRUE(splice.site.info)) {
whole.command <- paste0(path.prefix, "gene_data/ref_genes/",
genome.name, ".gtf > ", genome.name, ".ss")
main.command <- "extract_splice_sites.py"
message("Input command : ",
paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command,
args = whole.command)
# break is return is not 0 (means success!)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
message("\n")
}
if (isTRUE(exon.info)) {
whole.command <- paste0(path.prefix, 'gene_data/ref_genes/',
genome.name, '.gtf > ', genome.name, '.exon')
main.command <- "extract_exons.py"
message("Input command : ",
paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command,
args = whole.command)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
message("\n")
}
if (isTRUE(splice.site.info) && isTRUE(exon.info)) {
if (isTRUE(Hisat2.Index.large.index)) {
Hisat2.Index.large.index.value <- "--large-index"
} else {
Hisat2.Index.large.index.value <- ""
}
whole.command <- paste(paste(Hisat2.Index.large.index.value,
'-p', Hisat2.Index.num.parallel.threads,
'--localftabchars', Hisat2.Index.local.ftab.chars,
'--localoffrate', Hisat2.Index.local.off.rate,
'--ftabchars', Hisat2.Index.ftab.chars,
'--offrate', Hisat2.Index.off.rate,
'--ss', paste0(genome.name, '.ss'),
"--exon", paste0(genome.name, '.exon'),
paste0(path.prefix,
'gene_data/ref_genome/',
genome.name, '.fa'),
paste0(genome.name, '_tran')))
main.command <- "hisat2-build"
message("Input command : ", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command,
args = whole.command)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
message("\n")
} else if (isTRUE(splice.site.info) && !isTRUE(exon.info)) {
whole.command <- paste(paste('--ss', paste0(genome.name, '.ss'),
paste0(path.prefix,
'gene_data/ref_genome/',
genome.name, '.fa'),
paste0(genome.name, '_tran')))
main.command <- "hisat2-build"
message("Input command : ", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command,
args = whole.command)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
message("\n")
} else if (!splice.site.info && exon.info) {
whole.command <- paste(paste('--exon', paste0(genome.name, '.exon'),
paste0(path.prefix,
'gene_data/ref_genome/',
genome.name, '.fa'),
paste0(genome.name, '_tran')))
main.command <- "hisat2-build"
message("Input command : ", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command,
args = whole.command)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
message("\n")
} else {
whole.command <- paste(paste(paste0(path.prefix,
'gene_data/ref_genome/',
genome.name, '.fa'),
paste0(genome.name, '_tran')))
main.command <- "hisat2-build"
message("Input command : ", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command,
args = whole.command)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
message("\n")
}
command.list <- c(command.list, "\n")
fileConn <- paste0(path.prefix, "RNASeq_results/COMMAND.txt")
write(command.list, fileConn, append = TRUE)
on.exit(setwd(current.path))
message("'", path.prefix, "gene_data/indices/",
genome.name, "_tran.*.ht2' has been created.\n\n")
} else {
on.exit(setwd(current.path))
stop("(\u2718) '", genome.name, ".gtf' or",
" '", genome.name, ".fa'is missing.\n\n")
}
}
}
}
# hisat2 alignment default
# Parameters: 42
Hisat2AlignmentDefault <- function(path.prefix,
fastq.gz.type,
genome.name,
sample.pattern,
independent.variable,
case.group,
control.group,
Hisat2.Alignment.num.parallel.threads,
Hisat2.Alignment.skip,
Hisat2.Alignment.trim5,
Hisat2.Alignment.trim3,
Hisat2.Alignment.n.ceil.1.function.type,
Hisat2.Alignment.n.ceil.2.constant.term,
Hisat2.Alignment.n.ceil.3.coefficient,
Hisat2.Alignment.mp.MX,
Hisat2.Alignment.mp.MN,
Hisat2.Alignment.sp.MX,
Hisat2.Alignment.sp.MN,
Hisat2.Alignment.np,
Hisat2.Alignment.rdg.1,
Hisat2.Alignment.rdg.2,
Hisat2.Alignment.rfg.1,
Hisat2.Alignment.rfg.2,
Hisat2.Alignment.score.min.1.function.type,
Hisat2.Alignment.score.min.2.constant.term,
Hisat2.Alignment.score.min.3.coefficient,
Hisat2.Alignment.pen.cansplice,
Hisat2.Alignment.penc.noncansplice,
Hisat2.Alignment.pen.canintronlen.1.function.type,
Hisat2.Alignment.pen.canintronlen.2.constant.term,
Hisat2.Alignment.pen.canintronlen.3.coefficient,
Hisat2.Alignment.pen.noncanintronlen.1.function.type,
Hisat2.Alignment.pen.noncanintronlen.2.constant.term,
Hisat2.Alignment.pen.noncanintronlen.3.coefficient,
Hisat2.Alignment.min.intronlen,
Hisat2.Alignment.max.intronlen,
Hisat2.Alignment.rna.strandness,
Hisat2.Alignment.k,
Hisat2.Alignment.max.seeds,
Hisat2.Alignment.secondary,
Hisat2.Alignment.minins,
Hisat2.Alignment.maxins,
Hisat2.Alignment.seed) {
if (isTRUE(CheckHisat2(print=FALSE))) {
check.results <- ProgressGenesFiles(path.prefix,
genome.name,
sample.pattern,
print=TRUE)
message("\n\u2618\u2618\u2618 Alignment :\n")
message("************** Hisat2 Alignment **************\n")
if (check.results$ht2.files.number.df != 0 &&
check.results$fastq.gz.files.number.df != 0){
command.list <- c()
command.list <- c(command.list, "* Hisat2 Alignment : ")
# Map reads to each alignment
current.path <- getwd()
setwd(paste0(path.prefix, "gene_data/"))
if (fastq.gz.type == "SE") {
deleteback <- gsub("1.fastq.gz$", replacement = "",
check.results$fastq.gz.files.df)
sample.table.r.value <- gsub(paste0("[A-Z, a-z]*[0-9]*_"),
replacement = "",
deleteback)
} else if (fastq.gz.type == "PE") {
deleteback <- gsub("[1-2]*.fastq.gz$", replacement = "",
check.results$fastq.gz.files.df)
sample.table.r.value <- gsub(paste0("[A-Z, a-z]*[0-9]*_"),
replacement = "",
deleteback)
}
if (isTRUE(length(unique(sample.table.r.value)) != 1)){
on.exit(setwd(current.path))
stop("(\u2718) Inconsistent formats. Please check files are all",
"'XXX_*.fastq.gz'\n\n")
} else {
sample.table <- table(gsub(paste0("_[1-2]*.fastq.gz$"),
replacement = "",
check.results$fastq.gz.files.df))
iteration.num <- length(sample.table)
sample.name <- names(sample.table)
sample.value <- as.vector(sample.table)
if (fastq.gz.type == "SE") {
alignment.result <- data.frame(matrix(0, ncol = 0, nrow = 4))
} else if (fastq.gz.type == "PE") {
alignment.result <- data.frame(matrix(0, ncol = 0, nrow = 5))
}
total.map.rates <- c()
for( i in seq_len(iteration.num)){
current.sub.command <- ""
total.sub.command <- ""
if (fastq.gz.type == "SE") {
for ( j in seq_len(sample.value[i])){
current.sub.command <- paste(paste0("-U"),
paste0("raw_fastq.gz/",
sample.name[i], "_",
sample.table.r.value[1],
j, ".fastq.gz"))
total.sub.command <- paste(total.sub.command, current.sub.command)
}
} else if (fastq.gz.type == "PE") {
for ( j in seq_len(sample.value[i])){
current.sub.command <- paste(paste0("-", j),
paste0("raw_fastq.gz/",
sample.name[i], "_",
sample.table.r.value[1],
j, ".fastq.gz"))
total.sub.command <- paste(total.sub.command, current.sub.command)
}
}
if (Hisat2.Alignment.rna.strandness == "FR") {
Hisat2.Alignment.rna.strandness.value = "--rna-strandness FR"
} else if (Hisat2.Alignment.rna.strandness == "RF") {
Hisat2.Alignment.rna.strandness.value = "--rna-strandness RF"
} else if (Hisat2.Alignment.rna.strandness == "None") {
Hisat2.Alignment.rna.strandness.value = ""
} else {
stop("'rna.strandness' variable is out of range. It should be 'FR' or 'RF', 'None'.")
}
if (isTRUE(Hisat2.Alignment.secondary)) {
Hisat2.Alignment.secondary.value = "--secondary"
} else {
Hisat2.Alignment.secondary.value = ""
}
whole.command <- paste("--skip", Hisat2.Alignment.skip,
"--trim5", Hisat2.Alignment.trim5,
"--trim3", Hisat2.Alignment.trim3,
"--n-ceil",
paste0(Hisat2.Alignment.n.ceil.1.function.type,
',', Hisat2.Alignment.n.ceil.2.constant.term,
',', Hisat2.Alignment.n.ceil.3.coefficient),
"--mp",
paste0(Hisat2.Alignment.mp.MX, ',', Hisat2.Alignment.mp.MN),
"--sp",
paste0(Hisat2.Alignment.sp.MX, ',', Hisat2.Alignment.sp.MN),
"--np", Hisat2.Alignment.np,
"--rdg",
paste0(Hisat2.Alignment.rdg.1, ',', Hisat2.Alignment.rdg.2),
"--rfg",
paste0(Hisat2.Alignment.rfg.1, ',', Hisat2.Alignment.rfg.2),
"--score-min",
paste0(Hisat2.Alignment.score.min.1.function.type,
',', Hisat2.Alignment.score.min.2.constant.term,
',', Hisat2.Alignment.score.min.3.coefficient),
"--pen-cansplice", Hisat2.Alignment.pen.cansplice,
"--pen-noncansplice", Hisat2.Alignment.penc.noncansplice,
"--pen-canintronlen",
paste0(Hisat2.Alignment.pen.canintronlen.1.function.type,
',', Hisat2.Alignment.pen.canintronlen.2.constant.term,
',', Hisat2.Alignment.pen.canintronlen.3.coefficient),
"--pen-noncanintronlen",
paste0(Hisat2.Alignment.pen.noncanintronlen.1.function.type,
',', Hisat2.Alignment.pen.noncanintronlen.2.constant.term,
',', Hisat2.Alignment.pen.noncanintronlen.3.coefficient),
"--min-intronlen", Hisat2.Alignment.min.intronlen,
"--max-intronlen", Hisat2.Alignment.max.intronlen,
Hisat2.Alignment.rna.strandness.value,
"-k", Hisat2.Alignment.k,
"--max-seeds", Hisat2.Alignment.max.seeds,
Hisat2.Alignment.secondary.value,
"--minins", Hisat2.Alignment.minins,
"--maxins", Hisat2.Alignment.maxins,
"--seed", Hisat2.Alignment.seed, "--time",
"-p", Hisat2.Alignment.num.parallel.threads,"--dta -x",
paste0("indices/", genome.name, "_tran"),
total.sub.command, "-S",
paste0("raw_sam/", sample.name[i],".sam") )
if (i != 1) message("\n")
main.command <- "hisat2"
message("Input command : ", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command,
args = whole.command,
stderr = TRUE, stdout = TRUE)
print(command.result)
if (fastq.gz.type == "SE") {
total.reads <- gsub(" reads; of these:", "", command.result[4])
# aligned concordantly exactly 1 time
zero_time <- as.numeric(gsub(" \\([0-9]*.[0-9]*%) aligned 0 times", "", command.result[6]))
one_time <- as.numeric(gsub(" \\([0-9]*.[0-9]*%) aligned exactly 1 time", "", command.result[7]))
more_one_time <- as.numeric(gsub(" \\([0-9]*.[0-9]*%) aligned >1 times", "", command.result[8]))
total.map.rate <- as.numeric(gsub("% overall alignment rate", "", command.result[9]))
total.map.rates <- c(total.map.rates, total.map.rate)
one.result <- c(total.reads, zero_time, one_time, more_one_time)
alignment.result[[sample.name[i]]] <- one.result
if (length(command.result) == 0) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
} else if (fastq.gz.type == "PE") {
# Total reads
total.reads <- gsub(" reads; of these:", "", command.result[4])
# aligned concordantly exactly 1 time
concordantly_1_time <- as.numeric(gsub(" \\([0-9]*.[0-9]*%) aligned concordantly exactly 1 time", "", command.result[7]))
# aligned concordantly >1 times
concordantly_more_1_times <- as.numeric(gsub(" \\([0-9]*.[0-9]*%) aligned concordantly >1 times", "", command.result[8]))
# aligned dicordantly 1 time
dicordantly_1_time <- as.numeric(gsub(" \\([0-9]*.[0-9]*%) aligned discordantly 1 time", "", command.result[11]))
# aligned 0 times concordantly or discordantly
not_dicordantly_concordantly <- as.numeric(gsub(" pairs aligned 0 times concordantly or discordantly; of these:", "", command.result[13]))
# Total mapping rate
total.map.rate <- as.numeric(gsub("% overall alignment rate", "", command.result[18]))
total.map.rates <- c(total.map.rates, total.map.rate)
one.result <- c(total.reads, concordantly_1_time, concordantly_more_1_times, dicordantly_1_time, not_dicordantly_concordantly)
alignment.result[[sample.name[i]]] <- one.result
if (length(command.result) == 0) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
}
}
if (fastq.gz.type == "SE") {
row.names(alignment.result) <- c("total_reads", "map_0_time", "map_1_time", "more_than_1_times")
alignment.result[] <- lapply(alignment.result, as.character)
alignment.result[] <- lapply(alignment.result, as.numeric)
} else if (fastq.gz.type == "PE") {
row.names(alignment.result) <- c("total_reads", "concordantly_1", "concordantly_more_1", "dicordantly_1", "not_both")
alignment.result[] <- lapply(alignment.result, as.character)
alignment.result[] <- lapply(alignment.result, as.numeric)
}
if(!dir.exists(paste0(path.prefix, "RNASeq_results/Alignment_Report/"))){
dir.create(paste0(path.prefix, "RNASeq_results/Alignment_Report/"))
}
trans.df <- data.frame(t(alignment.result))
write.csv(trans.df,
file = paste0(path.prefix,
"RNASeq_results/",
"Alignment_Report/Alignment_report_reads.csv"),
row.names = TRUE)
if (fastq.gz.type == "SE") {
trans.df.portion <- data.frame(t(alignment.result))
trans.df.portion$map_0_time <- round(trans.df.portion$map_0_time / trans.df.portion$total_reads, 4)
trans.df.portion$map_1_time <- round(trans.df.portion$map_1_time / trans.df.portion$total_reads, 4)
trans.df.portion$more_than_1_times <- round(trans.df.portion$more_than_1_times / trans.df.portion$total_reads, 4)
} else if (fastq.gz.type == "PE") {
trans.df.portion <- data.frame(t(alignment.result))
trans.df.portion$concordantly_1 <- round(trans.df.portion$concordantly_1 / trans.df.portion$total_reads, 4)
trans.df.portion$concordantly_more_1 <- round(trans.df.portion$concordantly_more_1 / trans.df.portion$total_reads, 4)
trans.df.portion$dicordantly_1 <- round(trans.df.portion$dicordantly_1 / trans.df.portion$total_reads, 4)
trans.df.portion$not_both <- round(trans.df.portion$not_both / trans.df.portion$total_reads, 4)
}
write.csv(trans.df.portion,
file = paste0(path.prefix,
"RNASeq_results/",
"Alignment_Report/Alignment_report_proportion.csv"),
row.names = TRUE)
names(total.map.rates) <- sample.name
total.map.rates <- data.frame(total.map.rates)
write.csv(total.map.rates,
file = paste0(path.prefix,
"RNASeq_results/",
"Alignment_Report/Overall_Mapping_rate.csv"),
row.names = TRUE)
message("\n")
command.list <- c(command.list, "\n")
fileConn <- paste0(path.prefix, "RNASeq_results/COMMAND.txt")
write(command.list, fileConn, append = TRUE)
on.exit(setwd(current.path))
}
} else {
on.exit(setwd(current.path))
stop("(\u2718) '", genome.name, "_tran.*.ht2' ",
"or 'XXX_*.fastq.gz' is missing.\n\n")
}
}
}
# Creating STAR index
# Creat STAR index for further use
# Parameters: 8
CreateSTARIndex <- function (path.prefix,
genome.name,
sample.pattern,
STAR.Index.num.parallel.threads,
STAR.Index.sjdbOverhang.Read.length,
STAR.Index.genomeSAindexNbases,
STAR.Index.genomeChrBinNbits,
STAR.Index.genomeSAsparseD) {
if (isTRUE(CheckSTAR(print=FALSE))){
check.results <- ProgressGenesFiles(path.prefix,
genome.name,
sample.pattern,
print=TRUE)
message("\n\u2618\u2618\u2618 Index Creation :\n")
message("************** Creating STAR Index **************\n")
if (check.results$gtf.file.logic.df && check.results$fa.file.logic.df){
command.list <- c()
command.list <- c(command.list, "* Creating STAR Index : ")
current.path <- getwd()
setwd(paste0(path.prefix, "gene_data/indices/"))
whole.command <- paste("--genomeSAindexNbases", STAR.Index.genomeSAindexNbases,
"--genomeChrBinNbits", STAR.Index.genomeChrBinNbits,
"--genomeSAsparseD", STAR.Index.genomeSAsparseD,
"--runThreadN", STAR.Index.num.parallel.threads,
"--runMode", "genomeGenerate",
"--genomeDir",
paste0(path.prefix, "gene_data/indices/"),
"--genomeFastaFiles",
paste0(path.prefix, 'gene_data/ref_genome/',
genome.name, '.fa'),
"--sjdbGTFfile",
paste0(path.prefix, "gene_data/ref_genes/",
genome.name, ".gtf"),
"--sjdbOverhang", strtoi(STAR.Index.sjdbOverhang.Read.length)-1)
main.command <- "STAR"
message("Input command : ", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command,
args = whole.command)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
message("\n")
command.list <- c(command.list, "\n")
fileConn <- paste0(path.prefix, "RNASeq_results/COMMAND.txt")
write(command.list, fileConn, append = TRUE)
on.exit(setwd(current.path))
message("'", path.prefix, "gene_data/indices/",
genome.name, "STAR indices has been created.\n\n")
} else {
on.exit(setwd(current.path))
stop("(\u2718) '", genome.name, ".gtf' or",
" '", genome.name, ".fa'is missing.\n\n")
}
}
}
# STAR alignment default
# Parameters: 53
STARAlignmentDefault <- function(path.prefix,
fastq.gz.type,
genome.name,
sample.pattern,
STAR.Alignment.num.parallel.threads,
STAR.Alignment.genomeLoad,
STAR.Alignment.readMapNumber,
STAR.Alignment.clip3pNbases,
STAR.Alignment.clip5pNbases,
STAR.Alignment.clip3pAdapterSeq,
STAR.Alignment.clip3pAdapterMMp,
STAR.Alignment.clip3pAfterAdapterNbases,
STAR.Alignment.limitGenomeGenerateRAM,
STAR.Alignment.limitIObufferSize,
STAR.Alignment.limitOutSAMoneReadBytes,
STAR.Alignment.limitOutSJoneRead,
STAR.Alignment.limitOutSJcollapsed,
STAR.Alignment.limitBAMsortRAM,
STAR.Alignment.outReadsUnmapped,
STAR.Alignment.outQSconversionAdd,
STAR.Alignment.outSAMprimaryFlag,
STAR.Alignment.outSAMmapqUnique,
STAR.Alignment.scoreGap,
STAR.Alignment.scoreGapNoncan,
STAR.Alignment.scoreGapGCAG,
STAR.Alignment.scoreGapATAC,
STAR.Alignment.scoreGenomicLengthLog2scale,
STAR.Alignment.scoreDelOpen,
STAR.Alignment.scoreDelBase,
STAR.Alignment.scoreInsOpen,
STAR.Alignment.scoreInsBase,
STAR.Alignment.scoreStitchSJshift,
STAR.Alignment.seedSearchStartLmax,
STAR.Alignment.seedSearchStartLmaxOverLread,
STAR.Alignment.seedSearchLmax,
STAR.Alignment.seedMultimapNmax,
STAR.Alignment.seedPerReadNmax,
STAR.Alignment.seedPerWindowNmax,
STAR.Alignment.seedNoneLociPerWindow,
STAR.Alignment.alignIntronMin,
STAR.Alignment.alignIntronMax,
STAR.Alignment.alignMatesGapMax,
STAR.Alignment.alignSJoverhangMin,
STAR.Alignment.alignSJDBoverhangMin,
STAR.Alignment.alignSplicedMateMapLmin,
STAR.Alignment.alignSplicedMateMapLminOverLmate,
STAR.Alignment.alignWindowsPerReadNmax,
STAR.Alignment.alignTranscriptsPerWindowNmax,
STAR.Alignment.alignTranscriptsPerReadNmax,
STAR.Alignment.alignEndsType,
STAR.Alignment.winAnchorMultimapNmax,
STAR.Alignment.winBinNbits,
STAR.Alignment.winAnchorDistNbins,
STAR.Alignment.winFlankNbins) {
if (isTRUE(CheckSTAR(print=FALSE))) {
check.results <- ProgressGenesFiles(path.prefix,
genome.name,
sample.pattern,
print=TRUE)
message("\n\u2618\u2618\u2618 Alignment :\n")
message("************** STAR Alignment **************\n")
samples.star.dir <- dir.create(file.path(paste0(path.prefix,
"gene_data/raw_star")),
showWarnings = FALSE) == 0
if (!isTRUE(samples.star.dir)) {
message(" (\u2714) : Create '", path.prefix,
"gene_data/raw_star/'.\n")
} else {
message(" (\u26A0) : '", path.prefix,
"gene_data/raw_star/' has already be created.\n")
}
if (check.results$ht2.files.number.df != 0 &&
check.results$fastq.gz.files.number.df != 0){
command.list <- c()
command.list <- c(command.list, "* STAR Alignment : ")
# Map reads to each alignment
current.path <- getwd()
setwd(paste0(path.prefix, "gene_data/"))
deleteback <- gsub("[1-2]*.fastq.gz$", replacement = "",
check.results$fastq.gz.files.df)
sample.table.r.value <- gsub(paste0("[A-Z, a-z]*[0-9]*_"),
replacement = "",
deleteback)
if (isTRUE(length(unique(sample.table.r.value)) != 1)){
on.exit(setwd(current.path))
stop("(\u2718) Inconsistent formats. Please check files are all",
"'XXX_*.fastq.gz'\n\n")
} else {
sample.table <- table(gsub(paste0("_[1-2]*.fastq.gz$"),
replacement = "",
check.results$fastq.gz.files.df))
iteration.num <- length(sample.table)
sample.name <- names(sample.table)
sample.value <- as.vector(sample.table)
alignment.result <- data.frame(matrix(0, ncol = 0, nrow = 8))
total.map.rates <- c()
for( i in seq_len(iteration.num)){
current.sub.command <- ""
total.sub.command <- ""
for ( j in seq_len(sample.value[i])){
current.sub.command <- paste(paste0("raw_fastq.gz/",
sample.name[i], "_",
sample.table.r.value[1],
j, ".fastq.gz"))
total.sub.command <- paste(total.sub.command, current.sub.command)
}
samples.star.dir.in <- dir.create(file.path(paste0(path.prefix,
"gene_data/raw_star/",
sample.name[i])),
showWarnings = FALSE) == 0
samples.star.dir.output <- paste0(path.prefix, "gene_data/raw_star/",
sample.name[i], "/")
if (!isTRUE(samples.star.dir.in)) {
message(" (\u2714) : Create '", samples.star.dir.output, ".\n")
} else {
message(" (\u26A0) : '", samples.star.dir.output,
" has already be created.\n")
}
# --genomeLoad : NoSharedMemory
# --readMapNumber : -1
# --clip3pNbases : 0
# --clip5pNbases : 0
# --clip3pAdapterSeq : -
# --clip3pAdapterMMp : 0.1
# --clip3pAfterAdapterNbases : 0
# --limitGenomeGenerateRAM : 31000000000
# --limitIObufferSize : 150000000
# --limitOutSAMoneReadBytes : 100000
# --limitOutSJoneRead : 1000
# --limitOutSJcollapsed : 1000000
# --limitBAMsortRAM : 0
# --outReadsUnmapped : None
# --outQSconversionAdd : 0
# --outSAMprimaryFlag : OneBestScore
# --outSAMmapqUnique : 255
# --scoreGap : 0
# --scoreGapNoncan : -8
# --scoreGapGCAG : -4
# --scoreGapATAC : -8
# --scoreGenomicLengthLog2scale : -0.25
# --scoreDelOpen : -2
# --scoreDelBase : -2
# --scoreInsOpen : -2
# --scoreInsBase : -2
# --scoreStitchSJshift : 1
# --seedSearchStartLmax : 50
# --seedSearchStartLmaxOverLread : 1.0
# --seedSearchLmax : 0
# --seedMultimapNmax : 10000
# --seedPerReadNmax : 1000
# --seedPerWindowNmax : 50
# --seedNoneLociPerWindow : 10
# --alignIntronMin : 21
# --alignIntronMax : 0
# --alignMatesGapMax : 0
# --alignSJoverhangMin : 5
# --alignSJDBoverhangMin : 3
# --alignSplicedMateMapLmin : 0
# --alignSplicedMateMapLminOverLmate : 0.66
# --alignWindowsPerReadNmax : 10000
# --alignTranscriptsPerWindowNmax : 100
# --alignTranscriptsPerReadNmax : 10000
# --alignEndsType : Local
# --winAnchorMultimapNmax : 50
# --winBinNbits : 16
# --winAnchorDistNbins : 9
# --winFlankNbins : 4
whole.command <- paste("--runThreadN", STAR.Alignment.num.parallel.threads,
"--runMode", "alignReads",
"--genomeLoad", STAR.Alignment.genomeLoad,
"--readMapNumber", STAR.Alignment.readMapNumber,
"--clip3pNbases", STAR.Alignment.clip3pNbases,
"--clip5pNbases", STAR.Alignment.clip5pNbases,
"--clip3pAdapterSeq", STAR.Alignment.clip3pAdapterSeq,
"--clip3pAdapterMMp", STAR.Alignment.clip3pAdapterMMp,
"--clip3pAfterAdapterNbases", STAR.Alignment.clip3pAfterAdapterNbases,
"--limitGenomeGenerateRAM", STAR.Alignment.limitGenomeGenerateRAM,
"--limitIObufferSize", STAR.Alignment.limitIObufferSize,
"--limitOutSAMoneReadBytes", STAR.Alignment.limitOutSAMoneReadBytes,
"--limitOutSJoneRead", STAR.Alignment.limitOutSJoneRead,
"--limitOutSJcollapsed", STAR.Alignment.limitOutSJcollapsed,
"--limitBAMsortRAM", STAR.Alignment.limitBAMsortRAM,
"--outReadsUnmapped", STAR.Alignment.outReadsUnmapped,
"--outQSconversionAdd", STAR.Alignment.outQSconversionAdd,
"--outSAMprimaryFlag", STAR.Alignment.outSAMprimaryFlag,
"--outSAMmapqUnique", STAR.Alignment.outSAMmapqUnique,
"--scoreGap", STAR.Alignment.scoreGap,
"--scoreGapNoncan", STAR.Alignment.scoreGapNoncan,
"--scoreGapGCAG", STAR.Alignment.scoreGapGCAG,
"--scoreGapATAC", STAR.Alignment.scoreGapATAC,
"--scoreGenomicLengthLog2scale", STAR.Alignment.scoreGenomicLengthLog2scale,
"--scoreDelOpen", STAR.Alignment.scoreDelOpen,
"--scoreDelBase", STAR.Alignment.scoreDelBase,
"--scoreInsOpen", STAR.Alignment.scoreInsOpen,
"--scoreInsBase", STAR.Alignment.scoreInsBase,
"--scoreStitchSJshift", STAR.Alignment.scoreStitchSJshift,
"--seedSearchStartLmax", STAR.Alignment.seedSearchStartLmax,
"--seedSearchStartLmaxOverLread", STAR.Alignment.seedSearchStartLmaxOverLread,
"--seedSearchLmax", STAR.Alignment.seedSearchLmax,
"--seedMultimapNmax", STAR.Alignment.seedMultimapNmax,
"--seedPerReadNmax", STAR.Alignment.seedPerReadNmax,
"--seedPerWindowNmax", STAR.Alignment.seedPerWindowNmax,
"--seedNoneLociPerWindow", STAR.Alignment.seedNoneLociPerWindow,
"--alignIntronMin", STAR.Alignment.alignIntronMin,
"--alignIntronMax", STAR.Alignment.alignIntronMax,
"--alignMatesGapMax", STAR.Alignment.alignMatesGapMax,
"--alignSJoverhangMin", STAR.Alignment.alignSJoverhangMin,
"--alignSJDBoverhangMin", STAR.Alignment.alignSJDBoverhangMin,
"--alignSplicedMateMapLmin", STAR.Alignment.alignSplicedMateMapLmin,
"--alignSplicedMateMapLminOverLmate", STAR.Alignment.alignSplicedMateMapLminOverLmate,
"--alignWindowsPerReadNmax", STAR.Alignment.alignWindowsPerReadNmax,
"--alignTranscriptsPerWindowNmax", STAR.Alignment.alignTranscriptsPerWindowNmax,
"--alignTranscriptsPerReadNmax", STAR.Alignment.alignTranscriptsPerReadNmax,
"--alignEndsType", STAR.Alignment.alignEndsType,
"--winAnchorMultimapNmax", STAR.Alignment.winAnchorMultimapNmax,
"--winBinNbits", STAR.Alignment.winBinNbits,
"--winAnchorDistNbins", STAR.Alignment.winAnchorDistNbins,
"--winFlankNbins", STAR.Alignment.winFlankNbins,
"--genomeDir",
paste0(path.prefix, "gene_data/indices/"),
"--readFilesIn", total.sub.command,
"--outFileNamePrefix", samples.star.dir.output,
"--readFilesCommand", "'zcat <'")
if (i != 1) message("\n")
main.command <- "STAR"
message("Input command : ", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command,
args = whole.command,
stderr = TRUE, stdout = TRUE)
Log.final.out <- paste0(path.prefix, "gene_data/raw_star/", sample.name[i], "/Log.final.out")
Log.final.out.read.in <- read.delim(Log.final.out, header = FALSE, sep = "\t", dec = ".")
Log.final.out.read.in.num <- suppressWarnings(as.numeric(levels(Log.final.out.read.in["V2"][[1]])[as.integer(Log.final.out.read.in["V2"][[1]])]))
print(Log.final.out)
print(Log.final.out.read.in)
print(Log.final.out.read.in.num)
# Total reads
total.reads <- Log.final.out.read.in.num[5]
# Uniquely_mapped
uniquely_mapping <- Log.final.out.read.in.num[8]
# mapped to multiple loci
multi_mapping_multiple_loci <- Log.final.out.read.in.num[23]
# mapped to too many loci
multi_mapping_many_loci <- Log.final.out.read.in.num[25]
# reads unmapped: too many mismatches
unmapped_many_mismatches <- Log.final.out.read.in.num[28]
# reads unmapped: too short
unmapped_short <- Log.final.out.read.in.num[30]
# reads unmapped: other
unmapped_other <- Log.final.out.read.in.num[32]
# number of chimeric reads
chimeric_reads <- Log.final.out.read.in.num[35]
# Total mapping rate
total.map.rate <- round(((uniquely_mapping + multi_mapping_multiple_loci + multi_mapping_many_loci) / total.reads) * 100, digits = 2)
total.map.rates <- c(total.map.rates, total.map.rate)
one.result <- c(total.reads, uniquely_mapping,
multi_mapping_multiple_loci, multi_mapping_many_loci,
unmapped_many_mismatches, unmapped_short,
unmapped_other, chimeric_reads)
alignment.result[[sample.name[i]]] <- one.result
if (length(command.result) == 0) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
file.symlink(paste0(path.prefix, "gene_data/raw_star/", sample.name[i], "/Aligned.out.sam"),
paste0(path.prefix, "gene_data/raw_sam/", sample.name[i], ".sam"))
}
row.names(alignment.result) <- c("total_reads", "uniquely_mapping",
"multi_mapping_multiple_loci",
"multi_mapping_many_loci",
"unmapped_too_many_mismatches",
"unmapped_too_short",
"unmapped_other",
"chimeric_reads")
alignment.result[] <- lapply(alignment.result, as.character)
alignment.result[] <- lapply(alignment.result, as.numeric)
if(!dir.exists(paste0(path.prefix, "RNASeq_results/Alignment_Report/"))){
dir.create(paste0(path.prefix, "RNASeq_results/Alignment_Report/"))
}
trans.df <- data.frame(t(alignment.result))
write.csv(trans.df,
file = paste0(path.prefix,
"RNASeq_results/",
"Alignment_Report/Alignment_report_reads.csv"),
row.names = TRUE)
trans.df.portion <- data.frame(t(alignment.result))
trans.df.portion$uniquely_mapping <- round(trans.df.portion$uniquely_mapping / trans.df.portion$total_reads, 4)
trans.df.portion$multi_mapping_multiple_loci <- round(trans.df.portion$multi_mapping_multiple_loci / trans.df.portion$total_reads, 4)
trans.df.portion$multi_mapping_many_loci <- round(trans.df.portion$multi_mapping_many_loci / trans.df.portion$total_reads, 4)
trans.df.portion$unmapped_too_many_mismatches <- round(trans.df.portion$unmapped_too_many_mismatches / trans.df.portion$total_reads, 4)
trans.df.portion$unmapped_too_short <- round(trans.df.portion$unmapped_too_short / trans.df.portion$total_reads, 4)
trans.df.portion$unmapped_other <- round(trans.df.portion$unmapped_other / trans.df.portion$total_reads, 4)
trans.df.portion$chimeric_reads <- round(trans.df.portion$chimeric_reads / trans.df.portion$total_reads, 4)
write.csv(trans.df.portion,
file = paste0(path.prefix,
"RNASeq_results/",
"Alignment_Report/Alignment_report_proportion.csv"),
row.names = TRUE)
names(total.map.rates) <- sample.name
total.map.rates <- data.frame(total.map.rates)
write.csv(total.map.rates,
file = paste0(path.prefix,
"RNASeq_results/",
"Alignment_Report/Overall_Mapping_rate.csv"),
row.names = TRUE)
message("\n")
command.list <- c(command.list, "\n")
fileConn <- paste0(path.prefix, "RNASeq_results/COMMAND.txt")
write(command.list, fileConn, append = TRUE)
on.exit(setwd(current.path))
}
} else {
on.exit(setwd(current.path))
stop("(\u2718) 'indices/*' ",
"or 'XXX_*.fastq.gz' is missing.\n\n")
}
}
}
# use 'Rsamtools' to sort and convert the SAM files to BAM
# Parameters: 6
RSamtoolsToBam <- function(SAMtools.or.Rsamtools,
Samtools.Bam.num.parallel.threads,
path.prefix,
genome.name,
sample.pattern,
Rsamtools.nCores){
check.results <- ProgressGenesFiles(path.prefix,
genome.name,
sample.pattern,
print=TRUE)
message("\n\u2618\u2618\u2618 'SAM' to 'BAM' :\n")
message("************** ",
"Rsamtools converting '.sam' to '.bam' **************\n")
command.list <- c()
if (check.results$sam.files.number.df != 0){
if (SAMtools.or.Rsamtools == "SAMtools") {
check.sam <- CheckSamtools()
if (check.sam) {
command.list <- c(command.list,
"* SAMtools Converting '.sam' to '.bam' : ")
sample.table <- table(gsub(paste0(".sam$"),
replacement = "",
check.results$sam.files.df))
sample.name <- names(sample.table)
for( i in sample.name){
whole.command <- paste("sort -@", Samtools.Bam.num.parallel.threads,
"-o", paste0(path.prefix, "gene_data/raw_bam/", i, ".bam"),
paste0(path.prefix, "gene_data/raw_sam/", i, ".sam"))
if (i != 1) message("\n")
main.command <- "samtools"
message("Input command :", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command, args = whole.command)
if (command.result != 0 ) {
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
}
} else {
message("(\u2718) : 'SAMtools.or.Rsamtools' parameter can't be set to 'SAMtools'",
" because 'samtools' command is not found in R shell through ",
"'system2()'. Please make sure 'samtools' is available",
" on your device or change 'SAMtools.or.Rsamtools' parameter ",
"to 'Rsamtools'!!\n\n")
}
} else if (SAMtools.or.Rsamtools == "Rsamtools") {
command.list <- c(command.list,
"* Rsamtools Converting '.sam' to '.bam' : ")
sample.table <- table(gsub(paste0(".sam$"),
replacement = "",
check.results$sam.files.df))
iteration.num <- length(sample.table)
sample.name <- names(sample.table)
sam.files <- lapply(sample.name, function(x) {paste0(path.prefix,
"gene_data/raw_sam/",
x, ".sam")})
bam.files <- lapply(sample.name,
function(x) {paste0(path.prefix,
"gene_data/raw_bam/",x)})
command.list <- c(command.list,
paste0(" Running 'asBam' in Rsamtools in parallel:",
Rsamtools.nCores, "\n"))
mcmapply(FUN = function(input.sam,output.bam) {
message(" Input SAM file: '", input.sam, "'\n")
message(" Creating BAM file: '", output.bam, ".bam'\n")
# Remove tmp file
# tmp_file <- tempfile()
# unlink(list.files(dirname(dirname(tmp_file)),
# pattern = "Rtmp*", full.names = TRUE),
# recursive = TRUE, force = TRUE)
Rsamtools::asBam(file = input.sam,
destination = output.bam,
overwrite = TRUE)
message(" Ouput BAM file: '", output.bam, ".bam' is created\n")
}, sam.files, bam.files, mc.cores = Rsamtools.nCores)
# command.list <- c(command.list, output.log)
# iteration.num <- length(sample.table)
# sample.name <- names(sample.table)
# sample.value <- as.vector(sample.table)
# for( i in seq_len(iteration.num)){
# input.sam <- paste0(path.prefix,
# "gene_data/raw_sam/",
# sample.name[i], ".sam")
# output.bam <- paste0(path.prefix,
# "gene_data/raw_bam/",
# sample.name[i])
# message(" Input SAM file: '", input.sam, "'\n")
# message(" Creating BAM file: '", output.bam, ".bam'\n")
# ba.file <- Rsamtools::asBam(file = input.sam,
# destination = output.bam,
# overwrite = TRUE,
# maxMemory = Rsamtools.maxMemory)
# message(" Ouput BAM file: '", output.bam, ".bam' is created\n")
# command.list <- c(command.list, ba.file)
# }
}
message("\n")
command.list <- c(command.list, "\n")
fileConn <- paste0(path.prefix, "RNASeq_results/COMMAND.txt")
write(command.list, fileConn, append = TRUE)
} else {
stop(c("(\u2718) 'XXX.sam' is missing.\n\n"))
}
}
# stringtie assemble and quantify expressed genes and transcripts
# Parameters: 9
StringTieAssemble <- function(path.prefix,
genome.name,
sample.pattern,
Stringtie.Assembly.num.parallel.threads,
Stringtie.Assembly.f,
Stringtie.Assembly.m,
Stringtie.Assembly.c,
Stringtie.Assembly.g,
Stringtie.Assembly.M) {
if (isTRUE(CheckStringTie(print=FALSE))) {
check.results <- ProgressGenesFiles(path.prefix,
genome.name,
sample.pattern,
print=TRUE)
message("\n\u2618\u2618\u2618 Assembly :\n")
message("************** Stringtie assembly **************\n")
if (check.results$bam.files.number.df != 0){
command.list <- c()
command.list <- c(command.list, "* Stringtie assembly : ")
current.path <- getwd()
setwd(paste0(path.prefix, "gene_data/"))
sample.name <- sort(gsub(paste0(".bam$"),
replacement = "",
check.results$bam.files.df))
iteration.num <- length(sample.name)
for( i in seq_len(iteration.num)){
whole.command <- paste("-p", Stringtie.Assembly.num.parallel.threads,
"-f", Stringtie.Assembly.f, "-m", Stringtie.Assembly.m,
"-c", Stringtie.Assembly.c, "-g", Stringtie.Assembly.g,
"-M", Stringtie.Assembly.M,
"-G", paste0("ref_genes/", genome.name, ".gtf"),
"-o", paste0("raw_gtf/", sample.name[i], ".gtf"),
"-l", sample.name[i],
paste0("raw_bam/", sample.name[i], ".bam"))
if (i != 1) message("\n")
main.command <- "stringtie"
message("Input command :", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command, args = whole.command)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
}
message("\n")
command.list <- c(command.list, "\n")
fileConn <- paste0(path.prefix, "RNASeq_results/COMMAND.txt")
write(command.list, fileConn, append = TRUE)
on.exit(setwd(current.path))
} else {
on.exit(setwd(current.path))
stop("(\u2718) '", genome.name, ".gtf' or 'XXX.bam' is missing.\n\n")
}
}
}
# stringtie merge transcripts from all samples
# Parameters: 4
StringTieMergeTrans <- function(path.prefix,
genome.name,
sample.pattern,
Stringtie.Merge.num.parallel.threads) {
if (isTRUE(CheckStringTie(print=FALSE))) {
check.results <- ProgressGenesFiles(path.prefix,
genome.name,
sample.pattern,
print=TRUE)
message("\n\u2618\u2618\u2618 Transcript Merging :\n")
message("************** ",
"Stringtie merging transcripts **************\n")
if ( isTRUE(check.results$gtf.file.logic.df) &&
check.results$gtf.files.number.df != 0){
command.list <- c()
command.list <- c(command.list, "* Stringtie Merging Transcripts : ")
current.path <- getwd()
setwd(paste0(path.prefix, "gene_data/"))
if(!dir.exists(paste0(path.prefix, "gene_data/merged/"))){
dir.create(paste0(path.prefix, "gene_data/merged/"))
}
sample.table <- table(check.results$gtf.files.df)
iteration.num <- length(sample.table)
sample.name <- names(sample.table)
sample.value <- as.vector(sample.table)
write.content <- paste0("raw_gtf/",sample.name[1])
for (i in 2:iteration.num){
write.content <- c(write.content, paste0("raw_gtf/" ,sample.name[i]))
}
write.file<-file("merged/mergelist.txt")
writeLines(write.content, write.file)
close(write.file)
whole.command <- paste("--merge -p", Stringtie.Merge.num.parallel.threads,
"-G", paste0("ref_genes/", genome.name, ".gtf"),
"-o", "merged/stringtie_merged.gtf",
"merged/mergelist.txt")
main.command <- "stringtie"
message("Input command :", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command, args = whole.command)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
message("\n")
command.list <- c(command.list, "\n")
fileConn <- paste0(path.prefix, "RNASeq_results/COMMAND.txt")
write(command.list, fileConn, append = TRUE)
on.exit(setwd(current.path))
} else {
on.exit(setwd(current.path))
stop("(\u2718) :'", genome.name, ".gtf' or 'XXX.gtf' is missing.\n\n")
}
}
}
# stringtie estimate transcript abundances and create table count for Ballgown
# Parameters: 4
StringTieToBallgown <- function(path.prefix,
genome.name,
sample.pattern,
Stringtie.2.Ballgown.num.parallel.threads) {
if (isTRUE(CheckStringTie(print=FALSE))) {
check.results <- ProgressGenesFiles(path.prefix,
genome.name,
sample.pattern,
print=TRUE)
message("\n\u2618\u2618\u2618 Ballgown Table count Creation :\n")
message("************** ",
"Stringtie creating table count for Ballgown **************\n")
if ((check.results$bam.files.number.df != 0) &&
isTRUE(check.results$stringtie_merged.gtf.file.df)){
command.list <- c()
command.list <- c(command.list,
"* Stringtie Creating Table count for Ballgown : ")
current.path <- getwd()
setwd(paste0(path.prefix, "gene_data/"))
sample.table <- table(gsub(paste0(".bam$"),
replacement = "",
check.results$bam.files.df))
iteration.num <- length(sample.table)
sample.name <- names(sample.table)
sample.value <- as.vector(sample.table)
for( i in seq_len(iteration.num)){
# '-e' only estimate the abundance of given reference transcripts
# (requires -G)
whole.command <- paste("-e -B -p", Stringtie.2.Ballgown.num.parallel.threads,
"-G", "merged/stringtie_merged.gtf",
"-o", paste0("ballgown/", sample.name[i],
"/", sample.name[i], ".gtf"),
"-A", paste0("gene_abundance/", sample.name[i],
"/", sample.name[i], ".tsv"),
paste0("raw_bam/", sample.name[i], ".bam"))
main.command <- 'stringtie'
if (i != 1) message("\n")
message("Input command :", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command, args = whole.command)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
}
message("\n")
command.list <- c(command.list, "\n")
fileConn <- paste0(path.prefix, "RNASeq_results/COMMAND.txt")
write(command.list, fileConn, append = TRUE)
on.exit(setwd(current.path))
} else {
on.exit(setwd(current.path))
stop("(\u2718) 'stringtie_merged.gtf' or 'XXX.bam' is missing.\n\n")
}
}
}
# Examine how the transcripts compare with the reference annotation
# Parameters: 3
GffcompareRefSample <- function(path.prefix,
genome.name,
sample.pattern) {
if (isTRUE(CheckGffcompare(print=FALSE))){
check.results <- ProgressGenesFiles(path.prefix,
genome.name,
sample.pattern,
print=TRUE)
message("\n\u2618\u2618\u2618 Merged `GTF` & Reference Comparison :\n")
message("************** ",
"Gffcompare comparing transcripts between ",
"merged and reference **************\n")
if ( isTRUE(check.results$stringtie_merged.gtf.file.df) &&
isTRUE(check.results$gtf.file.logic.df)){
command.list <- c()
command.list <- c(command.list, "* Gffcompare Comparing ",
"Transcripts between Merged and Reference : ")
current.path <- getwd()
setwd(paste0(path.prefix, "gene_data/"))
whole.command <- paste("-r", paste0("ref_genes/", genome.name, ".gtf"),
"-G -o merged/merged merged/stringtie_merged.gtf")
main.command <- "gffcompare"
message("Input command : ", paste(main.command, whole.command), "\n")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command, args = whole.command)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
message("\n")
command.list <- c(command.list, "\n")
fileConn <- paste0(path.prefix, "RNASeq_results/COMMAND.txt")
write(command.list, fileConn, append = TRUE)
on.exit(setwd(current.path))
} else {
on.exit(setwd(current.path))
stop("(\u2718) '", genome.name,
".gtf' or 'stringtie_merged.gtf' is missing.\n\n")
}
}
}
# converting stringtie ballogwn preprocessed data to count table
# Parameters: 6
PreDECountTable <- function(path.prefix,
sample.pattern,
python.variable.answer,
python.variable.version,
python.2to3,
print=TRUE) {
# ftp server :
# 'prepDE.py' is from
# ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.1.0-source.zip
message("\n\u2618\u2618\u2618 Raw Reads Count Creation :\n")
message("************** Reading prepDE.py ************\n")
if(!dir.exists(paste0(path.prefix, "gene_data/reads_count_matrix/"))){
dir.create(file.path(paste0(path.prefix, 'gene_data/reads_count_matrix/')),
showWarnings = FALSE)
}
url <- "https://ccb.jhu.edu/software/stringtie/dl/prepDE.py"
command.list <- c()
command.list <- c(command.list, "* Installing 'prepDE.py' : ")
message(path.prefix, "gene_data/reads_count_matrix\n")
current.path <- getwd()
setwd(paste0(path.prefix, "gene_data/reads_count_matrix/"))
file.download <- getURL(url, download.file,
paste0(path.prefix,
"gene_data/reads_count_matrix/prepDE.py"))
message("Using R function : 'download.file()' is called. \n")
command.list <- c(command.list,
" Using R function : 'download.file()' is called.")
command.list <- c(command.list, "\n")
if (file.download != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
message("'", path.prefix,
"gene_data/reads_count_matrix/prepDE.py' has been installed.\n\n")
message("************** Creating 'sample_lst.txt' file ************\n")
sample.files <- list.files(paste0(path.prefix, "gene_data/ballgown/"),
pattern = sample.pattern)
write.content <- paste0(sample.files[1], " ", path.prefix,
"gene_data/ballgown/", sample.files[1] ,
"/", sample.files[1], ".gtf")
for(i in 2:length(sample.files)){
write.content <- c(write.content,
paste0(sample.files[i], " ", path.prefix,
"gene_data/ballgown/", sample.files[i],
"/",sample.files[i], ".gtf"))
}
write.file<-file(paste0(path.prefix,
"gene_data/reads_count_matrix/sample_lst.txt"))
writeLines(write.content, write.file)
close(write.file)
message("'", path.prefix,
"gene_data/reads_count_matrix/sample_lst.txt' has been created\n\n")
message("************** ",
"Creating gene and transcript raw count file ",
"************\n")
# have to check python !!!
if (python.variable.answer) {
message("(\u2714) : Python is available on your device!\n")
message(" Python version : ",
reticulate::py_config()$version, "\n")
if(python.variable.version >= 3) {
## If Python3 ==> check whether 2to3 variable is valid!
if (isTRUE(python.2to3)) {
message("(\u270D) : Converting 'prepDE.py' ",
"from python2 to python3 \n\n")
whole.command <- paste0("-W ", path.prefix,
"gene_data/reads_count_matrix/prepDE.py ",
"--no-diffs")
main.command <- "2to3"
message("Input command :",
paste(main.command, whole.command), "\n\n")
command.list <- c(command.list, "* Coverting prepDE.py to Python3 : ")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.list <- c(command.list, "\n")
command.result <- system2(command = main.command, args = whole.command)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
} else {
on.exit(setwd(current.path))
message("(\u26A0) 2to3 command is not available on your device !\n\n'")
return(TRUE)
}
}
message("\n(\u2714) : Creating gene and transcript raw count files now !\n")
whole.command <- paste0(path.prefix,
"gene_data/reads_count_matrix/prepDE.py -i ",
path.prefix,
"gene_data/reads_count_matrix/sample_lst.txt")
main.command <- "python"
message("Input command :", paste(main.command, whole.command), "\n\n")
command.list <- c(command.list,
"* Creating Gene and Transcript Raw Count File : ")
command.list <- c(command.list,
paste(" command :", main.command, whole.command))
command.result <- system2(command = main.command,
args = whole.command,
wait = TRUE)
if (command.result != 0 ) {
on.exit(setwd(current.path))
message("(\u2718) '", main.command, "' is failed !!")
stop("'", main.command, "' ERROR")
}
message("'", path.prefix,
"gene_data/reads_count_matrix/",
"gene_count_matrix.csv' has been created\n")
message("'", path.prefix,
"gene_data/reads_count_matrix/",
"transcript_count_matrix.csv' has been created\n\n")
command.list <- c(command.list, "\n")
fileConn <- paste0(path.prefix, "RNASeq_results/COMMAND.txt")
write(command.list, fileConn, append = TRUE)
on.exit(setwd(current.path))
return(TRUE)
} else {
on.exit(setwd(current.path))
message("(\u26A0) Python is not available on your device!! ",
"Please install python to run ",
"python script 'prepDE.py'. ",
"Raw reads count table creation is skipped!!\n\n'")
return(TRUE)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.