Introduction to ngstk

knitr::opts_chunk$set(comment = "#>", collapse = TRUE)


ngstk is an R package to facilitate the analysis of NGS data, such as visualization, conversion of the data format for WEB service input and another purpose.

You can learn some usage of ngstk through this tutorial.


Data format conversion

We defined a rule to facilitate the data format conversion. It is a good choice to save the all of output colums meta information. Moreover, the other input data should establish a connection with the output colums according the requirement.

The follow example configuration file can be used to convert iseq, a pipeline to analysis genetic variants from NGS data, output data to the web service ProteinPaint, a tool to visulize the mutation data.

Title = "Proteinpaint configuration file"

# muts2pp is a function that can convert mutation data to Proteinpaint input format
colnames = ["gene", "refseq", "chromosome", "start", "aachange", "class", "disease", "sample"]
handler_lib = "default_handlers"
mhandler_lib = "default_mhandlers"
gene = "Gene symbol, e.g TP53, PTEN"
refseq = "Transcript of refSeq or Ensemble, e.g NM_000546, ENST00000635293"
chromosome = "Chromosome, e.g. chr1, chr2"
start = "Chromosome start location of a mutation site, e.g. 153249385"
aachange = "Amino acid level change of gene mutation, e.g. p.R347C, p.L615delinsDL"
class = "Mutation type, e.g. nonsense, proteinIns, proteinDel, frameshift"
disease = "Disease name or sample group name, e.g. B-ALL, T-ALL, G1, G2"
sample = "Sample name"

alias = ["gene", "symbol"]

alias = ["refseq", ""]

alias = ["chromosome"]

alias = ["start"]

alias = ["aachange", "amino.acid.change"]
extract_pattern = "p[.]+.*$"

alias = ["mutation_type", "mutation.type"]
raw = ["nonframeshift ins", "nonframeshift del",
       "frameshift ins", "frameshift del", "stoploss", "nonsense",
new = ["proteinIns", "proteinDel", "frameshift", "frameshift",
       "nonsense", "nonsense", "splice"]
na_replace = "splice"

muts2pp, muts2mutation_mapper, muts2oncoprinter and fusions2pp are the example functions based on the established rules.

demo_file <- system.file("extdata", "demo/proteinpaint/muts2pp_iseq.txt", package = "ngstk")
input_data <- read.table(demo_file, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
disease <- "T-ALL"
input_data <- data.frame(input_data, disease)
input_data$disease <- as.character(input_data$disease)

# Convert mutations data to proteinpaint input
result <- muts2pp(input_data, input_type = "iseq")
# Convert mutations data to cbioportal input
result <- muts2mutation_mapper(input_data, input_type = "iseq")
result <- muts2oncoprinter(input_data, input_type = "iseq")

demo_file <- system.file('extdata', 'demo/proteinpaint/fusions2pp_fusioncatcher.txt', package = 'ngstk')
input_data <- read.table(demo_file, sep = '\t', header = TRUE, stringsAsFactors = FALSE)
disease <- 'B-ALL'
sampletype <- 'diagnose'
input_data <- data.frame(input_data, disease, sampletype)
input_data$disease <- as.character(input_data$disease)
# Convert fusions data to proteinpaint input
result <- fusions2pp(input_data, input_type = 'fusioncatcher')

merge_table_files is the another util function to merge multiple table files.

a <- data.frame(col1=1:6, col2=2:7)
b <- data.frame(col1=6:11, col2=1:6)
file_a <- paste0(tempfile(), '_abcd')
file_b <- paste0(tempfile(), '_abcd')
write.table(a, file_a, sep = '\t', row.names = FALSE)
write.table(b, file_b, sep = '\t', row.names = FALSE)
input_files <- c(file_a, file_b)
x1 <- merge_table_files(input_files = input_files)
x2 <- merge_table_files(files_dir = tempdir(), pattern = '.*_abcd$')
outfn = tempfile()
x3 <- merge_table_files(files_dir = tempdir(), pattern = ".*_abcd$", outfn = outfn)
head(read.table(outfn, sep = "\t", header = TRUE))

Data filtration

Data filtration or subset is an important step to clean or run the specific analysis. A series of data filtration function will be establish and fixed that can be re-used in the future.

demo_file <- system.file("extdata", "demo/proteinpaint/fusions2pp_fusioncatcher.txt", package = "ngstk")
input_data <- read.table(demo_file, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
# Get data subset according the defined rule
mhandler_extra_params = list(gene_5 = 1, gene_3 = 2, any_gene = "TCF3", fusions_any_match_flag = TRUE)
result_1 <- fusions_filter(input_data, mhandler_extra_params = mhandler_extra_params)

mhandler_extra_params = list(gene_3 = 2, right_gene = "GYPA", fusions_right_match_flag = TRUE)
result_2 <- fusions_filter(input_data, mhandler_extra_params = mhandler_extra_params)

mhandler_extra_params = list(gene_5 = 1, left_gene = "GYPA", fusions_left_match_flag = TRUE)
result_3 <- fusions_filter(input_data, mhandler_extra_params = mhandler_extra_params)

mhandler_extra_params = list(gene_5 = 1, gene_3 = 2, left_gene = "GYPE", right_gene = "GYPA", fusions_full_match_flag = TRUE)
result_4 <- fusions_filter(input_data, mhandler_extra_params = mhandler_extra_params)

mhandler_extra_params = list(gene_5 = 1, gene_3 = 2, left_gene = "GYPE", right_gene = "GYPA", fusions_anyfull_match_flag = TRUE)
result_5 <- fusions_filter(input_data, mhandler_extra_params = mhandler_extra_params)

Log related

In the data analysis process, we usually need to check the create time and modification time of some result files, e.g. all of files create or change time are identical. ngstk provied get_files_mtime and get_files_ctime to process the files time check step. The default time check function is check all files time wheather are identical and you can use your own function.

file_a <- tempfile()
file_b <- tempfile()
file.create(c(file_a, file_b))
x <- get_files_mtime(input_files = c(file_a, file_b))
x <- get_files_mtime(input_files = c(file_a, file_b), return_check = FALSE)
x <- get_files_mtime(input_files = c(file_a, file_b), return_mtime = FALSE)
x <- get_files_ctime(input_files = c(file_a, file_b))
x <- get_files_ctime(input_files = c(file_a, file_b), return_check = FALSE)

# time stamp

Split data

Split data is an optional step if you want to parallel process the data stream. ngstk provide split_row_data and split_col_data to split data.frame and data.table object.

x1 <- data.frame(col1 = 1:39, col2 = 1:39)
x <- split_row_data(x1, sections = 2)
x <- split_row_data(x1, sections = 3)
x1 <- data.frame(col1 = 1:10, col2 = 11:20)
x1.t <- t(x1)
x <- split_col_data(x1.t, sections = 3)
# split file
dat <- data.frame(col1 = 1:10000)
outfn <- tempfile()
write.table(dat, outfn, sep = "\t")

Filename Process

files_dir <- system.file('extdata', 'demo/format', package = 'ngstk')
pattern <- '*.txt'
list.files(files_dir, pattern)
x <- format_filenames(files_dir = files_dir, pattern = pattern, prefix = 'hg38_')

Command line utils functions

# Collect command line bins files in R package
rbin('ngstk', tempdir())

# Print sub commands
option_list <- list(
  make_option(c('-l', '--list-all-subcmds'), action = 'store_true',
               default = FALSE, help = 'Print all supported subcmds of ngsjs.')
subcmds_list <- list(subcmd1 = 'Use method 1 to plot boxplot',
                      subcmd2 = 'Use method 2 to plot boxplot')
 description <- 'Method to plot boxplot'
 usage <- 'usage: %prog [options] [params]'
 opt_parser_obj <- opt_parser(subcmds_list = subcmds_list,
                             option_list = option_list,
                             description = description,
                             usage = usage)

# Print the command line message
# You can define the message order use
# paramter help_order = c("description", "usage", "options", "subcmds", "epilogue"

Download functions

# Use future package to parallel download urls with logs
urls <- c(paste0('',
 par_download(urls, sprintf('%s/%s', tempdir(), basename(urls)))



Try the ngstk package in your browser

Any scripts or data that you put into this service are public.

ngstk documentation built on May 2, 2019, 9:19 a.m.