knitr::opts_chunk$set(comment = "#>", collapse = TRUE) library(ngstk)
ngstk is an R package to facilitate the analysis of NGS data, such as visualization, conversion of the data format for WEB service input and another purpose.
You can learn some usage of ngstk through this tutorial.
We defined a rule to facilitate the data format conversion. It is a good choice to save the all of output colums meta information. Moreover, the other input data should establish a connection with the output colums according the requirement.
The follow example configuration file can be used to convert iseq, a pipeline to analysis genetic variants from NGS data, output data to the web service ProteinPaint, a tool to visulize the mutation data.
Title = "Proteinpaint configuration file" # muts2pp is a function that can convert mutation data to Proteinpaint input format [muts2pp.meta.defined_cols] colnames = ["gene", "refseq", "chromosome", "start", "aachange", "class", "disease", "sample"] handler_lib = "default_handlers" mhandler_lib = "default_mhandlers" [muts2pp.meta.defined_cols.description] gene = "Gene symbol, e.g TP53, PTEN" refseq = "Transcript of refSeq or Ensemble, e.g NM_000546, ENST00000635293" chromosome = "Chromosome, e.g. chr1, chr2" start = "Chromosome start location of a mutation site, e.g. 153249385" aachange = "Amino acid level change of gene mutation, e.g. p.R347C, p.L615delinsDL" class = "Mutation type, e.g. nonsense, proteinIns, proteinDel, frameshift" disease = "Disease name or sample group name, e.g. B-ALL, T-ALL, G1, G2" sample = "Sample name" [muts2pp.format.iseq.gene] alias = ["gene", "symbol"] [muts2pp.format.iseq.refseq] alias = ["refseq", "transcription.id"] [muts2pp.format.iseq.chromosome] alias = ["chromosome"] [muts2pp.format.iseq.start] alias = ["start"] [muts2pp.format.iseq.aachange] alias = ["aachange", "amino.acid.change"] extract_pattern = "p[.]+.*$" [muts2pp.format.iseq.class] alias = ["mutation_type", "mutation.type"] raw = ["nonframeshift ins", "nonframeshift del", "frameshift ins", "frameshift del", "stoploss", "nonsense", "splice"] new = ["proteinIns", "proteinDel", "frameshift", "frameshift", "nonsense", "nonsense", "splice"] na_replace = "splice"
muts2pp, muts2mutation_mapper, muts2oncoprinter and fusions2pp are the example functions based on the established rules.
demo_file <- system.file("extdata", "demo/proteinpaint/muts2pp_iseq.txt", package = "ngstk") input_data <- read.table(demo_file, sep = "\t", header = TRUE, stringsAsFactors = FALSE) disease <- "T-ALL" input_data <- data.frame(input_data, disease) input_data$disease <- as.character(input_data$disease) # Convert mutations data to proteinpaint input result <- muts2pp(input_data, input_type = "iseq") head(result) # Convert mutations data to cbioportal input result <- muts2mutation_mapper(input_data, input_type = "iseq") head(result) result <- muts2oncoprinter(input_data, input_type = "iseq") head(result) demo_file <- system.file('extdata', 'demo/proteinpaint/fusions2pp_fusioncatcher.txt', package = 'ngstk') input_data <- read.table(demo_file, sep = '\t', header = TRUE, stringsAsFactors = FALSE) disease <- 'B-ALL' sampletype <- 'diagnose' input_data <- data.frame(input_data, disease, sampletype) input_data$disease <- as.character(input_data$disease) # Convert fusions data to proteinpaint input result <- fusions2pp(input_data, input_type = 'fusioncatcher') head(result)
merge_table_files is the another util function to merge multiple table files.
a <- data.frame(col1=1:6, col2=2:7) b <- data.frame(col1=6:11, col2=1:6) file_a <- paste0(tempfile(), '_abcd') file_b <- paste0(tempfile(), '_abcd') write.table(a, file_a, sep = '\t', row.names = FALSE) write.table(b, file_b, sep = '\t', row.names = FALSE) input_files <- c(file_a, file_b) x1 <- merge_table_files(input_files = input_files) head(x1) x2 <- merge_table_files(files_dir = tempdir(), pattern = '.*_abcd$') head(x2) outfn = tempfile() x3 <- merge_table_files(files_dir = tempdir(), pattern = ".*_abcd$", outfn = outfn) head(read.table(outfn, sep = "\t", header = TRUE))
Data filtration or subset is an important step to clean or run the specific analysis. A series of data filtration function will be establish and fixed that can be re-used in the future.
demo_file <- system.file("extdata", "demo/proteinpaint/fusions2pp_fusioncatcher.txt", package = "ngstk") input_data <- read.table(demo_file, sep = "\t", header = TRUE, stringsAsFactors = FALSE) # Get data subset according the defined rule mhandler_extra_params = list(gene_5 = 1, gene_3 = 2, any_gene = "TCF3", fusions_any_match_flag = TRUE) result_1 <- fusions_filter(input_data, mhandler_extra_params = mhandler_extra_params) head(result_1) mhandler_extra_params = list(gene_3 = 2, right_gene = "GYPA", fusions_right_match_flag = TRUE) result_2 <- fusions_filter(input_data, mhandler_extra_params = mhandler_extra_params) head(result_2) mhandler_extra_params = list(gene_5 = 1, left_gene = "GYPA", fusions_left_match_flag = TRUE) result_3 <- fusions_filter(input_data, mhandler_extra_params = mhandler_extra_params) head(result_3) mhandler_extra_params = list(gene_5 = 1, gene_3 = 2, left_gene = "GYPE", right_gene = "GYPA", fusions_full_match_flag = TRUE) result_4 <- fusions_filter(input_data, mhandler_extra_params = mhandler_extra_params) head(result_4) mhandler_extra_params = list(gene_5 = 1, gene_3 = 2, left_gene = "GYPE", right_gene = "GYPA", fusions_anyfull_match_flag = TRUE) result_5 <- fusions_filter(input_data, mhandler_extra_params = mhandler_extra_params) head(result_5)
In the data analysis process, we usually need to check the create time and modification time of some result files, e.g. all of files create or change time are identical. ngstk provied get_files_mtime
and get_files_ctime
to process the files time check step. The default time check function is check all files time wheather are identical and you can use your own function.
file_a <- tempfile() file_b <- tempfile() file.create(c(file_a, file_b)) x <- get_files_mtime(input_files = c(file_a, file_b)) x x <- get_files_mtime(input_files = c(file_a, file_b), return_check = FALSE) x x <- get_files_mtime(input_files = c(file_a, file_b), return_mtime = FALSE) x x <- get_files_ctime(input_files = c(file_a, file_b)) x x <- get_files_ctime(input_files = c(file_a, file_b), return_check = FALSE) x # time stamp time_stamp()
Split data is an optional step if you want to parallel process the data stream. ngstk provide split_row_data
and split_col_data
to split data.frame and data.table object.
x1 <- data.frame(col1 = 1:39, col2 = 1:39) x1 x <- split_row_data(x1, sections = 2) x x <- split_row_data(x1, sections = 3) x x1 <- data.frame(col1 = 1:10, col2 = 11:20) x1.t <- t(x1) x <- split_col_data(x1.t, sections = 3) x # split file dat <- data.frame(col1 = 1:10000) outfn <- tempfile() write.table(dat, outfn, sep = "\t") split_row_file(outfn)
files_dir <- system.file('extdata', 'demo/format', package = 'ngstk') pattern <- '*.txt' list.files(files_dir, pattern) x <- format_filenames(files_dir = files_dir, pattern = pattern, prefix = 'hg38_') x
# Collect command line bins files in R package rbin('ngstk', tempdir()) # Print sub commands option_list <- list( make_option(c('-l', '--list-all-subcmds'), action = 'store_true', default = FALSE, help = 'Print all supported subcmds of ngsjs.') ) subcmds_list <- list(subcmd1 = 'Use method 1 to plot boxplot', subcmd2 = 'Use method 2 to plot boxplot') description <- 'Method to plot boxplot' usage <- 'usage: %prog [options] [params]' opt_parser_obj <- opt_parser(subcmds_list = subcmds_list, option_list = option_list, description = description, usage = usage) # Print the command line message # You can define the message order use # paramter help_order = c("description", "usage", "options", "subcmds", "epilogue" print_help(opt_parser_obj)
# Use future package to parallel download urls with logs urls <- c(paste0('https://raw.githubusercontent.com/', 'Miachol/ftp/master/files/images/bioinstaller/maftools3.png'), paste0('https://raw.githubusercontent.com/', 'Miachol/ftp/master/files/images/bioinstaller/maftools4.png')) par_download(urls, sprintf('%s/%s', tempdir(), basename(urls)))
set_colors('default') set_colors('proteinpaint_mutations') set_colors('proteinpaint_chromHMM_state')
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.