#' Import featureCounts output table into R.
#'
#' Skips the first comment line.
#'
#' @param sample_dir Path to the directory containing the count files.
#' @param sample_names Vector of sample names.
#' @param counts_suffix Suffix of the counts file.
#' @param sub_dir If TRUE, count files are nested in subfolders named after sample names, otherwise counts files
#' are directly in sample_dir.
#' @return Counts matrix where the first two columns are gene_id and feature length.
#' @author Kaur Alasoo
#' @export
loadCounts <- function(sample_dir, sample_names, counts_suffix = ".counts.txt", sub_dir = TRUE){
#Load featureCounts output into R
matrix = c()
for (i in c(1:length(sample_names))){
if (sub_dir == TRUE){
path = file.path(sample_dir, sample_names[i], paste(sample_names[i], counts_suffix, sep = ""))
} else {
path = file.path(sample_dir, paste(sample_names[i], counts_suffix, sep = ""))
}
print(sample_names[i])
table = readr::read_tsv(path, skip = 1, col_types = "cccccii")
print(head(table))
if (i == 1){
matrix = table[,c(1,6,7)]
}
else{
matrix = cbind(matrix, table[,7])
}
}
colnames(matrix) = c("gene_id", "length", sample_names)
return(matrix)
}
loadExonCounts <- function(sample_dir, sample_names, counts_suffix = ".exon_counts.txt", sub_dir = TRUE){
#Load featureCounts output into R
matrix = c()
for (i in c(1:length(sample_names))){
if (sub_dir == TRUE){
path = file.path(sample_dir, sample_names[i], paste(sample_names[i], counts_suffix, sep = ""))
} else {
path = file.path(sample_dir, paste(sample_names[i], counts_suffix, sep = ""))
}
print(sample_names[i])
table = readr::read_tsv(path, skip = 1, col_types = "cccccii")
print(head(table))
if (i == 1){
matrix = table[,c(1,3,4,6,7)]
}
else{
matrix = cbind(matrix, table[,7])
}
}
colnames(matrix) = c("gene_id", "start", "end", "length", sample_names)
matrix = dplyr::tbl_df(matrix)
return(matrix)
}
loadIntronEventCounts <- function(sample_dir, sample_names, counts_suffix = ".intron_events.txt", sub_dir = TRUE){
#Load featureCounts output into R
matrix = c()
for (i in c(1:length(sample_names))){
if (sub_dir == TRUE){
path = file.path(sample_dir, sample_names[i], paste(sample_names[i], counts_suffix, sep = ""))
} else {
path = file.path(sample_dir, paste(sample_names[i], counts_suffix, sep = ""))
}
print(sample_names[i])
table = readr::read_tsv(path, col_names = FALSE, col_types = "ccc")
if (i == 1){
matrix = table
}
else{
matrix = cbind(matrix, table[,3])
}
}
colnames(matrix) = c("gene_id", "event_id", sample_names)
return(matrix)
}
loadVerifyBamID <- function(sample_names, sample_dir, suffix = ".verifyBamID.bestSM", sub_dir = TRUE){
matrix = c()
for (i in c(1:length(sample_names))){
if (sub_dir == TRUE){
path = file.path(sample_dir, sample_names[i], paste(sample_names[i], suffix, sep = ""))
} else {
path = file.path(sample_dir, paste(sample_names[i], suffix, sep = ""))
}
table = read.table(path, comment.char = "", sep ="\t", header = TRUE) %>%
dplyr::select(CHIP_ID, FREEMIX) %>%
dplyr::mutate(sample_id = sample_names[i]) %>%
dplyr::rename(genotype_id = CHIP_ID, freemix = FREEMIX)
matrix = rbind(matrix, table)
}
return(matrix)
}
loadChrCounts <- function(sample_dir, sample_names, counts_suffix = ".chr_counts", sub_dir = TRUE){
#Import read counts per chromosome for each sample (used for estimating MT fraction in ATAC-Seq)
matrix = c()
for (i in c(1:length(sample_names))){
if (sub_dir == TRUE){
path = file.path(sample_dir, sample_names[i], paste(sample_names[i], counts_suffix, sep = ""))
} else {
path = file.path(sample_dir, paste(sample_names[i], counts_suffix, sep = ""))
}
print(path)
table = read.table(path, header = FALSE, stringsAsFactors = FALSE)
colnames(table) = c(sample_names[i], "chr_name")
if (i == 1){
matrix = table[,c(2,1)]
print(head(matrix))
}
else{
matrix = dplyr::full_join(matrix, table, by = "chr_name")
print(head(matrix))
}
}
return(matrix)
}
loadFeaturCountsSummary <- function(sample_dir, sample_names, counts_suffix = ".counts.txt.summary"){
#Import featureCounts summary files
matrix = c()
for (i in c(1:length(sample_names))){
path = file.path(sample_dir, sample_names[i], paste(sample_names[i], counts_suffix, sep = ""))
print(path)
table = read.table(path, header = TRUE, stringsAsFactors = FALSE)
colnames(table) = c("status", sample_names[i])
if (i == 1){
matrix = table[,c(1,2)]
print(head(matrix))
}
else{
matrix = dplyr::full_join(matrix, table, by = "status")
print(head(matrix))
}
}
rownames(matrix) = matrix$status
matrix = matrix[,-1]
matrix = as.data.frame(t(matrix))
matrix = dplyr::mutate(matrix, sample_id = rownames(matrix)) %>% dplyr::select(sample_id, everything())
return(matrix)
}
loadMarkDuplicates <- function(sample_dir, sample_names, counts_suffix = ".MarkDuplicates.txt", sub_dir = TRUE){
#Import MarkDuplicates summary files
matrix = c()
for (i in c(1:length(sample_names))){
if (sub_dir == TRUE){
path = file.path(sample_dir, sample_names[i], paste(sample_names[i], counts_suffix, sep = ""))
} else {
path = file.path(sample_dir, paste(sample_names[i], counts_suffix, sep = ""))
}
print(path)
data = readr::read_tsv(path, skip = 6, col_names = FALSE)
data = data[-2,]
table = as.data.frame(t(data),stringsAsFactors = FALSE)
colnames(table) = c("statistic", sample_names[i])
if (i == 1){
matrix = table[,c(1,2)]
print(head(matrix))
}
else{
matrix = dplyr::full_join(matrix, table, by = "statistic")
print(head(matrix))
}
}
rownames(matrix) = matrix$statistic
matrix = matrix[,-1]
matrix = as.data.frame(t(matrix), stringsAsFators = FALSE)
matrix = dplyr::mutate(matrix, sample_id = rownames(matrix)) %>% dplyr::select(sample_id, everything())
return(matrix)
}
#' Import narrowPeak files into a list of GRanges objects.
#'
#' All narrowPeak files are assumed to be nested in subdirectories named adter sample names.
#' Based on rtracklayer::import.bed, but adds additional columns present only in narrowPeak files.
#'
#' @param sample_dir Path to the directory containing the narroPeak files.
#' @param sample_names Vector of sample names.
#' @param peaks_suffix Suffix of the narrowPeak files.
#' @return List of GRanges objects corresponding to peak calls from each sample.
#' @param sub_dir If TRUE, count files are nested in subfolders named after sample names, otherwise counts files
#' are directly in sample_dir.
#' @author Kaur Alasoo
#' @export
loadNarrowPeaks <- function(sample_dir, sample_names, peaks_suffix = "_peaks.narrowPeak", sub_dir = TRUE){
#Import narrowPeak files into a list
result = list()
extraCols_narrowPeak <- c(signalValue = "numeric", pValue = "numeric", qValue = "numeric", peak = "integer")
for (i in c(1:length(sample_names))){
if (sub_dir == TRUE){
path = file.path(sample_dir, sample_names[i], paste(sample_names[i], peaks_suffix, sep = ""))
} else {
path = file.path(sample_dir, paste(sample_names[i], peaks_suffix, sep = ""))
}
print(path)
peaks = rtracklayer::import(path, format = "BED", extraCols = extraCols_narrowPeak)
result[[sample_names[i]]] = peaks
}
return(result)
}
#' Import broadPeak files into a list of GRanges objects.
#'
#' All broadPeak files are assumed to be nested in subdirectories named adter sample names.
#' Based on rtracklayer::import.bed, but adds additional columns present only in broadPeak files.
#'
#' @param sample_dir Path to the directory containing the broadPeak files.
#' @param sample_names Vector of sample names.
#' @param peaks_suffix Suffix of the broadPeak files.
#' @param sub_dir If TRUE, count files are nested in subfolders named after sample names, otherwise counts files
#' are directly in sample_dir.
#' @return List of GRanges objects corresponding to peak calls from each sample.
#' @author Kaur Alasoo
#' @export
loadBroadPeaks <- function(sample_dir, sample_names, peaks_suffix = "_peaks.broadPeak", sub_dir = TRUE){
#Import broakPeak files into a list
result = list()
extraCols_broadPeak <- c(signalValue = "numeric", pValue = "numeric", qValue = "numeric")
for (i in c(1:length(sample_names))){
if (sub_dir == TRUE){
path = file.path(sample_dir, sample_names[i], paste(sample_names[i], peaks_suffix, sep = ""))
} else {
path = file.path(sample_dir, paste(sample_names[i], peaks_suffix, sep = ""))
}
print(path)
peaks = rtracklayer::import(path, format = "BED", extraCols = extraCols_broadPeak)
result[[sample_names[i]]] = peaks
}
return(result)
}
#' Import genotypes in GDS file to a matrix
#'
#' Open GDS file (created with SNPRelate::snpgdsVCF2GDS) into R and convert it into a matrix of
#' variant positions and matrix of allele dosage (0,1,2)
#'
#' @param file Path to the GDS file.
#' @return List containing snpspos and genotypes matrix.
#' @author Kaur Alasoo
#' @export
gdsToMatrix <- function(gds_file){
#Extract genotypes
gds <- GWASTools::GdsGenotypeReader(gds_file)
genotypes = GWASTools::getGenotype(gds)
sample_ids = GWASTools::getVariable(gds, "sample.id")
snp_rs_ids = GWASTools::getVariable(gds, "snp.rs.id")
snp_ids = GWASTools::getVariable(gds, "snp.id")
#Invent id for snps that do not have an rs id
new_snp_ids = paste("snp",snp_ids[snp_rs_ids == ""], sep = "")
snp_rs_ids[snp_rs_ids == ""] = new_snp_ids
colnames(genotypes) = sample_ids
rownames(genotypes) = snp_rs_ids
#Extract SNP coordinates
snpspos = dplyr::data_frame(snpid = snp_rs_ids,
chr = GWASTools::getVariable(gds, "snp.chromosome"),
pos = GWASTools::getVariable(gds, "snp.position"))
GWASTools::close(gds)
return(list(snpspos = snpspos, genotypes = genotypes))
}
#' Import output files from bedCountFragmentLengths.py script into R.
#'
#' All fragment length files are assumed to be nested in subdirectories named after sample names.
#'
#' @param sample_dir Path to the directory containing the fragment length files.
#' @param sample_names Vector of sample names.
#' @param file_suffix Suffix of the fragment lengths files.
#' @return List of GRanges objects corresponding to peak calls from each sample.
#' @author Kaur Alasoo
#' @export
loadFragmentLengths <- function(sample_dir, sample_names, file_suffix = ".fragment_lengths.txt", sub_dir = TRUE){
result = c()
for (i in c(1:length(sample_names))){
if (sub_dir == TRUE){
path = file.path(sample_dir, sample_names[i], paste(sample_names[i], file_suffix, sep = ""))
} else {
path = file.path(sample_dir, paste(sample_names[i], file_suffix, sep = ""))
}
table = read.table(path) %>% dplyr::tbl_df()
colnames(table) = c("count", "fragment_length")
sample_table = dplyr::mutate(table, sample_id = sample_names[i])
if (i == 1){
result = sample_table
}
else{
result = rbind(result, sample_table)
}
}
return(result)
}
#' A general function to quickly import tabix indexed tab-separated files into data_frame
#'
#' @param tabix_file Path to tabix-indexed text file
#' @param param A instance of GRanges, RangedData, or RangesList
#' provide the sequence names and regions to be parsed. Passed onto Rsamtools::scanTabix()
#' @param ... Additional parameters to be passed on to readr::read_delim()
#'
#' @return List of data_frames, one for each entry in the param GRanges object.
#' @export
scanTabixDataFrame <- function(tabix_file, param, ...){
tabix_list = Rsamtools::scanTabix(tabix_file, param = param)
df_list = lapply(tabix_list, function(x,...){
if(length(x) > 0){
if(length(x) == 1){
#Hack to make sure that it also works for data frames with only one row
#Adds an empty row and then removes it
result = paste(paste(x, collapse = "\n"),"\n",sep = "")
result = readr::read_delim(result, delim = "\t", ...)[1,]
}else{
result = paste(x, collapse = "\n")
result = readr::read_delim(result, delim = "\t", ...)
}
} else{
#Return NULL if the nothing is returned from tabix file
result = NULL
}
return(result)
}, ...)
return(df_list)
}
#' Import variant information extracted from VCF file into R
#'
#' The variant information text file can be generated from the VCF using the following
#' bcftools command:
#' bcftools query -f '\%CHROM\\t\%POS\\t\%ID\\t\%REF\\t\%ALT\\t\%TYPE\\t\%AC\\t\%AN\\n' path/to/vcf_file.vcf.gz | bgzip > path/to/variant_infromation_file.txt.gz
#'
#' @param path Path to the the variant information text file.
#' @export
importVariantInformation <- function(path){
info_col_names = c("chr","pos","snp_id","ref","alt","type","AC","AN")
into_col_types = "ciccccii"
snp_info = readr::read_delim(path, delim = "\t", col_types = into_col_types, col_names = info_col_names)
snp_info = dplyr::mutate(snp_info, indel_length = pmax(nchar(alt), nchar(ref))) %>%
dplyr::mutate(is_indel = ifelse(indel_length > 1, TRUE, FALSE)) %>%
dplyr::mutate(MAF = pmin(AC/AN, 1-(AC/AN)))
return(snp_info)
}
#' Import a specific region from a tabix-indexed GWAS summary stats file
tabixFetchGWASSummary <- function(granges, summary_path){
gwas_col_names = c("snp_id", "chr", "pos", "effect_allele", "MAF",
"p_nominal", "beta", "OR", "log_OR", "se", "z_score", "trait", "PMID", "used_file")
gwas_col_types = c("ccicdddddddccc")
gwas_pvalues = scanTabixDataFrame(summary_path, granges, col_names = gwas_col_names, col_types = gwas_col_types)
return(gwas_pvalues)
}
#' Import full GWAS summary stats file
importGWASSummary <- function(summary_path){
gwas_col_names = c("snp_id", "chr", "pos", "effect_allele", "MAF",
"p_nominal", "beta", "OR", "log_OR", "se", "z_score", "trait", "PMID", "used_file")
gwas_col_types = c("ccicdddddddccc")
gwas_pvals = readr::read_tsv(summary_path,
col_names = gwas_col_names, col_types = gwas_col_types, skip = 1)
return(gwas_pvals)
}
importMotifDisruptions <- function(path){
motif_colnames = c("gene_id","snp_id","snp_count",".row","start","strand","motif_id","ref_abs_score","ref_rel_score",
"ref_match","alt_abs_score","alt_rel_score","alt_match","rel_diff","max_rel_score")
motif_disruptions = readr::read_delim(path, delim = "\t", col_names = motif_colnames)
return(motif_disruptions)
}
loadSalmonCounts <- function(sample_dir, sample_names, counts_suffix = ".ensembl85.quant.sf.gz",
sub_dir = TRUE, column_name = "TPM", ...){
#Load featureCounts output into R
matrix = c()
for (i in c(1:length(sample_names))){
if (sub_dir == TRUE){
path = file.path(sample_dir, sample_names[i], paste0(sample_names[i], counts_suffix))
} else {
path = file.path(sample_dir, paste0(sample_names[i], counts_suffix))
}
print(sample_names[i])
columns = c("Name","Length","EffectiveLength","TPM","NumReads")
table = readr::read_delim(path, delim = "\t", col_names = columns, col_types = "cdddd", ...)
#Calculate relative length
if(column_name == "RelativeLength"){
table = dplyr::mutate(table, RelativeLength = EffectiveLength/Length)
}
if (i == 1){
matrix = table[,c("Name",column_name)]
}
else{
matrix = dplyr::bind_cols(matrix, table[,column_name])
}
}
colnames(matrix) = c("transcript_id", sample_names)
return(matrix)
}
salmonSummarizedExperiment <- function(sample_metadata, transcript_metadata, sample_dir, counts_suffix, sub_dir = FALSE, ...){
#Load salmon lengths and merge with transcript metadata
salmon_lengths = loadSalmonCounts(sample_dir, sample_metadata$sample_id[1], counts_suffix, sub_dir, column_name = "Length", ...)
colnames(salmon_lengths)[2] = "salmon_length"
transcript_meta = dplyr::left_join(salmon_lengths, transcript_metadata, by = "transcript_id") %>%
dplyr::select(transcript_id, gene_id, gene_name, everything()) %>%
as.data.frame()
rownames(transcript_meta) = transcript_meta$transcript_id
#Import assays
tpms = loadSalmonCounts(sample_dir, sample_metadata$sample_id, counts_suffix, sub_dir, column_name = "TPM", ...) %>%
tibbleToNamedMatrix()
counts = loadSalmonCounts(sample_dir, sample_metadata$sample_id, counts_suffix, sub_dir, column_name = "NumReads", ...) %>%
tibbleToNamedMatrix()
relLengths = loadSalmonCounts(sample_dir, sample_metadata$sample_id, counts_suffix, sub_dir, column_name = "RelativeLength", ...) %>%
tibbleToNamedMatrix()
#Calculate TPM ratios
#Set up the TPM data frame for computatiob
gene_transcript_map = dplyr::select(transcript_metadata, transcript_id, gene_id)
tpm_df = dplyr::mutate(as.data.frame(tpms), transcript_id = rownames(tpms)) %>%
tbl_df() %>%
dplyr::left_join(gene_transcript_map, by = "transcript_id") %>%
dplyr::select(-transcript_id) %>%
dplyr::select(gene_id, everything())
#Calculate total expression per gene
gene_total_tpms = purrr::slice_rows(tpm_df, "gene_id") %>%
purrr::by_slice(~colSums(.) %>%
t() %>%
as.data.frame(), .collate = "rows")
#Make matrix of gene expression values for each transcript
tx_gene_expression = dplyr::left_join(gene_transcript_map, gene_total_tpms, by = "gene_id")
tx_gene_matrix = dplyr::select(tx_gene_expression, -gene_id, -transcript_id) %>% as.matrix()
rownames(tx_gene_matrix) = tx_gene_expression$transcript_id
#calculate TPM ratios and add them to the SummarizedExperiment
tpm_ratios = tpms/tx_gene_matrix[rownames(tpms),]
#Construct SummarizedExperiemnt
se = SummarizedExperiment::SummarizedExperiment(assays =
list(counts = counts, tpms = tpms, relLengths = relLengths, tpm_ratios = tpm_ratios),
colData = sample_metadata,
rowData = transcript_meta)
return(se)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.