#! R
#' DESeq2 module taking metadata_csv nad raw count data frame to run DE analysis
#'
#' @param count_data object in matrix format (integers) with rownames as genes and colnames as sample IDs
#' @param anno_tb tibble of ensembl_gene_id - external_gene_id mappings for annotation
#' @param tpm_tb tibble of tpms to append to results for output
#' @param tag string used to prefix output
#' @param metadata_csv path to file holding metadata
#' @param metadata_design string of design using colnames of metadata_csv
#' @param control_reference string indicating the 'control' intgroup for last element of design
#' @param output_dir path to where output goes
#' @param ens_version ENSEMBL version to use
#' @param org_prefix organism prefix from ENSEMBL
#' @param delim_samples character to delimit sample IDs (default: ".")
#' @return nz_fc_co raw count object with no lines summing to zeros
#' @importFrom magrittr '%>%'
#' @export
DESeq2_module <- function(count_data, anno_tb = NULL, tpm_tb = NULL, tag = NULL, metadata_csv = NULL, metadata_design = NULL, control_reference = NULL, output_dir = NULL, delim_samples = "\\."){
print("Running: DESeq2_module()")
##warnigns for undefined param
if(is.null(tag)){
stop("Please provide a tag to name your outputs")
}
if(is.null(metadata_csv)){
stop("Please specify a metadata_csv file in CSV format")
}
if(is.null(metadata_design)){
print("Please specify a metadata_design for DESeq2")
print("NB this should be format: first + second + ... + last")
print("NBB that 'last' in design is condition of interest in DE analysis")
stop("NBBB that all elements of the design must be names of columns in 'metadata' file")
}
# if(is.null(control_reference)){
# print("Please specify a control_reference from a column in metadata_csv for DESeq2")
# stop("NB that this should be one of the 'intgroup' levels")
# }
if(is.null(output_dir)){
print("No directory specified for output, the current dir will be used")
output_dir <- "./DEseq2"
} else {
output_dir <- paste0(output_dir, "/DEseq2")
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
}
##read in condition data
cond <- readr::read_csv(metadata_csv)
cond_df <- as.data.frame(cond)
rownames(cond_df) <- cond_df[,1]
##overlap w/count_data and sort
cond_df <- cond_df[rownames(cond_df) %in% colnames(count_data),]
cond_df <- cond_df[sort(rownames(cond_df)),]
##break design into components
design_vec <- unlist(lapply(metadata_design, function(f){gsub(" ", "", strsplit(f, "\\+")[[1]])}))
##take only cols of interest and define major condition
cond_df <- cond_df[,colnames(cond_df) %in% c("sample", design_vec)]
CONDITION <- rev(design_vec)[1]
##create factors
for(x in 1:length(colnames(cond_df))){
cond_df[,x] <- factor(cond_df[,x])
}
##change reference level if specified
if(!is.null(control_reference)){
cond_df[,CONDITION] <- relevel(cond_df[,CONDITION], ref = control_reference)
}
##DESeq2DataSet object
dds <- DESeq2::DESeqDataSetFromMatrix(countData = count_data,
colData = cond_df,
design = formula(paste0("~ ", metadata_design)))
##run DESeq2, make results
ddseq <- DESeq2::DESeq(dds)
##make all contrasts of CONDITION, then set into named list
combn_mat <- t(combn(levels(cond_df[,CONDITION]),2))
combns <- apply(combn_mat, 1, function(f){paste(f, collapse="-")})
DESeq2_res_list <- lapply(combns, function(f){
ress <- na.omit(DESeq2::results(ddseq, contrast = c(CONDITION, strsplit(f, "-")[[1]])))
ress_tb <- tibble::as_tibble(as.data.frame(ress), rownames="ensembl_gene_id")
if(!is.null(anno_tb)){
if(class(anno_tb)[1] != "tbl_df"){
anno_tb <- tibble::as_tibble(anno_tb)
}
if(any(colnames(anno_tb) %in% "target_id")==TRUE){
anno_tb <- anno_tb %>% dplyr::select(-target_id)
}
ress_tb <- dplyr::left_join(anno_tb, ress_tb) %>%
na.omit() %>%
dplyr::arrange(padj) %>%
tibble::as_tibble() %>%
dplyr::distinct()
}
if(!is.null(tpm_tb)){
num_cols <- unlist(lapply(tpm_tb[1,], is.numeric))
colnames(tpm_tb)[num_cols] <- paste0(colnames(tpm_tb)[num_cols], "_tpm")
ress_tb <- dplyr::left_join(ress_tb, tpm_tb) %>%
na.omit() %>%
dplyr::arrange(padj) %>%
tibble::as_tibble() %>%
dplyr::distinct()
}
readr::write_tsv(ress_tb, paste0(output_dir, "/", tag, ".res.", f, ".DESeq2.tsv"))
return(ress_tb)
})
names(DESeq2_res_list) <- unlist(lapply(combns, function(f){
paste(paste0(CONDITION, strsplit(f, "-")[[1]]), collapse = "-")
}))
saveRDS(DESeq2_res_list, file = paste0(output_dir, "/", tag, ".DESeq2_res_list.rds"))
##plots
vsd <- DESeq2::vst(dds, blind = TRUE)
sampleDists <- dist(t(SummarizedExperiment::assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
colors <- grDevices::colorRampPalette( rev(RColorBrewer::brewer.pal(9, "Blues")) )(256)
hc <- hclust(sampleDists)
pdf(paste0(output_dir, "/", tag, ".heatmap.pdf"))
heatmap(sampleDistMatrix, Rowv=as.dendrogram(hc),
symm=TRUE, col=colors,
margins=c(2,10), labCol=FALSE, cexRow = 0.8 )
dev.off()
##PCA
lapply(design_vec, function(f){
bmpcaplot <- BMplotPCA(vsd, intgroup=c(f), anno_tb = anno_tb, pc_limit = 5)
pdf(paste0(output_dir, "/", tag, ".", f, ".PCA_PC_loadings.pdf"), onefile = TRUE)
lapply(bmpcaplot[[1]], print)
print(bmpcaplot[[2]])
print(bmpcaplot[[3]])
dev.off()
})
}
#' Plotting PCA function
#' @param x variance stabilized DESeq2 object
#' @param intgroup which colname from metadata_csv to be output on plot
#' @param ntop top n genes to use by variance
#' @param anno_tb tibble of ensembl_gene_id - external_gene_id mappings for annotation
#' @param pc_limit integer percent variance accounted by PC for inclusion in plots (default: 10%)
#' @return list of ggplot2 objects for printing (PCA, PCs, loadings)
#' @export
BMplotPCA <- function(x, intgroup = NULL, ntop = 15000, anno_tb, pc_limit = 10) {
rv <- matrixStats::rowVars(SummarizedExperiment::assay(x))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
length(rv)))]
pca <- prcomp(t(SummarizedExperiment::assay(x)[select, ]))
sdPc <- apply(pca$x, 2, sd)
percentVar <- sdPc^2/sum(sdPc^2)
sdRot <- apply(pca$rotation, 2, sd)
percentVarRot <- sdRot^2/sum(sdRot^2)
if (!all(intgroup %in% names(SummarizedExperiment::colData(x)))) {
stop("The argument 'intgroup' should specify columns of colData(xx)")
}
intgroup.df <- as.data.frame(SummarizedExperiment::colData(x)[, intgroup, drop = FALSE])
group <- factor(apply(intgroup.df, 1, paste, collapse = " : "))
pca_plot <- function(d, group, intgroup.df, pcv){
if(nlevels(group)>6){
ggp <- ggplot2::ggplot(data = d,
ggplot2::aes_string(x = "PC1",
y = colnames(d)[2],
group = intgroup)) +
ggplot2::scale_shape_manual(values = 1:dim(unique(intgroup.df))[1]) +
ggplot2::labs(paste0("PCA plot using ", colnames(intgroup.df)), x = paste0("PC1: ", round(pcv[1] * 100), "% variance"),y = paste0(colnames(d)[2], ": ", round(pcv[2] * 100), "% variance")) +
ggrepel::geom_text_repel(label = rownames(d),
colour = "black",
size = 2,
fontface = "bold") +
ggplot2::geom_point(ggplot2::aes_string(shape = intgroup,
colour = intgroup,
fill = intgroup),
size = 3) +
ggplot2::ggtitle(paste0("PCA plot using ", colnames(intgroup.df)),
subtitle = paste0(colnames(d)[1], " vs. ", colnames(d)[2]))
}
if(nlevels(group)<=6){
ggp <- ggplot2::ggplot(data = d, ggplot2::aes_string(x = "PC1",
y = colnames(d)[2],
group = intgroup)) +
# ggplot2::aes(x = PC1, y = PC2, group = group, shape = group, colour = group)) +
ggplot2::geom_point(ggplot2::aes_string(shape = intgroup,
colour = intgroup,
fill = intgroup),
size = 3) +
ggplot2::scale_shape_discrete(solid = T) +
ggplot2::xlab(paste0("PC1: ", round(pcv[1] * 100), "% variance")) +
ggplot2::ylab(paste0(colnames(d)[2], ": ", round(pcv[2] * 100), "% variance")) +
ggrepel::geom_text_repel(label = rownames(d),
colour = "black",
size = 2,
fontface = "bold") +
# ggplot2::annotate("text",x=pca$x[,1], y = pca$x[,2]-0.4, label = colnames(x), cex = 1.6) +
ggplot2::ggtitle(paste0("PCA plot using ", colnames(intgroup.df)),
subtitle = paste0(colnames(d)[1], " vs. ", colnames(d)[2]))
}
return(ggp)
}
##lapply to make all PC > pc_limit included
pcv_u <- percentVar[percentVar > pc_limit/100]
ggps <- lapply(2:length(pcv_u), function(f){
d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, f], intgroup.df, names = colnames(x))
colnames(d)[colnames(d) == "PC2"] <- paste0("PC", f)
ggp <- pca_plot(d, group, intgroup.df, pcv = percentVar[c(1,f)])
return(ggp)
})
##scree plot showing contributions of PCs
pcv <- data.frame(PC = colnames(pca$x), percent_variance = percentVar)
pcv <- pcv[order(pcv$percent_variance, decreasing = TRUE),]
levels(pcv$PC) <- pcv$PC
ggs <- ggplot2::ggplot(data = pcv, ggplot2::aes(x = PC, y = percent_variance )) +
ggplot2::geom_col() +
ggplot2::ggtitle("Proprotion of Variances of Principle Components") +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90))
##loading top 5, bottom 5 from 10 top PCs
top_pc <- pcv[1:10,][!is.na(pcv[1:10, "PC"]),]
top_pc$percent_variance <- paste0(round(top_pc$percent_variance, digits = 3)*100, "%")
pca_rot <- pca$rotation[,top_pc$PC]
fives_list <- lapply(colnames(pca_rot), function(f){
fc <- c(tail(sort(pca_rot[,f]),5), head(sort(pca_rot[,f]),5))
nfc <- dplyr::filter(.data = anno_tb, ensembl_gene_id %in% names(fc))
names(fc) <- unlist(lapply(names(fc), function(ff){
dist_nfc <- dplyr::filter(.data = nfc, ensembl_gene_id %in% ff) %>%
dplyr::select(external_gene_name) %>% dplyr::distinct() %>% unlist()
paste(dist_nfc, collapse = ",")
}))
return(data.frame(gene = names(fc), loading = fc, PC = f))
})
loadings_plot <- do.call(rbind,fives_list) %>%
dplyr::left_join(top_pc) %>%
dplyr::mutate(PC = paste0(PC, "\n(", percent_variance, ")"))
ggl <- ggplot2::ggplot(data = loadings_plot,
ggplot2::aes_string(x = "PC", y = "loading", label = "gene")) +
ggplot2::geom_point(ggplot2::aes_string(colour = "loading"),
size = 3) +
ggplot2::scale_shape_discrete(solid = T) +
ggrepel::geom_text_repel(colour = "black",
size = 2,
fontface = "bold") +
ggplot2::ggtitle("Loadings plot, top 10 PCs by variance, top/bottom 5 genes") +
ggplot2::xlab("PC (% variance accounted for)")
return(list(ggps, ggs, ggl))
}
#' Parse information from STAR run to find used GTF file
#' @importFrom magrittr '%>%'
#' @export
genesGTF <- function(output_dir = OUTDR){
##get genes.gtf input
genesgtf_file <- system("find work/stage -name genes.gtf | grep -v STAR", intern=T)
system(paste0("cat ", genesgtf_file, " | perl -ane 'if(($F[1] eq \"ensembl\") && ($F[2] eq \"CDS\")){print $_;}' > ", output_dir, "/gene.ensembl.CDS.gtf"))
cds_gtf_idnm <- read_table2(paste0(output_dir, "/gene.ensembl.CDS.gtf")) %>%
dplyr::select(14,16)
##parse CDS from genes.gtf
splitFun <- function(x){unlist(lapply(x, function(f){strsplit(f, '\\"')[[1]][2]}))}
cds_gtf_nm <- tibble(ensembl_gene_id = splitFun(unlist(cds_gtf_idnm[,1])),
external_gene_name = splitFun(unlist(cds_gtf_idnm[,2]))) %>%
distinct() %>%
arrange(ensembl_gene_id)
return(cds_gtf_nm)
}
#' Take two inputs of datasets in biomaRt::useMart and return the tibble of the matched orthologs
#'
#' @param human_genome is the genome against which we want orthologs (set as human but changeble with unknown results)
#' @param org_prefix string indicating the name of the organism which is suffixed with '_gene_ensembl' for biomaRt
#' @param ens_version string indicating version of ENSEMBL to use
#' @return ensid2gene2orth a tibble of orthologs in ensembl - gene name - ortholog
#' @importFrom magrittr '%>%'
#' @export
biomaRt_anno_orth <- function(human_genome = "hsapiens_gene_ensembl", org_prefix = NULL, ens_version = NULL){
##need a way to access versions
##if you know you need a specific version for acces to a specific genome version
##then specify version; otehrwise defaults to latest by below
if(is.null(ens_version)){
ens_version <- rev(strsplit(biomaRt::listMarts()[1,2], " ")[[1]])[1]
}
##specifies the HOST connect (URL)
HOST <- as_tibble(listEnsemblArchives()) %>%
dplyr::filter(version %in% ens_version) %>%
dplyr::select(url) %>% unlist()
##access the two genomes
mart1 <- biomaRt::useMart(biomart = "ensembl", dataset = human_genome, host = HOST)
mart2 <- biomaRt::useMart(biomart = "ensembl", dataset = paste0(org_prefix, "_gene_ensembl"), host = HOST)
##first genome genes with homologs for second genome
ensid2gene1 <- as_tibble(biomaRt::getBM(attributes=c("ensembl_gene_id", "external_gene_name", paste0(org_prefix, "_homolog_ensembl_gene")), mart = mart1))
##second genome, with renaming to allow join with above
renames <- c(paste0(org_prefix,"_homolog_ensembl_gene"),
paste0(org_prefix,"_homolog_external_name"))
ensid2gene2 <- as_tibble(biomaRt::getBM(attributes=c("ensembl_gene_id", "external_gene_name"), mart = mart2))
ensid2gene2 <- dplyr::select(.data = ensid2gene2, !!renames[1] := ensembl_gene_id, !!renames[2] := external_gene_name)
##join to get human and ortholog IDs
ensid2gene2orth <- left_join(ensid2gene1, ensid2gene2, by=renames[1]) %>%
na.omit()
return(ensid2gene2orth)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.