#' @title Hierarchical cluster analysis
#' @description Hierarchical cluster analysis using several methods such as
#' ward.D", "ward.D2", "single", "complete", "average" (= UPGMA),
#' "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' @param tabDF is a dataframe or numeric matrix, each row represents a gene,
#' each column represents a sample come from TCGAPrepare.
#' @param method is method to be used for generic cluster such as 'hclust'
#' or 'consensus'
#' @param methodHC is method to be used for Hierarchical cluster.
#' @import stats
#' @export
#' @return object of class hclust if method selected is 'hclust'.
#' If method selected is 'Consensus' returns a list of length maxK
#' (maximum cluster number to evaluate.). Each element is a list containing
#' consensusMatrix (numerical matrix), consensusTree (hclust), consensusClass
#' (consensus class assignments). ConsensusClusterPlus also produces images.
TCGAanalyze_Clustering <- function(
tabDF,
method,
methodHC = "ward.D2"
) {
if (!requireNamespace("ConsensusClusterPlus", quietly = TRUE)) {
stop("ConsensusClusterPlus is needed. Please install it.", call. = FALSE)
}
if (method == "hclust") {
ans <- hclust(ddist <- dist(tabDF), method = methodHC)
}
if (method == "consensus") {
sHc <- hclust(ddist <- dist(tabDF), method = methodHC)
ans <- ConsensusClusterPlus::ConsensusClusterPlus(
ddist,
maxK = 7,
pItem = 0.9,
reps = 1000,
title = "mc_consensus_k7_1000",
clusterAlg = "hc",
innerLinkage = "ward.D2",
finalLinkage = "complete",
plot = 'pdf',
writeTable = TRUE
)
}
return(ans)
}
#' @title Array Array Intensity correlation (AAIC) and correlation boxplot to define outlier
#' @description TCGAanalyze_Preprocessing perform Array Array Intensity correlation (AAIC).
#' It defines a square symmetric matrix of spearman correlation among samples.
#' According this matrix and boxplot of correlation samples by samples it is possible
#' to find samples with low correlation that can be identified as possible outliers.
#' @param object gene expression of class RangedSummarizedExperiment from TCGAprepare
#' @param cor.cut is a threshold to filter samples according their spearman correlation in
#' samples by samples. default cor.cut is 0
#' @param filename Filename of the image file
#' @param width Image width
#' @param height Image height
#' @param datatype is a string from RangedSummarizedExperiment assay
#' @importFrom grDevices dev.list
#' @importFrom SummarizedExperiment assays
#' @export
#' @return Plot with array array intensity correlation and boxplot of correlation samples by samples
TCGAanalyze_Preprocessing <- function(
object,
cor.cut = 0,
datatype = names(assays(object))[1],
filename = NULL,
width = 1000,
height = 1000
) {
# This is a work around for raw_counts and raw_count
if(missing(object))
stop("Please set object argument")
if(!is(object,"RangedSummarizedExperiment"))
stop("Input object should be a RangedSummarizedExperiment")
if (grepl("raw_counts", datatype) & any(grepl("raw_counts", names(assays(object)))))
datatype <- names(assays(object))[grepl("raw_counts", names(assays(object)))]
if (!any(grepl(datatype, names(assays(object)))))
stop(
paste0(
datatype,
" not found in the assay list: ",
paste(names(assays(object)), collapse = ", "),
"\n Please set the correct datatype argument."
)
)
if (!(is.null(dev.list()["RStudioGD"]))) {
dev.off()
}
if (is.null(filename)) filename <- "PreprocessingOutput.png"
png(filename, width = width, height = height)
par(oma = c(10, 10, 10, 10))
ArrayIndex <- as.character(1:length(colData(object)$barcode))
pmat_new <- matrix(0, length(ArrayIndex), 4)
colnames(pmat_new) <- c("Disease", "platform", "SampleID", "Study")
rownames(pmat_new) <- as.character(colData(object)$barcode)
pmat_new <- as.data.frame(pmat_new)
pmat_new$Disease <- as.character(colData(object)$definition)
pmat_new$platform <- "platform"
pmat_new$SampleID <- as.character(colData(object)$barcode)
pmat_new$Study <- "study"
tabGroupCol <- cbind(pmat_new, Color = matrix(0, nrow(pmat_new), 1))
for (i in seq_along(unique(tabGroupCol$Disease))) {
tabGroupCol[which(tabGroupCol$Disease == tabGroupCol$Disease[i]), "Color"] <-
rainbow(length(unique(tabGroupCol$Disease)))[i]
}
pmat <- pmat_new
phenodepth <- min(ncol(pmat), 3)
order <- switch(
phenodepth + 1,
ArrayIndex,
order(pmat[, 1]),
order(pmat[, 1], pmat[, 2]),
order(pmat[, 1], pmat[, 2], pmat[, 3])
)
arraypos <- (1:length(ArrayIndex)) * (1 / (length(ArrayIndex) - 1)) - (1 / (length(ArrayIndex) - 1))
arraypos2 = seq(1:length(ArrayIndex) - 1)
for (i in 2:length(ArrayIndex)) {
arraypos2[i - 1] <- (arraypos[i] + arraypos[i - 1]) / 2
}
layout(matrix(c(1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 3, 3, 3, 4), 4, 4, byrow = TRUE))
c <- cor(assay(object, datatype)[, order], method = "spearman")
image(c,
xaxt = "n",
yaxt = "n",
#xlab = "Array Samples",
#ylab = "Array Samples",
main = "Array-Array Intensity Correlation after RMA"
)
for (i in 1:length(names(table(tabGroupCol$Color)))) {
currentCol <- names(table(tabGroupCol$Color))[i]
pos.col <- arraypos[which(tabGroupCol$Color == currentCol)]
lab.col <- colnames(c)[which(tabGroupCol$Color == currentCol)]
#axis(1, labels = lab.col , at = pos.col, col = currentCol,lwd = 6,las = 2)
axis(
2,
labels = lab.col ,
at = pos.col,
col = currentCol,
lwd = 6,
las = 2
)
}
m <- matrix(pretty(c, 10), nrow = 1, ncol = length(pretty(c, 10)))
image(
m,
xaxt = "n",
yaxt = "n",
ylab = "Correlation Coefficient"
)
axis(2,
labels = as.list(pretty(c, 10)),
at = seq(0, 1, by = (1 / (
length(pretty(c, 10)) - 1
))))
abline(
h = seq(
(1 / ( length(pretty(c, 10)) - 1)) / 2,
1 - (1 / ( length(pretty(c, 10)) - 1)),
by = (1 / ( length(pretty(c, 10)) - 1))
)
)
box()
boxplot(
c,
outline = FALSE,
las = 2,
lwd = 6,
# names = NULL,
col = tabGroupCol$Color,
main = "Boxplot of correlation samples by samples after normalization"
)
dev.off()
samplesCor <- rowMeans(c)
message("Number of outliers: ", sum(samplesCor < cor.cut))
objectWO <- assay(object, datatype)[, names(samplesCor)[samplesCor > cor.cut]]
return(objectWO)
}
#' @title survival analysis (SA) univariate with Kaplan-Meier (KM) method.
#' @description TCGAanalyze_SurvivalKM perform an univariate Kaplan-Meier (KM) survival analysis (SA).
#' It performed Kaplan-Meier survival univariate using complete follow up with all days
#' taking one gene a time from Genelist of gene symbols.
#' For each gene according its level of mean expression in cancer samples,
#' defining two thresholds for quantile
#' expression of that gene in all samples (default ThreshTop=0.67,ThreshDown=0.33) it is possible
#' to define a threshold of intensity of gene expression to divide the samples in 3 groups
#' (High, intermediate, low).
#' TCGAanalyze_SurvivalKM performs SA between High and low groups using following functions
#' from survival package
#' \enumerate{
#' \item survival::Surv
#' \item survival::survdiff
#' \item survival::survfit
#' }
#' @param clinical_patient is a data.frame using function 'clinic' with information
#' related to barcode / samples such as bcr_patient_barcode, days_to_death ,
#' days_to_last_follow_up , vital_status, etc
#' @param dataGE is a matrix of Gene expression (genes in rows, samples in cols) from TCGAprepare
#' @param Genelist is a list of gene symbols where perform survival KM.
#' @param Survresult is a parameter (default = FALSE) if is TRUE will show KM plot and results.
#' @param ThreshTop is a quantile threshold to identify samples with high expression of a gene
#' @param ThreshDown is a quantile threshold to identify samples with low expression of a gene
#' @param p.cut p.values threshold. Default: 0.05
#' @param group1 a string containing the barcode list of the samples in in control group
#' @param group2 a string containing the barcode list of the samples in in disease group
#' @export
#' @return table with survival genes pvalues from KM.
#' @examples
#' # Selecting only 20 genes for example
#' dataBRCAcomplete <- log2(dataBRCA[1:20,] + 1)
#'
#' # clinical_patient_Cancer <- GDCquery_clinic("TCGA-BRCA","clinical")
#' clinical_patient_Cancer <- data.frame(
#' bcr_patient_barcode = substr(colnames(dataBRCAcomplete),1,12),
#' vital_status = c(rep("alive",3),"dead",rep("alive",2),rep(c("dead","alive"),2)),
#' days_to_death = c(NA,NA,NA,172,NA,NA,3472,NA,786,NA),
#' days_to_last_follow_up = c(3011,965,718,NA,1914,423,NA,5,656,1417)
#' )
#'
#' group1 <- TCGAquery_SampleTypes(colnames(dataBRCAcomplete), typesample = c("NT"))
#' group2 <- TCGAquery_SampleTypes(colnames(dataBRCAcomplete), typesample = c("TP"))
#'
#' tabSurvKM <- TCGAanalyze_SurvivalKM(
#' clinical_patient = clinical_patient_Cancer,
#' dataGE = dataBRCAcomplete,
#' Genelist = rownames(dataBRCAcomplete),
#' Survresult = FALSE,
#' p.cut = 0.4,
#' ThreshTop = 0.67,
#' ThreshDown = 0.33,
#' group1 = group1, # Control group
#' group2 = group2
#' ) # Disease group
#'
#' # If the groups are not specified group1 == group2 and all samples are used
#' \dontrun{
#' tabSurvKM <- TCGAanalyze_SurvivalKM(
#' clinical_patient_Cancer,
#' dataBRCAcomplete,
#' Genelist = rownames(dataBRCAcomplete),
#' Survresult = TRUE,
#' p.cut = 0.2,
#' ThreshTop = 0.67,
#' ThreshDown = 0.33
#' )
#' }
TCGAanalyze_SurvivalKM <- function(
clinical_patient,
dataGE,
Genelist,
Survresult = FALSE,
ThreshTop = 0.67,
ThreshDown = 0.33,
p.cut = 0.05,
group1,
group2
) {
check_package("survival")
# Check which genes we really have in the matrix
Genelist <- intersect(rownames(dataGE), Genelist)
# Split gene expression matrix btw the groups
dataCancer <- dataGE[Genelist, group2, drop = FALSE]
dataNormal <- dataGE[Genelist, group1, drop = FALSE]
colnames(dataCancer) <- substr(colnames(dataCancer), 1, 12)
cfu <- clinical_patient[clinical_patient[, "bcr_patient_barcode"] %in% substr(colnames(dataCancer), 1, 12), ]
if ("days_to_last_followup" %in% colnames(cfu))
colnames(cfu)[grep("days_to_last_followup", colnames(cfu))] <- "days_to_last_follow_up"
cfu <- as.data.frame(
subset(
cfu,
select = c(
"bcr_patient_barcode",
"days_to_death",
"days_to_last_follow_up",
"vital_status"
)
))
# Set alive death to inf
if (length(grep("alive", cfu$vital_status, ignore.case = TRUE)) > 0)
cfu[grep("alive", cfu$vital_status, ignore.case = TRUE), "days_to_death"] <- "-Inf"
# Set dead follow up to inf
if (length(grep("dead", cfu$vital_status, ignore.case = TRUE)) > 0)
cfu[grep("dead", cfu$vital_status, ignore.case = TRUE), "days_to_last_follow_up"] <- "-Inf"
cfu <- cfu[!(is.na(cfu[, "days_to_last_follow_up"])), ]
cfu <- cfu[!(is.na(cfu[, "days_to_death"])), ]
followUpLevel <- FALSE
cfu$days_to_death <- as.numeric(as.character(cfu$days_to_death))
cfu$days_to_last_follow_up <- as.numeric(as.character(cfu$days_to_last_follow_up))
rownames(cfu) <- cfu[, "bcr_patient_barcode"] #mod1
cfu <- cfu[!(is.na(cfu[, "days_to_last_follow_up"])), ]
cfu <- cfu[!(is.na(cfu[, "days_to_death"])), ]
cfu_complete <- cfu
ngenes <- nrow(as.matrix(rownames(dataNormal)))
# Evaluate each gene
tabSurv_Matrix <- plyr::adply(
.data = 1:length(rownames(dataNormal)),
.margins = 1,
.fun = function(i){
mRNAselected <- as.matrix(rownames(dataNormal))[i]
mRNAselected_values <- dataCancer %>% dplyr::filter(rownames(dataCancer) == mRNAselected) %>% as.numeric
mRNAselected_values_normal <- dataNormal %>% dplyr::filter(rownames(dataNormal) == mRNAselected) %>% as.numeric
if (all(mRNAselected_values == 0)) return(NULL) # All genes are 0
tabSurv_Matrix <- data.frame("mRNA" = mRNAselected)
# Get Thresh values for cancer expression
mRNAselected_values_ordered <- sort(mRNAselected_values, decreasing = TRUE)
mRNAselected_values_ordered_top <- as.numeric(quantile(as.numeric(mRNAselected_values_ordered), ThreshTop)[1])
mRNAselected_values_ordered_down <- as.numeric(quantile(as.numeric(mRNAselected_values_ordered), ThreshDown)[1])
mRNAselected_values_newvector <- mRNAselected_values
if (!is.na(mRNAselected_values_ordered_top)) {
# How many samples do we have
numberOfSamples <- length(mRNAselected_values_ordered)
# High group (above ThreshTop)
lastelementTOP <- max(which(
mRNAselected_values_ordered > mRNAselected_values_ordered_top
))
# Low group (below ThreshDown)
firstelementDOWN <- min(
which(
mRNAselected_values_ordered <= mRNAselected_values_ordered_down
)
)
samples_top_mRNA_selected <- names(mRNAselected_values_ordered[1:lastelementTOP])
samples_down_mRNA_selected <- names(mRNAselected_values_ordered[firstelementDOWN:numberOfSamples])
# Which samples are in the intermediate group (above ThreshLow and below ThreshTop)
samples_UNCHANGED_mRNA_selected <- names(mRNAselected_values_newvector[
which((mRNAselected_values_newvector) > mRNAselected_values_ordered_down & mRNAselected_values_newvector < mRNAselected_values_ordered_top
)])
cfu_onlyTOP <- cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_top_mRNA_selected, ]
cfu_onlyDOWN <- cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_down_mRNA_selected, ]
cfu_onlyUNCHANGED <- cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_UNCHANGED_mRNA_selected, ]
cfu_ordered <- NULL
cfu_ordered <- rbind(cfu_onlyTOP, cfu_onlyDOWN)
cfu <- cfu_ordered
ttime <- as.numeric(cfu[, "days_to_death"])
sum(status <- ttime > 0) # morti
deads_complete <- sum(status <- ttime > 0)
ttime_only_top <- cfu_onlyTOP[, "days_to_death"]
deads_top <- sum(ttime_only_top > 0)
if (dim(cfu_onlyDOWN)[1] >= 1) {
ttime_only_down <- cfu_onlyDOWN[, "days_to_death"]
deads_down <- sum(ttime_only_down > 0)
} else {
deads_down <- 0
}
tabSurv_Matrix[1, "Cancer Deaths"] <- deads_complete
tabSurv_Matrix[1, "Cancer Deaths with Top"] <- deads_top
tabSurv_Matrix[1, "Cancer Deaths with Down"] <- deads_down
tabSurv_Matrix[1, "Mean Normal"] <- mean(as.numeric(mRNAselected_values_normal))
dataCancer_onlyTop_sample <- dataCancer[, samples_top_mRNA_selected, drop = FALSE]
dataCancer_onlyTop_sample_mRNASelected <- dataCancer_onlyTop_sample[rownames(dataCancer_onlyTop_sample) == mRNAselected, ]
dataCancer_onlyDown_sample <- dataCancer[, samples_down_mRNA_selected, drop = FALSE]
dataCancer_onlyDown_sample_mRNASelected <- dataCancer_onlyDown_sample[rownames(dataCancer_onlyDown_sample) == mRNAselected, ]
tabSurv_Matrix[1, "Mean Tumor Top"] <- mean(as.numeric(dataCancer_onlyTop_sample_mRNASelected))
tabSurv_Matrix[1, "Mean Tumor Down"] <- mean(as.numeric(dataCancer_onlyDown_sample_mRNASelected))
ttime[!status] <- as.numeric(cfu[!status, "days_to_last_follow_up"])
ttime[which(ttime == -Inf)] <- 0
ttime <- survival::Surv(ttime, status)
rownames(ttime) <- rownames(cfu)
legendHigh <- paste(mRNAselected, "High")
legendLow <- paste(mRNAselected, "Low")
tabSurv_pvalue <- tryCatch({
tabSurv <- survival::survdiff(ttime ~ c(rep(
"top", nrow(cfu_onlyTOP)
), rep(
"down", nrow(cfu_onlyDOWN)
)))
tabSurv_chis <- unlist(tabSurv)$chisq
tabSurv_pvalue <-
as.numeric(1 - pchisq(abs(tabSurv$chisq), df = 1))
}, error = function(e) {
return(Inf)
})
tabSurv_Matrix[1, "pvalue"] <- tabSurv_pvalue
if (Survresult == TRUE) {
titlePlot <- paste("Kaplan-Meier Survival analysis, pvalue = ",tabSurv_pvalue)
plot(
survival::survfit(ttime ~ c(
rep("low", nrow(cfu_onlyTOP)), rep("high", nrow(cfu_onlyDOWN))
)),
col = c("green", "red"),
main = titlePlot,
xlab = "Days",
ylab = "Survival"
)
legend(
100,
1,
legend = c(legendLow, legendHigh),
col = c("green", "red"),
text.col = c("green", "red"),
pch = 15
)
print(tabSurv)
}
} #end if
tabSurv_Matrix
},.progress = "time") #end for
tabSurv_Matrix[tabSurv_Matrix == "-Inf"] <- 0
tabSurvKM <- tabSurv_Matrix
# Filtering by selected pvalue < 0.01
tabSurvKM <- tabSurvKM[tabSurvKM$mRNA != 0, ]
tabSurvKM <- tabSurvKM[tabSurvKM$pvalue < p.cut, ]
tabSurvKM <- tabSurvKM[!duplicated(tabSurvKM$mRNA), ]
rownames(tabSurvKM) <- tabSurvKM$mRNA
tabSurvKM <- tabSurvKM[, -1]
tabSurvKM <- tabSurvKM[order(tabSurvKM$pvalue, decreasing = FALSE), ]
colnames(tabSurvKM) <- gsub("Cancer", "Group2", colnames(tabSurvKM))
colnames(tabSurvKM) <- gsub("Tumor", "Group2", colnames(tabSurvKM))
colnames(tabSurvKM) <- gsub("Normal", "Group1", colnames(tabSurvKM))
return(tabSurvKM)
}
#' @title Filtering mRNA transcripts and miRNA selecting a threshold.
#' @description
#' TCGAanalyze_Filtering allows user to filter mRNA transcripts and miRNA,
#' samples, higher than the threshold defined quantile mean across all samples.
#' @param tabDF is a dataframe or numeric matrix, each row represents a gene,
#' each column represents a sample come from TCGAPrepare
#' @param method is method of filtering such as 'quantile', 'varFilter', 'filter1', 'filter2'
#' @param qnt.cut is threshold selected as mean for filtering
#' @param var.func is function used as the per-feature filtering statistic.
#' See genefilter documentation
#' @param var.cutoff is a numeric value. See genefilter documentation
#' @param eta is a parameter for filter1. default eta = 0.05.
#' @param foldChange is a parameter for filter2. default foldChange = 1.
#' @export
#' @return A filtered dataframe or numeric matrix where each row represents a gene,
#' each column represents a sample
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataNorm <- TCGAanalyze_Normalization(tabDF = dataBRCA,
#' geneInfo = geneInfo,
#' method = "geneLength")
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm, method = "quantile", qnt.cut = 0.25)
TCGAanalyze_Filtering <- function(
tabDF,
method,
qnt.cut = 0.25,
var.func = IQR,
var.cutoff = 0.75,
eta = 0.05,
foldChange = 1
) {
if (method == "quantile") {
GeneThresh <- as.numeric(quantile(rowMeans(tabDF,na.rm = TRUE), qnt.cut))
geneFiltered <- names(which(rowMeans(tabDF) > GeneThresh))
tabDF_Filt <- tabDF[geneFiltered,]
}
if (method == "varFilter") {
check_package("genefilter")
tabDF_Filt <- genefilter::varFilter(
tabDF,
var.func = IQR,
var.cutoff = 0.75,
filterByQuantile = TRUE
)
}
if (method == "filter1") {
normCounts <- tabDF
geData <- t(log(1 + normCounts, 2))
filter <-
apply(geData, 2, function(x)
sum(quantile(x, probs = c(1 - eta, eta)) * c(1, -1)))
tabDF_Filt <- geData[, which(filter > foldChange)]
}
if (method == "filter2") {
geData <- tabDF
filter <-
apply(geData, 2, function(x)
prod(quantile(x, probs = c(1 - eta, eta)) - 10) < 0)
tabDF_Filt <- geData[, which(filter)]
}
return(tabDF_Filt)
}
#' @title normalization mRNA transcripts and miRNA using EDASeq package.
#' @description
#' TCGAanalyze_Normalization allows user to normalize mRNA transcripts and miRNA,
#' using EDASeq package.
#'
#' Normalization for RNA-Seq Numerical and graphical
#' summaries of RNA-Seq read data. Within-lane normalization procedures
#' to adjust for GC-content effect (or other gene-level effects) on read counts:
#' loess robust local regression, global-scaling, and full-quantile normalization
#' (Risso et al., 2011). Between-lane normalization procedures to adjust for
#' distributional differences between lanes (e.g., sequencing depth):
#' global-scaling and full-quantile normalization (Bullard et al., 2010).
#'
#' For istance returns all mRNA or miRNA with mean across all
#' samples, higher than the threshold defined quantile mean across all samples.
#'
#' TCGAanalyze_Normalization performs normalization using following functions
#' from EDASeq
#' \enumerate{
#' \item EDASeq::newSeqExpressionSet
#' \item EDASeq::withinLaneNormalization
#' \item EDASeq::betweenLaneNormalization
#' \item EDASeq::counts
#' }
#' @param tabDF Rnaseq numeric matrix, each row represents a gene,
#' each column represents a sample
#' @param geneInfo Information matrix of 20531 genes about geneLength and gcContent.
#' Two objects are provided: TCGAbiolinks::geneInfoHT,TCGAbiolinks::geneInfo
#' @param method is method of normalization such as 'gcContent' or 'geneLength'
#' @export
#' @return Rnaseq matrix normalized with counts slot holds the count data as a matrix
#' of non-negative integer count values, one row for each observational unit (gene or the like),
#' and one column for each sample.
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
TCGAanalyze_Normalization <- function(
tabDF,
geneInfo,
method = "geneLength"
) {
check_package("EDASeq")
# Check if we have a SE, we need a gene expression matrix
if (is(tabDF, "SummarizedExperiment")) tabDF <- assay(tabDF)
geneInfo <- geneInfo[!is.na(geneInfo[, 1]), ]
geneInfo <- as.data.frame(geneInfo)
geneInfo$geneLength <- as.numeric(as.character(geneInfo$geneLength))
geneInfo$gcContent <- as.numeric(as.character(geneInfo$gcContent))
if(any(grepl("\\|",rownames(tabDF)))){
tmp <- as.character(rownames(tabDF))
geneNames <- stringr::str_split(tmp, "\\|",simplify = T)
tmp <- which(geneNames[, 1] == "?")
geneNames[tmp, 1] <- geneNames[tmp, 2]
tmp <- table(geneNames[, 1])
tmp <- which(geneNames[, 1] %in% names(tmp[which(tmp > 1)]))
geneNames[tmp, 1] <- paste(geneNames[tmp, 1], geneNames[tmp, 2], sep = ".")
tmp <- table(geneNames[, 1])
rownames(tabDF) <- geneNames[, 1]
} else if(any(grepl("ENSG",rownames(tabDF)))){
rownames(tabDF) <- gsub("\\.[0-9]*","",rownames(tabDF))
}
if (method == "gcContent") {
rawCounts <- tabDF
commonGenes <- intersect(rownames(geneInfo), rownames(rawCounts))
geneInfo <- geneInfo[commonGenes, ]
rawCounts <- rawCounts[commonGenes, ]
timeEstimated <- format(ncol(tabDF) * nrow(tabDF) / 80000, digits = 2)
message(
messageEstimation <- paste(
"I Need about ",
timeEstimated,
"seconds for this Complete Normalization Upper Quantile",
" [Processing 80k elements /s] "
)
)
ffData <- as.data.frame(geneInfo)
rawCounts <- floor(rawCounts)
message("Step 1 of 4: newSeqExpressionSet ...")
tmp <- EDASeq::newSeqExpressionSet(as.matrix(rawCounts), featureData = ffData)
#fData(tmp)[, "gcContent"] <- as.numeric(geneInfo[, "gcContent"])
message("Step 2 of 4: withinLaneNormalization ...")
tmp <- EDASeq::withinLaneNormalization(
tmp, "gcContent",
which = "upper",
offset = TRUE
)
message("Step 3 of 4: betweenLaneNormalization ...")
if(any(is.na(EDASeq::normCounts(tmp)))) {
tmp <- tmp[rowSums(is.na(EDASeq::normCounts(tmp))) == 0,]
}
tmp <- EDASeq::betweenLaneNormalization(
tmp,
which = "upper",
offset = TRUE
)
normCounts <- log(rawCounts[rownames(tmp),] + .1) + EDASeq::offst(tmp)
normCounts <- floor(exp(normCounts) - .1)
message("Step 4 of 4: .quantileNormalization ...")
tmp <- t(.quantileNormalization(t(normCounts)))
tabDF_norm <- floor(tmp)
}
if (method == "geneLength") {
tabDF <- tabDF[!duplicated(rownames(tabDF)), !duplicated(colnames(tabDF))]
tabDF <- tabDF[rownames(tabDF) %in% rownames(geneInfo), ]
#tabDF <- tabDF[rowMeans(tabDF) > 1,]
tabDF <- tabDF[which(rowSums(tabDF == 0) < ncol(tabDF)),]
tabDF <- as.matrix(tabDF)
geneInfo <- geneInfo[rownames(geneInfo) %in% rownames(tabDF),]
geneInfo <- geneInfo[!duplicated(rownames(geneInfo)),]
toKeep <- which(geneInfo[, "geneLength"] != 0)
geneInfo <- geneInfo[toKeep,]
tabDF <- tabDF[toKeep,]
geneInfo <- as.data.frame(geneInfo)
tabDF <- round(tabDF)
commonGenes <- intersect(rownames(tabDF), rownames(geneInfo))
tabDF <- tabDF[commonGenes, ]
geneInfo <- geneInfo[commonGenes, ]
timeEstimated <- format(ncol(tabDF) * nrow(tabDF) / 80000, digits = 2)
message(
messageEstimation <- paste(
"I Need about ",
timeEstimated,
"seconds for this Complete Normalization Upper Quantile",
" [Processing 80k elements /s] "
)
)
message("Step 1 of 4: newSeqExpressionSet ...")
tabDF_norm <- EDASeq::newSeqExpressionSet(
tabDF,
featureData = geneInfo
)
message("Step 2 of 4: withinLaneNormalization ...")
tabDF_norm <- EDASeq::withinLaneNormalization(
tabDF_norm,
"geneLength",
which = "upper",
offset = FALSE
)
message("Step 3 of 4: betweenLaneNormalization ...")
if(any(is.na(EDASeq::normCounts(tabDF_norm)))) {
tabDF_norm <- tabDF_norm[rowSums(is.na(EDASeq::normCounts(tabDF_norm))) == 0,]
}
tabDF_norm <- EDASeq::betweenLaneNormalization(
tabDF_norm,
which = "upper",
offset = FALSE
)
message("Step 4 of 4: exprs ...")
tabDF_norm <- EDASeq::counts(tabDF_norm)
}
# In case NA's were produced to all rows
if(any(rowSums(is.na(tabDF_norm)) == ncol(tabDF_norm))){
tabDF_norm <- tabDF_norm[rowSums(is.na(tabDF_norm)) != ncol(tabDF_norm),]
}
return(tabDF_norm)
}
#' @title Differential expression analysis (DEA) using edgeR or limma package.
#' @description
#' TCGAanalyze_DEA allows user to perform Differentially expression analysis (DEA),
#' using edgeR package or limma to identify differentially expressed genes (DEGs).
#' It is possible to do a two-class analysis.
#'
#' TCGAanalyze_DEA performs DEA using following functions from edgeR:
#' \enumerate{
#' \item edgeR::DGEList converts the count matrix into an edgeR object.
#' \item edgeR::estimateCommonDisp each gene gets assigned the same dispersion estimate.
#' \item edgeR::exactTest performs pair-wise tests for differential expression between two groups.
#' \item edgeR::topTags takes the output from exactTest(), adjusts the raw p-values using the
#' False Discovery Rate (FDR) correction, and returns the top differentially expressed genes.
#' }
#' TCGAanalyze_DEA performs DEA using following functions from limma:
#' \enumerate{
#' \item limma::makeContrasts construct matrix of custom contrasts.
#' \item limma::lmFit Fit linear model for each gene given a series of arrays.
#' \item limma::contrasts.fit Given a linear model fit to microarray data, compute estimated coefficients and standard errors for a given set of contrasts.
#' \item limma::eBayes Given a microarray linear model fit, compute moderated t-statistics, moderated F-statistic, and log-odds of differential expression by empirical Bayes moderation of the standard errors towards a common value.
#' \item limma::toptable Extract a table of the top-ranked genes from a linear model fit.
#' }
#' @param mat1 numeric matrix, each row represents a gene,
#' each column represents a sample with Cond1type
#' @param mat2 numeric matrix, each row represents a gene,
#' each column represents a sample with Cond2type
#' @param metadata Add metadata
#' @param Cond1type a string containing the class label of the samples in mat1
#' (e.g., control group)
#' @param Cond2type a string containing the class label of the samples in mat2
#' (e.g., case group)
#' @param pipeline a string to specify which package to use ("limma" or "edgeR")
#' @param method is 'glmLRT' (1) or 'exactTest' (2) used for edgeR
#' (1) Fit a negative binomial generalized log-linear model to
#' the read counts for each gene
#' (2) Compute genewise exact tests for differences in the means between
#' two groups of negative-binomially distributed counts.
#' @param fdr.cut is a threshold to filter DEGs according their p-value corrected
#' @param logFC.cut is a threshold to filter DEGs according their logFC
#' @param batch.factors a vector containing strings to specify options for batch correction. Options are "Plate", "TSS", "Year", "Portion", "Center", and "Patients"
#' @param ClinicalDF a dataframe returned by GDCquery_clinic() to be used to extract year data
#' @param paired boolean to account for paired or non-paired samples. Set to TRUE for paired case
#' @param log.trans boolean to perform log cpm transformation. Set to TRUE for log transformation
#' @param trend boolean to perform limma-trend pipeline. Set to TRUE to go through limma-trend
#' @param MAT matrix containing expression set as all samples in columns and genes as rows. Do not provide if mat1 and mat2 are used
#' @param contrast.formula string input to determine coefficients and to design contrasts in a customized way
#' @param Condtypes vector of grouping for samples in MAT
#' @param voom boolean to perform voom transformation for limma-voom pipeline. Set to TRUE for voom transformation
#' @export
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataBRCA, method = "quantile", qnt.cut = 0.25)
#' samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
#' samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
#' dataDEGs <- TCGAanalyze_DEA(
#' mat1 = dataFilt[,samplesNT],
#' mat2 = dataFilt[,samplesTP],
#' Cond1type = "Normal",
#' Cond2type = "Tumor"
#' )
#'
#' @return table with DEGs containing for each gene logFC, logCPM, pValue,and FDR, also for each contrast
TCGAanalyze_DEA <- function (
mat1,
mat2,
metadata = TRUE,
Cond1type,
Cond2type,
pipeline = "edgeR",
method = "exactTest",
fdr.cut = 1,
logFC.cut = 0,
batch.factors = NULL,
ClinicalDF = data.frame(),
paired = FALSE,
log.trans = FALSE,
voom = FALSE,
trend = FALSE,
MAT = data.frame(),
contrast.formula = "",
Condtypes = c()
){
if(pipeline == "limma") check_package("limma")
if(pipeline == "edgeR") check_package("edgeR")
table.code <- c(
"TP",
"TR",
"TB",
"TRBM",
"TAP",
"TM",
"TAM",
"THOC",
"TBM",
"NB",
"NT",
"NBC",
"NEBV",
"NBM",
"CELLC",
"TRB",
"CELL",
"XP",
"XCL"
)
names(table.code) <- c(
"01",
"02",
"03",
"04",
"05",
"06",
"07",
"08",
"09",
"10",
"11",
"12",
"13",
"14",
"20",
"40",
"50",
"60",
"61"
)
if (nrow(MAT) == 0) {
TOC <- cbind(mat1, mat2)
Cond1num <- ncol(mat1)
Cond2num <- ncol(mat2)
} else {
TOC <- MAT
}
if (metadata == TRUE & all(grepl("TCGA",colnames(TOC)))){
my_IDs <- get_IDs(TOC)
Plate <- factor(my_IDs$plate)
Condition <- factor(my_IDs$condition)
TSS <- factor(my_IDs$tss)
Portion <- factor(my_IDs$portion)
Center <- factor(my_IDs$center)
Patients <- factor(my_IDs$patient)
}
# this makes non-sense for non-TCGA data
if (paired == TRUE) {
matched.query <- TCGAquery_MatchedCoupledSampleTypes(
my_IDs$barcode,
table.code[unique(my_IDs$sample)]
)
my_IDs <- subset(my_IDs, barcode == matched.query)
TOC <- TOC[, (names(TOC) %in% matched.query)]
}
if (nrow(ClinicalDF) > 0) {
names(ClinicalDF)[names(ClinicalDF) == "bcr_patient_barcode"] <- "patient"
ClinicalDF$age_at_diag_year <- floor(clinical$age_at_diagnosis / 365)
ClinicalDF$diag_year <- ClinicalDF$age_at_diag_year + clinical$year_of_birth
diag_yearDF <- ClinicalDF[, c("patient", "diag_year")]
my_IDs <- merge(my_IDs, ClinicalDF, by = "patient")
Year <- as.factor(my_IDs$diag_year)
}
options <- c("Plate", "TSS", "Year", "Portion", "Center", "Patients")
if (length(batch.factors) == 0) {
message("Batch correction skipped since no factors provided")
} else {
for (o in batch.factors) {
if (o %in% options == FALSE)
stop(paste0(o, " is not a valid batch correction factor"))
if (o == "Year" & nrow(ClinicalDF) == 0)
stop(
"batch correction using diagnosis year needs clinical info. Provide Clinical Data in arguments"
)
}
}
additiveformula <- paste(batch.factors, collapse = "+")
message("----------------------- DEA -------------------------------")
if (nrow(MAT) == 0) {
message("o ",Cond1num," samples in Cond1type ",Cond1type)
message("o ",Cond2num," samples in Cond2type ",Cond2type)
message("o ", nrow(TOC), " features as miRNA or genes ")
} else {
message("o ", nrow(TOC), " features as miRNA or genes ")
}
message("This may take some minutes...")
colnames(TOC) <- paste0("s", 1:ncol(TOC))
if (length(Condtypes) > 0) {
tumorType <- factor(x = Condtypes, levels = unique(Condtypes))
} else {
tumorType <- factor(
x = rep(c(Cond1type, Cond2type), c(Cond1num, Cond2num)),
levels = c(Cond1type, Cond2type)
)
}
if (length(batch.factors) == 0 & length(Condtypes) > 0) {
if (pipeline == "edgeR")
design <- model.matrix( ~ tumorType)
else
design <- model.matrix( ~ 0 + tumorType)
} else if (length(batch.factors) == 0 & length(Condtypes) == 0) {
if (pipeline == "edgeR")
design <- model.matrix( ~ tumorType)
else
design <- model.matrix( ~ 0 + tumorType)
} else if (length(batch.factors) > 0 & length(Condtypes) == 0) {
if (pipeline == "edgeR")
formula <- paste0("~tumorType+", additiveformula)
else
formula <- paste0("~0+tumorType+", additiveformula)
design <- model.matrix(eval(parse(text = formula)))
} else if (length(batch.factors) > 0 & length(Condtypes) > 0) {
if (pipeline == "edgeR") {
formula <- paste0("~tumorType+", additiveformula)
if (length(Condtypes) > 2)
formula <- paste0("~0+tumorType+", additiveformula)
} else {
formula <- paste0("~0+tumorType+", additiveformula)
}
design <- model.matrix(eval(parse(text = formula)))
}
if (pipeline == "edgeR") {
if (method == "exactTest") {
DGE <- edgeR::DGEList(TOC, group = rep(c(Cond1type, Cond2type), c(Cond1num, Cond2num)))
disp <- edgeR::estimateCommonDisp(DGE)
tested <- edgeR::exactTest(disp, pair = c(Cond1type, Cond2type))
logFC_table <- tested$table
tableDEA <- edgeR::topTags(tested, n = nrow(tested$table))$table
tableDEA <- tableDEA[tableDEA$FDR <= fdr.cut,]
tableDEA <- tableDEA[abs(tableDEA$logFC) >= logFC.cut,]
}
else if (method == "glmLRT") {
if (length(unique(tumorType)) == 2) {
aDGEList <- edgeR::DGEList(counts = TOC, group = tumorType)
aDGEList <- edgeR::estimateGLMCommonDisp(aDGEList, design)
aDGEList <- edgeR::estimateGLMTagwiseDisp(aDGEList, design)
aGlmFit <- edgeR::glmFit(
aDGEList,
design,
dispersion = aDGEList$tagwise.dispersion,
prior.count.total = 0
)
aGlmLRT <- edgeR::glmLRT(aGlmFit, coef = 2)
tableDEA <- cbind(aGlmLRT$table, FDR = p.adjust(aGlmLRT$table$PValue, "fdr"))
tableDEA <- tableDEA[tableDEA$FDR < fdr.cut, ]
tableDEA <- tableDEA[abs(tableDEA$logFC) > logFC.cut, ]
if (all(grepl("ENSG", rownames(tableDEA)))){
tableDEA <- cbind(tableDEA, map.ensg(genes = rownames(tableDEA))[, c("gene_name","gene_type")])
}
} else if (length(unique(tumorType)) > 2) {
aDGEList <- edgeR::DGEList(counts = TOC, group = tumorType)
colnames(design)[1:length(levels(tumorType))] <- levels(tumorType)
prestr = "makeContrasts("
poststr = ",levels=design)"
commandstr = paste(prestr, contrast.formula, poststr, sep = "")
commandstr = paste0("limma::", commandstr)
cont.matrix <- eval(parse(text = commandstr))
aDGEList <- edgeR::estimateGLMCommonDisp(aDGEList, design)
aDGEList <- edgeR::estimateGLMTagwiseDisp(aDGEList, design)
aGlmFit <- edgeR::glmFit(
aDGEList,
design,
dispersion = aDGEList$tagwise.dispersion,
prior.count.total = 0
)
tableDEA <- list()
for (mycoef in colnames(cont.matrix)) {
message(paste0("DEA for", " :", mycoef))
aGlmLRT <- edgeR::glmLRT(aGlmFit, contrast = cont.matrix[, mycoef])
tt <- aGlmLRT$table
tt <- cbind(tt, FDR = p.adjust(aGlmLRT$table$PValue,"fdr"))
tt <- tt[(tt$FDR < fdr.cut & abs(as.numeric(tt$logFC)) > logFC.cut),]
tableDEA[[as.character(mycoef)]] <- tt
if (all(grepl("ENSG", rownames(tableDEA[[as.character(mycoef)]]))))
tableDEA[[as.character(mycoef)]] <-
cbind(tableDEA[[as.character(mycoef)]],
map.ensg(genes = rownames(tableDEA[[as.character(mycoef)]]))[,2:3])
}
}
} else {
stop(
paste0(
method,
" is not a valid DEA method option. Choose 'exactTest' or 'glmLRT' "
)
)
}
} else if (pipeline == "limma") {
if (log.trans == TRUE){
logCPM <- edgeR::cpm(TOC, log = TRUE, prior.count = 3)
} else {
logCPM <- TOC
}
if (voom == TRUE) {
message("Voom Transformation...")
logCPM <- limma::voom(logCPM, design)
}
if (length(unique(tumorType)) == 2) {
colnames(design)[1:2] <- c(Cond1type, Cond2type)
contr <- paste0(Cond2type, "-", Cond1type)
cont.matrix <- limma::makeContrasts(contrasts = contr, levels = design)
fit <- limma::lmFit(logCPM, design)
fit <- limma::contrasts.fit(fit, cont.matrix)
if (trend == TRUE) {
fit <- limma::eBayes(fit, trend = TRUE)
} else {
fit <- limma::eBayes(fit, trend = FALSE)
}
tableDEA <- limma::topTable(
fit,
coef = 1,
adjust.method = "fdr",
number = nrow(TOC)
)
limma::volcanoplot(fit, highlight = 10)
index <- which(tableDEA[, 4] < fdr.cut)
tableDEA <- tableDEA[index,]
neg_logFC.cut <- -1 * logFC.cut
index <- which(abs(as.numeric(tableDEA$logFC)) > logFC.cut)
tableDEA <- tableDEA[index,]
}
else if (length(unique(tumorType)) > 2) {
DGE <- edgeR::DGEList(TOC, group = tumorType)
colnames(design)[1:length(levels(tumorType))] <- levels(tumorType)
prestr = "makeContrasts("
poststr = ",levels=colnames(design))"
commandstr = paste(prestr, contrast.formula, poststr, sep = "")
commandstr = paste0("limma::", commandstr)
cont.matrix <- eval(parse(text = commandstr))
fit <- limma::lmFit(logCPM, design)
fit <- limma::contrasts.fit(fit, cont.matrix)
if (trend == TRUE){
fit <- limma::eBayes(fit, trend = TRUE)
} else {
fit <- limma::eBayes(fit, trend = FALSE)
}
tableDEA <- list()
for (mycoef in colnames(cont.matrix)) {
tableDEA[[as.character(mycoef)]] <- limma::topTable(
fit,
coef = mycoef,
adjust.method = "fdr",
number = nrow(MAT)
)
message(paste0("DEA for", " :", mycoef))
tempDEA <- tableDEA[[as.character(mycoef)]]
index.up <- which(tempDEA$adj.P.Val < fdr.cut & abs(as.numeric(tempDEA$logFC)) > logFC.cut)
tableDEA[[as.character(mycoef)]] <- tempDEA[index.up,]
if (all(grepl("ENSG", rownames(tableDEA[[as.character(mycoef)]]))))
tableDEA[[as.character(mycoef)]] <-
cbind(tableDEA[[as.character(mycoef)]],
map.ensg(genes = rownames(tableDEA[[as.character(mycoef)]]))[, 2:3])
}
}
} else {
stop(paste0(
pipeline,
" is not a valid pipeline option. Choose 'edgeR' or 'limma'"
))
}
message("----------------------- END DEA -------------------------------")
return(tableDEA)
}
#' @title Batch correction using ComBat and Voom transformation using limma package.
#' @description
#' TCGAbatch_correction allows user to perform a Voom correction on gene expression data and have it ready for DEA.
#' One can also use ComBat for batch correction for exploratory analysis. If batch.factor or adjustment argument is "Year"
#' please provide clinical data. If no batch factor is provided, the data will be voom corrected only
#'
#' TCGAanalyze_DEA performs DEA using following functions from sva and limma:
#' \enumerate{
#' \item limma::voom Transform RNA-Seq Data Ready for Linear Modelling.
#' \item sva::ComBat Adjust for batch effects using an empirical Bayes framework.
#' }
#' @param tabDF numeric matrix, each row represents a gene,
#' each column represents a sample
#' @param batch.factor a string containing the batch factor to use for correction.
#' Options are "Plate", "TSS", "Year", "Portion", "Center"
#' @param adjustment vector containing strings for factors to adjust for using ComBat.
#' Options are "Plate", "TSS", "Year", "Portion", "Center"
#' @param UnpublishedData if TRUE perform a batch correction after adding new data
#' @param ClinicalDF a dataframe returned by GDCquery_clinic() to be used to extract year data
#' @param AnnotationDF a dataframe with column Batch indicating different batches of the samples in the tabDF
#' @export
#' @return data frame with ComBat batch correction applied
TCGAbatch_Correction <- function (
tabDF,
batch.factor = NULL,
adjustment = NULL,
ClinicalDF = data.frame(),
UnpublishedData = FALSE,
AnnotationDF = data.frame()
){
check_package("sva")
# code for non-TCGA samples
if (UnpublishedData == TRUE) {
if(!"Batch" %in% colnames(AnnotationDF)) {
stop("AnnotationDF should have a Batch column")
} else {
batch.factor <- as.factor(AnnotationDF$Batch)
}
if(!"Condition" %in% colnames(AnnotationDF)) {
mod <- model.matrix(~as.factor(Condition),data = AnnotationDF)
} else {
mod <- NULL
}
batch_corr <- sva::ComBat(
dat = tabDF,
batch = batch.factor,
mod = mod,
par.prior = TRUE,
prior.plots = TRUE
)
}
if (UnpublishedData == FALSE) {
if (length(batch.factor) == 0 & length(adjustment) == 0)
message("batch correction will be skipped")
else if (batch.factor %in% adjustment) {
stop(paste0("Cannot adjust and correct for the same factor|"))
}
my_IDs <- get_IDs(tabDF)
if (length(batch.factor) > 0 || length(adjustment) > 0)
if ((nrow(ClinicalDF) > 0 & batch.factor == "Year") ||
("Year" %in% adjustment == TRUE & nrow(ClinicalDF) >
0)) {
names(ClinicalDF)[names(ClinicalDF) == "bcr_patient_barcode"] <-
"patient"
ClinicalDF$age_at_diag_year <-
floor(ClinicalDF$age_at_diagnosis / 365)
ClinicalDF$diag_year <-
ClinicalDF$age_at_diag_year +
ClinicalDF$year_of_birth
diag_yearDF <- ClinicalDF[, c("patient", "diag_year")]
Year <- merge(my_IDs, diag_yearDF, by = "patient")
Year <- Year$diag_year
Year <- as.factor(Year)
}
else if (nrow(ClinicalDF) == 0 & batch.factor == "Year") {
stop("Cannot extract Year data. Clinical data was not provided")
}
Plate <- as.factor(my_IDs$plate)
Condition <- as.factor(my_IDs$condition)
TSS <- as.factor(my_IDs$tss)
Portion <- as.factor(my_IDs$portion)
Sequencing.Center <- as.factor(my_IDs$center)
design.mod.combat <- model.matrix( ~ Condition)
options <- c("Plate", "TSS", "Year", "Portion", "Sequencing Center")
if (length(batch.factor) > 1)
stop("Combat can only correct for one batch variable. Provide one batch factor")
if (batch.factor %in% options == FALSE)
stop(paste0(o, " is not a valid batch correction factor"))
for (o in adjustment) {
if (o %in% options == FALSE)
stop(paste0(o, " is not a valid adjustment factor"))
}
adjustment.data <- c()
for (a in adjustment) {
if (a == "Sequencing Center")
a <- Sequencing.Center
adjustment.data <-
cbind(eval(parse(text = a)), adjustment.data)
}
if (batch.factor == "Sequencing Center")
batch.factor <- Sequencing.Center
batchCombat <- eval(parse(text = batch.factor))
if (length(adjustment) > 0) {
adjustment.formula <- paste(adjustment, collapse = "+")
adjustment.formula <- paste0("+", adjustment.formula)
adjustment.formula <- paste0("~Condition", adjustment.formula)
print(adjustment.formula)
model <- data.frame(batchCombat, row.names = colnames(tabDF))
design.mod.combat <- model.matrix(
eval(parse(text = adjustment.formula)),
data = model
)
}
print(unique(batchCombat))
batch_corr <- sva::ComBat(
dat = tabDF,
batch = batchCombat,
mod = design.mod.combat,
par.prior = TRUE,
prior.plots = TRUE
)
}
return(batch_corr)
}
##Function to take raw counts by removing rows filtered after norm and filter process###
#' @title Use raw count from the DataPrep object which genes are removed by normalization and filtering steps.
#' @description function to keep raw counts after filtering and/or normalizing.
#' @param DataPrep DataPrep object returned by TCGAanalyze_Preprocessing()
#' @param DataFilt Filtered data frame containing samples in columns and genes in rows after normalization and/or filtering steps
#' @examples
#' \dontrun{
#' dataPrep_raw <- UseRaw_afterFilter(dataPrep, dataFilt)
#' }
#' @export
#' @return Filtered return object similar to DataPrep with genes removed after normalization and filtering process.
UseRaw_afterFilter <- function(DataPrep, DataFilt) {
rownames(DataPrep) <-
lapply(rownames(DataPrep), function(x)
gsub("[[:punct:]]\\d*", "", x))
filtered.list <- setdiff(rownames(DataPrep), rownames(DataFilt))
Res <- DataPrep[!rownames(DataPrep) %in% filtered.list,]
return(Res)
}
#' @title Adding information related to DEGs genes from DEA as mean values in two conditions.
#' @description
#' TCGAanalyze_LevelTab allows user to add information related to DEGs genes from
#' Differentially expression analysis (DEA) such as mean values and in two conditions.
#' @param FC_FDR_table_mRNA Output of dataDEGs filter by abs(LogFC) >=1
#' @param typeCond1 a string containing the class label of the samples
#' in TableCond1 (e.g., control group)
#' @param typeCond2 a string containing the class label of the samples
#' in TableCond2 (e.g., case group)
#' @param TableCond1 numeric matrix, each row represents a gene, each column
#' represents a sample with Cond1type
#' @param TableCond2 numeric matrix, each row represents a gene, each column
#' represents a sample with Cond2type
#' @param typeOrder typeOrder
#' @export
#' @return table with DEGs, log Fold Change (FC), false discovery rate (FDR),
#' the gene expression level
#' for samples in Cond1type, and Cond2type, and Delta value (the difference
#' of gene expression between the two
#' conditions multiplied logFC)
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataBRCA, method = "quantile", qnt.cut = 0.25)
#' samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
#' samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
#' dataDEGs <- TCGAanalyze_DEA(
#' dataFilt[,samplesNT],
#' dataFilt[,samplesTP],
#' Cond1type = "Normal",
#' Cond2type = "Tumor"
#' )
#' dataDEGsFilt <- dataDEGs[abs(dataDEGs$logFC) >= 1,]
#' dataTP <- dataFilt[,samplesTP]
#' dataTN <- dataFilt[,samplesNT]
#' dataDEGsFiltLevel <- TCGAanalyze_LevelTab(
#' FC_FDR_table_mRNA = dataDEGsFilt,
#' typeCond1 = "Tumor",
#' typeCond2 = "Normal",
#' TableCond1 = dataTP,
#' TableCond2 = dataTN
#' )
TCGAanalyze_LevelTab <- function(
FC_FDR_table_mRNA,
typeCond1,
typeCond2,
TableCond1,
TableCond2,
typeOrder = TRUE
) {
TableLevel <- data.frame(
"mRNA" = rownames(FC_FDR_table_mRNA),
"logFC" = FC_FDR_table_mRNA$logFC,
"FDR" = FC_FDR_table_mRNA$FDR,
"Delta" = FC_FDR_table_mRNA$logFC * rowMeans(TableCond1[rownames(FC_FDR_table_mRNA),],na.rm = TRUE)
)
TableLevel[[typeCond1]] <- rowMeans(TableCond1[TableLevel$mRNA,],na.rm = TRUE)
TableLevel[[typeCond2]] <- rowMeans(TableCond2[TableLevel$mRNA,],na.rm = TRUE)
TableLevel <- TableLevel[order(as.numeric(TableLevel[, "Delta"]), decreasing = typeOrder),]
rownames(TableLevel) <- TableLevel[, "mRNA"]
if (all(grepl("ENSG", rownames(TableLevel))))
TableLevel <- cbind(TableLevel, map.ensg(genes = rownames(TableLevel))[, 2:3])
return(TableLevel)
}
#' @title Enrichment analysis for Gene Ontology (GO) [BP,MF,CC] and Pathways
#' @description
#' Researchers, in order to better understand the underlying biological
#' processes, often want to retrieve a functional profile of a set of genes
#' that might have an important role. This can be done by performing an
#' enrichment analysis.
#'
#'We will perform an enrichment analysis on gene sets using the TCGAanalyze_EAcomplete
#'function. Given a set of genes that are
#'up-regulated under certain conditions, an enrichment analysis will find
#'identify classes of genes or proteins that are #'over-represented using
#'annotations for that gene set.
#' @param TFname is the name of the list of genes or TF's regulon.
#' @param RegulonList List of genes such as TF's regulon or DEGs where to find enrichment.
#' @export
#' @return Enrichment analysis GO[BP,MF,CC] and Pathways complete table enriched by genelist.
#' @examples
#' Genelist <- c("FN1","COL1A1")
#' ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist)
#' \dontrun{
#' Genelist <- rownames(dataDEGsFiltLevel)
#' system.time(ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist))
#' }
TCGAanalyze_EAcomplete <- function(TFname, RegulonList) {
# This is a verification of the input
# in case the List is like Gene|ID
# we will get only the Gene
if (all(grepl("\\|", RegulonList))) {
RegulonList <- strsplit(RegulonList, "\\|")
RegulonList <- unlist(lapply(RegulonList, function(x)
x[1]))
}
print(
paste(
"I need about ",
"1 minute to finish complete ",
"Enrichment analysis GO[BP,MF,CC] and Pathways... "
)
)
ResBP <- TCGAanalyze_EA(TFname, RegulonList, DAVID_BP_matrix,
EAGenes, GOtype = "DavidBP")
print("GO Enrichment Analysis BP completed....done")
ResMF <- TCGAanalyze_EA(TFname, RegulonList, DAVID_MF_matrix,
EAGenes, GOtype = "DavidMF")
print("GO Enrichment Analysis MF completed....done")
ResCC <- TCGAanalyze_EA(TFname, RegulonList, DAVID_CC_matrix,
EAGenes, GOtype = "DavidCC")
print("GO Enrichment Analysis CC completed....done")
ResPat <- TCGAanalyze_EA(TFname, RegulonList, listEA_pathways,
EAGenes, GOtype = "Pathway")
print("Pathway Enrichment Analysis completed....done")
ans <-
list(
ResBP = ResBP,
ResMF = ResMF,
ResCC = ResCC,
ResPat = ResPat
)
return(ans)
}
#' @title Enrichment analysis of a gene-set with GO [BP,MF,CC] and pathways.
#' @description
#' The rational behind a enrichment analysis ( gene-set, pathway etc) is to compute
#' statistics of whether the overlap between the focus list (signature) and the gene-set
#' is significant. ie the confidence that overlap between the list is not due to chance.
#' The Gene Ontology project describes genes (gene products) using terms from
#' three structured vocabularies: biological process, cellular component and molecular function.
#' The Gene Ontology Enrichment component, also referred to as the GO Terms" component, allows
#' the genes in any such "changed-gene" list to be characterized using the Gene Ontology terms
#' annotated to them. It asks, whether for any particular GO term, the fraction of genes
#' assigned to it in the "changed-gene" list is higher than expected by chance
#' (is over-represented), relative to the fraction of genes assigned to that term in the
#' reference set.
#' In statistical terms it perform the analysis tests the null hypothesis that,
#' for any particular ontology term, there is no difference in the proportion of genes
#' annotated to it in the reference list and the proportion annotated to it in the test list.
#' We adopted a Fisher Exact Test to perform the EA.
#' @param GeneName is the name of gene signatures list
#' @param TableEnrichment is a table related to annotations of gene symbols such as
#' GO[BP,MF,CC] and Pathways. It was created from DAVID gene ontology on-line.
#' @param RegulonList is a gene signature (lisf of genes) in which perform EA.
#' @param GOtype is type of gene ontology Biological process (BP), Molecular Function (MF),
#' Cellular componet (CC)
#' @param FDRThresh pvalue corrected (FDR) as threshold to selected significant
#' BP, MF,CC, or pathways. (default FDR < 0.01)
#' @param EAGenes is a table with informations about genes
#' such as ID, Gene, Description, Location and Family.
#' @param GeneSymbolsTable if it is TRUE will return a table with GeneSymbols in common GO or pathways.
# @export
#' @import stats
#' @return Table with enriched GO or pathways by selected gene signature.
#' @examples
#' \dontrun{
#' EAGenes <- get("EAGenes")
#' RegulonList <- rownames(dataDEGsFiltLevel)
#' ResBP <- TCGAanalyze_EA(
#' GeneName="DEA genes Normal Vs Tumor",
#' RegulonList = RegulonList,
#' TableEnrichment = DAVID_BP_matrix,
#' EAGenes = EAGenes,
#' GOtype = "DavidBP"
#' )
#'}
TCGAanalyze_EA <- function (
GeneName,
RegulonList,
TableEnrichment,
EAGenes,
GOtype,
FDRThresh = 0.01,
GeneSymbolsTable = FALSE
) {
topPathways <- nrow(TableEnrichment)
topPathways_tab <- matrix(0, 1, topPathways)
topPathways_tab <- as.matrix(topPathways_tab)
rownames(topPathways_tab) <- GeneName
rownames(EAGenes) <- toupper(rownames(EAGenes))
EAGenes <- EAGenes[!duplicated(EAGenes[, "ID"]),]
rownames(EAGenes) <- EAGenes[, "ID"]
allgene <- EAGenes[, "ID"]
current_pathway_from_EA <- as.matrix(TableEnrichment[, GOtype])
TableNames <- gsub("David",
"",
paste("Top ", GOtype, " n. ",
1:topPathways, " of ", topPathways, sep = ""))
colnames(topPathways_tab) <- TableNames
topPathways_tab <- as.data.frame(topPathways_tab)
table_pathway_enriched <- matrix(1, nrow(current_pathway_from_EA), 8)
colnames(table_pathway_enriched) <- c(
"Pathway",
"GenesInPathway",
"Pvalue",
"FDR",
"CommonGenesPathway",
"PercentPathway",
"PercentRegulon",
"CommonGeneSymbols"
)
table_pathway_enriched <- as.data.frame(table_pathway_enriched)
for (i in 1:nrow(current_pathway_from_EA)) {
table_pathway_enriched[i, "Pathway"] <-
as.character(current_pathway_from_EA[i,])
if (nrow(TableEnrichment) == 589) {
genes_from_current_pathway_from_EA <-
GeneSplitRegulon(TableEnrichment[TableEnrichment[GOtype] ==
as.character(current_pathway_from_EA[i,]),][,
"Molecules"], ",")
}
else {
genes_from_current_pathway_from_EA <-
GeneSplitRegulon(TableEnrichment[TableEnrichment[GOtype] ==
as.character(current_pathway_from_EA[i,]),][,
"Molecules"], ", ")
}
genes_common_pathway_TFregulon <- as.matrix(intersect(
toupper(RegulonList),
toupper(genes_from_current_pathway_from_EA)
))
if (length(genes_common_pathway_TFregulon) != 0) {
current_pathway_commongenes_num <-
length(genes_common_pathway_TFregulon)
seta <- allgene %in% RegulonList
setb <- allgene %in% genes_from_current_pathway_from_EA
ft <- fisher.test(seta, setb)
FisherpvalueTF <- ft$p.value
table_pathway_enriched[i, "Pvalue"] <-
as.numeric(FisherpvalueTF)
if (FisherpvalueTF < 0.01) {
current_pathway_commongenes_percent <- paste(
"(",
format((
current_pathway_commongenes_num / length(genes_from_current_pathway_from_EA)
) *
100,
digits = 2
), "%)"
)
current_pathway_commongenes_num_with_percent <-
gsub(
" ",
"",
paste(
current_pathway_commongenes_num,
current_pathway_commongenes_percent,
"pv=",
format(FisherpvalueTF, digits = 2)
)
)
table_pathway_enriched[i, "CommonGenesPathway"] <-
length(genes_common_pathway_TFregulon)
table_pathway_enriched[i, "CommonGeneSymbols"] <-
paste0(genes_common_pathway_TFregulon, collapse = ";")
table_pathway_enriched[i, "GenesInPathway"] <-
length(genes_from_current_pathway_from_EA)
table_pathway_enriched[i, "PercentPathway"] <-
as.numeric(table_pathway_enriched[i,
"CommonGenesPathway"]) /
as.numeric(table_pathway_enriched[i,
"GenesInPathway"]) * 100
table_pathway_enriched[i, "PercentRegulon"] <-
as.numeric(table_pathway_enriched[i,
"CommonGenesPathway"]) /
length(RegulonList) *
100
}
}
}
table_pathway_enriched <-
table_pathway_enriched[order(table_pathway_enriched[,
"Pvalue"], decreasing = FALSE),]
table_pathway_enriched <-
table_pathway_enriched[table_pathway_enriched[,
"Pvalue"] < 0.01,]
table_pathway_enriched[, "FDR"] <-
p.adjust(table_pathway_enriched[,
"Pvalue"], method = "fdr")
table_pathway_enriched <-
table_pathway_enriched[table_pathway_enriched[,
"FDR"] < FDRThresh,]
table_pathway_enriched <-
table_pathway_enriched[order(table_pathway_enriched[,
"FDR"], decreasing = FALSE),]
table_pathway_enriched_filt <-
table_pathway_enriched[table_pathway_enriched$FDR < FDRThresh, ]
if (nrow(table_pathway_enriched) > 0) {
tmp <- table_pathway_enriched
tmp <- paste(
tmp[, "Pathway"],
"; FDR= ",
format(tmp[,
"FDR"], digits = 3),
"; (ng=",
round(tmp[, "GenesInPathway"]),
"); (ncommon=",
format(tmp[, "CommonGenesPathway"],
digits = 2),
")",
sep = ""
)
tmp <- as.matrix(tmp)
topPathways_tab <-
topPathways_tab[, 1:nrow(table_pathway_enriched),
drop = FALSE]
topPathways_tab[1,] <- tmp
}
else {
topPathways_tab <- NA
}
if (GeneSymbolsTable == FALSE) {
return(topPathways_tab)
}
if (GeneSymbolsTable == TRUE) {
return(table_pathway_enriched_filt)
}
}
#' @title Differentially expression analysis (DEA) using limma package.
#' @description Differentially expression analysis (DEA) using limma package.
#' @param FC.cut write
#' @param AffySet A matrix-like data object containing log-ratios or log-expression values
#' for a series of arrays, with rows corresponding to genes and columns to samples
#' @examples
#' \dontrun{
#' to add example
#' }
#' @export
#' @return List of list with tables in 2 by 2 comparison
#' of the top-ranked genes from a linear model fitted by DEA's limma
TCGAanalyze_DEA_Affy <- function(AffySet, FC.cut = 0.01) {
if (!requireNamespace("Biobase", quietly = TRUE)) {
stop("Biobase package is needed for this function to work. Please install it.",
call. = FALSE)
}
if (!requireNamespace("limma", quietly = TRUE)) {
stop("limma package is needed for this function to work. Please install it.",
call. = FALSE)
}
Pdatatable <- Biobase::phenoData(AffySet)
f <- factor(Pdatatable$Disease)
groupColors <- names(table(f))
tmp <- matrix(0, length(groupColors), length(groupColors))
colnames(tmp) <- groupColors
rownames(tmp) <- groupColors
tmp[upper.tri(tmp)] <- 1
sample_tab <- Pdatatable
f <- factor(Pdatatable$Disease)
design <- model.matrix( ~ 0 + f)
colnames(design) <- levels(f)
fit <- limma::lmFit(AffySet, design) ## fit is an object of class MArrayLM.
groupColors <- names(table(Pdatatable$Disease))
CompleteList <- vector("list", sum(tmp))
k <- 1
for (i in 1:length(groupColors)) {
col1 <- colnames(tmp)[i]
for (j in 1:length(groupColors)) {
col2 <- rownames(tmp)[j]
if (i != j) {
if (tmp[i, j] != 0) {
Comparison <- paste(col2, "-", col1, sep = "")
if (i == 4 &&
j == 6) {
Comparison <- paste(col1, "-", col2, sep = "")
}
if (i == 5 &&
j == 6) {
Comparison <- paste(col1, "-", col2, sep = "")
}
print(paste(i, j, Comparison, "to do..."))
cont.matrix <- limma::makeContrasts(I = Comparison, levels = design)
fit2 <- limma::contrasts.fit(fit, cont.matrix)
fit2 <- limma::eBayes(fit2)
sigI <- limma::topTable(
fit2,
coef = 1,
adjust.method = "BH",
sort.by = "B",
p.value = 0.05,
lfc = FC.cut,
number = 50000
)
sigIbis <- sigI[order(abs(as.numeric(sigI$logFC)), decreasing = TRUE), ]
names(CompleteList)[k] <- gsub("-", "_", Comparison)
CompleteList[[k]] <- sigIbis
k <- k + 1
}
}
}
}
return(CompleteList)
}
#' @title Generate network
#' @description TCGAanalyze_analyseGRN perform gene regulatory network.
#' @param TFs a vector of genes.
#' @param normCounts is a matrix of gene expression with genes in rows and samples in columns.
#' @param kNum the number of nearest neighbors to consider to estimate the mutual information.
#' Must be less than the number of columns of normCounts.
#' @export
#' @return an adjacent matrix
TCGAanalyze_analyseGRN <- function(TFs, normCounts, kNum) {
if (!requireNamespace("parmigene", quietly = TRUE)) {
stop(
"parmigene package is needed for this function to work. Please install it.",
call. = FALSE
)
}
MRcandidates <- intersect(rownames(normCounts), TFs)
# Mutual information between TF and genes
sampleNames <- colnames(normCounts)
geneNames <- rownames(normCounts)
messageMI_TFgenes <- paste(
"Estimation of MI among [",
length(MRcandidates),
" TRs and ",
nrow(normCounts),
" genes].....",
sep = ""
)
timeEstimatedMI_TFgenes1 <- length(MRcandidates) * nrow(normCounts) / 1000
timeEstimatedMI_TFgenes <- format(timeEstimatedMI_TFgenes1 * ncol(normCounts) / 17000, digits = 2)
messageEstimation <- print(
paste(
"I Need about ",
timeEstimatedMI_TFgenes,
"seconds for this MI estimation. [Processing 17000k elements /s] "
)
)
miTFGenes <- parmigene::knnmi.cross(normCounts[MRcandidates,], normCounts, k = kNum)
return(miTFGenes)
}
#' @title Generate pathview graph
#' @description TCGAanalyze_Pathview pathway based data integration and visualization.
#' @param dataDEGs dataDEGs
#' @param pathwayKEGG pathwayKEGG
#' @export
#' @return an adjacent matrix
#' @examples
#' \dontrun{
#' dataDEGs <- data.frame(mRNA = c("TP53","TP63","TP73"), logFC = c(1,2,3))
#' TCGAanalyze_Pathview(dataDEGs)
#' }
TCGAanalyze_Pathview <-
function(dataDEGs, pathwayKEGG = "hsa05200") {
if (!requireNamespace("clusterProfiler", quietly = TRUE)) {
stop("clusterProfiler needed for this function to work. Please install it.",
call. = FALSE)
}
if (!requireNamespace("pathview", quietly = TRUE)) {
stop("pathview needed for this function to work. Please install it.",
call. = FALSE)
}
# Converting Gene symbol to gene ID
eg = as.data.frame(
clusterProfiler::bitr(
dataDEGs$mRNA,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = "org.Hs.eg.db"
)
)
eg <- eg[!duplicated(eg$SYMBOL), ]
dataDEGs <- dataDEGs[dataDEGs$mRNA %in% eg$SYMBOL, ]
dataDEGs <- dataDEGs[order(dataDEGs$mRNA, decreasing = FALSE), ]
eg <- eg[order(eg$SYMBOL, decreasing = FALSE), ]
dataDEGs$GeneID <- eg$ENTREZID
dataDEGsFiltLevel_sub <-
subset(dataDEGs, select = c("GeneID", "logFC"))
genelistDEGs <- as.numeric(dataDEGsFiltLevel_sub$logFC)
names(genelistDEGs) <- dataDEGsFiltLevel_sub$GeneID
hsa05200 <- pathview::pathview(
gene.data = genelistDEGs,
pathway.id = pathwayKEGG,
species = "hsa",
limit = list(gene = as.integer(max(
abs(genelistDEGs)
)))
)
}
#' @title infer gene regulatory networks
#' @description TCGAanalyze_networkInference taking expression data as input,
#' this will return an adjacency matrix of interactions
#' @param data expression data, genes in columns, samples in rows
#' @param optionMethod inference method, chose from aracne, c3net, clr and mrnet
#' @export
#' @return an adjacent matrix
TCGAanalyze_networkInference <-
function(data, optionMethod = "clr") {
# Converting Gene symbol to gene ID
if (optionMethod == "c3net") {
if (!requireNamespace("c3net", quietly = TRUE)) {
stop(
"c3net package is needed for this function to work. Please install it.",
call. = FALSE
)
}
net <- c3net::c3net(t(data))
} else {
if (!requireNamespace("minet", quietly = TRUE)) {
stop(
"minet package is needed for this function to work. Please install it.",
call. = FALSE
)
}
net <- minet::minet(data, method = optionMethod)
}
return(net)
}
#' Creates a plot for GAIA output (all significant aberrant regions.)
#' @description
#' This function is a auxiliary function to visualize GAIA output
#' (all significant aberrant regions.)
#' @param calls A matrix with the following columns: Chromossome, Aberration Kind
#' Region Start, Region End, Region Size and score
#' @param threshold Score threshold (orange horizontal line in the plot)
#' @export
#' @importFrom graphics abline axis legend plot points
#' @return A plot with all significant aberrant regions.
#' @examples
#' call <- data.frame("Chromossome" = rep(9,100),
#' "Aberration Kind" = rep(c(-2,-1,0,1,2),20),
#' "Region Start [bp]" = 18259823:18259922,
#' "Region End [bp]" = 18259823:18259922,
#' "score" = rep(c(1,2,3,4),25))
#' gaiaCNVplot(call,threshold = 0.01)
#' call <- data.frame("Chromossome" = rep(c(1,9),50),
#' "Aberration Kind" = rep(c(-2,-1,0,1,2),20),
#' "Region Start [bp]" = 18259823:18259922,
#' "Region End [bp]" = 18259823:18259922,
#' "score" = rep(c(1,2,3,4),25))
#' gaiaCNVplot(call,threshold = 0.01)
gaiaCNVplot <- function (calls, threshold = 0.01) {
Calls <- calls[order(calls[, grep("start", colnames(calls), ignore.case = TRUE)]), ]
Calls <- Calls[order(Calls[, grep("chr", colnames(calls), ignore.case = TRUE)]), ]
rownames(Calls) <- NULL
Chromo <- Calls[, grep("chr", colnames(calls), ignore.case = TRUE)]
Gains <- apply(Calls, 1, function(x)
ifelse(x[grep("aberration", colnames(calls), ignore.case = TRUE)] == 1, x["score"], 0))
Losses <- apply(Calls, 1, function(x)
ifelse(x[grep("aberration", colnames(calls), ignore.case = TRUE)] == 0, x["score"], 0))
plot(
Gains,
ylim = c(-max(Calls[, "score"] + 2), max(Calls[, "score"] + 2)),
type = "h",
col = "red",
xlab = "Chromosome",
ylab = "Score",
xaxt = "n"
)
points(-(Losses), type = "h", col = "blue")
# Draw origin line
abline(h = 0, cex = 4)
# Draw threshold lines
abline(
h = -log10(threshold),
col = "orange",
cex = 4,
main = "test"
)
abline(
h = log10(threshold),
col = "orange",
cex = 4,
main = "test"
)
uni.chr <- unique(Chromo)
temp <- rep(0, length(uni.chr))
for (i in 1:length(uni.chr)) {
temp[i] <- max(which(uni.chr[i] == Chromo))
}
for (i in 1:length(temp)) {
abline(v = temp[i],
col = "black",
lty = "dashed")
}
nChroms <- length(uni.chr)
begin <- c()
for (d in 1:nChroms) {
chrom <- sum(Chromo == uni.chr[d])
begin <- append(begin, chrom)
}
temp2 <- rep(0, nChroms)
for (i in 1:nChroms) {
if (i == 1) {
temp2[1] <- (begin[1] * 0.5)
}
else if (i > 1) {
temp2[i] <- temp[i - 1] + (begin[i] * 0.5)
}
}
uni.chr[uni.chr == 23] <- "X"
uni.chr[uni.chr == 24] <- "Y"
for (i in 1:length(temp)) {
axis(1,
at = temp2[i],
labels = uni.chr[i],
cex.axis = 1)
}
legend(
x = 1,
y = max(Calls[, "score"] + 2),
y.intersp = 0.8,
c("Amp"),
pch = 15,
col = c("red"),
text.font = 3
)
legend(
x = 1,
y = -max(Calls[, "score"] + 0.5),
y.intersp = 0.8,
c("Del"),
pch = 15,
col = c("blue"),
text.font = 3
)
}
#' Get a matrix of interactions of genes from biogrid
#' @description
#' Using biogrid database, it will create a matrix of gene interactions.
#' If columns A and row B has value 1, it means the gene A and gene B interacts.
#' @param tmp.biogrid Biogrid table
#' @export
#' @param names.genes List of genes to filter from output. Default: consider all genes
#' @return A matrix with 1 for genes that interacts, 0 for no interaction.
#' @examples
#' names.genes.de <- c("PLCB1","MCL1","PRDX4","TTF2","TACC3", "PARP4","LSM1")
#' tmp.biogrid <- data.frame("Official.Symbol.Interactor.A" = names.genes.de,
#' "Official.Symbol.Interactor.B" = rev(names.genes.de))
#' net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de)
#' \dontrun{
#' file <- paste0("http://thebiogrid.org/downloads/archives/",
#' "Release%20Archive/BIOGRID-3.4.133/BIOGRID-ALL-3.4.133.tab2.zip")
#' downloader::download(file,basename(file))
#' unzip(basename(file),junkpaths =TRUE)
#' tmp.biogrid <- read.csv(gsub("zip","txt",basename(file)),
#' header=TRUE, sep="\t", stringsAsFactors=FALSE)
#' names.genes.de <- c("PLCB1","MCL1","PRDX4","TTF2","TACC3", "PARP4","LSM1")
#' net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de)
#' }
getAdjacencyBiogrid <- function(tmp.biogrid, names.genes = NULL) {
it.a <- grep("Symbol", colnames(tmp.biogrid), value = TRUE)[1]
it.b <- grep("Symbol", colnames(tmp.biogrid), value = TRUE)[2]
if (is.null(names.genes)) {
names.genes <-
sort(union(unique(tmp.biogrid[, it.a]), unique(tmp.biogrid[, it.b])))
ind <- seq(1, nrow(tmp.biogrid))
} else {
ind.A <- which(tmp.biogrid[, it.a] %in% names.genes)
ind.B <- which(tmp.biogrid[, it.b] %in% names.genes)
ind <- intersect(ind.A, ind.B)
}
mat.biogrid <- matrix(
0,
nrow = length(names.genes),
ncol = length(names.genes),
dimnames = list(names.genes, names.genes)
)
for (i in ind) {
mat.biogrid[tmp.biogrid[i, it.a], tmp.biogrid[i, it.b]] <-
mat.biogrid[tmp.biogrid[i, it.b], tmp.biogrid[i, it.a]] <- 1
}
diag(mat.biogrid) <- 0
return(mat.biogrid)
}
#' @title Get GDC primary tumors samples with both DNA methylation (HM450K) and Gene expression data
#' @description
#' For a given TCGA project it gets the primary tumors samples (barcode) with both
#' DNA methylation and Gene expression data from GDC database
#' @param project A GDC project
#' @param n Number of samples to return. If NULL return all (default)
#' @return A vector of barcodes
#' @export
#' @examples
#' # Get ACC samples with both DNA methylation (HM450K) and gene expression aligned to hg19
#' samples <- matchedMetExp("TCGA-UCS")
matchedMetExp <- function(
project,
n = NULL
) {
# get primary solid tumor samples: DNA methylation
message("Download DNA methylation information")
met450k <- GDCquery(
project = project,
data.category = "DNA Methylation",
platform = "Illumina Human Methylation 450",
sample.type = c("Primary Tumor")
)
# get primary solid tumor samples: RNAseq
message("Download gene expression information")
exp <- GDCquery(
project = project,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts"
)
# Get patients with samples in both platforms
met450k_tp <- met450k$results[[1]]$cases
exp_tp <- exp$results[[1]]$cases
patients <-
substr(exp_tp, 1, 15)[
substr(exp_tp, 1, 15) %in% substr(met450k_tp, 1, 15)
] |> unique()
if (!is.null(n)) {
patients <- patients[1:n] # get only n samples
}
return(patients)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.