options(stringsAsFactors = FALSE)
##Functions for cluster frequency analysis
getTotalCellNumbers = function(file, wd) {
setwd(wd)
table = read.csv(file, header=TRUE)
files = list.files(pattern = "*.clustered.txt$")
indecies = rep.int(0, length(files))
for (sample in 1:length(table[,1])) {
sampleName = table[sample,1]
if (!(all(grepl(sampleName, files)==FALSE))) {
order = which(grepl(sampleName, files)==TRUE)
indecies[order] = sample
}
}
if (!all(indecies != 0)) {
print("There is a missing index")
}
totalCellNumbers = table[indecies,2]
return(totalCellNumbers)
}
getSampleIDs = function(group1, group2, wd) {
setwd(wd)
files = list.files(pattern = "*clustered.txt$")
sampleID = rep.int(0, length(files))
for (variable in 1:length(group1)){
group = group1[variable]
indecies = grep(group, files)
sampleID[indecies] = 1
}
for (variable in 1:length(group2)){
group = group2[variable]
indecies = grep(group, files)
sampleID[indecies] = 2
}
if (!all(sampleID!=0)) {
print("There is an unassigned file")
}
return(sampleID)
}
getFrequencies = function(wd, totalCellNumbers) {
setwd(wd)
clusteredFiles = as.matrix(list.files(pattern = "*.clustered.txt$"))
extractPopSizes = function(f) {
newFile = read.table(f, header = TRUE, stringsAsFactors = FALSE, sep="\t")
popSizes = as.matrix(newFile$popsize)
return(popSizes)
}
popSizeMatrix = apply(clusteredFiles, 1, extractPopSizes)
if (totalCellNumbers[1] == "NA") {
freqMatrix = t(t(popSizeMatrix) / colSums(popSizeMatrix))
} else {
freqMatrix = t(t(popSizeMatrix) / totalCellNumbers)
}
rownames(freqMatrix) = 1:length(freqMatrix[,1])
colnames(freqMatrix) = clusteredFiles
write.csv(freqMatrix, file = paste(wd, "frequencyMatrix.csv", sep="/"))
return(freqMatrix)
}
##This function extracts the column names from the first "clustered.txt" file and
##stores them. This is to avoid changes caused during read-write to punctuation.
extractColnames = function(wd) {
return(read.table(list.files(pattern = "*clustered.txt$")[1], nrows = 1, sep = "\t"))
}
##This funciton reads in all files in the working directory, extracts the frequency for
##each cluster and stores this in a matrix. Then this matrix is passed into SAM along
##with the sampleID codes to build the model of different features by sample type.
run_SAM_analysis = function(freqMatrix, sampleID, nperms) {
if (length(unique(sampleID)) == 2) {
family = "Two class unpaired"
} else {
family = "Multiclass"
}
fullFreqMatrix = as.data.frame(freqMatrix)
row_without_values = apply(fullFreqMatrix, 1, function(row) all(row==0 || row==1))
row_without_values_indecies = which(row_without_values == TRUE)
if (length(row_without_values_indecies) > 0) {
freqMatrix = fullFreqMatrix[-row_without_values_indecies,]
} else {
freqMatrix = fullFreqMatrix
}
freqMatrix = as.matrix(freqMatrix) ## Fix for new version of R
samResults = SAM(x=freqMatrix,y=sampleID,resp.type=family,
genenames=rownames(freqMatrix), geneid=rownames(freqMatrix), nperms=nperms)
return(samResults)
}
##This function manually calculates the fold chnage between groups for plotting
##regardless of significance cutoff
calculate_FoldChange = function(freqMatrix, group1, group2) {
result_FC <- numeric()
group1_mean <- rowMeans(freqMatrix[,which(grepl(group1, colnames(freqMatrix)))])
group2_mean <- rowMeans(freqMatrix[,which(grepl(group2, colnames(freqMatrix)))])
if(any(group1_mean == 0)) {
index1 <- which(group1_mean == 0)
group1_mean[index1] = 1/nrow(freqMatrix)
group2_mean[index1] = group2_mean[index1] + 1/nrow(freqMatrix)
}
if(any(group2_mean == 0)) {
index2 <- which(group2_mean == 0)
group2_mean[index2] = 1/nrow(freqMatrix)
group1_mean[index2] = group1_mean[index2] + 1/nrow(freqMatrix)
}
result_FC <- group2_mean/group1_mean
result_FC[is.na(result_FC)] <- 1; result_FC[is.infinite(result_FC)] <- 1
result_FC <- round(log2(result_FC), digits = 3)
return(result_FC)
}
##This funciton checks each cluster to see if it is in the list of
##higher or lower populations and saves the transformed q-values as a matrix
getSignifMatrix = function(model, num_clusters, qValue_cutoff, include_foldC) {
getClusterSignif = function(cluster) {
if (is.element(cluster, pop_higher$`Gene Name`)) {
entry = which(pop_higher$`Gene Name` == cluster)
if (as.numeric(as.character(pop_higher$`q-value(%)`[entry])) <= qValue_cutoff) {
return((100 - as.numeric(as.character(pop_higher$`q-value(%)`[entry]))) / 100)
} else {
return(0.5)
}
} else if (is.element(cluster, pop_lower$`Gene Name`)) {
entry = which(pop_lower$`Gene Name` == cluster)
if (as.numeric(as.character(pop_lower$`q-value(%)`[entry])) <= qValue_cutoff) {
return((as.numeric(as.character(pop_lower$`q-value(%)`[entry]))) / 100)
} else {
return(0.5)
}
} else {
return(0.5)
}
}
getClusterFoldChange = function(cluster) {
if (is.element(cluster, pop_higher$`Gene Name`)) {
entry = which(pop_higher$`Gene Name` == cluster)
if (as.numeric(as.character(pop_higher$`q-value(%)`[entry])) <= qValue_cutoff) {
return(as.numeric(as.character(pop_higher$`Fold Change`[entry])))
} else {
return(1)
}
} else if (is.element(cluster, pop_lower$`Gene Name`)) {
entry = which(pop_lower$`Gene Name` == cluster)
if (as.numeric(as.character(pop_lower$`q-value(%)`[entry])) <= qValue_cutoff) {
return(as.numeric(as.character(pop_lower$`Fold Change`[entry])))
} else {
return(1)
}
} else {
return(1)
}
}
##If there is only one significant cluster, SAM will return a vector.
##If there are multiple, SAM will return a matrix.
if (is.null(model$siggenes.table$genes.up)) {
pop_higher = c()
} else if (is.vector(model$siggenes.table$genes.up)) {
pop_higher = as.data.frame(t(model$siggenes.table$genes.up), stringsAsFactors = FALSE)
} else if (is.matrix(model$siggenes.table$genes.up)) {
pop_higher = as.data.frame(model$siggenes.table$genes.up, stringsAsFactors = FALSE)
}
if (is.null(model$siggenes.table$genes.lo)) {
pop_lower = c()
} else if (is.vector(model$siggenes.table$genes.lo)) {
pop_lower = as.data.frame(t(model$siggenes.table$genes.lo), stringsAsFactors = FALSE)
} else if (is.matrix(model$siggenes.table$genes.lo)) {
pop_lower = as.data.frame(model$siggenes.table$genes.lo, stringsAsFactors = FALSE)
}
cluster_signif = mat.or.vec(num_clusters, 3)
colnames(cluster_signif) = c("cellType", "Significance", "SAMFoldChange")
cluster_signif[,1] = 1:num_clusters
clusters = as.matrix(1:num_clusters)
cluster_signif[,2] = apply(clusters, 1, getClusterSignif)
if(include_foldC) {
cluster_signif[,3] = apply(clusters, 1, getClusterFoldChange)
if(any(is.infinite(cluster_signif[,3]))) {cluster_signif[which(is.infinite(cluster_signif[,3])),3] = 1}
} else {cluster_signif = cluster_signif[,-3]}
return(cluster_signif)
}
##This function appends the q-value matrix to each "clustered.txt" file in the wd.
append_freq_signif = function(wd, colNames, signif_matrix, include_foldC = FALSE, set_max_Val = 2) {
remove_freqsignif_columns(wd)
setwd(wd)
files = as.matrix(list.files(pattern = "*clustered.txt$"))
bind_signif_by_file = function(newFile) {
currFile = read.table(newFile, header = TRUE, sep = "\t")
colnames(currFile) = as.matrix(colNames)
if(include_foldC) {
appendedFile = cbind(currFile, signif_matrix[,-1])
colnames(appendedFile)[grep("Significance", colnames(appendedFile))] = "frequencySignif"
colnames(appendedFile)[grep("SAMFoldChange", colnames(appendedFile))] = "frequencyFoldChange"
colnames(appendedFile)[grep("All_Log2_FoldChange", colnames(appendedFile))] = "frequency_ALLFoldChange"
#Log2 Normalize SAM output
appendedFile$frequencyFoldChange = log2(as.numeric(appendedFile$frequencyFoldChange))
# Normalize Fold Changes
set_min_Val = -set_max_Val
FCs <- c("frequencyFoldChange","frequency_ALLFoldChange")
for (i in 1:2){
FC_dat <- appendedFile[,which(grepl(FCs[i], colnames(appendedFile)))]
if(any(FC_dat > set_max_Val)) {FC_dat[which(FC_dat > set_max_Val)] = set_max_Val}
if(any(FC_dat < set_min_Val)) {FC_dat[which(FC_dat < set_min_Val)] = set_min_Val}
normalized = (FC_dat-set_min_Val)/(set_max_Val-set_min_Val)
if(any(normalized>1)) {normalized[which(normalized>1)] = 0.99999} ##represents greatest increase, 1 used for black landmarks
if(any(normalized==1)) {normalized[which(normalized==1)] = 0.99999}
appendedFile[,which(grepl(FCs[i], colnames(appendedFile)))] = normalized
}
} else{
appendedFile = cbind(currFile, signif_matrix[,grep("Significance", colnames(signif_matrix))])
colnames(appendedFile)[ncol(appendedFile)] = "frequencySignif"
}
write.table(appendedFile, file = newFile, row.names = F, sep = "\t", quote = F)
}
apply(files, 1, bind_signif_by_file)
}
##Run the cluster frequency significance analysis
analyze_cluster_frequencies = function(wd, group1, group2, qValue_cutoff, nperms, total_cells_in_file, total_cell_numbers_csv, include_foldC,set_max_Val) {
totalCellNumbers = c()
if (total_cells_in_file == TRUE) {
totalCellNumbers = "NA"
} else {
totalCellNumbers = getTotalCellNumbers(file = total_cell_numbers_csv, wd = wd)
}
freqMatrix = getFrequencies(wd = wd, totalCellNumbers = totalCellNumbers)
sampleID = getSampleIDs(group1, group2, wd)
model = run_SAM_analysis(freqMatrix, sampleID = sampleID, nperms = nperms)
getFC <- calculate_FoldChange(freqMatrix, group1, group2)
signif_matrix = getSignifMatrix(model, num_clusters = length(freqMatrix[,1]), qValue_cutoff = qValue_cutoff, include_foldC)
colNames = extractColnames(wd)
signif_matrix <- as.data.frame(cbind(signif_matrix,as.numeric(getFC)))
colnames(signif_matrix)[ncol(signif_matrix)] = "All_Log2_FoldChange"
append_freq_signif(wd, colNames = colNames, signif_matrix = signif_matrix, include_foldC = include_foldC, set_max_Val = set_max_Val)
write.csv(signif_matrix, paste(wd, "freqSignif.csv", sep="/"), row.names = FALSE)
return(list.files(pattern = "*.clustered.txt$"))
}
remove_freqsignif_columns = function(wd) {
options(stringsAsFactors = F)
setwd(wd)
files = list.files(pattern = "*clustered.txt$")
for (i in 1:length(files)) {
origColNames = as.vector(read.table(files[i], nrows = 1, sep="\t"))
signifColumns = c(grep(pattern="frequency", origColNames))
if (length(signifColumns) > 0) {
newColNames = origColNames[-signifColumns]
currFile = read.table(files[i], header=TRUE, sep="\t")
newFile = currFile[,-signifColumns]
colnames(newFile) = newColNames
write.table(newFile, file = files[i], row.names = F, sep = "\t", quote = F)
}
}
return("Frequency significance columns removed.")
}
print_Feature_Template = function(wd){
setwd(wd)
files = list.files(pattern = "*.clustered.txt$")
temp_mat = data.frame(fileName = files, feature_value = rep(NA, length = length(files)))
write.csv(temp_mat, paste(wd,"Feature_Template.csv", sep="/"), row.names = FALSE)
}
##Functions for Boolean experession analysis
get_num_clusters = function(wd){
files = list.files(pattern = "*clustered.txt$")
testFile = read.table(files[1], header=TRUE, sep="\t")
return(length(testFile[,1]))
}
get_rdata_col_names <- function(working.directory, f.name)
{
rdata.file <- readRDS(paste(working.directory, f.name, sep = "/"))
ret <- as.vector(colnames(rdata.file))
if(any(is.na(ret)))
{
w <- is.na(ret)
ret[w] <- as.vector(w)
}
return(ret)
}
buildBooleanMatrix = function(wd, feature, booleanThreshold, asinh.cofactor, num_clusters){
setwd(wd)
dataFiles = as.matrix(list.files(pattern = "*.RData$"))
booleanMatrix = mat.or.vec(num_clusters, length(dataFiles))
populateBooleanMatrixColumn = function(dataFile) {
currFile = readRDS(file = dataFile)
getBooleanFrequency = function(cluster) {
currVec = currFile[which(currFile$cellType == cluster), which(colnames(currFile) == feature)]
booleanFrequency = as.vector(sum(currVec >= asinh(booleanThreshold / asinh.cofactor)) / length(currVec))
return(booleanFrequency)
}
clusters = as.matrix(1:num_clusters)
booleanFreqVec = apply(clusters, 1, getBooleanFrequency)
return(booleanFreqVec)
}
fullBooleanMatrix = apply(dataFiles, 1, populateBooleanMatrixColumn)
rownames(fullBooleanMatrix) = 1:num_clusters
colnames(fullBooleanMatrix) = dataFiles
#write.csv(fullBooleanMatrix, paste(wd, paste(feature, "BooleanFreq_FULL.csv", sep=""), sep="/"), row.names = TRUE)
row_without_values = apply(fullBooleanMatrix, 1, function(row) all(is.na(row)))
row_without_values_indecies = which(row_without_values == TRUE)
if (length(row_without_values_indecies) > 0) {
culledBooleanMatrix = fullBooleanMatrix[-row_without_values_indecies,]
} else {
culledBooleanMatrix = fullBooleanMatrix
}
culledBooleanMatrix[is.na(culledBooleanMatrix)] = 0
row_all_zero = apply(culledBooleanMatrix, 1, function(row) all(row==0))
row_all_zero_indecies = which(row_all_zero == TRUE)
if (length(row_all_zero_indecies) > 0) {
booleanMatrix = culledBooleanMatrix[-row_all_zero_indecies,]
} else {
booleanMatrix = culledBooleanMatrix
}
return(booleanMatrix)
}
append_expr_signif = function(wd, colNames, signif_matrix, feature, include_foldC = FALSE, set_max_Val = 2) {
setwd(wd)
files = as.matrix(list.files(pattern = "*clustered.txt$"))
bind_signif_by_file = function(newFile) {
currFile = read.table(newFile, header = TRUE, sep = "\t")
colnames(currFile) = as.matrix(colNames)
if(include_foldC) {
appendedFile = cbind(currFile, signif_matrix[,-1])
colnames(appendedFile)[grep("Significance", colnames(appendedFile))] = paste(feature, "BooleanSignif", sep="")
colnames(appendedFile)[grep("SAMFoldChange", colnames(appendedFile))] = paste(feature, "BooleanFoldChange", sep="")
colnames(appendedFile)[grep("All_Log2_FoldChange", colnames(appendedFile))] = paste(feature, "Boolean_ALLFoldChange", sep="")
#Log2 Normalize SAM output
SAM_Index <- which(grepl(paste(feature, "BooleanFoldChange", sep=""), colnames(appendedFile)))
appendedFile[,SAM_Index] = log2(as.numeric(appendedFile[,SAM_Index]))
# Normalize Fold Changes
set_min_Val = -set_max_Val
FCs <- c(paste(feature, "BooleanFoldChange", sep=""),paste(feature, "Boolean_ALLFoldChange", sep=""))
for (i in 1:2){
FC_dat <- appendedFile[,which(grepl(FCs[i], colnames(appendedFile)))]
if(any(FC_dat > set_max_Val)) {FC_dat[which(FC_dat > set_max_Val)] = set_max_Val}
if(any(FC_dat < set_min_Val)) {FC_dat[which(FC_dat < set_min_Val)] = set_min_Val}
normalized = (FC_dat-set_min_Val)/(set_max_Val-set_min_Val)
if(any(normalized>1)) {normalized[which(normalized>1)] = 0.99999} ##represents greatest increase, 1 used for black landmarks
if(any(normalized==1)) {normalized[which(normalized==1)] = 0.99999}
appendedFile[,which(grepl(FCs[i], colnames(appendedFile)))] = normalized
}
} else{
appendedFile = cbind(currFile, signif_matrix[,grep("Significance", colnames(signif_matrix))])
colnames(appendedFile)[ncol(appendedFile)] = paste(feature, "BooleanSignif", sep="")
}
write.table(appendedFile, file = newFile, row.names = F, sep = "\t", quote = F)
}
apply(files, 1, bind_signif_by_file)
}
analyze_cluster_expression = function(wd, group1, group2, qValue_cutoff, nperms, feature, booleanThreshold,
asinh.cofactor, include_foldC, set_max_Val) {
setwd(wd)
num_clusters = get_num_clusters(wd)
booleanMatrix = buildBooleanMatrix(wd, feature = feature,
booleanThreshold = booleanThreshold,
asinh.cofactor = asinh.cofactor,
num_clusters = num_clusters)
write.csv(booleanMatrix, paste(wd, paste(feature, "BooleanFreq.csv", sep=""), sep="/"), row.names = TRUE)
sampleID = getSampleIDs(group1, group2, wd)
model = run_SAM_analysis(booleanMatrix, sampleID = sampleID, nperms = nperms)
getFC_express <- calculate_FoldChange(booleanMatrix, group1, group2)
signif_matrix = getSignifMatrix(model, num_clusters = num_clusters, qValue_cutoff = qValue_cutoff, include_foldC)
signif_matrix = cbind(signif_matrix, getFC_express)
colnames(signif_matrix)[ncol(signif_matrix)] = "All_Log2_FoldChange"
write.csv(signif_matrix, paste(wd, paste(feature, "BooleanSignif.csv", sep=""), sep="/"), row.names = FALSE)
colNames = extractColnames(wd)
append_expr_signif(wd, colNames = colNames, signif_matrix = signif_matrix, feature = feature,
include_foldC = include_foldC, set_max_Val = set_max_Val)
return(list.files(pattern = "*.clustered.txt$"))
}
remove_exprsignif_columns = function(wd) {
options(stringsAsFactors = F)
setwd(wd)
files = list.files(pattern = "*clustered.txt$")
for (i in 1:length(files)) {
origColNames = as.vector(read.table(files[i], nrows = 1, sep="\t"))
signifColumns = c(grep(pattern="Boolean", origColNames))
if (length(signifColumns) > 0) {
newColNames = origColNames[-signifColumns]
currFile = read.table(files[i], header=TRUE, sep="\t")
newFile = currFile[,-signifColumns]
colnames(newFile) = newColNames
write.table(newFile, file = files[i], row.names = F, sep = "\t", quote = F)
}
}
return("Expression significance columns removed.")
}
########
## Added 20190.10.10 to plot features correlated with cluster abundances
analyze_cluster_correlation = function(wd, corFeature, corTest, corPlotTypes, qVal_cutoff) {
setwd(wd)
num_clusters = get_num_clusters(wd)
corr_matrix = data.frame(cellType = 1:num_clusters, Correlation = rep(NA, length = num_clusters), pVal = rep(NA, length = num_clusters))
freqMatrix = getFrequencies(wd = wd, totalCellNumbers = "NA")
freqMatrix = freqMatrix[,order(colnames(freqMatrix))]
corr_featureMatrix = read.csv(paste(wd, corFeature,sep="/"), stringsAsFactors = FALSE)
corr_featureMatrix = corr_featureMatrix[order(corr_featureMatrix$fileName),]
if(all(colnames(freqMatrix) == corr_featureMatrix$fileName)){
feature_values <- corr_featureMatrix$feature_value
for(i in 1:num_clusters) {
this_clus = freqMatrix[i,]
corr_result = cor.test(feature_values, this_clus, method = corTest, exact = FALSE)
corr_matrix$Correlation[i] = corr_result$estimate
corr_matrix$pVal[i] = corr_result$p.value
if(is.na(corr_matrix$Correlation[i])) {
corr_matrix$Correlation[i] = 0
corr_matrix$pVal[i] = 1
}
}
corr_matrix$pVal <- p.adjust(corr_matrix$pVal, method = "hochberg")
colNames = extractColnames(wd)
corfeatureName <- as.character(corFeature); corfeatureName <- gsub(".csv","",corfeatureName)
append_corr_signif(wd, corfeatureName, colNames = colNames, corr_matrix, corPlotTypes, qVal_cutoff)
write.csv(corr_matrix, paste(wd,"/",corfeatureName,"_correlationSignif.csv", sep=""), row.names = FALSE)
return(list.files(pattern = "*.clustered.txt$"))
} else {
return("Error, please include a feature value for each file in your directory")
}
}
append_corr_signif = function(wd, corfeatureName, colNames, corr_matrix, corPlotTypes, qVal_cutoff) {
setwd(wd)
files = as.matrix(list.files(pattern = "*clustered.txt$"))
bind_signif_by_file = function(newFile) {
currFile = read.table(newFile, header = TRUE, sep = "\t")
colnames(currFile) = as.matrix(colNames)
significant_Corrs = corr_matrix$Correlation
significant_Corrs[which(corr_matrix$pVal > (qVal_cutoff/100))] = 0
appendedFile = cbind(currFile, significant_Corrs)
colnames(appendedFile)[ncol(appendedFile)] <- paste(corfeatureName,"correlationSIG",sep="_")
if(corPlotTypes){
appendedFile = cbind(appendedFile, corr_matrix$Correlation)
colnames(appendedFile)[ncol(appendedFile)] <- paste(corfeatureName,"correlationALL",sep="_")
}
write.table(appendedFile, file = newFile, row.names = F, sep = "\t", quote = F)
}
apply(files, 1, bind_signif_by_file)
}
remove_correlation_columns = function(wd) {
options(stringsAsFactors = F)
setwd(wd)
files = list.files(pattern = "*clustered.txt$")
for (i in 1:length(files)) {
origColNames = as.vector(read.table(files[i], nrows = 1, sep="\t"))
signifColumns = c(grep(pattern="correlation", origColNames))
if (length(signifColumns) > 0) {
newColNames = origColNames[-signifColumns]
currFile = read.table(files[i], header=TRUE, sep="\t")
newFile = currFile[,-signifColumns]
colnames(newFile) = newColNames
write.table(newFile, file = files[i], row.names = F, sep = "\t", quote = F)
}
}
return("Expression significance columns removed.")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.