testColumnNames <- function(name, files, datapath) {
# Set Working Directory on level back
d <- "Datasets loaded:"
d <- paste(d, name, collapse = " ")
cat(paste0("testColumnNames", "\t"), file = logFile, append = TRUE)
cat(paste0(d, "\t"), file = logFile, append = TRUE)
cat(paste0("read data", "\t"), file = logFile, append = TRUE)
cat(paste0("read data", "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
# used columns
if (use_only_useful_columns) {
input <- read.csv(system.file("extdata/param",
"used_columns_only_useful.csv",
package = "tripr",
mustWork = TRUE
),
sep = ";", stringsAsFactors = FALSE, header = FALSE
)
} else {
input <- read.csv(system.file("extdata/param",
"used_columns.csv",
package = "tripr",
mustWork = TRUE
),
sep = ";", stringsAsFactors = FALSE, header = FALSE
)
}
used_columns <- list()
all_used_columns <- c()
for (i in seq(1, 29, 3)) {
file_name <- paste0(input[i, 1], ".txt")
input[i, 1] <- strsplit(input[i, 1], "_")[[1]][2]
input[i, 1] <- make.names(input[i, 1])
input[i + 1, ] <- make.names(input[i + 1, ])
a <- input[i + 1, which(input[i + 1, ] != "X")]
a <- paste0(input[i, 1], ".", a)
used_columns[[input[i, 1]]] <- paste0(input[i, 1], ".", input[i + 1, ])
if (file_name %in% files) {
all_used_columns <- c(all_used_columns, a[which(!stringr::str_detect(a, ".NA."))])
}
}
all_used_columns <- c("dataName", all_used_columns)
## Stored in 'e' environment ("global.R", L:4)
e$all_used_columns <- all_used_columns
e$used_columns <- used_columns
# filter_column contains the columns that are used for each one of the 9 filters with ids=1:9
filter_column <- c()
if ("1_Summary.txt" %in% files) {
filter_column <- c(
used_columns[["Summary"]][3],
used_columns[["Summary"]][18],
used_columns[["Summary"]][2],
used_columns[["Summary"]][18],
used_columns[["Summary"]][4],
used_columns[["Summary"]][3],
used_columns[["Summary"]][8],
used_columns[["Summary"]][11],
used_columns[["Summary"]][15],
used_columns[["Summary"]][18]
)
}
# Log start time and memory currently used
start.time <- Sys.time()
rawDataSet <- list()
count_wrong <- 0
worng_columns_id <- list()
worng_columns_names <- list()
wrong_dataset <- c()
# Load datasets individually
for (i in seq_len(length(name))) {
firstSepData <- TRUE
if (strsplit(name[i], "_")[[1]][1] != name[i] && suppressWarnings(as.numeric(strsplit(name[i], "_")[[1]][2])) > 1) firstSepData <- FALSE
rawDataSet[[name[i]]] <- data.frame(dataName = strsplit(name[i], "_")[[1]][1])
worng_columns_names_temp <- c()
worng_columns_id_temp <- c()
num_initial_col <- 0
for (j in seq_len(length(files))) {
b <- strsplit(files[j], "_")
b2 <- gsub(".txt", "", b[[1]][2])
b2 <- gsub("-", ".", b2)
var.name <- b2
temp <- data.frame(assign(var.name, read.csv(paste0(datapath, "/", name[i], "/", files[j]), sep = "\t", stringsAsFactors = FALSE, fill = TRUE)))
num_initial_col <- num_initial_col + ncol(temp)
colnames(temp) <- paste0(b2, ".", colnames(temp))
if (paste0(b2, ".X") %in% colnames(temp)) {
temp <- temp[, -match(paste0(b2, ".X"), names(temp))]
}
rawDataSet[[name[i]]] <- data.frame(rawDataSet[[name[i]]], temp)
# Check the column names
for (k in seq_len(length(used_columns[[b2]]))) {
if ((used_columns[[b2]][k] %in% colnames(rawDataSet[[name[i]]])) || stringr::str_detect(used_columns[[b2]][k], ".NA") || stringr::str_detect(used_columns[[b2]][k], ".X")) {
# do nothing
} else {
message <- "wrong column names"
tmp2 <- paste0(files[j], ": ", gsub("[.]", " ", gsub(b2, "", used_columns[[b2]][k])))
if (tmp2 %in% worng_columns_names_temp) {
} else {
worng_columns_names_temp <- c(worng_columns_names_temp, tmp2)
worng_columns_id_temp <- c(worng_columns_id_temp, which(all_used_columns %in% used_columns[[b2]][k]))
}
}
}
}
if (length(worng_columns_id_temp) > 0) {
wrong_dataset <- c(wrong_dataset, name[i])
count_wrong <- count_wrong + 1
worng_columns_id[[count_wrong]] <- worng_columns_id_temp
worng_columns_names[[count_wrong]] <- worng_columns_names_temp
} else {
# Drop all the columns that will not be used
rawDataSet[[name[i]]] <- rawDataSet[[name[i]]] %>% dplyr::select(all_used_columns)
}
}
for (i in seq_len(length(name))) {
firstSepData <- TRUE
if (strsplit(name[i], "_")[[1]][1] != name[i] && suppressWarnings(as.numeric(strsplit(name[i], "_")[[1]][2])) == 0) {
newName <- strsplit(name[i], "_")[[1]][1]
rawDataSet[[newName]] <- rawDataSet[[name[i]]]
rawDataSet[[name[i]]] <- NULL
}
if (strsplit(name[i], "_")[[1]][1] != name[i] && suppressWarnings(as.numeric(strsplit(name[i], "_")[[1]][2])) > 0) {
newName <- strsplit(name[i], "_")[[1]][1]
rawDataSet[[newName]] <- rbind(rawDataSet[[newName]], rawDataSet[[name[i]]])
rawDataSet[[name[i]]] <- NULL
}
num_of_datasets <- length(rawDataSet)
}
newDatasetNames <- names(rawDataSet)
k <- c()
# check if the column names are correct
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
confirm <- "Data Loaded!"
if (length(worng_columns_id) > 0) {
return(list(
"confirm" = confirm,
"logFile" = logFile,
"message" = "wrong column names",
"wrong_dataset" = wrong_dataset,
"worng_columns_id" = worng_columns_id,
"worng_columns_names" = worng_columns_names,
"rawDataSet" = rawDataSet
))
}
# Correct the wrong column names
return(list(
"confirm" = confirm,
"logFile" = logFile,
"newDatasetNames" = newDatasetNames,
"message" = "",
"wrong_dataset" = c(),
"worng_columns_id" = c(),
"worng_columns_names" = c(),
"rawDataSet" = rawDataSet
))
}
######################################################################################################################################
correctColumnNames <- function(files, rawDataSet, allDatasets, wrong_dataset, new_columns = list(), worng_columns_id = list(), name) {
nr <- 0
for (i in names(rawDataSet)) {
nr <- nrow(rawDataSet[[i]]) + nr
}
# logfile
cat(paste0("correctColumnNames", "\t"), file = logFile, append = TRUE)
cat(paste0("wrong datasets", paste(wrong_dataset, sep = ","), "\t"), file = logFile, append = TRUE)
cat(paste0(nr, "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(rawDataSet[[names(rawDataSet)[1]]]), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
# filter_column contains the columns that are used for each one of the 9 filters with ids=1:9
filter_column <- c(used_columns[["Summary"]][3], used_columns[["Summary"]][18], used_columns[["Summary"]][2], used_columns[["Summary"]][18], used_columns[["Summary"]][4], used_columns[["Summary"]][3], used_columns[["Summary"]][8], used_columns[["Summary"]][11], used_columns[["Summary"]][15], used_columns[["Summary"]][18])
# used columns
input <- read.csv(system.file("extdata/param", "used_columns.csv",
package = "tripr", mustWork = TRUE
),
sep = ";", stringsAsFactors = FALSE, header = FALSE
)
used_columns <- list()
all_used_columns <- c()
for (i in seq(1, 29, 3)) {
file_name <- paste0(input[i, 1], ".txt")
input[i, 1] <- strsplit(input[i, 1], "_")[[1]][2]
input[i, 1] <- make.names(input[i, 1])
input[i + 1, ] <- make.names(input[i + 1, ])
a <- input[i + 1, which(input[i + 1, ] != "X")]
a <- paste0(input[i, 1], ".", a)
used_columns[[input[i, 1]]] <- a
if (file_name %in% files) {
all_used_columns <- c(all_used_columns, a)
}
}
all_used_columns <- c("dataName", all_used_columns)
# Log start time and memory currently used
start.time <- Sys.time()
correct <- 0
w <- 0
for (i in seq_len(length(wrong_dataset))) {
w <- w + length(worng_columns_id[[i]])
# update columns
# wrong column names
for (j in seq_len(length(new_columns[[i]]))) {
if (length(which(names(rawDataSet[[wrong_dataset[i]]]) == new_columns[[i]][j])) > 0) {
correct <- correct + 1
}
names(rawDataSet[[wrong_dataset[i]]])[which(names(rawDataSet[[wrong_dataset[i]]]) == new_columns[[i]][j])] <- all_used_columns[worng_columns_id[[i]][j]]
}
}
if (correct == w) {
correct <- "yes"
for (i in seq_len(length(name))) {
rawDataSet[[name[i]]] <- rawDataSet[[name[i]]] %>% select(all_used_columns)
}
} else {
correct <- "no"
}
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(list("rawDataSet" = rawDataSet, "correct" = correct))
}
######################################################################################################################################
imgtcleaning <- function(rawDataSet, name, allDatasets, files, cell_id = 1, filter_id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), filter_out_char1 = " P", filter_out_char2 = "[*]|X|#|[.]", filter_in_char = "productive", filterStart = "^*", filterEnd = "*$", identityLow = 95, identityHigh = 100, VGene = "", JGene = "", DGene = "", lengthLow = 7, lengthHigh = 15, aminoacid = "CASSPPDTGELFF", seq1 = 1, seq2 = 2, Tcell) {
used_columns <- e$used_columns
a <- "Filter ids "
for (i in seq_len(length(filter_id))) {
a <- paste0(a, ",", filter_id[i])
}
nr <- 0
for (i in names(rawDataSet)) {
nr <- nrow(rawDataSet[[i]]) + nr
}
# logfile
cat(paste0("imgtcleaning", "\t"), file = logFile, append = TRUE)
cat(paste0(a, "\t"), file = logFile, append = TRUE)
cat(paste0(nr, "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(rawDataSet[[names(rawDataSet)[1]]]), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cleaning_criteria <- c(
"Functional V-Gene",
"CDR3 with no Special Characters",
"Productive Sequence",
"Productive Sequences"
)
# filter_column contains the columns that are used for each one of the 9 filters with ids=1:9
filter_column <- c(
used_columns[["Summary"]][3],
used_columns[["Summary"]][18],
used_columns[["Summary"]][2],
used_columns[["Summary"]][18],
used_columns[["Summary"]][4],
used_columns[["Summary"]][3],
used_columns[["Summary"]][8],
used_columns[["Summary"]][11],
used_columns[["Summary"]][15],
used_columns[["Summary"]][18],
"Summary.V.REGION.identity....with.ins.del.events."
)
filterOut <- list()
workflow <- matrix(0, length(filter_id), 3)
# Log start time and memory currently used
start.time <- Sys.time()
# Combine raw name
for (i in seq_len(length(name))) {
if (i == 1) {
allData <- rawDataSet[[name[i]]]
} else {
allData <- rbind(allData, rawDataSet[[name[i]]])
}
}
test_column <- c(filter_column[1], filter_column[2], filter_column[5], used_columns[["Nt.sequences"]][1])
# Drop all the columns that will not be used
# IMPORTANT: Datasets should start with T
# dataSetIDs <- as.list(levels(unique(allData$dataName)))
allDataInitial <- allData
workflow_datasets <- list()
a <- matrix(0, length(filter_id), 3)
for (j in seq_len(length(name))) {
workflow_datasets[[name[j]]] <- a
}
# Take only the first gene (V, J D) (Separated with "or")
a <- which(stringr::str_detect(allData[[filter_column[1]]], " or|,"))
if (length(a) > 0) {
a2 <- strsplit(allData[[filter_column[1]]][a], " or|,")
allData[[filter_column[1]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
a <- which(stringr::str_detect(allData[[filter_column[7]]], " or|,"))
if (length(a) > 0) {
a2 <- strsplit(allData[[filter_column[7]]][a], " or|,")
allData[[filter_column[7]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
if (!all(is.na(allData[[filter_column[8]]]))) {
a <- which(stringr::str_detect(allData[[filter_column[8]]], " or|,"))
if (length(a) > 0) {
a2 <- strsplit(allData[[filter_column[8]]][a], " or|,")
allData[[filter_column[8]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
# Remove (see comment) from AA.JUNCTION
a <- which(stringr::str_detect(allData[[filter_column[2]]], " [(]see"))
if (length(a) > 0) {
a2 <- strsplit(allData[[filter_column[2]]][a], " [(]see")
allData[[filter_column[2]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
# Apply the requested filters
if (any(filter_id == 1)) {
i <- which(filter_id == 1)
filterOut[[1]] <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[1]]], filter_out_char1))
if (nrow(filterOut[[1]]) > 0) filterOut[[1]] <- cbind(filterOut[[1]], FilterId = cleaning_criteria[1])
allData <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[1]]], filter_out_char1))
workflow[i, 1] <- filter_id[i]
workflow[i, 2] <- nrow(filterOut[[1]])
workflow[i, 3] <- nrow(allData)
for (j in seq_len(length(name))) {
if (nrow(filterOut[[1]]) > 0) {
filterOut_datasets <- filterOut[[1]] %>% dplyr::filter(filterOut[[1]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[1]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- (workflow_datasets[[name[j]]][i - 1, 3])
}
workflow_datasets[[name[j]]][i, 3] <- prev - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i]
}
}
if (any(filter_id == 2)) {
i <- which(filter_id == 2)
filterOut[[2]] <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[2]]], filter_out_char2))
if (nrow(filterOut[[2]]) > 0) filterOut[[2]] <- cbind(filterOut[[2]], FilterId = cleaning_criteria[2])
allDataInitial_tmp <- allData
allData <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[2]]], filter_out_char2))
workflow[i, 1] <- filter_id[i]
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
workflow[i, 3] <- nrow(allData)
for (j in seq_len(length(name))) {
if (nrow(filterOut[[2]]) > 0) {
filterOut_datasets <- filterOut[[2]] %>% dplyr::filter(filterOut[[2]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[2]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- prev - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i]
}
}
if (any(filter_id == 3)) {
i <- which(filter_id == 3)
if (Tcell) {
filterOut[[3]] <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[3]]], "^productive$"))
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[3]]], "^productive$"))
} else {
filterOut[[3]] <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[3]]], "^productive"))
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[3]]], "^productive"))
ins_del <- which(!is.na(allData[[filter_column[11]]]))
if (length(ins_del) > 0) {
# check if V-REGION deletions or V-REGION insertions contain (cause frameshift)
delete <- allData[ins_del, ] %>% dplyr::filter(stringr::str_detect(allData[[used_columns[["Summary"]][21]]][ins_del], "[(]cause frameshift[)]"))
delete2 <- allData[ins_del, ] %>% dplyr::filter(stringr::str_detect(allData[[used_columns[["Summary"]][22]]][ins_del], "[(]cause frameshift[)]"))
delete_all <- unique(rbind(delete, delete2))
filterOut[[3]] <- unique(rbind(filterOut[[3]], delete_all))
allData <- allData %>% dplyr::filter(!(allData[[used_columns[["Summary"]][1]]] %in% filterOut[[3]][[used_columns[["Summary"]][1]]]))
}
}
if (nrow(filterOut[[3]]) > 0) filterOut[[3]] <- cbind(filterOut[[3]], FilterId = cleaning_criteria[3])
workflow[i, 3] <- nrow(allData)
workflow[i, 1] <- filter_id[i]
workflow[i, 2] <- nrow(filterOut[[3]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[3]]) > 0) {
filterOut_datasets <- filterOut[[3]] %>% dplyr::filter(filterOut[[3]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[3]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- (workflow_datasets[[name[j]]][i - 1, 3])
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i]
}
}
if (any(filter_id == 4)) {
i <- which(filter_id == 4)
if (filterStart == "" && filterEnd == "") {
filterOut[[4]] <- allData %>% dplyr::filter(!(stringr::str_detect(allData[[filter_column[4]]], filterStart)))
} else if (filterEnd == "") {
filterOut[[4]] <- allData %>% dplyr::filter(!(stringr::str_detect(allData[[filter_column[4]]], filterStart)))
allData <- allData %>% dplyr::filter((stringr::str_detect(allData[[filter_column[4]]], filterStart)))
} else if (filterStart == "") {
filterOut[[4]] <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[4]]], filterEnd))
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[4]]], filterEnd))
} else {
filterOut[[4]] <- allData %>% dplyr::filter(!(stringr::str_detect(allData[[filter_column[4]]], filterStart)))
filterOut[[4]] <- rbind(filterOut[[4]], (allData %>% dplyr::filter(!(stringr::str_detect(allData[[filter_column[4]]], filterEnd)))))
allData <- allData %>% dplyr::filter((stringr::str_detect(allData[[filter_column[4]]], filterStart)))
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[4]]], filterEnd))
}
if (nrow(filterOut[[4]]) > 0) filterOut[[4]] <- cbind(filterOut[[4]], FilterId = cleaning_criteria[4])
workflow[i, 1] <- filter_id[i]
workflow[i, 2] <- nrow(filterOut[[4]])
workflow[i, 3] <- nrow(allData)
for (j in seq_len(length(name))) {
if (nrow(filterOut[[4]]) > 0) {
filterOut_datasets <- filterOut[[4]] %>% dplyr::filter(filterOut[[4]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[4]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- nrow(allData %>% dplyr::filter(allData$dataName == name[j]))
workflow_datasets[[name[j]]][i, 2] <- prev - nrow(allData %>% dplyr::filter(allData$dataName == name[j]))
workflow_datasets[[name[j]]][i, 1] <- filter_id[i]
}
}
filterOutSum <- c()
if (length(filter_id) > 0) {
for (i in seq_len(length(filter_id))) {
if (i == 1) {
filterOutSum <- filterOut[[filter_id[1]]]
} else {
filterOutSum <- rbind(filterOutSum, filterOut[[filter_id[i]]])
}
}
}
a <- name[1]
if (length(name) > 1) {
for (i in 2:length(name)) {
a <- paste0(a, ", ", name[i])
}
}
b <- filter_id[1]
if (length(filter_id) > 1) {
for (i in 2:length(filter_id)) {
b <- paste0(b, ", ", filter_id[i])
}
}
# Separate allData to different tables
initial_datasets <- list()
for (i in seq_len(length(name))) {
initial_datasets[[name[i]]] <- allDataInitial %>% dplyr::filter(allDataInitial$dataName == name[i])
}
cleaned_datasets <- list()
if (length(allData) > 0) {
for (i in seq_len(length(name))) {
cleaned_datasets[[name[i]]] <- allData %>% dplyr::filter(allData$dataName == name[i])
}
}
cleaned_out_datasets <- list()
if (length(filterOutSum) > 0) {
for (i in seq_len(length(name))) {
cleaned_out_datasets[[name[i]]] <- filterOutSum %>% dplyr::filter(filterOutSum$dataName == name[i])
}
}
confirm <- paste0("Datasets cleaned: ", a, ". Cleaning Filters applied: ", b)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
result <- list(
"message" = "",
"dim" = dim(allData),
"workflow" = workflow,
"workflow_datasets" = workflow_datasets,
"allDataInitial" = allDataInitial,
"allData" = allData,
"filterOutSum" = filterOutSum,
"initial_datasets" = initial_datasets,
"cleaned_datasets" = cleaned_datasets,
"cleaned_out_datasets" = cleaned_out_datasets,
"confirm" = confirm
)
return(result)
}
######################################################################################################################################
imgtfilter <- function(rawDataSet, name, allData, cell_id = 1, filter_id = c(5, 6, 7, 8, 9, 10), filter_out_char1 = " P", filter_out_char2 = "[:punct:]|X", filter_in_char = "productive", filterStart = "^*", filterEnd = "*$", identityLow = 95, identityHigh = 100, VGene = "", JGene = "", DGene = "", lengthLow = 7, lengthHigh = 15, aminoacid = "CASSPPDTGELFF", seq1 = 1, seq2 = 2) {
# logfile
used_columns <- e$used_columns
a <- "Filter ids "
for (i in seq_len(length(filter_id))) {
a <- paste0(a, ",", filter_id[i])
}
cat(paste0("imgtfilter", "\t"), file = logFile, append = TRUE)
cat(paste0(a, "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(rawDataSet[[names(rawDataSet)[1]]]), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cleaning_criteria <- c("Functional V-Gene", "CDR3 with no Special Characters", "Productive Sequence", "Productive Sequences")
filtering_criteria <- c("V-REGION identity %", "Specific V Gene", "Specific J Gene", "Specific D Gene", "CDR3 length range", "CDR3 length range")
criteria <- c(cleaning_criteria, filtering_criteria)
# filter_column contains the columns that are used for each one of the 9 filters with ids=1:9
filter_column <- c(used_columns[["Summary"]][3], used_columns[["Summary"]][18], used_columns[["Summary"]][2], used_columns[["Summary"]][18], used_columns[["Summary"]][4], used_columns[["Summary"]][3], used_columns[["Summary"]][8], used_columns[["Summary"]][11], used_columns[["Summary"]][15], used_columns[["Summary"]][18], "Summary.V.REGION.identity....with.ins.del.events.")
filterOut <- list()
workflow <- matrix(0, length(filter_id), 3)
# Log start time and memory currently used
start.time <- Sys.time()
test_column <- c(filter_column[1], filter_column[2], filter_column[5], used_columns[["Nt.sequences"]][1])
used_column <- c("dataName")
for (i in seq_len(length(filter_id))) {
used_column <- c(used_column, filter_column[filter_id[i]])
}
# IMPORTANT: Datasets should start with T
# dataSetIDs <- as.list(levels(unique(allData$dataName)))
allDataInitial <- allData
workflow_datasets <- list()
a <- matrix(0, length(filter_id), 3)
for (j in seq_len(length(name))) {
workflow_datasets[[name[j]]] <- a
}
# Apply the requested filters
if (any(filter_id == 5)) {
i <- which(filter_id == 5)
ins_del <- which(!is.na(allData[[filter_column[11]]]))
if (length(ins_del) > 0) {
allData[[filter_column[5]]][ins_del] <- allData[[filter_column[11]]][ins_del]
}
filterOut[[5]] <- allData[which(allData[[filter_column[5]]] > identityHigh | allData[[filter_column[5]]] < identityLow), ]
allData <- allData[which(allData[[filter_column[5]]] <= identityHigh & allData[[filter_column[5]]] >= identityLow), ]
if (nrow(filterOut[[5]]) > 0) filterOut[[5]] <- cbind(filterOut[[5]], FilterId = criteria[5])
workflow[i, 3] <- nrow(allData)
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[5]]) > 0) {
filterOut_datasets <- filterOut[[5]] %>% dplyr::filter(filterOut[[5]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[5]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
if (any(filter_id == 6)) {
i <- which(filter_id == 6)
filterOut[[6]] <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[6]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", VGene)))))
if (nrow(filterOut[[6]]) > 0) filterOut[[6]] <- cbind(filterOut[[6]], FilterId = criteria[6])
workflow[i, 3] <- nrow(allData) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[6]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", VGene)))))
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[6]]) > 0) {
filterOut_datasets <- filterOut[[6]] %>% dplyr::filter(filterOut[[6]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[6]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
if (any(filter_id == 7)) {
i <- which(filter_id == 7)
filterOut[[7]] <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[7]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", JGene)))))
if (nrow(filterOut[[7]]) > 0) filterOut[[7]] <- cbind(filterOut[[7]], FilterId = criteria[7])
workflow[i, 3] <- nrow(allData) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[7]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", JGene)))))
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[7]]) > 0) {
filterOut_datasets <- filterOut[[7]] %>% dplyr::filter(filterOut[[7]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[7]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
if (any(filter_id == 8)) {
i <- which(filter_id == 8)
filterOut[[8]] <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[8]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", DGene)))))
if (nrow(filterOut[[8]]) > 0) filterOut[[8]] <- cbind(filterOut[[8]], FilterId = criteria[8])
workflow[i, 3] <- nrow(allData) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[8]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", DGene)))))
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[8]]) > 0) {
filterOut_datasets <- filterOut[[8]] %>% dplyr::filter(filterOut[[8]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[8]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
if (any(filter_id == 9)) {
i <- which(filter_id == 9)
filterOut[[9]] <- allData[which(as.numeric(allData[[filter_column[9]]]) > lengthHigh | as.numeric(allData[[filter_column[9]]]) < lengthLow), ]
if (nrow(filterOut[[9]]) > 0) filterOut[[9]] <- cbind(filterOut[[9]], FilterId = criteria[9])
workflow[i, 3] <- nrow(allData) - nrow(filterOut[[filter_id[i]]])
allData <- allData[which(as.numeric(allData[[filter_column[9]]]) <= lengthHigh & as.numeric(allData[[filter_column[9]]]) >= lengthLow), ]
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[9]]) > 0) {
filterOut_datasets <- filterOut[[9]] %>% dplyr::filter(filterOut[[9]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[9]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
if (any(filter_id == 10)) {
i <- which(filter_id == 10)
filterOut[[10]] <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[10]]], aminoacid))
if (nrow(filterOut[[10]]) > 0) filterOut[[10]] <- cbind(filterOut[[10]], FilterId = criteria[10])
workflow[i, 3] <- nrow(allData) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[10]]], aminoacid))
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[10]]) > 0) {
filterOut_datasets <- filterOut[[10]] %>% dplyr::filter(filterOut[[10]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[10]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
# Provide a 1_Summary.txt of the datasets
filterOutSum <- c()
if (length(filter_id) > 0) {
for (i in seq_len(length(filter_id))) {
if (i == 1) {
filterOutSum <- filterOut[[filter_id[1]]]
} else {
filterOutSum <- rbind(filterOutSum, filterOut[[filter_id[i]]])
}
}
}
# Do the actual analysis - targeted tables
a <- name[1]
if (length(name) > 1) {
for (i in 2:length(name)) {
a <- paste0(a, ", ", name[i])
}
}
b <- filter_id[1] - 4
if (length(filter_id) > 1) {
for (i in 2:length(filter_id)) {
b <- paste0(b, ", ", (filter_id[i] - 4))
}
}
# Delete (see comment) from genes
a <- which(stringr::str_detect(allData[[used_columns[["Summary"]][3]]], " or|,| [(]see"))
if (length(a) > 0) {
a2 <- strsplit(allData[[used_columns[["Summary"]][3]]][a], " or|,| [(]see")
allData[[used_columns[["Summary"]][3]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
a <- which(stringr::str_detect(allData[[used_columns[["Summary"]][8]]], " or|,| [(]see"))
if (length(a) > 0) {
a2 <- strsplit(allData[[used_columns[["Summary"]][8]]][a], " or|,| [(]see")
allData[[used_columns[["Summary"]][8]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
if (!all(is.na(allData[[used_columns[["Summary"]][11]]]))) {
a <- which(stringr::str_detect(allData[[used_columns[["Summary"]][11]]], " or|,| [(]see"))
if (length(a) > 0) {
a2 <- strsplit(allData[[used_columns[["Summary"]][11]]][a], " or|,| [(]see")
allData[[used_columns[["Summary"]][11]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
# Separate allData to different tables
initial_datasets <- list()
for (i in seq_len(length(name))) {
initial_datasets[[name[i]]] <- allDataInitial %>% dplyr::filter(allDataInitial$dataName == name[i])
}
filtered_datasets <- list()
if (length(allData) > 0) {
for (i in seq_len(length(name))) {
filtered_datasets[[name[i]]] <- allData %>% dplyr::filter(allData$dataName == name[i])
if (save_tables_individually_filter_in) {
write.table((filtered_datasets[[name[i]]]), paste0(e$output_folder, "/", "filter_in_", name[i], ".txt"), sep = "\t", row.names = FALSE, col.names = TRUE)
}
}
}
filtered_out_datasets <- list()
if (length(filterOutSum) > 0) {
for (i in seq_len(length(name))) {
filtered_out_datasets[[name[i]]] <- filterOutSum %>% dplyr::filter(filterOutSum$dataName == name[i])
}
}
a <- ""
for (i in seq_len(length(name))) {
a <- paste(a, name[i])
}
if (save_tables_individually_filter_in) {
write.table((allData), paste0(e$output_folder, "/", "filter_in_All_Data", ".txt"), sep = "\t", row.names = FALSE, col.names = TRUE)
}
confirm <- paste0("Datasets filtered: ", a, ". Filters applied: ", b)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
result <- list(
"message" = "",
"dim" = dim(allData),
"workflow" = workflow,
"workflow_datasets" = workflow_datasets,
"allDataInitial" = allDataInitial,
"allData" = allData,
"filterOutSum" = filterOutSum,
"initial_datasets" = initial_datasets,
"filtered_datasets" = filtered_datasets,
"filtered_out_datasets" = filtered_out_datasets,
"confirm" = confirm
)
return(result)
}
######################################################################################################################################
imgtcleaningLow <- function(rawDataSet, name, allDatasets, files, cell_id = 1, filter_id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), filter_out_char1 = " P", filter_out_char2 = "[*]|X|#|[.]", filter_in_char = "productive", filterStart = "^*", filterEnd = "*$", identityLow = 95, identityHigh = 100, VGene = "", JGene = "", DGene = "", lengthLow = 7, lengthHigh = 15, aminoacid = "CASSPPDTGELFF", seq1 = 1, seq2 = 2, Tcell) {
used_columns <- e$used_columns
a <- "Filter ids "
for (i in seq_len(length(filter_id))) {
a <- paste0(a, ",", filter_id[i])
}
nr <- 0
for (i in names(rawDataSet)) {
nr <- nrow(rawDataSet[[i]]) + nr
}
# logfile
cat(paste0("imgtcleaning", "\t"), file = logFile, append = TRUE)
cat(paste0(a, "\t"), file = logFile, append = TRUE)
cat(paste0(nr, "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(rawDataSet[[names(rawDataSet)[1]]]), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cleaning_criteria <- c("Functional V-Gene", "CDR3 with no Special Characters", "Productive Sequence", "Productive Sequences")
# filter_column contains the columns that are used for each one of the 9 filters with ids=1:9
filter_column <- c(used_columns[["Summary"]][3], used_columns[["Summary"]][18], used_columns[["Summary"]][2], used_columns[["Summary"]][18], used_columns[["Summary"]][4], used_columns[["Summary"]][3], used_columns[["Summary"]][8], used_columns[["Summary"]][11], used_columns[["Summary"]][15], used_columns[["Summary"]][18], "Summary.V.REGION.identity....with.ins.del.events.")
filterOut <- list()
filterIn <- list()
workflow <- matrix(0, length(filter_id), 3)
# Log start time and memory currently used
start.time <- Sys.time()
# Combine raw name
for (i in seq_len(length(name))) {
if (i == 1) {
allData <- rawDataSet[[name[i]]]
} else {
allData <- rbind(allData, rawDataSet[[name[i]]])
}
}
test_column <- c(filter_column[1], filter_column[2], filter_column[5], used_columns[["Nt.sequences"]][1])
used_column <- c("dataName")
for (i in seq_len(length(filter_id))) {
used_column <- c(used_column, filter_column[filter_id[i]])
}
# IMPORTANT: Datasets should start with T
# dataSetIDs <- as.list(levels(unique(allData$dataName)))
allDataInitial <- allData
workflow_datasets <- list()
a <- matrix(0, length(filter_id), 3)
for (j in seq_len(length(name))) {
workflow_datasets[[name[j]]] <- a
}
# Take only the first gene (V, J D) (Separated with "or")
a <- which(stringr::str_detect(allData[[filter_column[1]]], " or|,"))
if (length(a) > 0) {
a2 <- strsplit(allData[[filter_column[1]]][a], " or|,")
allData[[filter_column[1]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
a <- which(stringr::str_detect(allData[[filter_column[7]]], " or|,"))
if (length(a) > 0) {
a2 <- strsplit(allData[[filter_column[7]]][a], " or|,")
allData[[filter_column[7]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
if (!all(is.na(allData[[filter_column[8]]]))) {
a <- which(stringr::str_detect(allData[[filter_column[8]]], " or|,"))
if (length(a) > 0) {
a2 <- strsplit(allData[[filter_column[8]]][a], " or|,")
allData[[filter_column[8]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
# Remove (see comment) from AA.JUNCTION
a <- which(stringr::str_detect(allData[[filter_column[2]]], " [(]see"))
if (length(a) > 0) {
a2 <- strsplit(allData[[filter_column[2]]][a], " [(]see")
allData[[filter_column[2]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
# Apply the requested filters
if (any(filter_id == 1)) {
i <- which(filter_id == 1)
filterOut[[1]] <- allDataInitial %>% dplyr::filter(stringr::str_detect(allDataInitial[[filter_column[1]]], filter_out_char1))
if (nrow(filterOut[[1]]) > 0) filterOut[[1]] <- cbind(filterOut[[1]], FilterId = cleaning_criteria[1])
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[1]]], filter_out_char1))
workflow[i, 1] <- filter_id[i]
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[1]]) > 0) {
filterOut_datasets <- filterOut[[1]] %>% dplyr::filter(filterOut[[1]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[1]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- (workflow_datasets[[name[j]]][i - 1, 3])
}
workflow_datasets[[name[j]]][i, 3] <- prev - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i]
}
}
if (any(filter_id == 2)) {
i <- which(filter_id == 2)
filterOut[[2]] <- allDataInitial %>% dplyr::filter(stringr::str_detect(allDataInitial[[filter_column[2]]], filter_out_char2))
if (nrow(filterOut[[2]]) > 0) filterOut[[2]] <- cbind(filterOut[[2]], FilterId = cleaning_criteria[2])
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[2]]], filter_out_char2))
workflow[i, 1] <- filter_id[i]
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[2]]) > 0) {
filterOut_datasets <- filterOut[[2]] %>% dplyr::filter(filterOut[[2]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[2]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- prev - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i]
}
}
if (any(filter_id == 3)) {
i <- which(filter_id == 3)
if (Tcell) {
filterOut[[3]] <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[3]]], "^productive$"))
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[3]]], "^productive$"))
} else {
filterOut[[3]] <- allData %>% dplyr::filter(!stringr::str_detect(allData[[filter_column[3]]], "^productive"))
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[3]]], "^productive"))
ins_del <- which(!is.na(allData[[filter_column[11]]]))
if (length(ins_del) > 0) {
# check if V-REGION deletions or V-REGION insertions contain (cause frameshift)
delete <- allData[ins_del, ] %>% dplyr::filter(stringr::str_detect(allData[[used_columns[["Summary"]][21]]][ins_del], "[(]cause frameshift[)]"))
delete2 <- allData[ins_del, ] %>% dplyr::filter(stringr::str_detect(allData[[used_columns[["Summary"]][22]]][ins_del], "[(]cause frameshift[)]"))
delete_all <- unique(rbind(delete, delete2))
filterOut[[3]] <- unique(rbind(filterOut[[3]], delete_all))
allData <- allData %>% dplyr::filter(!(allData[[used_columns[["Summary"]][1]]] %in% filterOut[[3]][[used_columns[["Summary"]][1]]]))
}
}
if (nrow(filterOut[[3]]) > 0) filterOut[[3]] <- cbind(filterOut[[3]], FilterId = cleaning_criteria[3])
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
workflow[i, 1] <- filter_id[i]
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[3]]) > 0) {
filterOut_datasets <- filterOut[[3]] %>% dplyr::filter(filterOut[[3]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[3]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- (workflow_datasets[[name[j]]][i - 1, 3])
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i]
}
}
if (any(filter_id == 4)) {
i <- which(filter_id == 4)
if (filterStart == "" && filterEnd == "") {
filterOut[[4]] <- allDataInitial %>% dplyr::filter(!(stringr::str_detect(allDataInitial[[filter_column[4]]], filterStart)))
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
} else if (filterEnd == "") {
filterOut[[4]] <- allDataInitial %>% dplyr::filter(!(stringr::str_detect(allDataInitial[[filter_column[4]]], filterStart)))
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter((stringr::str_detect(allData[[filter_column[4]]], filterStart)))
} else if (filterStart == "") {
filterOut[[4]] <- allDataInitial %>% dplyr::filter(!stringr::str_detect(allDataInitial[[filter_column[4]]], filterEnd))
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[4]]], filterEnd))
} else {
filterOut[[4]] <- allDataInitial %>% dplyr::filter(!(stringr::str_detect(allDataInitial[[filter_column[4]]], filterStart)))
filterOut[[4]] <- rbind(filterOut[[4]], (allDataInitial %>% dplyr::filter(!(stringr::str_detect(allDataInitial[[filter_column[4]]], filterEnd)))))
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter((stringr::str_detect(allData[[filter_column[4]]], filterStart)))
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[4]]], filterEnd))
}
if (nrow(filterOut[[4]]) > 0) filterOut[[4]] <- cbind(filterOut[[4]], FilterId = cleaning_criteria[4])
workflow[i, 1] <- filter_id[i]
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[4]]) > 0) {
filterOut_datasets <- filterOut[[4]] %>% dplyr::filter(filterOut[[4]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[4]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- nrow(allData %>% dplyr::filter(allData$dataName == name[j]))
workflow_datasets[[name[j]]][i, 2] <- prev - nrow(allData %>% dplyr::filter(allData$dataName == name[j]))
workflow_datasets[[name[j]]][i, 1] <- filter_id[i]
}
}
filterOutSum <- c()
if (length(filter_id) > 0) {
for (i in seq_len(length(filter_id))) {
if (i == 1) {
filterOutSum <- filterOut[[filter_id[1]]]
} else {
filterOutSum <- rbind(filterOutSum, filterOut[[filter_id[i]]])
}
}
# Handle conflictions
if (nrow(filterOutSum) > 0) {
filterOutSum <- aggregate(filterOutSum[, c(1, 3:ncol(filterOutSum))], list(filterOutSum[, 2]), function(x) paste0(unique(x)))
colnames(filterOutSum)[1] <- used_columns[["Summary"]][1]
filterOutSum <- filterOutSum[, c(2, 1, 3:ncol(filterOutSum))]
}
}
a <- name[1]
if (length(name) > 1) {
for (i in 2:length(name)) {
a <- paste0(a, ", ", name[i])
}
}
b <- filter_id[1]
if (length(filter_id) > 1) {
for (i in 2:length(filter_id)) {
b <- paste0(b, ", ", filter_id[i])
}
}
# Separate allData to different tables
initial_datasets <- list()
for (i in seq_len(length(name))) {
initial_datasets[[name[i]]] <- allDataInitial %>% dplyr::filter(allDataInitial$dataName == name[i])
}
cleaned_datasets <- list()
if (length(allData) > 0) {
for (i in seq_len(length(name))) {
cleaned_datasets[[name[i]]] <- allData %>% dplyr::filter(allData$dataName == name[i])
}
}
cleaned_out_datasets <- list()
if (length(filterOutSum) > 0) {
for (i in seq_len(length(name))) {
cleaned_out_datasets[[name[i]]] <- filterOutSum %>% dplyr::filter(filterOutSum$dataName == name[i])
}
}
confirm <- paste0("Datasets cleaned: ", a, ". Cleaning Filters applied: ", b)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
result <- list("message" = "", "dim" = dim(allData), "workflow" = workflow, "workflow_datasets" = workflow_datasets, "allDataInitial" = allDataInitial, "allData" = allData, "filterOutSum" = filterOutSum, "initial_datasets" = initial_datasets, "cleaned_datasets" = cleaned_datasets, "cleaned_out_datasets" = cleaned_out_datasets, "confirm" = confirm)
return(result)
}
######################################################################################################################################
imgtfilterLow <- function(rawDataSet, name, allData, cell_id = 1, filter_id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), filter_out_char1 = " P", filter_out_char2 = "[:punct:]|X", filter_in_char = "productive", filterStart = "^*", filterEnd = "*$", identityLow = 95, identityHigh = 100, VGene = "", JGene = "", DGene = "", lengthLow = 7, lengthHigh = 15, aminoacid = "CASSPPDTGELFF", seq1 = 1, seq2 = 2) {
used_columns <- e$used_columns
# logfile
a <- "Filter ids "
for (i in seq_len(length(filter_id))) {
a <- paste0(a, ",", filter_id[i])
}
cat(paste0("imgtfilter", "\t"), file = logFile, append = TRUE)
cat(paste0(a, "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(rawDataSet[[names(rawDataSet)[1]]]), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cleaning_criteria <- c("Functional V-Gene", "CDR3 with no Special Characters", "Productive Sequence", "Productive Sequences")
filtering_criteria <- c("V-REGION identity %", "Specific V Gene", "Specific J Gene", "Specific D Gene", "CDR3 length range", "CDR3 length range")
criteria <- c(cleaning_criteria, filtering_criteria)
# filter_column contains the columns that are used for each one of the 9 filters with ids=1:9
filter_column <- c(used_columns[["Summary"]][3], used_columns[["Summary"]][18], used_columns[["Summary"]][2], used_columns[["Summary"]][18], used_columns[["Summary"]][4], used_columns[["Summary"]][3], used_columns[["Summary"]][8], used_columns[["Summary"]][11], used_columns[["Summary"]][15], used_columns[["Summary"]][18], "Summary.V.REGION.identity....with.ins.del.events.")
filterOut <- list()
workflow <- matrix(0, length(filter_id), 3)
# Log start time and memory currently used
start.time <- Sys.time()
# mem_used()
test_column <- c(filter_column[1], filter_column[2], filter_column[5], used_columns[["Nt.sequences"]][1])
used_column <- c("dataName")
for (i in seq_len(length(filter_id))) {
used_column <- c(used_column, filter_column[filter_id[i]])
}
# IMPORTANT: Datasets should start with T
# dataSetIDs <- as.list(levels(unique(allData$dataName)))
allDataInitial <- allData
workflow_datasets <- list()
a <- matrix(0, length(filter_id), 3)
for (j in seq_len(length(name))) {
workflow_datasets[[name[j]]] <- a
}
# Apply the requested filters
if (any(filter_id == 5)) {
i <- which(filter_id == 5)
ins_del <- which(!is.na(allData[[filter_column[11]]]))
if (length(ins_del) > 0) {
allData[[filter_column[5]]][ins_del] <- allData[[filter_column[11]]][ins_del]
}
filterOut[[5]] <- allDataInitial[which(allDataInitial[[filter_column[5]]] > identityHigh | allDataInitial[[filter_column[5]]] < identityLow), ]
allData <- allData[which(allData[[filter_column[5]]] <= identityHigh & allData[[filter_column[5]]] >= identityLow), ]
if (nrow(filterOut[[5]]) > 0) filterOut[[5]] <- cbind(filterOut[[5]], FilterId = criteria[5])
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[5]]) > 0) {
filterOut_datasets <- filterOut[[5]] %>% dplyr::filter(filterOut[[5]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[5]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
if (any(filter_id == 6)) {
i <- which(filter_id == 6)
filterOut[[6]] <- allDataInitial %>% dplyr::filter(!stringr::str_detect(allDataInitial[[filter_column[6]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", VGene)))))
if (nrow(filterOut[[6]]) > 0) filterOut[[6]] <- cbind(filterOut[[6]], FilterId = criteria[6])
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[6]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", VGene)))))
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[6]]) > 0) {
filterOut_datasets <- filterOut[[6]] %>% dplyr::filter(filterOut[[6]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[6]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
if (any(filter_id == 7)) {
i <- which(filter_id == 7)
filterOut[[7]] <- allDataInitial %>% dplyr::filter(!stringr::str_detect(allDataInitial[[filter_column[7]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", JGene)))))
if (nrow(filterOut[[7]]) > 0) filterOut[[7]] <- cbind(filterOut[[7]], FilterId = criteria[7])
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[7]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", JGene)))))
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[7]]) > 0) {
filterOut_datasets <- filterOut[[7]] %>% dplyr::filter(filterOut[[7]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[7]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
if (any(filter_id == 8)) {
i <- which(filter_id == 8)
filterOut[[8]] <- allDataInitial %>% dplyr::filter(!stringr::str_detect(allDataInitial[[filter_column[8]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", DGene)))))
if (nrow(filterOut[[8]]) > 0) filterOut[[8]] <- cbind(filterOut[[8]], FilterId = criteria[8])
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[8]]], gsub("[(]", "[(]", gsub("[)]", "[)]", gsub("[*]", "[*]", DGene)))))
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[8]]) > 0) {
filterOut_datasets <- filterOut[[8]] %>% dplyr::filter(filterOut[[8]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[8]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
if (any(filter_id == 9)) {
i <- which(filter_id == 9)
filterOut[[9]] <- allDataInitial[which(as.numeric(allDataInitial[[filter_column[9]]]) > lengthHigh | as.numeric(allDataInitial[[filter_column[9]]]) < lengthLow), ]
if (nrow(filterOut[[9]]) > 0) filterOut[[9]] <- cbind(filterOut[[9]], FilterId = criteria[9])
allData <- allData[which(as.numeric(allData[[filter_column[9]]]) <= lengthHigh & as.numeric(allData[[filter_column[9]]]) >= lengthLow), ]
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[9]]) > 0) {
filterOut_datasets <- filterOut[[9]] %>% dplyr::filter(filterOut[[9]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[9]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
if (any(filter_id == 10)) {
i <- which(filter_id == 10)
filterOut[[10]] <- allDataInitial %>% dplyr::filter(!stringr::str_detect(allDataInitial[[filter_column[10]]], aminoacid))
if (nrow(filterOut[[10]]) > 0) filterOut[[10]] <- cbind(filterOut[[10]], FilterId = criteria[10])
workflow[i, 3] <- nrow(allDataInitial) - nrow(filterOut[[filter_id[i]]])
allData <- allData %>% dplyr::filter(stringr::str_detect(allData[[filter_column[10]]], aminoacid))
workflow[i, 1] <- filter_id[i] - 4
workflow[i, 2] <- nrow(filterOut[[filter_id[i]]])
for (j in seq_len(length(name))) {
if (nrow(filterOut[[10]]) > 0) {
filterOut_datasets <- filterOut[[10]] %>% dplyr::filter(filterOut[[10]]$dataName == name[j])
} else {
filterOut_datasets <- filterOut[[10]]
}
if (i == 1) {
prev <- nrow(rawDataSet[[name[j]]])
} else {
prev <- workflow_datasets[[name[j]]][i - 1, 3]
}
workflow_datasets[[name[j]]][i, 3] <- (prev) - nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 2] <- nrow(filterOut_datasets)
workflow_datasets[[name[j]]][i, 1] <- filter_id[i] - 4
}
}
# Provide a 1_Summary.txt of the datasets
# mem_used() # log memory used after filtering (no significant change should be evident)
filterOutSum <- c()
if (length(filter_id) > 0) {
for (i in seq_len(length(filter_id))) {
if (i == 1) {
filterOutSum <- filterOut[[filter_id[1]]]
} else {
filterOutSum <- rbind(filterOutSum, filterOut[[filter_id[i]]])
}
}
# Handle conflictions
if (nrow(filterOutSum) > 0) {
filterOutSum <- aggregate(filterOutSum[, c(1, 3:ncol(filterOutSum))], list(filterOutSum[, 2]), function(x) paste0(unique(x)))
colnames(filterOutSum)[1] <- used_columns[["Summary"]][1]
filterOutSum <- filterOutSum[, c(2, 1, 3:ncol(filterOutSum))]
}
}
# Do the actual analysis - targeted tables
a <- name[1]
if (length(name) > 1) {
for (i in 2:length(name)) {
a <- paste0(a, ", ", name[i])
}
}
b <- filter_id[1] - 4
if (length(filter_id) > 1) {
for (i in 2:length(filter_id)) {
b <- paste0(b, ", ", (filter_id[i] - 4))
}
}
# Delete (see comment) from genes
a <- which(stringr::str_detect(allData[[used_columns[["Summary"]][3]]], " or|,| [(]see"))
if (length(a) > 0) {
a2 <- strsplit(allData[[used_columns[["Summary"]][3]]][a], " or|,| [(]see")
allData[[used_columns[["Summary"]][3]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
a <- which(stringr::str_detect(allData[[used_columns[["Summary"]][8]]], " or|,| [(]see"))
if (length(a) > 0) {
a2 <- strsplit(allData[[used_columns[["Summary"]][8]]][a], " or|,| [(]see")
allData[[used_columns[["Summary"]][8]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
if (!all(is.na(allData[[used_columns[["Summary"]][11]]]))) {
a <- which(stringr::str_detect(allData[[used_columns[["Summary"]][11]]], " or|,| [(]see"))
if (length(a) > 0) {
a2 <- strsplit(allData[[used_columns[["Summary"]][11]]][a], " or|,| [(]see")
allData[[used_columns[["Summary"]][11]]][a] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
# Separate allData to different tables
initial_datasets <- list()
for (i in seq_len(length(name))) {
initial_datasets[[name[i]]] <- allDataInitial %>% dplyr::filter(allDataInitial$dataName == name[i])
}
filtered_datasets <- list()
if (length(allData) > 0) {
for (i in seq_len(length(name))) {
filtered_datasets[[name[i]]] <- allData %>% dplyr::filter(allData$dataName == name[i])
}
}
filtered_out_datasets <- list()
if (length(filterOutSum) > 0) {
for (i in seq_len(length(name))) {
filtered_out_datasets[[name[i]]] <- filterOutSum %>% dplyr::filter(filterOutSum$dataName == name[i])
}
}
confirm <- paste0("Datasets filtered: ", a, ". Filters applied: ", b)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
result <- list("message" = "", "dim" = dim(allData), "workflow" = workflow, "workflow_datasets" = workflow_datasets, "allDataInitial" = allDataInitial, "allData" = allData, "filterOutSum" = filterOutSum, "initial_datasets" = initial_datasets, "filtered_datasets" = filtered_datasets, "filtered_out_datasets" = filtered_out_datasets, "confirm" = confirm)
return(result)
}
######################################################################################################################################
clonotypes <- function(allData, allele, gene, junction, name, run_diagnosis) { # run_shm
used_columns <- e$used_columns
# logfile
message("Clonotype Analysis Step 1")
if (allele == FALSE) {
g <- stringr::str_replace(gene, ".and.allele", "")
} else {
g <- gene
}
cat(paste0("clonotypes", "\t"), file = logFile, append = TRUE)
cat(paste0(paste(g, junction, sep = ","), "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(allData), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
# clonotypes all Data
allData_initial <- allData
view_specific_clonotype_allData <- list()
convergent_evolution <- c()
convergent_evolution_list_allData <- list()
cluster_id <- c()
freq_cluster_id <- c()
v.gene.and.allele <- c()
aa.junction <- c()
if (length(name) > 1) {
if (length(gene) > 0) {
message("Clonotype Analysis Step 2.a")
if (allele == FALSE) {
allData[[gene]] <- stringr::str_split(allData[[gene]], "\\*", simplify = TRUE)[, 1] # as.character(plyr::ldply(a2, function(s){t(data.frame(unlist(s)))})[,1])
}
distinctVGenes_CDR3 <- allData %>%
dplyr::group_by(Genes = allData[[gene]], CDR3 = allData[[junction]]) %>%
dplyr::count(sort = TRUE) # dplyr::summarise(n = n())
distinctVGenes_CDR3$clonotype <- paste(distinctVGenes_CDR3$Genes,
distinctVGenes_CDR3$CDR3,
sep = " - "
) # do.call(paste, c(distinctVGenes_CDR3[c("Genes", "CDR3")], sep = " - "))
for (i in seq_len(nrow(distinctVGenes_CDR3))) {
# Get specific clonotype data
inputVGenes_CDR3 <- which((allData[[gene]] == distinctVGenes_CDR3$Genes[i]) & (allData[[junction]] == distinctVGenes_CDR3$CDR3[i]))
view_specific_clonotype_allData[[distinctVGenes_CDR3$clonotype[i]]] <- allData_initial[inputVGenes_CDR3, ]
# Count unique 'IMGT.gapped.nt.sequences.CDR3.IMGT'
ce <- view_specific_clonotype_allData[[distinctVGenes_CDR3$clonotype[i]]] %>%
dplyr::count(view_specific_clonotype_allData[[distinctVGenes_CDR3$clonotype[i]]][[used_columns[["IMGT.gapped.nt.sequences"]][9]]],
sort = TRUE
)
colnames(ce) <- c(used_columns[["IMGT.gapped.nt.sequences"]][9], "convergent_evolution")
# Create convergent evolution list
convergent_evolution_list_allData[[as.character(i)]] <- ce
convergent_evolution <- c(convergent_evolution, paste0("cluster ", i, " : ", nrow(ce)))
# Calculate clonotype frequency
freq_cluster_id[inputVGenes_CDR3] <- 100 * length(inputVGenes_CDR3) / nrow(allData_initial)
cluster_id[inputVGenes_CDR3] <- i
}
} else {
message("Clonotype Analysis Step 2.b")
distinctVGenes_CDR3 <- allData %>%
dplyr::group_by(clonotype = allData[[junction]]) %>%
dplyr::count(sort = TRUE) # dplyr::summarise(n = n())
for (i in seq_len(nrow(distinctVGenes_CDR3))) {
# Get specific clonotype data
inputVGenes_CDR3 <- which(allData[[junction]] == distinctVGenes_CDR3$clonotype[i])
view_specific_clonotype_allData[[distinctVGenes_CDR3$clonotype[i]]] <- allData[inputVGenes_CDR3, ]
# Count unique 'IMGT.gapped.nt.sequences.CDR3.IMGT'
ce <- view_specific_clonotype_allData[[distinctVGenes_CDR3$clonotype[i]]] %>%
dplyr::count(view_specific_clonotype_allData[[distinctVGenes_CDR3$clonotype[i]]][[used_columns[["IMGT.gapped.nt.sequences"]][9]]],
sort = TRUE
)
colnames(ce) <- c(used_columns[["IMGT.gapped.nt.sequences"]][9], "convergent_evolution")
# Create coergenet evolution list
convergent_evolution_list_allData[[as.character(i)]] <- ce
convergent_evolution <- c(convergent_evolution, paste0("cluster ", i, " : ", nrow(ce)))
# Calculate clonotype frequency
freq_cluster_id[inputVGenes_CDR3] <- 100 * length(inputVGenes_CDR3) / nrow(allData_initial)
cluster_id[inputVGenes_CDR3] <- i
# Get V genes and AA junctions
v.gene.and.allele <- c(v.gene.and.allele, unique(allData[inputVGenes_CDR3, ]$Summary.V.GENE.and.allele))
aa.junction <- c(aa.junction, unique(allData[inputVGenes_CDR3, ]$Summary.AA.JUNCTION))
}
}
clono_allData <- distinctVGenes_CDR3[, c("clonotype", "n")]
clono_allData <- cbind(clono_allData, Freq = 100 * clono_allData$n / nrow(allData), convergent_evolution)
colnames(clono_allData) <- c("clonotype", "N", "Freq", "Convergent Evolution")
if (junction == "Summary.Sequence") {
clono_allData <- cbind(clono_allData, v.gene.and.allele, aa.junction)
colnames(clono_allData) <- c("clonotype", "N", "Freq", "Convergent Evolution", "V Gene and allele", "AA Junction")
}
clono_allData_freq <- cbind(allData_initial, "cluster_id" = cluster_id, "freq_cluster_id" = freq_cluster_id)
if (save_tables_individually) {
if (junction == "Summary.Sequence") {
clono_write <- clono_allData
} else {
## Changed cbind to merge
clono_write <- as.data.frame(cbind(distinctVGenes_CDR3[, c("Genes", "CDR3")], clono_allData[, c("N", "Freq", "Convergent Evolution")]))
colnames(clono_write) <- c("clonotype", "CDR3", "N", "Freq", "Convergent Evolution")
}
write.table(clono_write,
paste0(e$output_folder, "/", "Clonotypes_", paste0(g, "+", junction), "All_Data", ".txt"),
sep = "\t",
row.names = FALSE,
col.names = TRUE,
quote = FALSE
)
write.table(cbind(allData_initial, "cluster_id" = cluster_id, "freq_cluster_id" = freq_cluster_id),
paste0(e$output_folder, "/", "filterin_clono_All_Data", ".txt"),
sep = "\t",
row.names = FALSE,
col.names = TRUE,
quote = FALSE
)
}
}
# clonotypes datasets
## Unnecessary
# run_diagnosis <<- run_diagnosis
clono_datasets <- list()
filterin_clono_datasets <- list()
view_specific_clonotype_datasets <- list()
convergent_evolution_list_datasets <- list()
convergent_evolution_list_datasets_only_num <- list()
diagnosis <- list()
cluster_id <- list()
freq_cluster_id <- list()
clono_datasets_freq <- list()
group.freq.seq <- list()
message("Clonotype Analysis Step 3")
one_run <- function(j) {
message("Clonotype Analysis Step 5")
data <- allData %>% dplyr::filter(allData$dataName == name[j])
data_initial <- allData_initial %>% dplyr::filter(allData$dataName == name[j])
view_specific_clonotype_datasets[[name[j]]] <- list()
convergent_evolution <- c()
convergent_evolution_list_datasets[[name[j]]] <- list()
convergent_evolution_list_datasets_only_num[[name[j]]] <- c()
freq_cluster_id[[name[j]]] <- c()
cluster_id[[name[j]]] <- c()
v.gene.and.allele <- c()
aa.junction <- c()
group.freq.seq[[name[j]]] <- data
if (length(gene) > 0) {
if (allele == FALSE) {
data[[gene]] <- stringr::str_split(data[[gene]], "\\*", simplify = TRUE)[, 1] # as.character(plyr::ldply(a2, function(s){t(data.frame(unlist(s)))})[,1])
}
distinctVGenes_CDR3 <- data %>%
dplyr::group_by(
Genes = data[[gene]],
CDR3 = data[[junction]]
) %>%
dplyr::count(sort = TRUE) # dplyr::summarise(n = n())
distinctVGenes_CDR3$clonotype <- paste(distinctVGenes_CDR3$Genes,
distinctVGenes_CDR3$CDR3,
sep = " - "
) # do.call(paste, c(distinctVGenes_CDR3[c("Genes", "CDR3")], sep = " - "))
group.freq.seq[[name[j]]] <- group.freq.seq[[name[j]]] %>%
dplyr::group_by(
Genes = group.freq.seq[[name[j]]][[gene]],
CDR3 = group.freq.seq[[name[j]]][[junction]]
)
group.freq.seq[[name[j]]]$clonotype <- paste(group.freq.seq[[name[j]]]$Genes,
group.freq.seq[[name[j]]]$CDR3,
sep = " - "
) # do.call(paste, c(group.freq.seq[[name[j]]][c("Genes", "CDR3")], sep = " - "))
group.freq.seq[[name[j]]] <- data.table::as.data.table(group.freq.seq[[name[j]]])
for (i in seq_len(nrow(distinctVGenes_CDR3))) {
# Get specific clonotype data
inputVGenes_CDR3 <- which((data[[gene]] == distinctVGenes_CDR3$Genes[i]) & (data[[junction]] == distinctVGenes_CDR3$CDR3[i]))
view_specific_clonotype_datasets[[name[j]]][[distinctVGenes_CDR3$clonotype[i]]] <- data_initial[inputVGenes_CDR3, ]
# Count unique 'IMGT.gapped.nt.sequences.CDR3.IMGT'
ce <- view_specific_clonotype_datasets[[name[j]]][[distinctVGenes_CDR3$clonotype[i]]] %>%
dplyr::count(view_specific_clonotype_datasets[[name[j]]][[distinctVGenes_CDR3$clonotype[i]]][[used_columns[["IMGT.gapped.nt.sequences"]][9]]],
sort = TRUE
)
colnames(ce) <- c(used_columns[["IMGT.gapped.nt.sequences"]][9], "convergent_evolution")
# Create convergent evolution
convergent_evolution_list_datasets[[name[j]]][[as.character(i)]] <- ce
convergent_evolution_list_datasets_only_num[[name[j]]] <- c(convergent_evolution_list_datasets_only_num[[name[j]]], nrow(ce))
convergent_evolution <- c(convergent_evolution, paste0("cluster ", i, " : ", nrow(ce)))
# Calculate clonotype frequency
freq_cluster_id[[name[j]]][inputVGenes_CDR3] <- 100 * length(inputVGenes_CDR3) / nrow(data_initial)
cluster_id[[name[j]]][[name[j]]][inputVGenes_CDR3] <- i
}
} else {
distinctVGenes_CDR3 <- data %>%
dplyr::group_by(clonotype = data[[junction]]) %>%
dplyr::count(sort = TRUE) # dplyr::summarise(n=n())
group.freq.seq[[name[j]]] <- group.freq.seq[[name[j]]] %>% dplyr::group_by(clonotype = group.freq.seq[[name[j]]][[junction]])
group.freq.seq[[name[j]]] <- data.table::as.data.table(group.freq.seq[[name[j]]])
for (i in seq_len(nrow(distinctVGenes_CDR3))) {
# Get specific clonotype data
inputVGenes_CDR3 <- which(data[[junction]] == distinctVGenes_CDR3$clonotype[i])
view_specific_clonotype_datasets[[name[j]]][[distinctVGenes_CDR3$clonotype[i]]] <- data_initial[inputVGenes_CDR3, ]
# Count unique 'IMGT.gapped.nt.sequences.CDR3.IMGT'
ce <- view_specific_clonotype_datasets[[name[j]]][[distinctVGenes_CDR3$clonotype[i]]] %>%
dplyr::count(view_specific_clonotype_datasets[[name[j]]][[distinctVGenes_CDR3$clonotype[i]]][[used_columns[["IMGT.gapped.nt.sequences"]][9]]],
sort = TRUE
)
colnames(ce) <- c(used_columns[["IMGT.gapped.nt.sequences"]][9], "convergent_evolution")
# Create convergent evolution list
convergent_evolution_list_datasets[[name[j]]][[as.character(i)]] <- ce
convergent_evolution_list_datasets_only_num[[name[j]]] <- c(convergent_evolution_list_datasets_only_num[[name[j]]], nrow(ce))
convergent_evolution <- c(convergent_evolution, paste0("cluster ", i, " : ", nrow(ce)))
# Get V gene and AA junction
v.gene.and.allele <- c(v.gene.and.allele, unique(data_initial[inputVGenes_CDR3, ]$Summary.V.GENE.and.allele))
aa.junction <- c(aa.junction, unique(data_initial[inputVGenes_CDR3, ]$Summary.AA.JUNCTION))
# Calculate clonotype frequency
freq_cluster_id[[name[j]]][[name[j]]][[name[j]]][inputVGenes_CDR3] <- 100 * length(inputVGenes_CDR3) / nrow(data_initial)
cluster_id[[name[j]]][inputVGenes_CDR3] <- i
}
}
clono_datasets[[name[j]]] <- distinctVGenes_CDR3[, c("clonotype", "n")]
clono_datasets[[name[j]]] <- cbind(clono_datasets[[name[j]]], Freq = 100 * clono_datasets[[name[j]]]$n / nrow(data))
clono_datasets[[name[j]]] <- cbind(clono_datasets[[name[j]]], convergent_evolution)
colnames(clono_datasets[[name[j]]]) <- c("clonotype", "N", "Freq", "Convergent Evolution")
if (junction == "Summary.Sequence") {
clono_datasets[[name[j]]] <- cbind(clono_datasets[[name[j]]], v.gene.and.allele, aa.junction)
colnames(clono_datasets[[name[j]]]) <- c("clonotype", "N", "Freq", "Convergent Evolution", "V Gene and allele", "AA Junction")
}
clono_datasets_freq[[name[j]]] <- cbind(data_initial, "cluster_id" = cluster_id[[name[j]]], "freq_cluster_id" = freq_cluster_id[[name[j]]])
if (run_diagnosis) {
group.freq.seq[[name[j]]] <- get_frequent_sequence(group.freq.seq[[name[j]]])
}
if (save_tables_individually) {
if (junction == "Summary.Sequence") {
clono_write <- clono_datasets[[name[j]]]
} else {
## CHANGED cbind() to merge()
clono_write <- as.data.frame(cbind(
distinctVGenes_CDR3[, c("Genes", "CDR3")],
clono_datasets[[name[j]]][, c("N", "Freq", "Convergent Evolution")]
))
colnames(clono_write) <- c("clonotype", "CDR3", "N", "Freq", "Convergent Evolution")
}
# Printing clonotypes file
write.table(clono_write,
paste0(e$output_folder, "/", "Clonotypes_", paste(g, "+", junction, "_", sep = ""), name[j], ".txt"),
sep = "\t",
row.names = FALSE,
col.names = TRUE,
quote = FALSE
)
# Printing whole integrated clonotypes file
write.table(clono_datasets_freq[[name[j]]],
paste0(e$output_folder, "/", "filterin_clono_", name[j], ".txt"),
sep = "\t",
row.names = FALSE,
col.names = TRUE,
quote = FALSE
)
# Printing SHM file
# Printing diagnosis file
if (run_diagnosis) {
write.table(group.freq.seq[[name[j]]],
paste0(e$output_folder, "/", "diagnosis_", name[j], ".txt"),
sep = "\t",
row.names = FALSE,
col.names = TRUE,
quote = FALSE
)
}
}
result <- list()
result[["clono_datasets"]] <- clono_datasets[[name[j]]]
result[["filterin_highly_clono"]] <- clono_datasets_freq[[name[j]]]
result[["view_specific_clonotype_datasets"]] <- view_specific_clonotype_datasets[[name[j]]]
result[["convergent_evolution_list_datasets"]] <- convergent_evolution_list_datasets[[name[j]]]
result[["convergent_evolution_list_datasets_only_num"]] <- convergent_evolution_list_datasets_only_num[[name[j]]]
result[["Diagnosis"]] <- group.freq.seq[[name[j]]]
return(result)
}
if (Sys.info()[1] == "Windows") {
message("Clonotype Analysis Step 4.a")
a <- lapply(seq_len(length(name)), one_run)
for (i in seq_len(length(name))) {
view_specific_clonotype_datasets[[name[i]]] <- a[[i]]$view_specific_clonotype_datasets
clono_datasets[[name[i]]] <- a[[i]]$clono_datasets
convergent_evolution_list_datasets[[name[i]]] <- a[[i]]$convergent_evolution_list_datasets
convergent_evolution_list_datasets_only_num[[name[i]]] <- a[[i]]$convergent_evolution_list_datasets_only_num
diagnosis[[name[i]]] <- a[[i]]$Diagnosis
}
} else {
message("Clonotype Analysis Step 4.b")
a <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = parallel::detectCores(all.tests = FALSE, logical = TRUE), mc.preschedule = TRUE)
# a = lapply(seq_len(length(name)), one_run) ## for debugging use lapply
for (i in seq_len(length(name))) {
view_specific_clonotype_datasets[[name[i]]] <- a[[i]]$view_specific_clonotype_datasets
clono_datasets[[name[i]]] <- a[[i]]$clono_datasets
convergent_evolution_list_datasets[[name[i]]] <- a[[i]]$convergent_evolution_list_datasets
convergent_evolution_list_datasets_only_num[[name[i]]] <- a[[i]]$convergent_evolution_list_datasets_only_num
diagnosis[[name[i]]] <- a[[i]]$Diagnosis
}
}
if (length(name) == 1) {
clono_allData <- clono_datasets[[name[1]]]
convergent_evolution_list_allData <- convergent_evolution_list_datasets
view_specific_clonotype_allData <- view_specific_clonotype_datasets[[name[1]]]
clono_allData_freq <- a[[1]]$filterin_highly_clono
if (save_tables_individually) {
if (junction == "Summary.Sequence") {
clono_write <- clono_allData
} else {
clono_write <- stringr::str_split(clono_allData$clonotype, " - ", simplify = TRUE)
clono_write <- data.table::as.data.table(clono_write)
clono_write <- cbind(clono_write, clono_allData[, c("N", "Freq", "Convergent Evolution")])
colnames(clono_write) <- c("Genes", "CDR3", "N", "Freq", "Convergent Evolution")
}
write.table(clono_write, paste0(e$output_folder, "/", "Clonotypes_All Data", ".txt"),
sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE
)
write.table(clono_allData_freq, paste0(e$output_folder, "/", "filterin_clono_All_Data", ".txt"),
sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE
)
}
}
confirm <- paste0("Clonotypes run!")
result <- list(
"clono_allData" = clono_allData,
"clono_datasets" = clono_datasets,
"filterin_highly_clono" = clono_allData_freq,
"view_specific_clonotype_allData" = view_specific_clonotype_allData,
"convergent_evolution_list_allData" = convergent_evolution_list_allData,
"view_specific_clonotype_datasets" = view_specific_clonotype_datasets,
"convergent_evolution_list_datasets" = convergent_evolution_list_datasets,
"convergent_evolution_list_datasets_only_num" = convergent_evolution_list_datasets_only_num,
"diagnosis" = diagnosis,
"confirm" = confirm
)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
get_frequent_sequence <- function(data) {
clonos <- as.list(unique(data$clonotype))
one.run <- function(index, temp) {
clono_in <- temp[which(temp$clonotype == index), ]
freq <- 100 * nrow(clono_in) / nrow(temp)
x1 <- table(clono_in$Summary.Sequence)
x1 <- data.table::as.data.table(x1)
colnames(x1) <- c("Sequence", "count")
sequences <- x1[which(x1$count == max(x1$count)), ]$Sequence
clono_in <- clono_in[!duplicated(Summary.Sequence), ]
map <- BiocGenerics::match(sequences, clono_in$Summary.Sequence)
v.region.identity <- clono_in[map, ]$Summary.V.REGION.identity..
total <- data.table(
Clonotype = unique(clono_in$clonotype),
Freq = freq,
V.Region.Identity = v.region.identity,
Sequence = sequences
)
return(total)
}
out <- lapply(clonos, one.run, data)
out <- rbindlist(out)
return(out)
}
######################################################################################################################################
highly_similar_clonotypes <- function(clono_allData, clono_datasets, num_of_mismatches, take_gene, cdr3_lengths, gene_clonotypes, clonotype_freq_thr_for_highly_sim, name) {
# logfile
cat(paste0("highly_similar_clonotypes", "\t"), file = logFile, append = TRUE)
cat(paste0(paste("take_gene ", take_gene, "threshold", clonotype_freq_thr_for_highly_sim, sep = ","), "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
####################################### All Data #######################################
clono_allData_only_cdr3 <- clono_allData
if (stringr::str_detect(clono_allData$clonotype[1], " - ") && take_gene == "No") {
a2 <- strsplit(clono_allData$clonotype, " - ")
clono_allData_only_cdr3$clonotype <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 2])
clono_allData_only_cdr3 <- data.table::as.data.table(clono_allData_only_cdr3[, 1:3])[, lapply(.SD, sum), by = .(clonotype = clonotype)]
clono_allData_only_cdr3 <- clono_allData_only_cdr3[order(-clono_allData_only_cdr3$N), ]
}
if (stringr::str_detect(clono_allData$clonotype[1], " - ") && take_gene == "Yes") {
a2 <- strsplit(clono_allData$clonotype, " - ")
clono_allData_only_cdr3$clonotype <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 2])
clono_allData_only_cdr3$gene <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
# if the gene does matter than I do not have to exclude it from the clono_allData_only_cdr3 table
# group by cdr3
clono_allData_only_cdr3$cluster_id <- as.numeric(row.names(clono_allData_only_cdr3))
# for each cdr3 length
highly_sim_view_specific_clonotypes <- list()
for (i in seq_len(length(cdr3_lengths))) {
highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]] <- list()
clonotypes_of_this_length <- clono_allData_only_cdr3 %>% dplyr::filter(str_length(clono_allData_only_cdr3$clonotype) == (cdr3_lengths[i] + 2))
end_process_for_this_length <- FALSE
while (end_process_for_this_length == FALSE) {
major_clonotype <- clonotypes_of_this_length$clonotype[1]
if (nrow(clonotypes_of_this_length) > 0) {
if (clonotypes_of_this_length$Freq[1] > clonotype_freq_thr_for_highly_sim) {
if (take_gene == "Yes") {
clonotypes_of_this_length_gene <- clonotypes_of_this_length %>% dplyr::filter(clonotypes_of_this_length$gene == clonotypes_of_this_length$gene[1])
dist_from_major <- stringdist::stringdist(clonotypes_of_this_length_gene$clonotype, major_clonotype)
matched_clonotypes <- clonotypes_of_this_length_gene %>% dplyr::filter(dist_from_major <= num_of_mismatches[i])
not_matched_clonotypes <- clonotypes_of_this_length %>% dplyr::filter(!(clonotypes_of_this_length$clonotype %in% matched_clonotypes$clonotype))
} else {
dist_from_major <- stringdist::stringdist(clonotypes_of_this_length$clonotype, major_clonotype)
matched_clonotypes <- clonotypes_of_this_length %>% dplyr::filter(dist_from_major <= num_of_mismatches[i])
not_matched_clonotypes <- clonotypes_of_this_length %>% dplyr::filter(dist_from_major > num_of_mismatches[i])
}
if (nrow(matched_clonotypes) > 0) {
# save it
if (take_gene == "No") {
highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]][[clonotypes_of_this_length$clonotype[1]]] <- matched_clonotypes # save the corresponding clonotype from clono_allData table not just the cdr3
highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]][[clonotypes_of_this_length$clonotype[1]]]$prev_cluster <- as.numeric(row.names(matched_clonotypes))
} else {
highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]][[clono_allData$clonotype[clonotypes_of_this_length$cluster_id[1]]]] <- clono_allData[matched_clonotypes$cluster_id, ] # save the corresponding clonotype from clono_allData table not just the cdr3
highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]][[clono_allData$clonotype[clonotypes_of_this_length$cluster_id[1]]]]$prev_cluster <- as.numeric(row.names(clono_allData[matched_clonotypes$cluster_id, ]))
}
}
if (nrow(not_matched_clonotypes) == 0) {
end_process_for_this_length <- TRUE
} else {
clonotypes_of_this_length <- not_matched_clonotypes
}
} else {
# terminate the process for this length
end_process_for_this_length <- TRUE
}
} else {
# terminate the process for this length
end_process_for_this_length <- TRUE
}
}
}
##### Results to tables
highly_sim_clonotypes <- list()
highly_sim_clonotypes_allGroups <- list()
for (i in seq_len(length(cdr3_lengths))) {
clonotype <- names(highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]])
if (!(is.null(clonotype))) {
N <- c()
Freq <- c()
prev_cluster <- c()
for (j in seq_len(length(highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]]))) {
highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]][[clonotype[j]]]$HS_cluster_id <- j
N <- c(N, sum(highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]][[clonotype[j]]]$N))
Freq <- c(Freq, sum(highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]][[clonotype[j]]]$Freq))
prev_cluster_c <- ""
for (k in seq_len(nrow(highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]][[clonotype[j]]]))) {
prev_cluster_c <- paste(prev_cluster_c, highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]][[clonotype[j]]]$prev_cluster[k])
}
prev_cluster <- c(prev_cluster, prev_cluster_c)
}
highly_sim_clonotypes[[paste0("length ", cdr3_lengths[i])]] <- data.frame(clonotype, N, Freq, prev_cluster, stringsAsFactors = FALSE) # I have this data frame for each length
highly_sim_clonotypes[[paste0("length ", cdr3_lengths[i])]]$HS_cluster_id <- as.numeric(row.names(highly_sim_clonotypes[[paste0("length ", cdr3_lengths[i])]]))
}
highly_sim_clonotypes_allGroups[[paste0("length ", cdr3_lengths[i])]] <- do.call(rbind.data.frame, highly_sim_view_specific_clonotypes[[paste0("length ", cdr3_lengths[i])]])
# extra clonotypes
if (take_gene == "Yes") {
clonotypes_of_this_length_id <- which(str_length(clono_allData_only_cdr3$clonotype) == (cdr3_lengths[i] + 2))
extra_clono <- clono_allData[clonotypes_of_this_length_id, ] %>% dplyr::filter(!(clono_allData[clonotypes_of_this_length_id, ]$clonotype %in% highly_sim_clonotypes_allGroups[[paste0("length ", cdr3_lengths[i])]]$clonotype))
} else {
clonotypes_of_this_length <- clono_allData_only_cdr3 %>% dplyr::filter(str_length(clono_allData_only_cdr3$clonotype) == (cdr3_lengths[i] + 2))
extra_clono <- clonotypes_of_this_length %>% dplyr::filter(!(clonotypes_of_this_length$clonotype %in% highly_sim_clonotypes_allGroups[[paste0("length ", cdr3_lengths[i])]]$clonotype))
}
if (nrow(extra_clono) > 0) {
extra_clono$prev_cluster <- 0
for (k in seq_len(nrow(extra_clono))) {
temp <- as.character(extra_clono[["Convergent Evolution"]][k])
extra_clono$prev_cluster[k] <- as.numeric(strsplit(strsplit(temp, "cluster ")[[1]][2], " :")[[1]][1])
}
if (!(is.null(clonotype))) {
extra_clono$HS_cluster_id <- (max(highly_sim_clonotypes[[paste0("length ", cdr3_lengths[i])]]$HS_cluster_id) + 1):(max(highly_sim_clonotypes[[paste0("length ", cdr3_lengths[i])]]$HS_cluster_id) + nrow(extra_clono))
} else {
extra_clono$HS_cluster_id <- seq_len(nrow(extra_clono))
}
highly_sim_clonotypes_allGroups[[paste0("length ", cdr3_lengths[i])]] <- rbind(highly_sim_clonotypes_allGroups[[paste0("length ", cdr3_lengths[i])]], extra_clono)
highly_sim_clonotypes[[paste0("length ", cdr3_lengths[i])]] <- rbind(highly_sim_clonotypes[[paste0("length ", cdr3_lengths[i])]], extra_clono[, c("clonotype", "N", "Freq", "prev_cluster", "HS_cluster_id")])
}
row.names(highly_sim_clonotypes_allGroups[[paste0("length ", cdr3_lengths[i])]]) <- seq_len(nrow(highly_sim_clonotypes_allGroups[[paste0("length ", cdr3_lengths[i])]]))
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Highly_sim_Clonotypes_", "All_Data_length_", cdr3_lengths[i], ".txt")
write.table(highly_sim_clonotypes[[paste0("length ", cdr3_lengths[i])]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
filename <- paste0(e$output_folder, "/", "Highly_sim_Clonotypes_groups_", "All_Data_length_", cdr3_lengths[i], ".txt")
write.table(highly_sim_clonotypes_allGroups[[paste0("length ", cdr3_lengths[i])]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
}
####################################### Separate Datasets #######################################
highly_sim_view_specific_clonotypes_datasets <- list()
highly_sim_clonotypes_datasets <- list()
highly_sim_clonotypes_allGroups_datasets <- list()
one_run <- function(j) {
clono_allData_only_cdr3 <- clono_datasets[[name[j]]]
if (stringr::str_detect(clono_datasets[[name[j]]]$clonotype[1], " - ") && take_gene == "No") {
a2 <- strsplit(clono_datasets[[name[j]]]$clonotype, " - ")
clono_allData_only_cdr3$clonotype <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 2])
clono_allData_only_cdr3 <- data.table::as.data.table(clono_allData_only_cdr3[, 1:3])[, lapply(.SD, sum), by = .(clonotype = clonotype)]
clono_allData_only_cdr3 <- clono_allData_only_cdr3[order(-clono_allData_only_cdr3$N), ]
}
if (stringr::str_detect(clono_datasets[[name[j]]]$clonotype[1], " - ") && take_gene == "Yes") {
a2 <- strsplit(clono_datasets[[name[j]]]$clonotype, " - ")
clono_allData_only_cdr3$clonotype <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 2])
clono_allData_only_cdr3$gene <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
clono_allData_only_cdr3$cluster_id <- as.numeric(row.names(clono_allData_only_cdr3))
#### analysis
# for each cdr3 length
highly_sim_view_specific_clonotypes_datasets[[name[j]]] <- list()
for (i in seq_len(length(cdr3_lengths))) {
highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]] <- list()
clonotypes_of_this_length <- clono_allData_only_cdr3 %>% dplyr::filter(str_length(clono_allData_only_cdr3$clonotype) == (cdr3_lengths[i] + 2))
end_process_for_this_length <- FALSE
while (end_process_for_this_length == FALSE) {
major_clonotype <- clonotypes_of_this_length$clonotype[1]
if (nrow(clonotypes_of_this_length) > 0) {
if (clonotypes_of_this_length$Freq[1] > clonotype_freq_thr_for_highly_sim) {
if (take_gene == "Yes") {
clonotypes_of_this_length_gene <- clonotypes_of_this_length %>% dplyr::filter(clonotypes_of_this_length$gene == clonotypes_of_this_length$gene[1])
dist_from_major <- stringdist::stringdist(clonotypes_of_this_length_gene$clonotype, major_clonotype)
matched_clonotypes <- clonotypes_of_this_length_gene %>% dplyr::filter(dist_from_major <= num_of_mismatches[i])
not_matched_clonotypes <- clonotypes_of_this_length %>% dplyr::filter(!(clonotypes_of_this_length$clonotype %in% matched_clonotypes$clonotype))
} else {
dist_from_major <- stringdist::stringdist(clonotypes_of_this_length$clonotype, major_clonotype)
matched_clonotypes <- clonotypes_of_this_length %>% dplyr::filter(dist_from_major <= num_of_mismatches[i])
not_matched_clonotypes <- clonotypes_of_this_length %>% dplyr::filter(dist_from_major > num_of_mismatches[i])
}
if (nrow(matched_clonotypes) > 0) {
# save it
if (take_gene == "No") {
highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]][[clonotypes_of_this_length$clonotype[1]]] <- matched_clonotypes # save the corresponding clonotype from clono_allData table not just the cdr3
highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]][[clonotypes_of_this_length$clonotype[1]]]$prev_cluster <- row.names(matched_clonotypes) # save the corresponding clonotype from clono_allData table not just the cdr3
} else {
highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]][[clono_datasets[[name[j]]]$clonotype[clonotypes_of_this_length$cluster_id[1]]]] <- clono_datasets[[name[j]]][matched_clonotypes$cluster_id, ] # save the corresponding clonotype from clono_allData table not just the cdr3
highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]][[clono_datasets[[name[j]]]$clonotype[clonotypes_of_this_length$cluster_id[1]]]]$prev_cluster <- row.names(clono_datasets[[name[j]]][matched_clonotypes$cluster_id, ]) # save the corresponding clonotype from clono_allData table not just the cdr3
}
}
if (nrow(not_matched_clonotypes) == 0) {
end_process_for_this_length <- TRUE
} else {
clonotypes_of_this_length <- not_matched_clonotypes
}
} else {
# terminate the process for this length
end_process_for_this_length <- TRUE
}
} else {
# terminate the process for this length
end_process_for_this_length <- TRUE
}
}
}
##### Results to tables
highly_sim_clonotypes_datasets[[name[j]]] <- list()
highly_sim_clonotypes_allGroups_datasets[[name[j]]] <- list()
for (i in seq_len(length(cdr3_lengths))) {
clonotype <- names(highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]])
if (!(is.null(clonotype))) {
N <- c()
Freq <- c()
prev_cluster <- c()
for (cl in seq_len(length(highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]]))) {
highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]][[clonotype[cl]]]$HS_cluster_id <- cl
N <- c(N, sum(highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]][[clonotype[cl]]]$N))
Freq <- c(Freq, sum(highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]][[clonotype[cl]]]$Freq))
prev_cluster_c <- ""
for (k in seq_len(nrow(highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]][[clonotype[cl]]]))) {
prev_cluster_c <- paste(prev_cluster_c, highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]][[clonotype[cl]]]$prev_cluster[k])
}
prev_cluster <- c(prev_cluster, prev_cluster_c)
}
highly_sim_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]] <- data.frame(clonotype, N, Freq, prev_cluster, stringsAsFactors = FALSE) # I have this data frame for each length
highly_sim_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]]$HS_cluster_id <- as.numeric(row.names(highly_sim_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]]))
}
highly_sim_clonotypes_allGroups_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]] <- do.call(rbind.data.frame, highly_sim_view_specific_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]])
# extra clonotypes
if (take_gene == "Yes") {
clonotypes_of_this_length_id <- which(str_length(clono_allData_only_cdr3$clonotype) == (cdr3_lengths[i] + 2))
extra_clono <- clono_datasets[[name[j]]][clonotypes_of_this_length_id, ] %>% dplyr::filter(!(clono_datasets[[name[j]]][clonotypes_of_this_length_id, ]$clonotype %in% highly_sim_clonotypes_allGroups_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]]$clonotype))
} else {
clonotypes_of_this_length <- clono_allData_only_cdr3 %>% dplyr::filter(str_length(clono_allData_only_cdr3$clonotype) == (cdr3_lengths[i] + 2))
extra_clono <- clonotypes_of_this_length %>% dplyr::filter(!(clonotypes_of_this_length$clonotype %in% highly_sim_clonotypes_allGroups_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]]$clonotype))
}
if (nrow(extra_clono) > 0) {
extra_clono$prev_cluster <- 0
for (k in seq_len(nrow(extra_clono))) {
temp <- as.character(extra_clono[["Convergent Evolution"]][k])
extra_clono$prev_cluster[k] <- as.numeric(strsplit(strsplit(temp, "cluster ")[[1]][2], " :")[[1]][1])
}
if (!(is.null(clonotype))) {
extra_clono$HS_cluster_id <- (max(highly_sim_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]]$HS_cluster_id) + 1):(max(highly_sim_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]]$HS_cluster_id) + nrow(extra_clono))
} else {
extra_clono$HS_cluster_id <- seq_len(nrow(extra_clono))
}
highly_sim_clonotypes_allGroups_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]] <- rbind(highly_sim_clonotypes_allGroups_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]], extra_clono)
highly_sim_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]] <- rbind(highly_sim_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]], extra_clono[, c("clonotype", "N", "Freq", "prev_cluster", "HS_cluster_id")])
row.names(highly_sim_clonotypes_allGroups_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]]) <- seq_len(nrow(highly_sim_clonotypes_allGroups_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]]))
}
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Highly_sim_Clonotypes_", name[j], "_length_", cdr3_lengths[i], ".txt")
write.table(highly_sim_clonotypes_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
filename <- paste0(e$output_folder, "/", "Highly_sim_Clonotypes_groups_", name[j], "_length_", cdr3_lengths[i], ".txt")
write.table(highly_sim_clonotypes_allGroups_datasets[[name[j]]][[paste0("length ", cdr3_lengths[i])]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
}
result <- list()
result[["highly_sim_clonotypes_allGroups_datasets"]] <- highly_sim_clonotypes_allGroups_datasets[[name[j]]]
result[["highly_sim_clonotypes_datasets"]] <- highly_sim_clonotypes_datasets[[name[j]]]
result[["highly_sim_view_specific_clonotypes_datasets"]] <- highly_sim_view_specific_clonotypes_datasets[[name[j]]]
return(result)
}
if (Sys.info()[1] == "Windows") {
a <- lapply(seq_len(length(name)), one_run)
for (i in seq_len(length(name))) {
highly_sim_clonotypes_allGroups_datasets[[name[i]]] <- a[[i]]$highly_sim_clonotypes_allGroups_datasets
clono_datasets[[name[i]]] <- a[[i]]$clono_datasets
highly_sim_clonotypes_datasets[[name[i]]] <- a[[i]]$highly_sim_clonotypes_datasets
highly_sim_view_specific_clonotypes_datasets[[name[i]]] <- a[[i]]$highly_sim_view_specific_clonotypes_datasets
}
} else {
a <- lapply(seq_len(length(name)), one_run)
# a=parallel::mclapply(seq_len(length(name)),one_run,mc.cores = num_of_cores, mc.preschedule = TRUE)
for (i in seq_len(length(name))) {
highly_sim_clonotypes_allGroups_datasets[[name[i]]] <- a[[i]]$highly_sim_clonotypes_allGroups_datasets
clono_datasets[[name[i]]] <- a[[i]]$clono_datasets
highly_sim_clonotypes_datasets[[name[i]]] <- a[[i]]$highly_sim_clonotypes_datasets
highly_sim_view_specific_clonotypes_datasets[[name[i]]] <- a[[i]]$highly_sim_view_specific_clonotypes_datasets
}
}
confirm <- paste0("Highly Similar Clonotypes run!")
result <- list(
"highly_sim_view_specific_clonotypes" = highly_sim_view_specific_clonotypes,
"highly_sim_clonotypes" = highly_sim_clonotypes,
"highly_sim_view_specific_clonotypes_datasets" = highly_sim_view_specific_clonotypes_datasets,
"highly_sim_clonotypes_datasets" = highly_sim_clonotypes_datasets,
"highly_sim_clonotypes_allGroups" = highly_sim_clonotypes_allGroups,
"highly_sim_clonotypes_allGroups_datasets" = highly_sim_clonotypes_allGroups_datasets,
"confirm" = confirm
)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
public_clonotypes <- function(clono_allData, clono_datasets, take_gene, use_reads, public_clonotype_thr, name, highly) {
# logfile
cat(paste0("public_clonotypes", "\t"), file = logFile, append = TRUE)
cat(paste0(paste("take_gene ", take_gene, "threshold", public_clonotype_thr, sep = ","), "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
if (stringr::str_detect(clono_allData$clonotype[1], " - ") && take_gene == "No") {
a2 <- strsplit(clono_allData$clonotype, " - ")
clono_allData$clonotype <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 2])
clono_allData <- data.table::as.data.table(clono_allData[, 1:3])[, lapply(.SD, sum), by = .(clonotype = clonotype)]
clono_allData <- clono_allData[order(-clono_allData$N), ]
}
initial_sum <- list()
for (n in name) {
initial_sum[[n]] <- sum(clono_datasets[[n]]$N)
}
if (use_reads) {
clono_allData <- clono_allData %>% dplyr::filter(N > public_clonotype_thr)
} else {
clono_allData <- clono_allData[1:public_clonotype_thr, ]
}
public_clono <- data.frame(clonotype = unique(clono_allData$clonotype), stringsAsFactors = FALSE)
# for each dataset
for (n in name) {
if (stringr::str_detect(clono_datasets[[n]]$clonotype[1], " - ") && take_gene == "No") {
a2 <- strsplit(clono_datasets[[n]]$clonotype, " - ")
clono_datasets[[n]]$clonotype <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 2])
clono_datasets[[n]] <- data.table::as.data.table(clono_datasets[[n]][, 1:3])[, lapply(.SD, sum), by = .(clonotype = clonotype)]
clono_datasets[[n]] <- clono_datasets[[n]][order(-clono_datasets[[n]]$N), ]
}
if (use_reads) {
clono_datasets[[n]] <- clono_datasets[[n]] %>% dplyr::filter(N > public_clonotype_thr)
} else {
clono_datasets[[n]] <- clono_datasets[[n]][1:public_clonotype_thr, ]
}
ids_dataset <- which(clono_datasets[[n]]$clonotype %in% public_clono$clonotype)
ids <- match(clono_datasets[[n]]$clonotype, public_clono$clonotype)[which(!(is.na(match(clono_datasets[[n]]$clonotype, public_clono$clonotype))))]
public_clono[[paste0(n, "_Reads/Total")]] <- NA
public_clono[[paste0(n, "_Freq")]] <- NA
public_clono[[paste0(n, "_Reads/Total")]][ids] <- paste0(clono_datasets[[n]]$N[ids_dataset], "/", initial_sum[[n]])
public_clono[[paste0(n, "_Freq")]][ids] <- clono_datasets[[n]]$Freq[ids_dataset]
}
public_clono$Num_of_patients <- NA
# filter results
public_clono$Num_of_patients <- (apply(public_clono, 1, function(x) sum(!(is.na(x)))) - 1) / 2
public_clono <- public_clono %>% dplyr::filter(Num_of_patients > 1)
# replace NA with 0
public_clono[is.na(public_clono)] <- 0
if (save_tables_individually) {
if (highly) {
filename <- paste0(e$output_folder, "/", "public_highly_clonotypes", ".txt")
write.table(public_clono, filename, sep = "\t", row.names = FALSE, col.names = TRUE)
} else {
filename <- paste0(e$output_folder, "/", "public_clonotypes", ".txt")
write.table(public_clono, filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
}
confirm <- paste0("Shared Clonotypes run!")
result <- list("public_clono" = public_clono, "confirm" = confirm)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
createLink <- function(val, on_click_js) {
#<a class="btn btn-primary">Info</a>'
as.character(tags$a(href = "#", onclick = sprintf(on_click_js, val), val))
}
######################################################################################################################################
viewClonotypes <- function(allData, allele, gene, junction, val1, val2) {
# logfile
cat(paste0("viewClonotypes", "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(allData), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
temp <- allData
if (length(gene) > 0) {
if (allele == FALSE) {
for (i in seq_len(nrow(temp))) {
temp[[gene]][i] <- strsplit(temp[[gene]][i], "[*]")[[1]][1]
}
} else {
for (i in seq_len(nrow(temp))) {
temp[[gene]][i] <- strsplit(temp[[gene]][i], "(see comment)")[[1]][1]
}
}
inputVGenes_CDR3 <- which((temp[[gene]] == val1) & (temp[[junction]] == val2))
} else {
inputVGenes_CDR3 <- which(temp[[junction]] == val1)
}
a <- allData[inputVGenes_CDR3, ]
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(a)
}
######################################################################################################################################
repertoires <- function(clono_allData, clono_datasets, allele, allele_clonotypes, gene, gene_clonotypes, name, view_specific_clonotype_allData, view_specific_clonotype_datasets, highly_sim) {
if (allele == FALSE) {
g <- stringr::str_replace(gene, ".and.allele", "")
} else {
g <- gene
}
# logfile
cat(paste0("repertoires", "\t"), file = logFile, append = TRUE)
cat(paste0(g, "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
for (i in seq_len(nrow(clono_allData))) {
clono_allData[i, 1] <- strsplit(as.character(clono_allData[i, 1]), " - ")[[1]][1]
}
for (j in seq_len(length(name))) {
for (i in seq_len(nrow(clono_datasets[[name[j]]]))) {
clono_datasets[[name[j]]][i, 1] <- strsplit(as.character(clono_datasets[[name[j]]][i, 1]), " - ")[[1]][1]
}
}
if (gene == gene_clonotypes && allele == allele_clonotypes && !(is.null(gene_clonotypes))) {
####################################### All Data
Repertoires_allData <- clono_allData %>%
dplyr::group_by(clono_allData[["clonotype"]]) %>%
dplyr::summarise(n = n())
Repertoires_allData <- Repertoires_allData[order(-Repertoires_allData$n), ]
Repertoires_allData <- cbind(Repertoires_allData, Freq = 100 * Repertoires_allData$n / nrow(clono_allData))
####################################### Separate Datasets
Repertoires_datasets <- list()
one_run <- function(j) {
Repertoires_datasets[[name[j]]] <- clono_datasets[[name[j]]] %>%
dplyr::group_by(clono_datasets[[name[j]]][["clonotype"]]) %>%
dplyr::summarise(n = n())
Repertoires_datasets[[name[j]]] <- Repertoires_datasets[[name[j]]][order(-Repertoires_datasets[[name[j]]]$n), ]
Repertoires_datasets[[name[j]]] <- cbind(Repertoires_datasets[[name[j]]], Freq = 100 * Repertoires_datasets[[name[j]]]$n / nrow(clono_datasets[[name[j]]]))
colnames(Repertoires_datasets[[name[j]]]) <- c("Gene", "N", "Freq")
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Repertoires_", g, "_", name[j], ".txt")
write.table(Repertoires_datasets[[name[j]]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
return(Repertoires_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
Repertoires_datasets <- lapply(seq_len(length(name)), one_run)
} else {
Repertoires_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(Repertoires_datasets) <- name
} else {
# find the most frequent gene that exists in each specific clonotype
####################################### All Data
freq_gene_name <- data.frame()
for (i in names(view_specific_clonotype_allData)) {
a <- view_specific_clonotype_allData[[i]]
# if ((allele==F) & (allele!=allele_clonotypes)){
if (allele == FALSE) {
if (!all(!(stringr::str_detect(a[[gene]], "[*]")))) {
a2 <- strsplit(a[[gene]], "[*]")
a[[gene]] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
freq_gene <- a %>%
dplyr::group_by(a[[gene]]) %>%
dplyr::summarise(n = n())
freq_gene <- freq_gene[order(-freq_gene$n), ]
freq_gene_name[i, 1] <- freq_gene[1, 1]
}
colnames(freq_gene_name) <- c("Gene")
freq_gene_name <- freq_gene_name %>%
dplyr::group_by(freq_gene_name[["Gene"]]) %>%
dplyr::summarise(n = n())
freq_gene_name <- freq_gene_name[order(-freq_gene_name$n), ]
freq_gene_name <- cbind(freq_gene_name, Freq = 100 * freq_gene_name$n / nrow(clono_allData))
colnames(freq_gene_name) <- c("Gene", "N", "Freq")
####################################### Separate Datasets
freq_gene_name_datasets <- list()
one_run <- function(j) {
freq_gene_name_datasets[[name[[j]]]] <- data.frame()
for (i in names(view_specific_clonotype_datasets[[name[j]]])) {
a <- view_specific_clonotype_datasets[[name[j]]][[i]]
if (allele == FALSE) {
if (!all(!(stringr::str_detect(a[[gene]], "[*]")))) {
a2 <- strsplit(a[[gene]], "[*]")
a[[gene]] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
freq_gene <- a %>%
dplyr::group_by(a[[gene]]) %>%
dplyr::summarise(n = n())
freq_gene <- freq_gene[order(-freq_gene$n), ]
freq_gene_name_datasets[[name[[j]]]][i, 1] <- freq_gene[1, 1]
}
colnames(freq_gene_name_datasets[[name[[j]]]]) <- c("Gene")
freq_gene_name_datasets[[name[[j]]]] <- freq_gene_name_datasets[[name[[j]]]] %>%
dplyr::group_by(freq_gene_name_datasets[[name[[j]]]][["Gene"]]) %>%
dplyr::summarise(n = n())
freq_gene_name_datasets[[name[j]]] <- freq_gene_name_datasets[[name[j]]][order(-freq_gene_name_datasets[[name[j]]]$n), ]
freq_gene_name_datasets[[name[j]]] <- cbind(freq_gene_name_datasets[[name[j]]], Freq = 100 * freq_gene_name_datasets[[name[j]]]$n / nrow(clono_datasets[[name[j]]]))
colnames(freq_gene_name_datasets[[name[j]]]) <- c("Gene", "N", "Freq")
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Repertoires_", g, "_", name[j], ".txt")
write.table(freq_gene_name_datasets[[name[j]]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
return(freq_gene_name_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
freq_gene_name_datasets <- lapply(seq_len(length(name)), one_run)
} else {
freq_gene_name_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(freq_gene_name_datasets) <- name
Repertoires_allData <- freq_gene_name
Repertoires_datasets <- freq_gene_name_datasets
}
colnames(Repertoires_allData) <- c("Gene", "N", "Freq")
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Repertoires_", g, "_", "All_Data", ".txt")
write.table(Repertoires_allData, filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
confirm <- paste0("Repertoires run!")
result <- list("Repertoires_allData" = Repertoires_allData, "Repertoires_datasets" = Repertoires_datasets, "confirm" = confirm)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
repertoires_highly_similar <- function(clono_allData, clono_datasets, allele, allele_clonotypes, gene, gene_clonotypes, name, view_specific_clonotype_allData, view_specific_clonotype_datasets, take_gene) {
# logfile
if (allele == FALSE) {
g <- stringr::str_replace(gene, ".and.allele", "")
} else {
g <- gene
}
# logfile
cat(paste0("repertoires_highly_similar", "\t"), file = logFile, append = TRUE)
cat(paste0(g, "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
if (allele == FALSE) {
g <- stringr::str_replace(gene, ".and.allele", "")
} else {
g <- gene
}
clono_allData_initial <- clono_allData
clono_datasets_initial <- clono_datasets
for (i in seq_len(nrow(clono_allData))) {
clono_allData[i, 1] <- strsplit(as.character(clono_allData[i, 1]), " - ")[[1]][1]
}
for (j in seq_len(length(name))) {
for (i in seq_len(nrow(clono_datasets[[name[j]]]))) {
clono_datasets[[name[j]]][i, 1] <- strsplit(as.character(clono_datasets[[name[j]]][i, 1]), " - ")[[1]][1]
}
}
if (gene == gene_clonotypes && allele == allele_clonotypes && take_gene == "Yes" && length(gene_clonotypes) > 0) {
####################################### All Data
Repertoires_allData <- clono_allData %>%
dplyr::group_by(clono_allData[["clonotype"]]) %>%
dplyr::summarise(n = n())
Repertoires_allData <- Repertoires_allData[order(-Repertoires_allData$n), ]
Repertoires_allData <- cbind(Repertoires_allData, Freq = 100 * Repertoires_allData$n / nrow(clono_allData))
####################################### Separate Datasets
Repertoires_datasets <- list()
one_run <- function(j) {
Repertoires_datasets[[name[j]]] <- clono_datasets[[name[j]]] %>%
dplyr::group_by(clono_datasets[[name[j]]][["clonotype"]]) %>%
dplyr::summarise(n = n())
Repertoires_datasets[[name[j]]] <- Repertoires_datasets[[name[j]]][order(-Repertoires_datasets[[name[j]]]$n), ]
Repertoires_datasets[[name[j]]] <- cbind(Repertoires_datasets[[name[j]]], Freq = 100 * Repertoires_datasets[[name[j]]]$n / nrow(clono_datasets[[name[j]]]))
colnames(Repertoires_datasets[[name[j]]]) <- c("Gene", "N", "Freq")
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Repertoires_HighlySim_", g, "_", name[j], ".txt")
write.table(Repertoires_datasets[[name[j]]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
return(Repertoires_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
Repertoires_datasets <- lapply(seq_len(length(name)), one_run)
} else {
Repertoires_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(Repertoires_datasets) <- name
} else {
# find the most frequent gene that exists in each specific clonotype
####################################### All Data
freq_gene_name <- data.frame()
if (take_gene == "Yes") {
for (i in names(view_specific_clonotype_allData)) {
if (i %in% clono_allData_initial$clonotype) {
a <- view_specific_clonotype_allData[[i]]
if (allele == FALSE) {
if (!all(!(stringr::str_detect(a[[gene]], "[*]")))) {
a2 <- strsplit(a[[gene]], "[*]")
a[[gene]] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
freq_gene <- a %>%
dplyr::group_by(a[[gene]]) %>%
dplyr::summarise(n = n())
freq_gene <- freq_gene[order(-freq_gene$n), ]
freq_gene_name[i, 1] <- freq_gene[1, 1]
}
}
}
colnames(freq_gene_name) <- c("Gene")
freq_gene_name <- freq_gene_name %>%
dplyr::group_by(freq_gene_name[["Gene"]]) %>%
dplyr::summarise(n = n())
freq_gene_name <- freq_gene_name[order(-freq_gene_name$n), ]
freq_gene_name <- cbind(freq_gene_name, Freq = 100 * freq_gene_name$n / nrow(clono_allData))
colnames(freq_gene_name) <- c("Gene", "N", "Freq")
####################################### Separate Datasets
freq_gene_name_datasets <- list()
one_run <- function(j) {
freq_gene_name_datasets[[name[[j]]]] <- data.frame()
if (take_gene == "Yes") {
for (i in names(view_specific_clonotype_datasets[[name[j]]])) {
if (i %in% clono_datasets_initial[[name[j]]]$clonotype) {
a <- view_specific_clonotype_datasets[[name[j]]][[i]]
if (allele == FALSE) {
if (!all(!(stringr::str_detect(a[[gene]], "[*]")))) {
a2 <- strsplit(a[[gene]], "[*]")
a[[gene]] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
freq_gene <- a %>%
dplyr::group_by(a[[gene]]) %>%
dplyr::summarise(n = n())
freq_gene <- freq_gene[order(-freq_gene$n), ]
freq_gene_name_datasets[[name[[j]]]][i, 1] <- freq_gene[1, 1]
}
}
colnames(freq_gene_name_datasets[[name[[j]]]]) <- c("Gene")
freq_gene_name_datasets[[name[[j]]]] <- freq_gene_name_datasets[[name[[j]]]] %>%
dplyr::group_by(freq_gene_name_datasets[[name[[j]]]][["Gene"]]) %>%
dplyr::summarise(n = n())
freq_gene_name_datasets[[name[j]]] <- freq_gene_name_datasets[[name[j]]][order(-freq_gene_name_datasets[[name[j]]]$n), ]
freq_gene_name_datasets[[name[j]]] <- cbind(freq_gene_name_datasets[[name[j]]], Freq = 100 * freq_gene_name_datasets[[name[j]]]$n / nrow(clono_datasets[[name[j]]]))
colnames(freq_gene_name_datasets[[name[j]]]) <- c("Gene", "N", "Freq")
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Repertoires_HiglySim_", g, "_", name[j], ".txt")
write.table(freq_gene_name_datasets[[name[j]]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
}
return(freq_gene_name_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
freq_gene_name_datasets <- lapply(seq_len(length(name)), one_run)
} else {
freq_gene_name_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(freq_gene_name_datasets) <- name
Repertoires_allData <- freq_gene_name
Repertoires_datasets <- freq_gene_name_datasets
}
colnames(Repertoires_allData) <- c("Gene", "N", "Freq")
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Repertoires_HighlySim_", g, "_", "All_Data", ".txt")
write.table(Repertoires_allData, filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
confirm <- paste0("Repertoires run!")
result <- list(
"Repertoires_allData" = Repertoires_allData,
"Repertoires_datasets" = Repertoires_datasets,
"confirm" = confirm
)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
repertoires_comparison <- function(Repertoires_allData, Repertoires_datasets, name, highly_sim, id) { # set name equal to the selected dataset
# logfile
if (!highly_sim) {
n <- "repertoires_comparison"
} else {
n <- "repertoires_comparison_higly_similar"
}
cat(paste0(n, "\t"), file = logFile, append = TRUE)
cat(paste0(paste("Number of datasets: ", length(name), sep = ""), "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(Repertoires_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(Repertoires_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
unique_repertoires <- data.frame(Gene = Repertoires_allData$Gene, stringsAsFactors = FALSE)
# for each dataset
for (n in name) {
ids_dataset <- which(Repertoires_datasets[[n]]$Gene %in% unique_repertoires$Gene)
ids <- match(Repertoires_datasets[[n]]$Gene, unique_repertoires$Gene)[which(!(is.na(match(Repertoires_datasets[[n]]$Gene, unique_repertoires$Gene))))]
unique_repertoires[[paste0(n, "_N/Total")]] <- NA
unique_repertoires[[paste0(n, "_Freq")]] <- NA
unique_repertoires[[paste0(n, "_N/Total")]][ids] <- paste0(Repertoires_datasets[[n]]$N[ids_dataset], "/", sum(Repertoires_datasets[[n]]$N))
unique_repertoires[[paste0(n, "_Freq")]][ids] <- Repertoires_datasets[[n]]$Freq[ids_dataset]
}
# replace NA with 0
unique_repertoires[is.na(unique_repertoires)] <- 0
# mean freq
unique_repertoires$Mean_Freq <- NA
unique_repertoires$Mean_Freq <- apply(unique_repertoires[, seq(3, ncol(unique_repertoires), 2)], 1, function(x) mean(x))
if (save_tables_individually) {
if (highly_sim) {
filename <- paste0(e$output_folder, "/", "highlySim_repertoires_comparison_table_", id, ".txt")
write.table(unique_repertoires, filename, sep = "\t", row.names = FALSE, col.names = TRUE)
} else {
filename <- paste0(e$output_folder, "/", "repertoires_comparison_table_", id, ".txt")
write.table(unique_repertoires, filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
}
confirm <- paste0("Repertoires Comparison run!")
result <- list("unique_repertoires" = unique_repertoires, "confirm" = confirm)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
Multiple_value_comparison <- function(clono_allData, clono_datasets, allele_clonotypes, gene_clonotypes, view_specific_clonotype_allData, view_specific_clonotype_datasets, val1, val2, name, identity_groups) {
# logfile
cat(paste0("Multiple_value_comparison", "\t"), file = logFile, append = TRUE)
cat(paste0(paste(val1, val2, sep = ","), "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
val1_initial <- val1
val2_initial <- val2
val_initial <- c(val1_initial, val2_initial)
if (val1 == "Molecular mass" || val1 == "pI") {
val1 <- paste0("Junction.", val1)
} else {
val1 <- paste0("Summary.", val1)
}
val1 <- gsub(" ", ".", val1)
val1 <- gsub("-", ".", val1)
val1 <- gsub("%", ".", val1)
if (val2 == "Molecular mass" || val2 == "pI") {
val2 <- paste0("Junction.", val2)
} else {
val2 <- paste0("Summary.", val2)
}
val2 <- gsub(" ", ".", val2)
val2 <- gsub("-", ".", val2)
val2 <- gsub("%", ".", val2)
if (!stringr::str_detect(val1, "allele") && stringr::str_detect(val1, "GENE")) {
val1 <- paste0(val1, ".and.allele")
}
if (!stringr::str_detect(val2, "allele") && stringr::str_detect(val2, "GENE")) {
val2 <- paste0(val2, ".and.allele")
}
Multiple_value_comparison_datasets <- list()
for (i in seq_len(nrow(clono_allData))) {
clono_allData[i, 1] <- strsplit(as.character(clono_allData[i, 1]), " - ")[[1]][1]
}
for (j in seq_len(length(name))) {
for (i in seq_len(nrow(clono_datasets[[name[j]]]))) {
clono_datasets[[name[j]]][i, 1] <- strsplit(as.character(clono_datasets[[name[j]]][i, 1]), " - ")[[1]][1]
}
}
multi_allData <- c()
val <- c(val1, val2)
multi_datasets <- list()
for (vals in seq_len(2)) {
if (stringr::str_detect(val[vals], "GENE")) {
gene <- val[vals]
if (gene == gene_clonotypes && (stringr::str_detect(val_initial[vals], "allele")) == allele_clonotypes && !(is.null(gene_clonotypes))) {
####################################### All Data
multi_allData <- cbind(multi_allData, clono_allData[["clonotype"]])
colnames(multi_allData)[ncol(multi_allData)] <- gene
####################################### Seperate Datasets
one_run <- function(j) {
multi_datasets[[name[j]]] <- cbind(multi_datasets[[name[j]]], clono_datasets[[name[j]]][["clonotype"]])
colnames(multi_datasets[[name[j]]])[ncol(multi_datasets[[name[j]]])] <- gene
return(multi_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
multi_datasets <- lapply(seq_len(length(name)), one_run)
} else {
multi_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(multi_datasets) <- name
} else {
# find the most frequent gene that exists in each specific clonotype
####################################### All Data
freq_gene_name <- data.frame()
for (i in names(view_specific_clonotype_allData)) {
a <- view_specific_clonotype_allData[[i]]
if ((stringr::str_detect(val_initial[vals], "allele") == FALSE)) {
if (!all(!(stringr::str_detect(a[[gene]], "[*]")))) {
a2 <- strsplit(a[[gene]], "[*]")
a[[gene]] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
freq_gene <- a %>%
dplyr::group_by(a[[gene]]) %>%
dplyr::summarise(n = n())
freq_gene <- freq_gene[order(-freq_gene$n), ]
freq_gene_name[i, 1] <- freq_gene[1, 1]
}
colnames(freq_gene_name) <- gene
multi_allData <- cbind(multi_allData, freq_gene_name[[gene]])
colnames(multi_allData)[ncol(multi_allData)] <- gene
####################################### Seperate Datasets
one_run <- function(j) {
freq_gene_name <- data.frame()
for (i in names(view_specific_clonotype_datasets[[name[j]]])) {
a <- view_specific_clonotype_datasets[[name[j]]][[i]]
if ((stringr::str_detect(val_initial[vals], "allele") == FALSE)) {
if (!all(!(stringr::str_detect(a[[gene]], "[*]")))) {
a2 <- strsplit(a[[gene]], "[*]")
a[[gene]] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
freq_gene <- a %>%
dplyr::group_by(a[[gene]]) %>%
dplyr::summarise(n = n())
freq_gene <- freq_gene[order(-freq_gene$n), ]
freq_gene_name[i, 1] <- freq_gene[1, 1]
}
colnames(freq_gene_name) <- gene
multi_datasets[[name[j]]] <- cbind(multi_datasets[[name[j]]], freq_gene_name[[gene]])
colnames(multi_datasets[[name[j]]])[ncol(multi_datasets[[name[j]]])] <- gene
return(multi_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
multi_datasets <- lapply(seq_len(length(name)), one_run)
} else {
multi_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(multi_datasets) <- name
}
} else {
a <- c()
for (i in names(view_specific_clonotype_allData)) {
a <- c(a, median(as.numeric(view_specific_clonotype_allData[[i]][[val[vals]]]), na.rm = TRUE))
}
if ((!is.null(identity_groups)) && (val[vals] == used_columns[["Summary"]][4])) {
a <- as.numeric(a)
temp <- a
for (values in seq_len(nrow(identity_groups))) {
if (values == nrow(identity_groups)) {
index <- which(a >= identity_groups[values, 1] & a <= identity_groups[values, 2])
} else {
index <- which(a >= identity_groups[values, 1] & a < identity_groups[values, 2])
}
temp[index] <- identity_groups$label[values]
}
a <- temp
}
multi_allData <- cbind(multi_allData, a)
colnames(multi_allData)[ncol(multi_allData)] <- val[vals]
####################################### Seperate Datasets
one_run <- function(j) {
a <- c()
for (i in names(view_specific_clonotype_datasets[[name[j]]])) {
a <- c(a, median(as.numeric(view_specific_clonotype_datasets[[name[j]]][[i]][[val[vals]]]), na.rm = TRUE))
}
if ((!is.null(identity_groups)) && (val[vals] == used_columns[["Summary"]][4])) {
a <- as.numeric(a)
temp <- a
for (values in seq_len(nrow(identity_groups))) {
if (values == nrow(identity_groups)) {
index <- which(a >= identity_groups[values, 1] & a <= identity_groups[values, 2])
} else {
index <- which(a >= identity_groups[values, 1] & a < identity_groups[values, 2])
}
temp[index] <- identity_groups$label[values]
}
a <- temp
}
multi_datasets[[name[j]]] <- cbind(multi_datasets[[name[j]]], a)
colnames(multi_datasets[[name[j]]])[ncol(multi_datasets[[name[j]]])] <- val[vals]
return(multi_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
multi_datasets <- lapply(seq_len(length(name)), one_run)
} else {
multi_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(multi_datasets) <- name
}
}
multi_allData <- data.frame(multi_allData, stringsAsFactors = FALSE)
Multiple_value_comparison_allData <- multi_allData %>%
dplyr::group_by(multi_allData[[val1]], multi_allData[[val2]]) %>%
dplyr::summarise(n = n())
Multiple_value_comparison_allData <- Multiple_value_comparison_allData[order(-Multiple_value_comparison_allData$n), ]
Multiple_value_comparison_allData <- cbind(Multiple_value_comparison_allData, Freq = 100 * Multiple_value_comparison_allData$n / nrow(multi_allData))
colnames(Multiple_value_comparison_allData) <- c(val1_initial, val2_initial, "N", "Freq")
####################################### Seperate Datasets
one_run <- function(j) {
multi_datasets[[name[j]]] <- data.frame(multi_datasets[[name[j]]], stringsAsFactors = FALSE)
Multiple_value_comparison_datasets[[name[j]]] <- multi_datasets[[name[j]]] %>%
dplyr::group_by(multi_datasets[[name[j]]][[val1]], multi_datasets[[name[j]]][[val2]]) %>%
dplyr::summarise(n = n())
Multiple_value_comparison_datasets[[name[j]]] <- Multiple_value_comparison_datasets[[name[j]]][order(-Multiple_value_comparison_datasets[[name[j]]]$n), ]
Multiple_value_comparison_datasets[[name[j]]] <- cbind(Multiple_value_comparison_datasets[[name[j]]], Freq = 100 * Multiple_value_comparison_datasets[[name[j]]]$n / nrow(multi_datasets[[name[j]]]))
colnames(Multiple_value_comparison_datasets[[name[j]]]) <- c(val1_initial, val2_initial, "N", "Freq")
Multiple_value_comparison_datasets[[name[j]]] <- data.frame(Multiple_value_comparison_datasets[[name[j]]], stringsAsFactors = FALSE)
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Multiple_value_comparison_", stringr::str_replace(val1_initial, "%", ""), "_", stringr::str_replace(val2_initial, "%", ""), "_", name[j], ".txt")
write.table(Multiple_value_comparison_datasets[[name[j]]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
return(Multiple_value_comparison_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
Multiple_value_comparison_datasets <- lapply(seq_len(length(name)), one_run)
} else {
Multiple_value_comparison_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(Multiple_value_comparison_datasets) <- name
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Multiple_value_comparison_", stringr::str_replace(val1_initial, "%", ""), "_", stringr::str_replace(val2_initial, "%", ""), "_", "All_Data", ".txt")
write.table(Multiple_value_comparison_allData, filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
confirm <- paste0("Multiple_value_comparison ", val1_initial, " - ", val2_initial, " run!")
result <- list("Multiple_value_comparison_allData" = Multiple_value_comparison_allData, "Multiple_value_comparison_datasets" = Multiple_value_comparison_datasets, "confirm" = confirm)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
Multiple_value_comparison_highly_similar <- function(clono_allData, clono_datasets, allele_clonotypes, gene_clonotypes, view_specific_clonotype_allData, view_specific_clonotype_datasets, val1, val2, name, identity_groups) {
# logfile
cat(paste0("Multiple_value_comparison_highly_similar", "\t"), file = logFile, append = TRUE)
cat(paste0(paste(val1, val2, sep = ","), "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(clono_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
val1_initial <- val1
val2_initial <- val2
val_initial <- c(val1_initial, val2_initial)
if (val1 == "Molecular mass" || val1 == "pI") {
val1 <- paste0("Junction.", val1)
} else {
val1 <- paste0("Summary.", val1)
}
val1 <- gsub(" ", ".", val1)
val1 <- gsub("-", ".", val1)
val1 <- gsub("%", ".", val1)
if (val2 == "Molecular mass" || val2 == "pI") {
val2 <- paste0("Junction.", val2)
} else {
val2 <- paste0("Summary.", val2)
}
val2 <- gsub(" ", ".", val2)
val2 <- gsub("-", ".", val2)
val2 <- gsub("%", ".", val2)
if (!stringr::str_detect(val1, "allele") && stringr::str_detect(val1, "GENE")) {
val1 <- paste0(val1, ".and.allele")
}
if (!stringr::str_detect(val2, "allele") && stringr::str_detect(val2, "GENE")) {
val2 <- paste0(val2, ".and.allele")
}
Multiple_value_comparison_datasets <- list()
for (i in seq_len(nrow(clono_allData))) {
clono_allData[i, 1] <- strsplit(as.character(clono_allData[i, 1]), " - ")[[1]][1]
}
for (j in seq_len(length(name))) {
for (i in seq_len(nrow(clono_datasets[[name[j]]]))) {
clono_datasets[[name[j]]][i, 1] <- strsplit(as.character(clono_datasets[[name[j]]][i, 1]), " - ")[[1]][1]
}
}
multi_allData <- c()
val <- c(val1, val2)
multi_datasets <- list()
for (vals in seq_len(2)) {
if (stringr::str_detect(val[vals], "GENE")) {
gene <- val[vals]
if (gene == gene_clonotypes && (stringr::str_detect(val_initial[vals], "allele")) == allele_clonotypes && !(is.null(gene_clonotypes))) {
####################################### All Data
multi_allData <- cbind(multi_allData, clono_allData[["clonotype"]])
colnames(multi_allData)[ncol(multi_allData)] <- gene
####################################### Seperate Datasets
one_run <- function(j) {
multi_datasets[[name[j]]] <- cbind(multi_datasets[[name[j]]], clono_datasets[[name[j]]][["clonotype"]])
colnames(multi_datasets[[name[j]]])[ncol(multi_datasets[[name[j]]])] <- gene
return(multi_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
multi_datasets <- lapply(seq_len(length(name)), one_run)
} else {
multi_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(multi_datasets) <- name
} else {
# find the most frequent gene that exists in each specific clonotype
####################################### All Data
freq_gene_name <- data.frame()
id <- 0
for (i in seq_len(nrow(clono_allData))) {
prev_clono <- as.numeric(strsplit(as.character(clono_allData$prev_cluster[i]), " ")[[1]][2:length(strsplit(as.character(clono_allData$prev_cluster[i]), " ")[[1]])])
a <- view_specific_clonotype_allData[[prev_clono[1]]]
if (length(prev_clono) > 1) {
for (cl in 2:length(prev_clono)) {
a <- rbind(a, view_specific_clonotype_allData[[prev_clono[cl]]])
}
}
if ((stringr::str_detect(val_initial[vals], "allele") == FALSE)) {
if (!all(!(stringr::str_detect(a[[gene]], "[*]")))) {
a2 <- strsplit(a[[gene]], "[*]")
a[[gene]] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
freq_gene <- a %>%
dplyr::group_by(a[[gene]]) %>%
dplyr::summarise(n = n())
freq_gene <- freq_gene[order(-freq_gene$n), ]
freq_gene_name[i, 1] <- freq_gene[1, 1]
}
colnames(freq_gene_name) <- gene
multi_allData <- cbind(multi_allData, freq_gene_name[[gene]])
colnames(multi_allData)[ncol(multi_allData)] <- gene
####################################### Seperate Datasets
one_run <- function(j) {
freq_gene_name <- data.frame()
for (i in seq_len(nrow(clono_datasets[[name[j]]]))) {
prev_clono <- as.numeric(strsplit(as.character(clono_datasets[[name[j]]]$prev_cluster[i]), " ")[[1]][2:length(strsplit(as.character(clono_datasets[[name[j]]]$prev_cluster[i]), " ")[[1]])])
prev_clono <- prev_clono[!is.na(prev_clono)]
a <- view_specific_clonotype_datasets[[name[j]]][[prev_clono[1]]]
if (length(prev_clono) > 1) {
for (cl in 2:length(prev_clono)) {
a <- rbind(a, view_specific_clonotype_datasets[[name[j]]][[prev_clono[cl]]])
}
}
if ((stringr::str_detect(val_initial[vals], "allele") == FALSE)) {
if (!all(!(stringr::str_detect(a[[gene]], "[*]")))) {
a2 <- strsplit(a[[gene]], "[*]")
a[[gene]] <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
}
}
freq_gene <- a %>%
dplyr::group_by(a[[gene]]) %>%
dplyr::summarise(n = n())
freq_gene <- freq_gene[order(-freq_gene$n), ]
freq_gene_name[i, 1] <- freq_gene[1, 1]
}
colnames(freq_gene_name) <- gene
multi_datasets[[name[j]]] <- cbind(multi_datasets[[name[j]]], freq_gene_name[[gene]])
colnames(multi_datasets[[name[j]]])[ncol(multi_datasets[[name[j]]])] <- gene
return(multi_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
multi_datasets <- lapply(seq_len(length(name)), one_run)
} else {
multi_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(multi_datasets) <- name
}
} else {
a <- c()
for (i in seq_len(nrow(clono_allData))) {
prev_clono <- as.numeric(strsplit(as.character(clono_allData$prev_cluster[i]), " ")[[1]][2:length(strsplit(as.character(clono_allData$prev_cluster[i]), " ")[[1]])])
prev_clono <- prev_clono[!is.na(prev_clono)]
view <- view_specific_clonotype_allData[[prev_clono[1]]]
if (length(prev_clono) > 1) {
for (cl in 2:length(prev_clono)) {
view <- rbind(view, view_specific_clonotype_allData[[prev_clono[cl]]])
}
}
a <- c(a, median(as.numeric(view[[val[vals]]]), na.rm = TRUE))
}
if ((!is.null(identity_groups)) && (val[vals] == used_columns[["Summary"]][4])) {
a <- as.numeric(a)
temp <- a
for (values in seq_len(nrow(identity_groups))) {
if (values == nrow(identity_groups)) {
index <- which(a >= identity_groups[values, 1] & a <= identity_groups[values, 2])
} else {
index <- which(a >= identity_groups[values, 1] & a < identity_groups[values, 2])
}
temp[index] <- identity_groups$label[values]
}
a <- temp
}
multi_allData <- cbind(multi_allData, a)
colnames(multi_allData)[ncol(multi_allData)] <- val[vals]
####################################### Seperate Datasets
one_run <- function(j) {
a <- c()
id <- 0
for (i in seq_len(nrow(clono_datasets[[name[j]]]))) {
prev_clono <- as.numeric(strsplit(as.character(clono_datasets[[name[j]]]$prev_cluster[i]), " ")[[1]][2:length(strsplit(as.character(clono_datasets[[name[j]]]$prev_cluster[i]), " ")[[1]])])
prev_clono <- prev_clono[!is.na(prev_clono)]
view <- view_specific_clonotype_datasets[[name[j]]][[prev_clono[1]]]
if (length(prev_clono) > 1) {
for (cl in 2:length(prev_clono)) {
view <- rbind(view, view_specific_clonotype_datasets[[name[j]]][[prev_clono[cl]]])
}
}
a <- c(a, median(as.numeric(view[[val[vals]]]), na.rm = TRUE))
}
if ((!is.null(identity_groups)) && (val[vals] == used_columns[["Summary"]][4])) {
a <- as.numeric(a)
temp <- a
for (values in seq_len(nrow(identity_groups))) {
if (values == nrow(identity_groups)) {
index <- which(a >= identity_groups[values, 1] & a <= identity_groups[values, 2])
} else {
index <- which(a >= identity_groups[values, 1] & a < identity_groups[values, 2])
}
temp[index] <- identity_groups$label[values]
}
a <- temp
}
multi_datasets[[name[j]]] <- cbind(multi_datasets[[name[j]]], a)
colnames(multi_datasets[[name[j]]])[ncol(multi_datasets[[name[j]]])] <- val[vals]
return(multi_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
multi_datasets <- lapply(seq_len(length(name)), one_run)
} else {
multi_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(multi_datasets) <- name
}
}
multi_allData <- data.frame(multi_allData, stringsAsFactors = FALSE)
Multiple_value_comparison_allData <- multi_allData %>%
dplyr::group_by(multi_allData[[val1]], multi_allData[[val2]]) %>%
dplyr::summarise(n = n())
Multiple_value_comparison_allData <- Multiple_value_comparison_allData[order(-Multiple_value_comparison_allData$n), ]
Multiple_value_comparison_allData <- cbind(Multiple_value_comparison_allData, Freq = 100 * Multiple_value_comparison_allData$n / nrow(multi_allData))
colnames(Multiple_value_comparison_allData) <- c(val1_initial, val2_initial, "N", "Freq")
####################################### Seperate Datasets
one_run <- function(j) {
multi_datasets[[name[j]]] <- data.frame(multi_datasets[[name[j]]], stringsAsFactors = FALSE)
Multiple_value_comparison_datasets[[name[j]]] <- multi_datasets[[name[j]]] %>%
dplyr::group_by(multi_datasets[[name[j]]][[val1]], multi_datasets[[name[j]]][[val2]]) %>%
dplyr::summarise(n = n())
Multiple_value_comparison_datasets[[name[j]]] <- Multiple_value_comparison_datasets[[name[j]]][order(-Multiple_value_comparison_datasets[[name[j]]]$n), ]
Multiple_value_comparison_datasets[[name[j]]] <- cbind(Multiple_value_comparison_datasets[[name[j]]], Freq = 100 * Multiple_value_comparison_datasets[[name[j]]]$n / nrow(multi_datasets[[name[j]]]))
colnames(Multiple_value_comparison_datasets[[name[j]]]) <- c(val1_initial, val2_initial, "N", "Freq")
Multiple_value_comparison_datasets[[name[j]]] <- data.frame(Multiple_value_comparison_datasets[[name[j]]], stringsAsFactors = FALSE)
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Multiple_value_comparison_highly_similar", stringr::str_replace(val1_initial, "%", ""), "_", stringr::str_replace(val2_initial, "%", ""), "_", name[j], ".txt")
write.table(Multiple_value_comparison_datasets[[name[j]]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
return(Multiple_value_comparison_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
Multiple_value_comparison_datasets <- lapply(seq_len(length(name)), one_run)
} else {
Multiple_value_comparison_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(Multiple_value_comparison_datasets) <- name
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Multiple_value_comparison_highly_similar", stringr::str_replace(val1_initial, "%", ""), "_", stringr::str_replace(val2_initial, "%", ""), "_", "All_Data", ".txt")
write.table(Multiple_value_comparison_allData, filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
confirm <- paste0("Multiple_value_comparison ", val1_initial, " - ", val2_initial, " run!")
result <- list("Multiple_value_comparison_allData" = Multiple_value_comparison_allData, "Multiple_value_comparison_datasets" = Multiple_value_comparison_datasets, "confirm" = confirm)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
createFrequencyTableCDR3 <- function(region_name, input, name, regionLength, FtopN, topClonotypesAlldata, topClonotypesDatasets, gene, junction, allele) {
# logfile
cat(paste0("createFrequencyTableCDR3", "\t"), file = logFile, append = TRUE)
cat(paste0(paste(region_name, paste0("Top N: ", FtopN), sep = ","), "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(input), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(input), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
input_initial <- input
if (FtopN) {
################################# combine all conent of the top N clonotypes
input_topN <- c()
for (i in seq_len(nrow(topClonotypesAlldata))) {
cl <- strsplit(topClonotypesAlldata$clonotype[i], " - ")
if (is.na(cl[[1]][1])) {
cl[[1]][1] <- ""
}
if (is.na(cl[[1]][2])) {
cl[[1]][2] <- ""
}
input_topN <- rbind(input_topN, viewClonotypes(input, allele, gene, junction, cl[[1]][1], cl[[1]][2]))
}
input <- input_topN
}
if (region_name == "CDR3") region_name <- "CDR3.IMGT"
region_name <- paste0("IMGT.gapped.AA.sequences.", region_name)
###################################### all data #######################################
# select AA juction or CDR3
region <- unique(input[[region_name]])
region_with_specific_length <- input %>% dplyr::filter(str_length(input[[region_name]]) == regionLength)
region_with_specific_length <- region_with_specific_length[[region_name]]
region_split <- strsplit(region, "")
region <- as.data.frame(region)
ancestral <- as.data.frame(c("I", "L", "V", "A", "M", "C", "T", "S", "H", "K", "R", "E", "D", "P", "G", "Y", "F", "W", "Q", "N"))
colnames(ancestral) <- "x"
new_list <- list()
k <- 0
for (i in seq_len(length(region_split))) {
if (length(region_split[[i]]) == regionLength) {
k <- k + 1
new_list[[k]] <- region_split[[i]]
}
}
region_new <- as.data.frame(new_list)
region_new <- data.frame(t(region_new))
row.names(region_new) <- NULL
if (length(region_new) > 0) {
# remove factors
region_new <- region_new %>% mutate_if(is.factor, as.character)
ancestral <- ancestral %>% mutate_if(is.factor, as.character)
a <- as.data.frame(region_new %>% dplyr::group_by(x = region_new[, 1]) %>% dplyr::summarise(n = n()))
table_count <- join(as.data.frame(ancestral), a, type = "left", by = "x")
for (i in 2:ncol(region_new)) {
a <- as.data.frame(region_new %>% dplyr::group_by(x = region_new[, i]) %>% dplyr::summarise(n = n()))
table_count <- join(table_count, a, type = "left", by = "x")
}
table_count[is.na(table_count)] <- 0
row.names(table_count) <- table_count[, 1]
# Create frequency table
table_freq <- table_count
table_freq[, 2:ncol(table_count)] <- 100 * table_count[, 2:ncol(table_count)] / k
colnames(table_freq) <- c("AA", seq_len((ncol(table_freq) - 1)))
colnames(table_count) <- c("AA", seq_len((ncol(table_count) - 1)))
if (region_name == paste0("IMGT.gapped.AA.sequences.", "CDR3.IMGT")) {
if ((ncol(table_count) - 1) == 13) {
a <- 105:117
} else if ((ncol(table_count) - 1) == 12) {
a <- c(105:110, 112:117)
} else if ((ncol(table_count) - 1) == 11) {
a <- c(105:110, 113:117)
} else if ((ncol(table_count) - 1) == 10) {
a <- c(105:109, 113:117)
} else if ((ncol(table_count) - 1) == 9) {
a <- c(105:109, 114:117)
} else if ((ncol(table_count) - 1) == 8) {
a <- c(105:108, 114:117)
} else if ((ncol(table_count) - 1) == 7) {
a <- c(105:108, 115:117)
} else if ((ncol(table_count) - 1) == 6) {
a <- c(105:107, 115:117)
} else if ((ncol(table_count) - 1) == 5) a <- c(105:107, 116:117)
colnames(table_count) <- c("AA", a)
colnames(table_freq) <- c("AA", a)
}
} else {
table_freq <- c()
table_count <- c()
warning("No data for this")
}
###################################### Separate datasets #######################################
table_count_datasets <- list()
table_freq_datasets <- list()
region_with_specific_length_dataset <- list()
for (j in seq_len(length(name))) {
namej <- name[j]
input_dataset <- input_initial %>% dplyr::filter(input_initial$dataName == name[j])
region_with_specific_length_dataset[[name[j]]] <- input_dataset %>% dplyr::filter(str_length(input_dataset[[region_name]]) == regionLength)
region_with_specific_length_dataset[[name[j]]] <- region_with_specific_length_dataset[[name[j]]][[region_name]]
if (FtopN) {
################################# combine all conent of the top N clonotypes
# thhe function should take allele,gene,junction as input!!!!!!!!!!!!
input_topN <- c()
for (i in seq_len(nrow(topClonotypesDatasets[[name[j]]]))) {
cl <- strsplit(topClonotypesDatasets[[name[j]]]$clonotype[i], " - ")
if (is.na(cl[[1]][1])) {
cl[[1]][1] <- ""
}
if (is.na(cl[[1]][2])) {
cl[[1]][2] <- ""
}
input_topN <- rbind(input_topN, viewClonotypes(input_dataset, allele, gene, junction, cl[[1]][1], cl[[1]][2]))
}
input_dataset <- input_topN
}
region <- unique(input_dataset[[region_name]])
region_split <- strsplit(region, "")
region <- as.data.frame(region)
ancestral <- as.data.frame(c("I", "L", "V", "A", "M", "C", "T", "S", "H", "K", "R", "E", "D", "P", "G", "Y", "F", "W", "Q", "N"))
colnames(ancestral) <- "x"
# take the region with length=region
new_list <- list()
k <- 0
for (i in seq_len(length(region_split))) {
if (length(region_split[[i]]) == regionLength) {
k <- k + 1
new_list[[k]] <- region_split[[i]]
}
}
name[j] <- namej
region_new <- as.data.frame(new_list)
region_new <- data.frame(t(region_new))
row.names(region_new) <- NULL
if (length(region_new) > 0) {
# remove factors
region_new <- region_new %>% mutate_if(is.factor, as.character)
ancestral <- ancestral %>% mutate_if(is.factor, as.character)
a <- as.data.frame(region_new %>% dplyr::group_by(x = region_new[, 1]) %>% dplyr::summarise(n = n()))
table_count_datasets[[name[j]]] <- join(as.data.frame(ancestral), a, type = "left", by = "x")
for (i in 2:ncol(region_new)) {
a <- as.data.frame(region_new %>% dplyr::group_by(x = region_new[, i]) %>% dplyr::summarise(n = n()))
table_count_datasets[[name[j]]] <- join(table_count_datasets[[name[j]]], a, type = "left", by = "x")
}
table_count_datasets[[name[j]]][is.na(table_count_datasets[[name[j]]])] <- 0
row.names(table_count_datasets[[name[j]]]) <- table_count_datasets[[name[j]]][, 1]
# Create frequency table
table_freq_datasets[[name[j]]] <- table_count_datasets[[name[j]]]
table_freq_datasets[[name[j]]][, 2:ncol(table_count_datasets[[name[j]]])] <- 100 * table_count_datasets[[name[j]]][, 2:ncol(table_count_datasets[[name[j]]])] / k
colnames(table_count_datasets[[name[j]]]) <- c("AA", seq_len((ncol(table_count_datasets[[name[j]]]) - 1)))
colnames(table_freq_datasets[[name[j]]]) <- c("AA", seq_len((ncol(table_freq_datasets[[name[j]]]) - 1)))
if (region_name == paste0("IMGT.gapped.AA.sequences.", "CDR3.IMGT")) {
if ((ncol(table_count_datasets[[name[j]]]) - 1) == 13) {
a <- 105:117
} else if ((ncol(table_count_datasets[[name[j]]]) - 1) == 12) {
a <- c(105:110, 112:117)
} else if ((ncol(table_count_datasets[[name[j]]]) - 1) == 11) {
a <- c(105:110, 113:117)
} else if ((ncol(table_count_datasets[[name[j]]]) - 1) == 10) {
a <- c(105:109, 113:117)
} else if ((ncol(table_count_datasets[[name[j]]]) - 1) == 9) {
a <- c(105:109, 114:117)
} else if ((ncol(table_count_datasets[[name[j]]]) - 1) == 8) {
a <- c(105:108, 114:117)
} else if ((ncol(table_count_datasets[[name[j]]]) - 1) == 7) {
a <- c(105:108, 115:117)
} else if ((ncol(table_count_datasets[[name[j]]]) - 1) == 6) {
a <- c(105:107, 115:117)
} else if ((ncol(table_count_datasets[[name[j]]]) - 1) == 5) a <- c(105:107, 116:117)
colnames(table_count_datasets[[name[j]]]) <- c("AA", a)
colnames(table_freq_datasets[[name[j]]]) <- c("AA", a)
}
} else {
table_freq_datasets[[name[j]]] <- c()
table_count_datasets[[name[j]]] <- c()
}
}
confirm <- paste0("Frequency tables run!")
result <- list("region_with_specific_length_dataset" = region_with_specific_length_dataset, "region_with_specific_length" = region_with_specific_length, "table_count" = table_count, "table_freq" = table_freq, "table_count_datasets" = table_count_datasets, "table_freq_datasets" = table_freq_datasets, "confirm" = confirm)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
createLogo <- function(table_count, table_count_datasets, name) {
# logfile
cat(paste0("createLogo", "\t"), file = logFile, append = TRUE)
cat(paste0("logo plot", "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(table_count), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(table_count), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
# Create color matrix
mat <- matrix(nrow = 20, ncol = 2)
mat[1, ] <- c("I", "#0000FF")
mat[2, ] <- c("L", "#0000FF")
mat[3, ] <- c("V", "#0000FF")
mat[4, ] <- c("A", "#0000FF")
mat[5, ] <- c("M", "#C6E2FF")
mat[6, ] <- c("C", "#C6E2FF")
mat[7, ] <- c("T", "#54FF9F")
mat[8, ] <- c("S", "#54FF9F")
mat[9, ] <- c("H", "#FF0000")
mat[10, ] <- c("K", "#FF0000")
mat[11, ] <- c("R", "#FF0000")
mat[12, ] <- c("E", "#FFD700")
mat[13, ] <- c("D", "#FFD700")
mat[14, ] <- c("P", "#FFD700")
mat[15, ] <- c("G", "#00EE00")
mat[16, ] <- c("Y", "#C1FFC1")
mat[17, ] <- c("F", "#1E90FF")
mat[18, ] <- c("W", "#BA55D3")
mat[19, ] <- c("Q", "#ED9121")
mat[20, ] <- c("N", "#ED9121")
##########################
if (!is.null(table_count)) {
p <- motifStack::pcm2pfm(table_count)
color_set <- c()
for (i in seq_len(length(rownames(p)))) {
color <- which(mat[, 1] == rownames(p)[i])
color_set <- c(color_set, mat[color, 2])
}
names(color_set) <- row.names(p)
# plotMotifLogo(p,p=rep(1/length(rownames(p)),length(rownames(p))),ic.scale=FALSE, colset = color_set, ylab="probability")
motif_all <- new("pcm", mat = as.matrix(p), name = "")
motif_all$color <- color_set
motif_datasets <- list()
for (j in seq_len(length(name))) {
if (!is.null(table_count_datasets[[name[j]]])) {
if ("AA" %in% colnames(table_count_datasets[[name[j]]])) {
table_count_datasets[[name[j]]] <- table_count_datasets[[name[j]]][, 2:ncol(table_count_datasets[[name[j]]])]
}
p <- motifStack::pcm2pfm(table_count_datasets[[name[j]]])
color_set <- c()
for (i in seq_len(length(rownames(p)))) {
color <- which(mat[, 1] == rownames(p)[i])
color_set <- c(color_set, mat[color, 2])
}
names(color_set) <- row.names(p)
# plotMotifLogo(p,p=rep(1/length(rownames(p)),length(rownames(p))),ic.scale=FALSE, colset = color_set, ylab="probability")
motif <- new("pcm", mat = as.matrix(p), name = "")
motif$color <- color_set
motif_datasets[[name[j]]] <- motif
} else {
motif_datasets[[name[j]]] <- c()
}
}
} else {
motif_all <- c()
motif_datasets <- c()
}
confirm <- "Logo run!"
result <- list("motif_all" = motif_all, "motif_datasets" = motif_datasets, "confirm" = confirm)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
# write to png file
# dev.print(png, file=paste0("K",j,"_H",j," AA.png"),width=1000, height=550)
}
######################################################################################################################################
alignment <- function(input, region, germline, name, only_one_germline, use_genes_germline, Tcell, AAorNtAlignment, clono_allData, clono_datasets, view_specific_clonotype_allData, view_specific_clonotype_datasets, topNClono, FtopN, thrClono, Fthr, highly) {
used_columns <- e$used_columns
# logfile
cat(paste0("alignment", "\t"), file = logFile, append = TRUE)
cat(paste0(paste(region, AAorNtAlignment, "top", topNClono, "clonotypes", sep = ","), "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(input), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(input), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
if (AAorNtAlignment == "aa") {
file <- "IMGT.gapped.AA.sequences."
} else {
file <- "IMGT.gapped.nt.sequences."
}
if (region == "CDR3") region <- "JUNCTION"
region <- paste0(file, region)
max_length_region <- max(str_length(input[[region]]))
if (Tcell == FALSE && only_one_germline == FALSE) {
type <- strsplit(strsplit(as.character(input[[used_columns[["Summary"]][3]]][1]), " ")[[1]][2], "V")[[1]][1]
if ((type == "IGK") | (type == "IGL")) {
germline_file <- paste0("Germline_sequences_alignments_", "IGK", "V_", AAorNtAlignment, ".csv")
Tgermlines <- read.csv(system.file("extdata/param", germline_file,
package = "tripr", mustWork = TRUE
),
sep = ";", stringsAsFactors = FALSE,
colClasses = c("character")
)
if (AAorNtAlignment == "aa") {
Tgermlines[, 113:117] <- "."
} else {
Tgermlines[, 336:351] <- "."
}
germline_file <- paste0("Germline_sequences_alignments_", "IGL", "V_", AAorNtAlignment, ".csv")
te <- read.csv(system.file("extdata/param", germline_file,
package = "tripr", mustWork = TRUE
),
sep = ";", stringsAsFactors = FALSE,
colClasses = c("character")
)
colnames(Tgermlines) <- colnames(te)
Tgermlines <- rbind(Tgermlines, te)
} else {
germline_file <- paste0("Germline_sequences_alignments_", type, "V_", AAorNtAlignment, ".csv")
Tgermlines <- read.csv(system.file("extdata/param", germline_file,
package = "tripr", mustWork = TRUE
),
sep = ";", stringsAsFactors = FALSE,
colClasses = c("character")
)
}
Tgermlines <- unique(Tgermlines)
colnames(Tgermlines) <- c("V1", seq_len((ncol(Tgermlines) - 1)))
a2 <- strsplit(Tgermlines$V1, " or|,| [(]see| OR")
Tgermlines$V1 <- as.character(plyr::ldply(a2, function(s) {
t(data.frame(unlist(s)))
})[, 1])
if (max_length_region > (ncol(Tgermlines) - 1)) {
extra_dots <- matrix(".", nrow(Tgermlines), max_length_region - ncol(Tgermlines))
Tgermlines <- cbind(Tgermlines, extra_dots)
colnames(Tgermlines) <- c("V1", seq_len((max_length_region - 1)))
Tgermlines[Tgermlines == ""] <- "."
Tgermlines[is.na(Tgermlines)] <- "."
}
}
if (region %in% colnames(input)) {
############### Clonotypes ##############
cluster_id <- c()
freq_cluster_id <- c()
if (length(view_specific_clonotype_allData) == 0) {
cluster_id[1:nrow(input)] <- 0
freq_cluster_id[1:nrow(input)] <- 0
} else {
if (!highly) {
for (i in seq_len(length(view_specific_clonotype_allData))) {
index <- which(input[[used_columns[["Summary"]][1]]] %in% view_specific_clonotype_allData[[names(view_specific_clonotype_allData)[i]]][[used_columns[["Summary"]][1]]])
if (index[1] > 0) {
freq_cluster_id[index] <- clono_allData$Freq[i]
cluster_id[index] <- i
}
}
} else {
for (i in seq_len(nrow(clono_allData))) {
prev_clono <- as.numeric(strsplit(as.character(clono_allData$prev_cluster[i]), " ")[[1]][2:length(strsplit(as.character(clono_allData$prev_cluster[i]), " ")[[1]])])
view <- view_specific_clonotype_allData[[prev_clono[1]]]
if (length(prev_clono) > 1) {
for (cl in 2:length(prev_clono)) {
view <- rbind(view, view_specific_clonotype_allData[[prev_clono[cl]]])
}
}
index <- which(input[[used_columns[["Summary"]][1]]] %in% view[[used_columns[["Summary"]][1]]])
if (index[1] > 0) {
freq_cluster_id[index] <- clono_allData$Freq[i]
cluster_id[index] <- i
}
}
}
}
#########################################
region_split <- strsplit(input[[region]], "")
for (i in seq_len(length(region_split))) {
if (length(region_split[[i]]) > max_length_region) {
region_split[[i]] <- region_split[[i]][1:max_length_region]
}
if (length(region_split[[i]]) < max_length_region) {
region_split[[i]][length(region_split[[i]]):max_length_region] <- "."
}
}
region_split <- as.data.frame(region_split)
region_split <- t(region_split)
row.names(region_split) <- NULL
region_alignment <- cbind(as.data.frame(cluster_id),
as.data.frame(freq_cluster_id),
Functionality = "productive"
)
region_alignment <- cbind(region_alignment,
J.GENE.and.allele = input[[used_columns[["Summary"]][8]]],
D.GENE.and.allele = input[[used_columns[["Summary"]][11]]],
V.GENE.and.allele = input[[used_columns[["Summary"]][3]]],
region_split, stringsAsFactors = FALSE
)
region_alignment$cluster_id <- as.character(cluster_id)
region_alignment$freq_cluster_id <- as.character(freq_cluster_id)
if (FtopN) {
region_alignment <- region_alignment %>% dplyr::filter(as.numeric(as.character(region_alignment$cluster_id)) <= topNClono | region_alignment$cluster_id == "-")
}
if (Fthr) {
region_alignment <- region_alignment %>% dplyr::filter(as.numeric(as.character(freq_cluster_id)) >= thrClono | region_alignment$cluster_id == "-")
}
if (only_one_germline) {
germline <- strsplit(germline, "")[[1]]
germline <- data.frame(t(germline), stringsAsFactors = FALSE)
germline <- c("-", "-", "germline", "-", "-", "-", germline)
germline <- as.data.frame(germline, stringsAsFactors = FALSE)
colnames(germline) <- colnames(region_alignment[, 1:ncol(germline)])
alignment_with_germline <- rbind(germline, region_alignment[, 1:ncol(germline)])
a <- t(apply(alignment_with_germline[2:nrow(alignment_with_germline), 3:length(alignment_with_germline)], 1, function(x) {
x == alignment_with_germline[1, 3:length(alignment_with_germline)] & x != "."
})) # x: a row of input[count,XColumns]
temp <- replace(alignment_with_germline[2:nrow(alignment_with_germline), 3:length(alignment_with_germline)], a == TRUE, "-")
# add the first and the second columns
temp2 <- cbind(alignment_with_germline[2:nrow(alignment_with_germline), 1:2], temp)
# add the last columns
if ((length(alignment_with_germline) + 1) < length(region_alignment)) {
temp2 <- rbind(temp2, region_alignment[, (length(alignment_with_germline) + 1):length(region_alignment)])
}
# add the germline (first row)
germline_new <- germline
# germline_new[,(length(germline)+1):length(region_alignment)]="."
colnames(germline_new) <- colnames(temp2)
output <- rbind(germline_new[1, ], temp2)
} else {
if (use_genes_germline) {
Tgermlines <- Tgermlines %>% dplyr::filter(stringr::str_detect(Tgermlines$V1, "[*]01 F"))
for (i in seq_len(nrow(Tgermlines))) {
Tgermlines$V1[i] <- strsplit(Tgermlines$V1, "[*]")[[i]][1]
}
region_alignment <- region_alignment
for (i in seq_len(nrow(region_alignment))) {
region_alignment$V.GENE.and.allele[i] <- strsplit(region_alignment$V.GENE.and.allele[i], "[*]")[[1]][1]
}
}
if ((ncol(region_alignment) - ncol(Tgermlines) - 5) > 0) {
a <- matrix(".", ncol = ncol(region_alignment) - ncol(Tgermlines) - 5, nrow = nrow(Tgermlines))
germlines <- cbind("-", 0, "germline", "-", "-", Tgermlines, a)
colnames(germlines) <- colnames(region_alignment)
alignment_with_germline <- rbind(germlines, region_alignment)
} else {
germlines <- cbind("-", 0, "germline", "-", "-", Tgermlines)
germlines <- germlines[, 1:ncol(region_alignment)]
colnames(germlines) <- colnames(region_alignment)
alignment_with_germline <- rbind(germlines, region_alignment)
}
df3 <- alignment_with_germline
germline <- c()
output <- c()
a <- c()
XColumns <- seq_len((ncol(region_alignment) - 6))
XColumns <- as.character(XColumns)
alignment_with_germline <- data.table::as.data.table(alignment_with_germline)
for (germ in unique(alignment_with_germline$V.GENE.and.allele)) {
y <- alignment_with_germline[which(alignment_with_germline$V.GENE.and.allele == germ), ]
germline <- which(y[["Functionality"]] == "germline")
productive <- which(y[["Functionality"]] == "productive")
if (length(germline) > 0 && length(productive) > 0) {
germline <- germline[1]
t <- as.matrix(y[germline, ..XColumns])
t <- rep(t, length(productive))
t <- matrix(data = t, nrow = length(productive), byrow = TRUE)
a <- y[productive, ..XColumns] == t & y[productive, ..XColumns] != "."
temp <- replace(y[productive, ..XColumns], a == TRUE, "-")
temp.names <- colnames(alignment_with_germline[, 1:6])
temp2 <- cbind(y[productive, ..temp.names], temp)
temp.names <- c(temp.names, XColumns)
output <- rbind(output, y[germline, ..temp.names], temp2)
}
}
}
alignment_allData <- output %>% select(-c(Functionality))
################################ for Separate datasets ###############################
alignment_datasets <- list()
one_run <- function(j) {
input_tmp <- input %>% dplyr::filter(input$dataName == name[j])
############### Clonotypes ##############
cluster_id <- c()
freq_cluster_id <- c()
if (length(view_specific_clonotype_allData) == 0) {
cluster_id[1:nrow(input_tmp)] <- 0
freq_cluster_id[1:nrow(input)] <- 0
} else {
if (!highly) {
for (i in seq_len(length(view_specific_clonotype_datasets[[name[j]]]))) {
index <- which(input_tmp[[used_columns[["Summary"]][1]]] %in% view_specific_clonotype_datasets[[name[j]]][[names(view_specific_clonotype_datasets[[name[j]]])[i]]][[used_columns[["Summary"]][1]]])
if (index[1] > 0) {
cluster_id[index] <- i
freq_cluster_id[index] <- clono_datasets[[name[j]]]$Freq[i]
}
}
} else {
for (i in seq_len(nrow(clono_datasets[[name[j]]]))) {
prev_clono <- as.numeric(strsplit(as.character(clono_datasets[[name[j]]]$prev_cluster[i]), " ")[[1]][2:length(strsplit(as.character(clono_datasets[[name[j]]]$prev_cluster[i]), " ")[[1]])])
prev_clono <- prev_clono[!is.na(prev_clono)]
view <- view_specific_clonotype_datasets[[name[j]]][[prev_clono[1]]]
if (length(prev_clono) > 1) {
for (cl in 2:length(prev_clono)) {
view <- rbind(view, view_specific_clonotype_datasets[[name[j]]][[prev_clono[cl]]])
}
}
index <- which(input_tmp[[used_columns[["Summary"]][1]]] %in% view_specific_clonotype_datasets[[name[j]]][[names(view_specific_clonotype_datasets[[name[j]]])[i]]][[used_columns[["Summary"]][1]]])
if (index[1] > 0) {
cluster_id[index] <- i
freq_cluster_id[index] <- clono_datasets[[name[j]]]$Freq[i]
}
}
}
}
#########################################
region_split <- strsplit(input_tmp[[region]], "")
for (i in seq_len(length(region_split))) {
if (length(region_split[[i]]) > max_length_region) {
region_split[[i]] <- region_split[[i]][1:max_length_region]
}
if (length(region_split[[i]]) < max_length_region) {
region_split[[i]][length(region_split[[i]]):max_length_region] <- "."
}
}
region_split <- as.data.frame(region_split)
region_split <- t(region_split)
row.names(region_split) <- NULL
region_alignment <- cbind(as.data.frame(cluster_id),
as.data.frame(freq_cluster_id),
Functionality = "productive"
)
region_alignment <- cbind(region_alignment,
J.GENE.and.allele = input_tmp[[used_columns[["Summary"]][8]]],
D.GENE.and.allele = input_tmp[[used_columns[["Summary"]][11]]],
V.GENE.and.allele = input_tmp[[used_columns[["Summary"]][3]]],
region_split, stringsAsFactors = FALSE
)
region_alignment$cluster_id <- as.character(cluster_id)
region_alignment$freq_cluster_id <- as.character(freq_cluster_id)
if (FtopN) {
region_alignment <- region_alignment %>%
dplyr::filter(as.numeric(as.character(region_alignment$cluster_id)) <= topNClono | region_alignment$cluster_id == "-")
}
if (Fthr) {
region_alignment <- region_alignment %>%
dplyr::filter(as.numeric(as.character(freq_cluster_id)) >= thrClono | region_alignment$cluster_id == "-")
}
if (only_one_germline) {
alignment_with_germline <- rbind(germline, region_alignment[, 1:length(germline)])
a <- t(apply(alignment_with_germline[2:nrow(alignment_with_germline), 3:length(alignment_with_germline)], 1, function(x) {
x == alignment_with_germline[1, 3:length(alignment_with_germline)] & x != "."
})) # x: a row of input[count,XColumns]
temp <- replace(alignment_with_germline[2:nrow(alignment_with_germline), 3:length(alignment_with_germline)], a == TRUE, "-")
# add the first and the second columns
temp2 <- cbind(alignment_with_germline[2:nrow(alignment_with_germline), 1:2], temp)
# add the last columns
if ((length(alignment_with_germline) + 1) < length(region_alignment)) {
temp2 <- rbind(temp2, region_alignment[, (length(alignment_with_germline) + 1):length(region_alignment)])
}
# add the germline (first row)
germline_new <- germline
colnames(germline_new) <- colnames(temp2)
output <- rbind(germline_new[1, ], temp2)
} else {
if (use_genes_germline) {
for (i in seq_len(nrow(region_alignment))) {
region_alignment$V.GENE.and.allele[i] <- strsplit(region_alignment$V.GENE.and.allele[i], "[*]")[[1]][1]
}
}
a <- matrix(".", ncol = ncol(region_alignment) - ncol(Tgermlines) - 5, nrow = nrow(Tgermlines))
germlines <- cbind("-", 0, "germline", "-", "-", Tgermlines, a)
colnames(germlines) <- colnames(region_alignment)
alignment_with_germline <- rbind(germlines, region_alignment)
germline <- c()
output <- c()
a <- c()
XColumns <- seq_len((ncol(region_alignment) - 6))
XColumns <- as.character(XColumns)
alignment_with_germline <- data.table::as.data.table(alignment_with_germline)
for (germ in unique(alignment_with_germline$V.GENE.and.allele)) {
y <- alignment_with_germline[which(alignment_with_germline$V.GENE.and.allele == germ), ]
germline <- which(y[["Functionality"]] == "germline")
productive <- which(y[["Functionality"]] == "productive")
if (length(germline) > 0 && length(productive) > 0) {
germline <- germline[1]
t <- as.matrix(y[germline, ..XColumns])
t <- rep(t, length(productive))
t <- matrix(data = t, nrow = length(productive), byrow = TRUE)
a <- y[productive, ..XColumns] == t & y[productive, ..XColumns] != "."
temp <- replace(y[productive, ..XColumns], a == TRUE, "-")
temp.names <- colnames(alignment_with_germline[, 1:6])
temp2 <- cbind(y[productive, ..temp.names], temp)
temp.names <- c(temp.names, XColumns)
output <- rbind(output, y[germline, ..temp.names], temp2)
}
}
}
alignment_datasets[[name[j]]] <- output %>% select(-c(Functionality))
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Alignment_", AAorNtAlignment, "_", name[j], ".txt")
write.table(alignment_datasets[[name[j]]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
return(alignment_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
if (length(name) == 1) {
alignment_datasets[[name]] <- alignment_allData
} else {
alignment_datasets <- lapply(seq_len(length(name)), one_run)
}
} else {
if (length(name) == 1) {
alignment_datasets[[name]] <- alignment_allData
} else {
alignment_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
}
names(alignment_datasets) <- name
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Alignment_", AAorNtAlignment, "_", "All_Data", ".txt")
write.table(alignment_allData, filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
confirm <- "Alignment run!"
result <- list(
"alignment_allData" = alignment_allData,
"alignment_datasets" = alignment_datasets,
"confirm" = confirm
)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
} else {
return(0)
}
}
######################################################################################################################################
mutations <- function(align, align_datasets, thr, AAorNtMutations, name, topNClono, FtopN, FclonoSeperately, cl, Fthr, thrClono, FthrSep = TRUE, thrSep) {
# logfile
cat(paste0("mutations", "\t"), file = logFile, append = TRUE)
cat(paste0(paste("region", AAorNtMutations, sep = ","), "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(align), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(align), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
if (FtopN) {
align <- dplyr::filter(align, cluster_id %in% c("-", as.character(seq_len(topNClono))))
}
if (Fthr) {
align <- align %>% dplyr::filter(align$freq_cluster_id >= thrClono | align$cluster_id == "-")
}
if (FclonoSeperately) {
cl_id <- paste0("_cluster_id_", cl)
if (FthrSep) {
align <- dplyr::filter(align, cluster_id %in% c("-", as.character(cl)))
} else {
align <- dplyr::filter(align, cluster_id %in% c("-", as.character(cl)))
}
} else {
cl_id <- ""
}
align[] <- lapply(align, factor) # the "[]" keeps the dataframe structure
colnames(align)[4:ncol(align)] <- paste0("X", colnames(align)[4:ncol(align)])
# Find IMGT-region from position
region_names <- c("FR1-IMGT", "CDR1-IMGT", "FR2-IMGT", "CDR2-IMGT", "FR3-IMGT", "CDR3-IMGT")
if (AAorNtMutations == "aa") {
index_1 <- c(1, 27, 39, 56, 66, 105)
index_2 <- c(26, 38, 55, 65, 104, 114)
start_g <- 85
IMGT_groups <- list(
Acidic = c("E", "D"), Basic = c("K", "R", "H"), Aromatic = c("F", "W", "Y"),
Non_polar_aliphatic = c("G", "A", "I", "V", "L", "P", "M"), Polar_non_charged = c("S", "T", "C", "N", "Q")
)
} else {
index_1 <- c(1, 79, 115, 166, 196, 313)
index_2 <- c(78, 114, 165, 195, 312, 342)
start_g <- 260
IMGT_groups <- list(transitions = c("a>g", "g>a", "c>t", "t>c"), transversions = c("a>c", "c>a", "g>t", "t>g", "c>g", "g>c", "a>t", "t>a"))
}
# for each position find the number of sequences that do not contain "-"
count <- 0
mutation_change_allData <- list()
level_counts <- c()
output <- c()
b <- by(
align, align$V.GENE.and.allele,
function(y) {
germline <<- which(y$cluster_id == "-")
germline <<- germline[1]
productive <<- which(y$cluster_id != "-")
if (length(productive) > 0 & length(germline)) {
mc <- c()
germ_length <- ncol(y) - length(which(y[germline, start_g:ncol(align)] == "."))
pos <- which((colSums(y[productive, 5:germ_length] != "-") / nrow(y)) > thr) # till germline length
for (i in pos) {
level_counts <<- plyr::count(y[productive, ], paste0("X", i))
letter_ <- which(level_counts[, 1] == "-")
letter_dot <- which(level_counts[, 1] == ".")
if (length(letter_) > 0 & length(letter_dot) > 0) {
index <- which(as.numeric(row.names(level_counts)) != letter_ & as.numeric(row.names(level_counts)) != letter_dot)
} else if (length(letter_) > 0 & length(letter_dot) == 0) {
index <- which(as.numeric(row.names(level_counts)) != letter_)
} else if (length(letter_) == 0 & length(letter_dot) > 0) {
index <- which(as.numeric(row.names(level_counts)) != letter_dot)
} else if (length(letter_) == 0 & length(letter_dot) == 0) index <- as.numeric(row.names(level_counts))
if (length(index) > 0) {
max_freq_id <- which(level_counts[index, 2] == max(level_counts[index, 2]))
new_row <- cbind(level_counts[index, ][max_freq_id, ], pos = i, germline = y[[paste0("X", i)]][germline])
colnames(new_row)[1] <- "X"
if (new_row[, 2][1] / nrow(y) > thr) {
mc <- rbind(mc, new_row)
}
}
}
mc$region <- c()
mc$germ_physico <- c()
mc$new_physico <- c()
mc$Change_in_physicochemical_properties <- c()
if (length(mc) > 0) {
for (r in seq_len(length(region_names))) {
mc[which(mc$pos >= index_1[r] & mc$pos <= index_2[r]), 5] <- region_names[r]
}
if (AAorNtMutations == "aa") {
for (r in names(IMGT_groups)) {
mc[which(mc$germline %in% IMGT_groups[[r]]), 6] <- r
mc[which(mc$X %in% IMGT_groups[[r]]), 7] <- r
}
mc$Change_in_physicochemical_properties <- paste0(mc$V6, "->", mc$V7)
} else {
for (r in names(IMGT_groups)) {
mc[which(paste0(mc$germline, ">", mc$X) %in% IMGT_groups[[r]]), 6] <- r
}
mc$Change_in_physicochemical_properties <- mc$V6
}
mc <- cbind(Gene = as.character(y$V.GENE.and.allele[germline]), Change_in_position = paste0(mc$germline, mc$pos, ">", mc$X), Region = mc$V5, Change_in_physicochemical_properties = mc$Change_in_physicochemical_properties, N = mc$freq, Freq = 100 * mc$freq / (nrow(y) - 1))
output <<- rbind(output, mc)
}
}
}
)
mutation_change_allData <- output
################################ for Separate datasets ###############################
mutation_change_datasets <- list()
one_run <- function(j) {
if (FtopN) {
align_datasets[[name[j]]] <- dplyr::filter(align_datasets[[name[j]]], cluster_id %in% c("-", as.character(seq_len(topNClono))))
}
if (FclonoSeperately) {
align_datasets[[name[j]]] <- dplyr::filter(align_datasets[[name[j]]], cluster_id %in% c("-", as.character(cl)))
}
align_datasets[[name[j]]][] <- lapply(align_datasets[[name[j]]], factor)
colnames(align_datasets[[name[j]]])[4:ncol(align_datasets[[name[j]]])] <- paste0("X", colnames(align_datasets[[name[j]]])[4:ncol(align_datasets[[name[j]]])])
# for each position find the number of sequences that do not contain "-"
count <- 0
output <- c()
mutation_change_temp <- list()
b <- by(
align_datasets[[name[j]]], align_datasets[[name[j]]]$V.GENE.and.allele,
function(y) {
germline <<- which(y[, "cluster_id"] == "-")
germline <<- germline[1]
productive <<- which(y[, "cluster_id"] != "-")
if (length(productive) > 0 & length(germline)) {
mc <- c()
germ_length <- ncol(y) - length(which(y[germline, start_g:ncol(align_datasets[[name[j]]])] == "."))
pos <- which((colSums(y[productive, 5:germ_length] != "-") / nrow(y)) > thr) # till germline length
for (i in pos) {
level_counts <- plyr::count(y[productive, ], paste0("X", i))
letter_ <- which(level_counts[, 1] == "-")
letter_dot <- which(level_counts[, 1] == ".")
if (length(letter_) > 0 & length(letter_dot) > 0) {
index <- which(as.numeric(row.names(level_counts)) != letter_ & as.numeric(row.names(level_counts)) != letter_dot)
} else if (length(letter_) > 0 & length(letter_dot) == 0) {
index <- which(as.numeric(row.names(level_counts)) != letter_)
} else if (length(letter_) == 0 & length(letter_dot) > 0) {
index <- which(as.numeric(row.names(level_counts)) != letter_dot)
} else if (length(letter_) == 0 & length(letter_dot) == 0) index <- as.numeric(row.names(level_counts))
if (length(index) > 0) {
max_freq_id <- which(level_counts[index, 2] == max(level_counts[index, 2]))
new_row <- cbind(level_counts[index, ][max_freq_id, ], pos = i, germline = y[[paste0("X", i)]][germline])
colnames(new_row)[1] <- "X"
if (new_row[, 2][1] / nrow(y) > thr) {
mc <- rbind(mc, new_row)
}
}
}
mc$region <- c()
mc$germ_physico <- c()
mc$new_physico <- c()
mc$Change_in_physicochemical_properties <- c()
if (length(mc) > 0) {
for (r in seq_len(length(region_names))) {
mc[which(mc$pos >= index_1[r] & mc$pos <= index_2[r]), 5] <- region_names[r]
}
if (AAorNtMutations == "aa") {
for (r in names(IMGT_groups)) {
mc[which(mc$germline %in% IMGT_groups[[r]]), 6] <- r
mc[which(mc$X %in% IMGT_groups[[r]]), 7] <- r
}
mc$Change_in_physicochemical_properties <- paste0(mc$V6, "->", mc$V7)
} else {
for (r in names(IMGT_groups)) {
mc[which(paste0(mc$germline, ">", mc$X) %in% IMGT_groups[[r]]), 6] <- r
}
mc$Change_in_physicochemical_properties <- mc$V6
}
mc <- cbind(Gene = as.character(y$V.GENE.and.allele[germline]), Change_in_position = paste0(mc$germline, mc$pos, ">", mc$X), Region = mc$V5, Change_in_physicochemical_properties = mc$Change_in_physicochemical_properties, N = mc$freq, Freq = 100 * mc$freq / (nrow(y) - 1))
output <<- rbind(output, mc)
}
}
}
)
mutation_change_datasets[[name[j]]] <- output
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Mutations_thr", thr, "_", AAorNtMutations, "_", name[j], cl_id, ".txt")
write.table(mutation_change_datasets[[name[j]]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
return(mutation_change_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
mutation_change_datasets <- lapply(seq_len(length(name)), one_run)
} else {
mutation_change_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(mutation_change_datasets) <- name
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Mutations_thr", thr, "_", AAorNtMutations, "_", "All_Data", cl_id, ".txt")
write.table(mutation_change_allData, filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
confirm <- "Mutations run!"
result <- list("mutation_change_allData" = mutation_change_allData, "mutation_change_datasets" = mutation_change_datasets, "confirm" = confirm)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
# input: the alignment file
groupedAlignment <- function(alignment_allData, alignment_datasets, name, AAorNtAlignment) {
# logfile
cat(paste0("groupedAlignment", "\t"), file = logFile, append = TRUE)
cat(paste0("grouping", "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(alignment_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(alignment_allData), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
input <- data.table(alignment_allData)
XColumns <- seq_len((ncol(alignment_allData) - 6))
XColumns <- as.character(XColumns)
XColumns <- c("V.GENE.and.allele", "cluster_id", "freq_cluster_id", XColumns)
distinct_changes <- input[, list(Freq = .N), by = XColumns]
grouped_alignment_allData <- cbind(N = distinct_changes$Freq, distinct_changes[, 1:(ncol(distinct_changes) - 1)])
grouped_alignment_datasets <- list()
one_run <- function(j) {
input <- data.table(alignment_datasets[[name[j]]])
distinct_changes <- input[, list(Freq = .N), by = XColumns]
grouped_alignment_datasets[[name[j]]] <- cbind(N = distinct_changes$Freq, distinct_changes[, 1:(ncol(distinct_changes) - 1)])
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Grouped Alignment_", AAorNtAlignment, "_", "All_Data", ".txt")
write.table(grouped_alignment_datasets[[name[j]]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Grouped Alignment_", AAorNtAlignment, "_", name[j], ".txt")
write.table(grouped_alignment_datasets[[name[j]]], filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
return(grouped_alignment_datasets[[name[j]]])
}
if (Sys.info()[1] == "Windows") {
grouped_alignment_datasets <- lapply(seq_len(length(name)), one_run)
} else {
grouped_alignment_datasets <- parallel::mclapply(seq_len(length(name)), one_run, mc.cores = num_of_cores, mc.preschedule = TRUE)
}
names(grouped_alignment_datasets) <- name
if (save_tables_individually) {
filename <- paste0(e$output_folder, "/", "Grouped Alignment_", AAorNtAlignment, "_", "All_Data", ".txt")
write.table(grouped_alignment_allData, filename, sep = "\t", row.names = FALSE, col.names = TRUE)
}
confirm <- "Grouped Alignment run!"
result <- list("grouped_alignment_allData" = grouped_alignment_allData, "grouped_alignment_datasets" = grouped_alignment_datasets, "confirm" = confirm)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
find_cdr3_diff1P <- function(allData, max_length_cdr3, position, name) {
# logfile
cat(paste0("find_cdr3_diff1P", "\t"), file = logFile, append = TRUE)
cat(paste0("max length ", max_length_cdr3, ",", "Position ", position, "\t"), file = logFile, append = TRUE)
cat(paste0(nrow(allData), "\t"), file = logFile, append = TRUE)
cat(paste0(ncol(allData), "\t"), file = logFile, append = TRUE)
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
################## Clonotypes ################
used_columns <- e$used_columns
gene <- used_columns[["Summary"]][3]
junction <- used_columns[["Summary"]][18]
unique_genes <- unique(allData[[gene]])
cdr3_diff1P_allData <- c()
for (i in seq_len(length(unique_genes))) {
gene_name <- unique_genes[i]
allData_temp <- allData %>% dplyr::filter(allData[[gene]] == gene_name)
distinctVGenes_CDR3 <- allData_temp %>%
dplyr::group_by(JUNCTION = allData_temp[[junction]]) %>%
dplyr::summarise(Freq = n())
distinctVGenes_CDR3 <- cbind(distinctVGenes_CDR3, cluster_id = row.names(distinctVGenes_CDR3))
clono_datasets <- list()
for (j in seq_len(length(name))) {
data <- allData_temp %>% dplyr::filter(allData_temp$dataName == name[j])
clono_datasets[[name[j]]] <- data %>%
dplyr::group_by(JUNCTION = data[[junction]]) %>%
dplyr::summarise(Freq = n())
clono_datasets[[name[j]]] <- cbind(clono_datasets[[name[j]]], cluster_id = row.names(clono_datasets[[name[j]]]))
}
################## Find the CDR3 that have difference only at the positions 8 and 9 with P ###############
sub_groups <- c()
sub_groups2 <- c()
used_rows <- c()
cdr3_split <- strsplit(distinctVGenes_CDR3$JUNCTION, "")
cdr3_split_new <- c()
k <- 0
for (j in seq_len(length(cdr3_split))) {
if (length(cdr3_split[[j]]) == max_length_cdr3) {
k <- k + 1
used_rows <- c(used_rows, j)
cdr3_split_new[[k]] <- cdr3_split[[j]]
}
if (length(cdr3_split[[j]]) == (max_length_cdr3 - 1)) {
# shift right from the position 8 till the end
k <- k + 1
cdr3_split[[j]][(position + 1):max_length_cdr3] <- cdr3_split[[j]][position:(max_length_cdr3 - 1)]
cdr3_split_new[[k]] <- cdr3_split[[j]]
used_rows <- c(used_rows, j)
}
distinctVGenes_CDR3 <- distinctVGenes_CDR3[used_rows, ]
}
cdr3_split_new <- as.data.frame(cdr3_split_new)
cdr3_split_new <- as.data.frame(t(cdr3_split_new), stringsAsFactors = FALSE)
row.names(cdr3_split_new) <- NULL
if (nrow(cdr3_split_new) > 0) {
# Find distinct cdr3_split_new
if (nrow(distinct(cdr3_split_new)) < nrow(cdr3_split_new)) {
cdr3_split_new <- data.table(cdr3_split_new)
VColumns <- colnames(cdr3_split_new)
temp <- cdr3_split_new[, list(Freq_sub_cluster = .N), by = VColumns]
v_cdr3_cluster_id <- c()
for (j in seq_len(nrow(distinctVGenes_CDR3))) {
v_cdr3_cluster_id[j] <- which((do.call(paste0, cdr3_split_new[j, ]) == do.call(paste0, temp[, 1:max_length_cdr3])))
}
multiple_objects_id <- c()
for (j in seq_len(length(v_cdr3_cluster_id))) {
if (length(which(v_cdr3_cluster_id == v_cdr3_cluster_id[j])) > 1) multiple_objects_id <- c(multiple_objects_id, j)
}
# Combine to one data frame
cdr3_split_cluster_id <- cbind.data.frame(cdr3_split_new, v_cdr3_cluster_id, stringsAsFactors = FALSE)
result <- cbind(cluster_id = distinctVGenes_CDR3$cluster_id[multiple_objects_id], JUNCTION = distinctVGenes_CDR3$JUNCTION[multiple_objects_id], Freq = distinctVGenes_CDR3$Freq[multiple_objects_id])
result <- as.data.frame(result, stringsAsFactors = FALSE)
a <- distinctVGenes_CDR3[multiple_objects_id, ]
a <- cbind(gene_name, a)
result <- a[order(a[, "JUNCTION"]), ]
cdr3_diff1P_allData <- rbind(cdr3_diff1P_allData, result)
}
}
}
################################### Separate Datasets ######################################
cdr3_diff1P_datasets <- list()
for (n in seq_len(length(name))) {
data_dataset <- allData %>% dplyr::filter(allData$dataName == name[n])
unique_genes <- unique(data_dataset[[gene]])
cdr3_diff1P_datasets[[name[n]]] <- c()
for (i in seq_len(length(unique_genes))) {
gene_name <- unique_genes[i]
allData_temp <- data_dataset %>% dplyr::filter(data_dataset[[gene]] == gene_name)
distinctVGenes_CDR3 <- allData_temp %>%
dplyr::group_by(JUNCTION = allData_temp[[junction]]) %>%
dplyr::summarise(Freq = n())
distinctVGenes_CDR3 <- cbind(distinctVGenes_CDR3, cluster_id = row.names(distinctVGenes_CDR3))
clono_datasets <- list()
for (j in seq_len(length(name))) {
data <- allData_temp %>% dplyr::filter(allData_temp$dataName == name[j])
clono_datasets[[name[j]]] <- data %>%
dplyr::group_by(JUNCTION = data[[junction]]) %>%
dplyr::summarise(Freq = n())
clono_datasets[[name[j]]] <- cbind(clono_datasets[[name[j]]], cluster_id = row.names(clono_datasets[[name[j]]]))
}
################## Find the CDR3 that have difference only at the positions 8 and 9 with P ###############
sub_groups <- c()
sub_groups2 <- c()
used_rows <- c()
cdr3_split <- strsplit(distinctVGenes_CDR3$JUNCTION, "")
cdr3_split_new <- c()
k <- 0
for (j in seq_len(length(cdr3_split))) {
if (length(cdr3_split[[j]]) == max_length_cdr3) {
k <- k + 1
used_rows <- c(used_rows, j)
cdr3_split_new[[k]] <- cdr3_split[[j]]
}
if (length(cdr3_split[[j]]) == (max_length_cdr3 - 1)) {
# shift right from the position 8 till the end
k <- k + 1
cdr3_split[[j]][(position + 1):max_length_cdr3] <- cdr3_split[[j]][position:(max_length_cdr3 - 1)]
cdr3_split_new[[k]] <- cdr3_split[[j]]
used_rows <- c(used_rows, j)
}
distinctVGenes_CDR3 <- distinctVGenes_CDR3[used_rows, ]
}
cdr3_split_new <- as.data.frame(cdr3_split_new)
cdr3_split_new <- as.data.frame(t(cdr3_split_new), stringsAsFactors = FALSE)
row.names(cdr3_split_new) <- NULL
if (nrow(cdr3_split_new) > 0) {
# Find distinct cdr3_split_new
if (nrow(distinct(cdr3_split_new)) < nrow(cdr3_split_new)) {
cdr3_split_new <- data.table(cdr3_split_new)
VColumns <- colnames(cdr3_split_new)
temp <- cdr3_split_new[, list(Freq_sub_cluster = .N), by = VColumns]
v_cdr3_cluster_id <- c()
for (j in seq_len(nrow(distinctVGenes_CDR3))) {
v_cdr3_cluster_id[j] <- which((do.call(paste0, cdr3_split_new[j, ]) == do.call(paste0, temp[, 1:max_length_cdr3])))
}
multiple_objects_id <- c()
for (j in seq_len(length(v_cdr3_cluster_id))) {
if (length(which(v_cdr3_cluster_id == v_cdr3_cluster_id[j])) > 1) multiple_objects_id <- c(multiple_objects_id, j)
}
# Combine to one data frame
cdr3_split_cluster_id <- cbind.data.frame(cdr3_split_new, v_cdr3_cluster_id, stringsAsFactors = FALSE)
result <- cbind(cluster_id = distinctVGenes_CDR3$cluster_id[multiple_objects_id], JUNCTION = distinctVGenes_CDR3$JUNCTION[multiple_objects_id], Freq = distinctVGenes_CDR3$Freq[multiple_objects_id])
result <- as.data.frame(result, stringsAsFactors = FALSE)
a <- distinctVGenes_CDR3[multiple_objects_id, ]
a <- cbind(gene_name, a)
result <- a[order(a[, "JUNCTION"]), ]
cdr3_diff1P_datasets[[name[n]]] <- rbind(cdr3_diff1P_datasets[[name[n]]], result)
}
}
}
}
confirm <- "CDR3 1 diff length run!"
result <- list("cdr3_diff1P_allData" = cdr3_diff1P_allData, "cdr3_diff1P_datasets" = cdr3_diff1P_datasets, "confirm" = confirm)
# log time end and memory used
cat(paste0(Sys.time(), "\t"), file = logFile, append = TRUE)
cat(pryr::mem_used(), file = logFile, append = TRUE, sep = "\n")
return(result)
}
######################################################################################################################################
addRepertoryFct <- function(id, btn) {
insertUI(
selector = "#placeholderRepertories",
ui = tags$div(
selectInput(btn, "Select type:", c(
"V Gene", "V Gene and allele",
"J Gene", "J Gene and allele",
"D Gene", "D Gene and allele"
), width = "170px"),
id = id
)
)
}
######################################################################################################################################
addMultipleValues <- function(id, btn, columns_for_Multiple_value_comparison, default_val1 = NULL, default_val2 = NULL) {
insertUI(
selector = "#placeholder",
## wrap element in a div with id for ease of removal
ui = tags$div(
# forloop for columns
div(style = "display:inline-block", selectInput(paste0("select_MultipleValues_column1_", btn), "Select 1st column:", columns_for_Multiple_value_comparison, selected = default_val1, width = "170px")),
div(style = "display:inline-block", selectInput(paste0("select_MultipleValues_column2_", btn), "Select 2nd column:", columns_for_Multiple_value_comparison, selected = default_val2, width = "170px")),
id = id
)
)
}
######################################################################################################################################
# Set up a button to have an animated loading indicator and a checkmark
# for better user experience
# Need to use with the corresponding `withBusyIndicator` server function
withBusyIndicatorUI <- function(button) {
id <- button[["attribs"]][["id"]]
div(
`data-for-btn` = id,
button,
span(
class = "btn-loading-container",
hidden(
img(src = "www/ajax-loader-bar.gif", class = "btn-loading-indicator"),
icon("check", class = "btn-done-indicator")
)
),
hidden(div(
class = "btn-err",
div(
icon("exclamation-circle"),
tags$b("Error: "),
span(class = "btn-err-msg")
)
))
)
}
######################################################################################################################################
# Call this function from the server with the button id that is clicked and the
# expression to run when the button is clicked
withBusyIndicatorServer <- function(buttonId, expr) {
# UX stuff: show the "busy" message, hide the other messages, disable the button
loadingEl <- sprintf("[data-for-btn=%s] .btn-loading-indicator", buttonId)
doneEl <- sprintf("[data-for-btn=%s] .btn-done-indicator", buttonId)
errEl <- sprintf("[data-for-btn=%s] .btn-err", buttonId)
shinyjs::disable(buttonId)
shinyjs::show(selector = loadingEl)
shinyjs::hide(selector = doneEl)
shinyjs::hide(selector = errEl)
on.exit({
shinyjs::enable(buttonId)
shinyjs::hide(selector = loadingEl)
})
# Try to run the code when the button is clicked and show an error message if
# an error occurs or a success message if it completes
tryCatch(
{
value <- expr
shinyjs::show(selector = doneEl)
shinyjs::delay(2000, shinyjs::hide(selector = doneEl, anim = TRUE, animType = "fade", time = 0.5))
value
},
error = function(err) {
errorFunc(err, buttonId)
}
)
}
# When an error happens after a button click, show the error
errorFunc <- function(err, buttonId) {
errEl <- sprintf("[data-for-btn=%s] .btn-err", buttonId)
errElMsg <- sprintf("[data-for-btn=%s] .btn-err-msg", buttonId)
errMessage <- gsub("^ddpcr: (.*)", "\\1", err$message)
shinyjs::html(html = errMessage, selector = errElMsg)
shinyjs::show(selector = errEl, anim = TRUE, animType = "fade")
}
appCSS <- "
.btn-loading-container {
margin-left: 10px;
font-size: 1.2em;
}
.btn-done-indicator {
color: green;
}
.btn-err {
margin-top: 10px;
color: red;
}
"
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.