knitr::opts_chunk$set(echo = TRUE)
library(ape) library(plyr) library(tidyr) library(stringr) library(readr) library(profvis) library(microbenchmark) library(CompareTools)
file_path <- system.file("extdata", "Dolloout", package = "CompareTools") files <- file_path tree_path <- system.file("extdata", "71g_species.tree", package = "CompareTools") tree <- read.tree(tree_path) group_path <- system.file("extdata", "groups.txt", package = "CompareTools") groups <- group_path #groups <- read_tsv(group_path) # groups <- NULL # partitioning <- TRUE # part_name <- "probe"
probe1 <- DolloImport3(tree = tree, files = files) microbenchmark(DolloImport3(tree = tree, files = files), times = 10)
group_df <- read_tsv(groups) cl_vector <- as.character(group_df$cluster) colnames(probe1[[2]][[1]])[1:50]
Test Slice function
probe1_sliced <- SliceComp(compare = probe1, clusters = colnames(probe1[[2]][[1]])[1:50], data = 2) probe1_sliced2 <- SliceComp(compare = probe1, clusters = colnames(probe1[[2]][[1]])[1:50], overwrite = FALSE) probe1_sliced3 <- SliceComp(compare = probe1_sliced2, clusters = colnames(probe1[[2]][[1]])[1:10], data = c(2:3), overwrite = TRUE)
Test transform
probe_t <- TransformComp(probe1, builtin = "NetCumGain") probe_t2 <- TransformComp(probe_t, builtin = "CumGain") probe_t3 <- TransformComp(probe_t, builtin = "CumLoss") identical(probe_t2$event_data$CumGain[,1] - probe_t3$event_data$CumLoss[,1], probe_t$event_data$NetCumGain[,1]) # hozzunk letre megegy eventdata-t probe2 <- SliceComp(compare = probe1, clusters = colnames(probe_t[[2]][[1]])[1:50], overwrite = FALSE) probe_t <- TransformComp(probe2, builtin = "NetCumGain") probe_t2 <- TransformComp(probe2, builtin = "CumGain") probe_t3 <- TransformComp(probe2, builtin = "CumLoss") identical(probe_t2$event_data$CumGain[,1] - probe_t3$event_data$CumLoss[,1], probe_t$event_data$NetCumGain[,1]) identical(probe_t2$event_data$CumGain[,1:50], probe_t2$event_data_SLICED$CumGain)
Test group
grouped_c <- GroupComp(probe1_sliced2, groups = group_df) grouped_c2 <- GroupComp(probe1_sliced2, groups = group_df, event_data = c(2,3))
Test plot event
PlotEvent(grouped_c2, dataset = "event_data_GROUPED", stat = "gains", data = c(1:3), sizemeth = "standard", size = 8, label.offset = 0.1, cex = 0.8)
probe1 <- GroupComp(probe1, groups = data.frame(colnames(probe1$event_data$gains), "all"), "event_data") PlotEvent(probe1, "event_data_GROUPED", "gains", data = 1, sizemeth = "standard", size = 8, label.offset = 0.1, cex = 0.8)
gr <- CorComp(probe1, dataset = "event_data", "gains", plot = FALSE, group_n = 5) probe2 <- GroupComp(probe1, groups = gr, "event_data") names(probe2)[4] <- "gr" PlotEvent(probe2, "event_data_GROUPED", "gains", data = 1, sizemeth = "standard", size = 8, label.offset = 0.1, cex = 0.8) PlotEvent(probe2, "gr", "gains", data = 1:5, sizemeth = "standard", size = 8, label.offset = 0.1, cex = 0.8) CorComp(probe1, dataset = "event_data", "gains", plot = TRUE, group_n = 5)
file_path <- system.file("extdata", "Dolloout_all_cluster", package = "CompareTools") files <- file_path tree_path <- system.file("extdata", "71g_species.tree", package = "CompareTools") tree <- read.tree(tree_path) group_path <- system.file("extdata", "groups.txt", package = "CompareTools") groups <- group_path
probe1 <- DolloImport3(tree = tree, files = files) probe1 <- GroupComp(probe1, groups = data.frame(colnames(probe1$event_data$gains), "all"), "event_data") PlotEvent(probe1, "event_data_GROUPED", "gains", data = 1, sizemeth = "standard", size = 8, label.offset = 0.1, cex = 0.8)
CorComp(probe1, dataset = "event_data", "gains", plot = TRUE, heatmap = FALSE, members = 1000)
probe1 <- DolloImport2(tree = tree, partitioning = FALSE, files = files, part_name = "every") d <- probe1$raw_data a <- probe1$every$all_cluster probe2 <- retrieve_event2(probe1, partitioning = TRUE, part_name = "all_cl")
microbenchmark(DolloImport2(tree = tree, partitioning = TRUE, files = files, part_name = "every"), times = 10)
Tehat mean 1.34 sec volt a futasa a kodnak....
profvis(DolloImport2(tree = tree, partitioning = FALSE, files = files, part_name = "all"))
profvis alapjan 1360 ms-ba telik az eges es ebbol 1310 a retrieve event fuggveny a retrieve fuggvenyen belul 110 + 70 ms volt az ains <- str_count(raw_data[,1], paste0(cluster_names[i], "/\d")) es a losses <- str_count(raw_data[,2], paste0(cluster_names[i], "/\d")) sorok
maga egy sor 1000 ms:
for(j in 1:node_n){
path_temp <- path[j]
v <- as.numeric(unlist(str_split(path_temp, ";")))
copy_num1[j] <- sum(all_cl_sep[[k]][v, 1])
copy_num2[j] <- sum(all_cl_sep[[k]][v, 2])
}
ezen belul is a v <- as.numeric(unlist(str_split(path_temp, ";"))) sor a leghosszabb...
akkor ezt a sort lecserelem erre:
v <- as.numeric(str_split(path_temp, ";", simplify = TRUE)[1,])
microbenchmark(DolloImport2(tree = tree, partitioning = TRUE, files = files, part_name = "every"), times = 10)
na most atlagba 1.36...
profvis(DolloImport2(tree = tree, partitioning = TRUE, files = files, part_name = "every"))
ugyan ott hasonlo ido...ez olyan volt mint halottnak a csok..
ok akkor most a belso loopbol kivettem a str kibontogatast:
node_ls <- vector("list", node_n) for(j in 1:node_n){ path_temp <- path[j] node_ls[[j]] <- as.numeric(str_split(path_temp, ";", simplify = TRUE)[1,]) }
for(k in 1:length(all_cl_sep)){ copy_num1 <- vector(mode = "numeric", length = node_n) copy_num2 <- vector(mode = "numeric", length = node_n) for(j in 1:node_n){ copy_num1[j] <- sum(all_cl_sep[[k]][node_ls[[j]], 1]) copy_num2[j] <- sum(all_cl_sep[[k]][node_ls[[j]], 2]) } all_cl_sep[[k]] <- cbind(all_cl_sep[[k]], copy_num1) all_cl_sep[[k]] <- cbind(all_cl_sep[[k]], copy_num2) all_cl_sep[[k]] <- unname(all_cl_sep[[k]]) }
microbenchmark(DolloImport2(tree = tree, partitioning = TRUE, files = files, part_name = "every"), times = 10)
hoppa javult! most mar csak 384 ms!! 3x gyorsabb lett!
profvis(DolloImport2(tree = tree, partitioning = TRUE, files = files, part_name = "every"))
Na akkor nezzuk meg hogy a zsolttol kapott nagyobb adatsor milyen gyorsan megy le:
tree <- read.tree("~/Documents/MUNKA/PhD/Enikonek/zsolttol/species_bigdata.tree") tree <- LadderizeTree(tree = tree) microbenchmark(DolloImport2(tree = tree, partitioning = TRUE, files = "~/Documents/MUNKA/PhD/Enikonek/zsolttol/Dolloout_bigdata", part_name = "every"), times = 10)
Ez mar sokkal rosszabbul hangzik mert 34 s az atlagos futasi ido......
profvis(DolloImport2(tree = tree, partitioning = TRUE, files = "~/Documents/MUNKA/PhD/Enikonek/zsolttol/Dolloout_bigdata", part_name = "every"))
na profvis alapjan latszik hogy a kodban ket resz van ami bottleneck lehet es az egyik fa nagysag fuggo a masik pedig cluster fuggo:
Ez a resz a fa nagysagatol fuggoen fog lassabban vagy gyorsabban futni:
for(k in 1:length(all_cl_sep)){ copy_num1 <- vector(mode = "numeric", length = node_n) copy_num2 <- vector(mode = "numeric", length = node_n) for(j in 1:node_n){ copy_num1[j] <- sum(all_cl_sep[[k]][node_ls[[j]], 1]) copy_num2[j] <- sum(all_cl_sep[[k]][node_ls[[j]], 2]) }
Ez a resz pedig a clusterek szamatol fogguen fog lassabban vagy gyorsabban futni.
cluster_names <- na.omit(unique(str_split(unique(unlist(str_split(raw_data, pattern = " "))), pattern = "/", simplify = TRUE)[,1]))
all_cl_sep <- vector("list", length = length(cluster_names))
for(i in seq_along(cluster_names)){
gains <- str_count(raw_data[,1], paste0(cluster_names[i], "/\d"))
losses <- str_count(raw_data[,2], paste0(cluster_names[i], "/\d"))
gains[is.na(gains)] <- 0
losses[is.na(losses)] <- 0
all_cl_sep[[i]] <- matrix(c(gains, losses), nrow = node_n, ncol = 2)
}
probe2 <- DolloImport2(tree = tree, partitioning = TRUE, files = "~/Documents/MUNKA/PhD/Enikonek/zsolttol/Dolloout_bigdata", part_name = "every")
tree, files = NULL, partitioning = FALSE, groups = NULL, part_name = NULL) if(is.null(files)){ stop("You need to give a path with the file name!") } #import raw data raw_file <- read_lines(files) rows_needed <- c(which(str_detect(raw_file, "GAIN") == TRUE), which(str_detect(raw_file, "LOSSES") == TRUE)) raw_df <- raw_file[rows_needed] raw_df2 <- str_split(raw_df, "\t") raw_df2 <- ldply(raw_df2, rbind) raw_df2 <- raw_df2[,-c(2,4)] colnames(raw_df2) <- c("event", "node", "chars") # itt eloszor le kene gyartani az objektumot a tree-vel es a raw file-al #match the phylo object nodes and the compare output nodes: node_def <- which(str_detect(raw_file, "node\\d") == TRUE) node_df <- raw_file[node_def] node_df2 <- str_split(node_df, "\t") node_df2 <- ldply(node_df2, rbind) colnames(node_df2) <- c("nodes", "species") node_df2$nodes <- str_replace(node_df2$nodes, "node", "") node_df2$species <- str_sub(node_df2$species, start = 2) # spacet kitakaritom dollo_species <- unique(unlist(sapply(node_df2$species, function(x) str_split(x, " ")))) if(any(dollo_species %in% tree$tip.label == FALSE)){ wrong_names_d <- which(dollo_species %in% tree$tip.label == FALSE) warning(paste0("The following species names don't match!\t", "Species in Dollo file:\t", dollo_species[wrong_names])) } node_df2$tree_nodes <- rep(NA, nrow(node_df2)) for(i in 1:nrow(node_df2)){ species <- unlist(str_split(node_df2[i,"species"], " ")) if(length(species) > 1){ node_df2[i, "tree_nodes"] <- getMRCA(tree, species) }else{ node_df2[i, "tree_nodes"] <- which(tree$tip.label == species) } } compare_ls <- list() compare_ls[[1]] <- list() compare_ls[[1]] <- tree #create the raw matrix node_df2$gain_event <- rep(NA, nrow(node_df2)) node_df2$loss_event <- rep(NA, nrow(node_df2)) gain_split <- raw_df2[raw_df2$event == "GAIN", ] loss_split <- raw_df2[raw_df2$event == "LOSSES", ] node_df2$gain_event <- gain_split$chars[match(node_df2$nodes, gain_split$node)] node_df2$loss_event <- loss_split$chars[match(node_df2$nodes, loss_split$node)] #node_df2 <- node_df2[,-c(1,2)] node_df2 <- node_df2[order(node_df2$tree_nodes),] sp <- node_df2$species match(sp[1:71], tree$tip.label) extract.clade(tree, 109)$tip.label raw_matrix <- as.matrix(node_df2[,2:3]) raw_matrix <- unname(raw_matrix) compare_ls[[2]] <- raw_matrix names(compare_ls) <- c("tree", "raw_data") # use the retrieve_event function events_ls <- retrieve_event2(compare = compare_ls, partitioning = partitioning, groups = groups, part_name = part_name) return(events_ls)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.