for(m in 1:5) {
# feature_matrix %>%
# select_if(function(x) any(is.na(x))) %>%
# summarise_each(funs(sum(is.na(.)))) -> extra_NA
cat(m, "\n")
d = distmix(feature_matrix %>% dplyr::select(-1) %>% as.data.frame(),
method = methods[m], idnum = idnum, idbin = idbin)
assign(paste0("d_", methods[m]), d)
# gower_dist <- cluster::daisy(feature_matrix[, -1],
# metric = "gower")
}
ade4::mantel.rtest(as.dist(d_gower), as.dist(d_gower))
vegan::mantel(as.dist(d_gower), as.dist(d_wishart))
vegan::mantel(as.dist(d_gower[1:1000,1:1000]), as.dist(d_gower[1:1000,1:1000]))
#toplot = feature_matrix
feature_matrix_df = feature_matrix %>% dplyr::select(-1) %>% as.data.frame()
feature_matrix_bin = trait.continuous2binary(feature_matrix)
feature_matrix_bin_df = feature_matrix_bin %>% dplyr::select(-id) %>% as.data.frame()
range = 1:ncol(feature_matrix_bin_df)
cor_matrix = cor(feature_matrix_bin_df[, range])
cor_melted = melt_dist(cor_matrix)
pdf(height = 6, width = 12,
file.path(base, paste0(dataset, ".", "trait_matrixatgranularity3", "_traitcorr-hist.pdf")))
h = hist(cor_melted[,"dist"], breaks = seq(-0.6, 1, 0.1), xlab = "trait-trait correlation", ylab = "count", main = "", cex.lab = 1.5,xaxt= "n")
axis(side=1, at = c(seq(-0.6, 0, 0.1), seq(0.1, 1, 0.1)), labels = c(seq(-0.6, 0, 0.1), seq(0.1, 1, 0.1)))
dev.off()
write.table(cor_melted,
file = file.path(base, paste0(dataset, ".", "trait_matrixatgranularity3", "_traitcorr.xls")),
row.names = F, col.names = T, sep = "\t", quote = F)
# cluster soil genomes
# Aquatic: 4698, Terrestrial: 3693, Plants: 376
feature_matrix = genomeset_results_wmetadata[["trait_matrixatgranularity3"]] %>%
dplyr::filter(`Ecosystem_Category` %in% c("Terrestrial", "Plants")) %>%
dplyr::filter(`Ecosystem_Type` %in% c("Soil", "Rhizoplane", "Rhizosphere", "Roots")) %>%
dplyr::select(-`Resource Use:Chemotrophy:chemolithoautotrophy:anaerobic ammonia oxidation`) %>%
dplyr::select(c(1:189))
feature_matrix_bin = trait.continuous2binary(feature_matrix)
feature_matrix_bin_df = feature_matrix_bin %>% dplyr::select(-id) %>% as.data.frame()
saveRDS(feature_matrix_bin_df, file.path(base, paste0(dataset, ".", "feature_matrix_bin_soil_df.rds")))
compute.prevalence.help
ngenomes = nrow(feature_matrix_bin)
#ngenomes = 200
toplot = feature_matrix_bin %>% dplyr::select(-id) %>% slice(1:ngenomes) %>% as.data.frame()
saveRDS(toplot, file.path(base, paste0(dataset, ".", "toplot.rds")))
pheatmap.outlist = list()
# all: k=277; 70.19208
# soil: k=192; 70.01773
pheatmapout = pheatmap(toplot,
#kmeans_k = k, iter.max = 10, nstart = 25,
cutree_rows = 192, cutree_cols=15,
color = c("white", "red"), breaks = c(0,0.9,1),
show_rownames = F, show_colnames = T,
treeheight_row = 150, treeheight_col = 200,
#cellwidth = 1, cellheight = 1, labels_row = NA, border_color = NA, treeheight_row = 50,
height = 80, width = 30,
fontsize_row = 6, fontsize_col = 8, angle_col = 90, border_color = "black",
cluster_cols = T, clustering_distance_rows = "binary",
#annotation_row = annotation_row, annotation_colors = annotation_colors,
silent = T, legend = F,
filename = file.path(base, paste0(dataset, ".alltraits.heatmap.pdf"))
)
hc = hclust(dist(toplot, method = "binary"), method = "complete")
v = cutree(hc, 192)[hc$order]
table(v)
cutree_rows
#c(totss = pheatmapout$kmeans$totss,
# tot.withinss = pheatmapout$kmeans$tot.withinss,
# betweenss = pheatmapout$kmeans$betweenss)
#pheatmapout$kmeans$cluster
for(k in 2:ngenomes) {
cat(k, "\n")
temp = kmeans(toplot, k, iter.max = 10, nstart = 25)
between_over_tot = temp$betweenss/temp$totss*100
pheatmap.outlist[[as.character(k)]][["ss"]] = c(totss = temp$totss,
tot.withinss = temp$tot.withinss,
betweenss = temp$betweenss)
pheatmap.outlist[[as.character(k)]][["cluster"]] = temp$kmeans$cluster
}
p = factoextra::fviz_nbclust(toplot, kmeans, method = "wss", k.max = 100)
cluster::clusGap()
between_SS / total_SS
# gap_stat <- cluster::clusGap(toplot, FUN = kmeans, iter.max = 10, nstart = 25,
# K.max = 200, B = 20)
# factoextra::fviz_gap_stat(gap_stat)
temp = feature_matrix_bin_df[1:200,]
d = vegan::vegdist(temp)
M1 = cor(mtcars)
corrplot(M1, method = 'color', diag = FALSE, type = 'upper', tl.cex = 9) # colorful number
detach("package:pheatmap", unload=TRUE)
draw_colnames_45 <- function (coln, gaps, ...) {
coord <- pheatmap:::find_coordinates(length(coln), gaps)
x <- coord$coord - 0.5 * coord$size
res <- grid::textGrob(
coln, x = x, y = unit(1, "npc") - unit(3,"bigpts"),
vjust = 0.75, hjust = 1, rot = 45, gp = grid::gpar(...)
)
return(res)
}
assignInNamespace(
x = "draw_colnames",
value = "draw_colnames_45",
#value = "draw_colnames",
ns = asNamespace("pheatmap")
)
pheatmap(
mat = mat,
color = inferno(length(mat_breaks) - 1),
breaks = mat_breaks,
border_color = NA,
cluster_cols = mat_cluster_cols,
cluster_rows = mat_cluster_rows,
cellwidth = 20,
show_colnames = TRUE,
show_rownames = FALSE,
annotation_col = mat_col,
annotation_colors = mat_colors,
drop_levels = TRUE,
fontsize = 14,
main = "Rotated Column Names"
)
alltraits.dist = vegan::vegdist(feature_matrix, method = "jaccard")
toplot1 = toplot %>%
dplyr::mutate_at(vars(!starts_with("id")),
#funs(case_when(. >= 1 ~ 1,TRUE ~ 0))) %>%
.funs = list(binary = ~case_when(. >= 22 ~ 1,TRUE ~ 0)))
compute.prevalence.help <- function(data) {
prevalence = data %>% tibble::as_tibble() %>%
# dplyr::select(c("id",
# traits_listbygranularity[[3]] %>%
# #dplyr::filter(`microtrait_trait-type` == "binary") %>%
# dplyr::pull(`microtrait_trait-name`))) %>%
dplyr::mutate_at(vars(!starts_with("id")),
funs(case_when(. >= 1 ~ 1,TRUE ~ 0))) %>%
tidyr::pivot_longer(!id, names_to = "trait", values_to = "presence") %>%
dplyr::select(-id) %>%
dplyr::count(trait, presence,.drop=FALSE) %>%
dplyr::group_by(trait) %>% #methaneoxidation_genes = c("mmoX", "mmoY", "mmoZ", "mmoC", "mmoD", "pmoA-amoA", "pmoB-amoB", "pmoC-amoC")
dplyr::mutate(percent = n/sum(n)*100) %>%
dplyr::filter(`presence` == 1) %>%
# dplyr::mutate(`percent` = 100-`percent`) %>%
dplyr::select(c("trait", "n", "percent"))
prevalence
}
#####cori
toplot = readRDS("organisms_habitatfiltered_matchedbyanalysis_wIMGTaxonID.feature_matrix_bin_soil_df.rds")
compute_variance_results = parallel::mclapply(2:300,
function(i) {
returnList = compute_variance(toplot, i)
returnList
},
mc.cores = 56)
#compute_variance_results[[50]][[1]]$ss
compute_variance = function(data, k) {
temp = kmeans(toplot, k, iter.max = 10, nstart = 25)
result = list()
result[[as.character(k)]][["ss"]] = c(totss = temp$totss,
tot.withinss = temp$tot.withinss,
betweenss = temp$betweenss,
betweenovertot_percent = temp$betweenss/temp$totss*100
)
result[[as.character(k)]][["cluster"]] = temp$cluster
result
}
#####cori
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.