library(magrittr) library(ggplot2)
df_samples <- readr::read_tsv("data/FARMM/20200619_farmm_metadata.tsv") df_samples <- df_samples %>% dplyr::filter(!is.na(SubjectID)) %>% dplyr::filter(study_day != "PS") %>% dplyr::mutate(study_day = as.numeric(study_day), SubjectID = as.character(SubjectID)) metaphlan <- read.delim("data/FARMM/kraken_results.tsv", sep = "\t", check.names = FALSE, stringsAsFactors = FALSE) mat_abd <- metaphlan[, -1] %>% as.matrix %>% apply(2, as.numeric) %>% set_rownames(metaphlan[, 1]) # common samples samples_common <- intersect(df_samples$SampleID, colnames(mat_abd)) mat_abd <- mat_abd[, df_samples$SampleID] # filter features # subset to bacteria taxa_bacteria <- rownames(mat_abd) %>% stringr::str_subset(stringr::fixed("k__Bacteria")) mat_abd <- mat_abd[taxa_bacteria, ] # filter for abundance mat_abd <- mat_abd[apply(mat_abd >= 1e-5, 1, sum) >= 5, ] which_allzero <- apply(mat_abd == 0, 2, all) mat_abd <- mat_abd[, !which_allzero] df_samples <- df_samples[!which_allzero, ] mat_count <- round(t(t(mat_abd) * df_samples$both_kept * 0.05)) # create tensor X_array <- array(NA, dim = c(nrow(mat_count), 30, 16)) dimnames(X_array) <- list(rownames(mat_count), unique(df_samples$SubjectID), seq(0, 15)) for(k in seq(1, dim(X_array)[3])) { k_df_samples <- df_samples %>% dplyr::filter(study_day == k - 1) X_array[, k_df_samples$SubjectID, k] <- mat_count[, k_df_samples$SampleID] } # sanity check sum(is.na(X_array)) == (dim(X_array)[2] * dim(X_array)[3] - nrow(df_samples)) * dim(X_array)[1] save(X_array, file = "data/FARMM/X_array.RData") save(df_samples, file = "data/FARMM/df_samples.RData") save(mat_count, file = "data/FARMM/mat_count.RData") save(mat_abd, file = "data/FARMM/mat_abd.RData")
# data obtained from https://codeocean.com/capsule/6494482/tree/v1 df_otu <- readr::read_tsv("data/DIABIMMUNE/table-matched.tsv") df_tax <- readr::read_tsv("data/DIABIMMUNE/taxonomy.qza/data/taxonomy.tsv") all(df_otu$`...1` == df_tax$`Feature ID`) mat_tax <- df_tax$Taxon %>% stringr::str_split_fixed("; ", n = 7) mat_tax[mat_tax[, 2] == "", 2] <- "p__" # collapse OTU table to genus level genus_collapsed <- apply(mat_tax[, -7], 1, paste0, collapse = "|") genus_collapsed_unique <- unique(genus_collapsed) mat_otu_genus <- vapply( genus_collapsed_unique, function(i_genus) { df_otu[genus_collapsed == i_genus, -1] %>% as.matrix() %>% apply(2, sum) }, rep(0, ncol(df_otu) - 1) ) %>% t() dimnames(mat_otu_genus) <- list(genus_collapsed_unique, colnames(df_otu)[-1]) # filter for low abundance features mat_otu_genus <- (apply(mat_otu_genus, 2, function(x) x / sum(x)) > 1e-5) %>% {apply(., 1, sum) > 10} %>% {mat_otu_genus[., ]} df_samples <- readr::read_tsv("data/DIABIMMUNE/11884_20190508-173103-added-month-abx.txt") all(df_samples$`#SampleID` == colnames(mat_otu_genus)) # some subjects can have multiple samples in the same month. Use the earlier # one. df_samples <- df_samples %>% dplyr::group_by(month, subjectid) %>% dplyr::arrange(age_at_collection) %>% dplyr::slice(1) %>% dplyr::ungroup() mat_otu_genus <- mat_otu_genus[, df_samples$`#SampleID`] # create tensor X_array <- array(NA, dim = c(nrow(mat_otu_genus), length(unique(df_samples$subjectid)), length(unique(df_samples$month)))) dimnames(X_array) <- list(rownames(mat_otu_genus), unique(df_samples$subjectid), seq(1, 36)) for(k in seq(1, dim(X_array)[3])) { k_df_samples <- df_samples %>% dplyr::filter(month == k - 1) X_array[, k_df_samples$subjectid, k] <- mat_otu_genus[, k_df_samples$`#SampleID`] } # sanity check sum(is.na(X_array)) == (dim(X_array)[2] * dim(X_array)[3] - nrow(df_samples)) * dim(X_array)[1] save(X_array, file = "data/DIABIMMUNE/X_array.RData") save(df_samples, file = "data/DIABIMMUNE/df_samples.RData") save(mat_otu_genus, file = "data/DIABIMMUNE/mat_otu_genus.RData")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.