#' @export
loadQiimeData <- function(){
otu_table <- system.file("extdata","otu_table_mc10_w_tax_w_metadata.txt", package="nychanesmicrobiome", mustWork = TRUE)
mapping_file <- system.file("extdata","mappingfile_corrected.txt", package="nychanesmicrobiome", mustWork = TRUE)
tree_otu <- system.file("extdata","rep_set.tre", package="nychanesmicrobiome", mustWork = TRUE)
rep_set <- system.file("extdata","rep_set.fna", package="nychanesmicrobiome", mustWork = TRUE)
phylo <- import_biom(BIOMfilename = otu_table, treefilename = read_tree(tree_otu), refseqfilename = rep_set, refseqFunction = parse_taxonomy_default)
colnames(tax_table(phylo)) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
#Remove samples with less than 1000 reads
phylo <- prune_samples(sample_sums(phylo)>1000, phylo)
#Remove OTU classified as chloroplasts and mitochondria
phylo <- subset_taxa(phylo, !Class %in% c("D_2__Chloroplast") & !Family %in% c("D_4__Mitochondria"))
#Merge splitted genera
splitted_genera <- sapply(data.frame(tax_table(phylo)[,"Genus"]), function(x) grep(" [1-9]",x))
tax_table(phylo)[splitted_genera,"Genus"] <- sapply(tax_table(phylo)[splitted_genera,"Genus"], function(x) gsub(" [1-9]","", x))
drop_prefixes <- function(x) {
x <- as.character(x)
x <- strsplit(x,"__")
x <- sapply(x, `[`, 2)
x
}
tax_table(phylo)[,"Phylum"] <- drop_prefixes(tax_table(phylo)[,"Phylum"])
tax_table(phylo)[,"Genus"] <- drop_prefixes(tax_table(phylo)[,"Genus"] )
phylo
}
loadQiimeITSData <- function(){
library(phyloseq)
otu_table <- "../input/qiime_data/ITS/otu_table_mc2_w_tax_w_metadata.txt"
mapping_file <- "../input/qiime_data/ITS/mappingFile_ITS.tsv_corrected.txt"
rep_set <- "../input/qiime_data/ITS/rep_set.fna"
import_biom(BIOMfilename = otu_table, refseqfilename = rep_set, refseqFunction = parse_taxonomy_default)
}
annotateFactors <- function(NYC_HANES){
colnames(tax_table(NYC_HANES)) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
sample_data(NYC_HANES)$SMOKER3CAT <- factor(sample_data(NYC_HANES)$SMOKER3CAT, labels = c("Never smoker",
"Current smoker",
"Former smoker"))
sample_data(NYC_HANES)$COTININE <- as.numeric(sample_data(NYC_HANES)$COTININE)
sample_data(NYC_HANES)$COTININECAT <- factor(sample_data(NYC_HANES)$COTININECAT, labels = c("Active smoker",
"Non-smoker + secondhand smoke",
"Non-smoker no secondhand smoke",
"Skipped"))
sample_data(NYC_HANES)$RACE <- factor(sample_data(NYC_HANES)$RACE, labels = c("Non-Hispanic White",
"Non-Hispanic Black",
"Hispanic",
"Asian",
"Other"))
sample_data(NYC_HANES)$GENDER <- factor(sample_data(NYC_HANES)$GENDER, labels = c("Male","Female"))
sample_data(NYC_HANES)$AGEGRP5C <- factor(sample_data(NYC_HANES)$AGEGRP5C, labels = c("20-29",
"30-39",
"40-49",
"50-59",
"60 AND OVER"))
sample_data(NYC_HANES)$SR_ACTIVE <- factor(sample_data(NYC_HANES)$SR_ACTIVE, labels = c("Very active",
"Somewhat active",
"Not very active/not active at all"))
sample_data(NYC_HANES)$EDU4CAT <- relevel(factor(sample_data(NYC_HANES)$EDU4CAT, labels = c( "Less than High school diploma",
"High school graduate/GED",
"Some College or associate's degree",
"College graduate or more")), "College graduate or more")
sample_data(NYC_HANES)$OHQ_3 <- factor(sample_data(NYC_HANES)$OHQ_3, labels = c("Yes","No",NA))
sample_data(NYC_HANES)$OHQ_5 <- as.numeric(sample_data(NYC_HANES)$OHQ_5)
sample_data(NYC_HANES)$OHQ_5_3CAT <- cut(sample_data(NYC_HANES)$OHQ_5, breaks=c(-1,0,5,7),
labels = c("0","1-5","6-7"), include.lowest = FALSE)
sample_data(NYC_HANES)$DBTS_NEW <- factor(sample_data(NYC_HANES)$DBTS_NEW, labels = c("Yes",
"No",NA))
sample_data(NYC_HANES)$smokingstatus <- factor(sample_data(NYC_HANES)$smokingstatus, levels = c("cigarette",
"never",
"former",
"alternativeonly",
"secondhand"),
labels = c("Cigarette","Never smoker","Former smoker","Alternative smoker","Secondhand"))
sample_data(NYC_HANES)$CURRENT_SMKER <- factor(sample_data(NYC_HANES)$CURRENT_SMKER, labels = c("Yes","No"))
sample_data(NYC_HANES)$CIGARETTES <- factor(sample_data(NYC_HANES)$CIGARETTES, labels = c("Yes","No",NA))
sample_data(NYC_HANES)$BMI <- as.double(sample_data(NYC_HANES)$BMI)
sample_data(NYC_HANES)$SPAGE <- as.integer(sample_data(NYC_HANES)$SPAGE)
sample_data(NYC_HANES)$MERCURYU <- as.double(sample_data(NYC_HANES)$MERCURYU )
sample_data(NYC_HANES)$SMOKER <- factor(sample_data(NYC_HANES)$SMOKER, labels = c("Smoker",
"Non smoker"))
sample_data(NYC_HANES)$SMQ_13_1_1 <- factor(sample_data(NYC_HANES)$SMQ_13_1_1, labels = c("Cigarettes",
"Cigars/Cigarillos",
#"Chewing tobacco",
#"Snuff",
"Hookah Pipe",
"E-Cigsarettes",
#"Nicotine patches/Gum/Other",
"NA"))
sample_data(NYC_HANES)$SMQ_7 <- as.integer(sample_data(NYC_HANES)$SMQ_7)
sample_data(NYC_HANES)$INC25KMOD <- factor(sample_data(NYC_HANES)$INC25KMOD, labels = c("Less Than $20,000",
"$20,000-$49,999",
"$50,000-$74,999",
"$75,000-$99,999",
"$100,000 or More",
"NA"))
sample_data(NYC_HANES)$POVGROUP6_0812CT <- factor(sample_data(NYC_HANES)$POVGROUP6_0812CT, labels = c("0 to 5%",
"5 to 10%",
"10 to 20%",
"20 to 30%",
"30 to 40%",
"40 to 100%"))
sample_data(NYC_HANES)$SMQ_4UNIT <- factor(sample_data(NYC_HANES)$SMQ_4UNIT, labels = c(1,4,52,NA))
sample_data(NYC_HANES)$US_BORN <- factor(sample_data(NYC_HANES)$US_BORN,
levels=c("1","2","NA"),
labels = c("US-Born, 50 States, DC, PR and Territories",
"Other", NA))
sample_data(NYC_HANES)$DMQ_7YEAR <- as.integer(sample_data(NYC_HANES)$DMQ_7YEAR)
sample_data(NYC_HANES)$DMQ_2 <- factor(sample_data(NYC_HANES)$DMQ_2,
levels=1:6,
labels=c("Married","Widowed","Divorced",
"Separated","Never married",
"Living with partner"))
sample_data(NYC_HANES)$AGEGRP3C <- factor(sample_data(NYC_HANES)$AGEGRP3C, levels=1:3,
labels=c("20-34", "35-64", "65 and over"))
sample_data(NYC_HANES)$EDU3CAT <- relevel(factor(sample_data(NYC_HANES)$EDU3CAT, levels=1:3,
labels=c("High School Diploma or Less", "Some College or Associate's Degree",
"College Graduate or More")),"College Graduate or More")
sample_data(NYC_HANES)$INC10K <- as.numeric(as.character(sample_data(NYC_HANES)$INC10K))
sample_data(NYC_HANES)$INC3C <- relevel(cut(sample_data(NYC_HANES)$INC10K,
breaks = stats::quantile(sample_data(NYC_HANES)$INC10K, probs=c(0, 1/3, 2/3, 1), na.rm=TRUE),
include.lowest = TRUE,
labels=c("Less Than $30,000", "$30,000 - $60,000", "$60,000 or more")), "$60,000 or more")
sample_data(NYC_HANES)$A1C <- as.integer(sample_data(NYC_HANES)$A1C)
sample_data(NYC_HANES)$GLUCOSE <- as.integer(sample_data(NYC_HANES)$GLUCOSE)
sample_data(NYC_HANES)$DBQ_10 <- as.numeric(sample_data(NYC_HANES)$DBQ_10)
sample_data(NYC_HANES)$DBQ_10 <- ifelse(sample_data(NYC_HANES)$DBQ_10UNIT=="1",
sample_data(NYC_HANES)$DBQ_10*7,
ifelse(sample_data(NYC_HANES)$DBQ_10UNIT=="3",
sample_data(NYC_HANES)$DBQ_10/4.33333,
sample_data(NYC_HANES)$DBQ_10))
sample_data(NYC_HANES)$DBQ_10_3CAT <- cut(sample_data(NYC_HANES)$DBQ_10, breaks=c(-1,.999,5,999),
labels = c("0-<1","1-5","6 or more"))
NYC_HANES
}
annotateFullDataset <- function(dat) {
# Similar to annotateFactors but for use with the complete
# NYCHANES data as data.frame
#converting NaN to NA
dat[ is.na(dat) ] <- NA
dat$RACE <- factor(dat$RACE, labels = c("Non-Hispanic White",
"Non-Hispanic Black",
"Hispanic",
"Asian",
"Other"))
dat$GENDER <- factor(dat$GENDER, labels = c("Male","Female"))
dat$AGEGRP5C <- factor(dat$AGEGRP5C, labels = c("20-29",
"30-39",
"40-49",
"50-59",
"60 AND OVER"))
dat$AGEGRP4C <- factor(dat$AGEGRP4C, labels = c("20-34",
"35-49",
"50-64",
"65 AND OVER"))
dat$AGEGRP3C <- factor(dat$AGEGRP3C, labels = c("20-34",
"35-64",
"65 AND OVER"))
dat$SR_ACTIVE <- factor(dat$SR_ACTIVE, labels = c("Very active",
"Somewhat active",
"Not very active/not active at all"))
dat$EDU3CAT <- factor(dat$EDU3CAT, labels = c("High school diploma or less",
"Some college or associate's degree",
"College graduate or more"))
dat$EDU4CAT <- factor(dat$EDU4CAT, labels = c( "Less than High school diploma",
"High school graduate/GED",
"Some College or associate's degree",
"College graduate or more"))
dat$OHQ_3 <- factor(dat$OHQ_3, labels = c("Yes",
"No"))
dat$OHQ_5 <- as.numeric(sample_data(dat)$OHQ_5)
dat$OHQ_5_3CAT <- cut(dat$OHQ_5, breaks=c(-1,0,5,7),
labels = c("0","1-5","6-7"), include.lowest = FALSE)
dat$DBTS_NEW <- factor(dat$DBTS_NEW, labels = c("Yes","No"))
dat$CURRENT_SMKER <- factor(dat$CURRENT_SMKER, labels = c("Yes","No"))
dat$BMI <- as.double(dat$BMI)
dat$SPAGE <- as.integer(dat$SPAGE)
dat$MERCURYU <- as.double(dat$MERCURYU )
dat$SMOKER <- factor(dat$SMOKER, labels = c("Smoker",
"Non smoker"))
dat$SMQ_13_1_1 <- factor(dat$SMQ_13_1_1, labels = c("Cigarettes",
"Cigars/Cigarillos",
"Chewing tobacco",
"Snuff",
"Hookah Pipe",
"E-Cigsarettes",
"Nicotine patches/Gum/Other"))
dat$SMOKER3CAT <- factor(dat$SMOKER3CAT, labels = c("Never smoker",
"Current smoker",
"Former smoker"))
dat$SMQ_7 <- as.integer(dat$SMQ_7)
dat$INC25KMOD <- factor(dat$INC25KMOD, labels = c("Less Than $20,000",
"$20,000-$49,999",
"$50,000-$74,999",
"$75,000-$99,999",
"$100,000 or More"))
dat$INC10K <- as.numeric(as.character(dat$INC10K))
dat$INC3C <- cut(dat$INC10K,
breaks = stats::quantile(dat$INC10K, probs=c(0, 1/3, 2/3, 1), na.rm=TRUE),
include.lowest = TRUE,
labels=c("Less Than $30,000", "$30,000 - $60,000", "$60,000 or more"))
dat$POVGROUP6_0812CT <- factor(dat$POVGROUP6_0812CT, labels = c("0 to 5%",
"5 to 10%",
"10 to 20%",
"20 to 30%",
"30 to 40%",
"40 to 100%"))
dat$US_BORN <- factor(dat$US_BORN,
levels=c("1","2"),
labels = c("US-Born, 50 States, DC, PR and Territories",
"Other"))
dat$DMQ_7YEAR <- as.integer(dat$DMQ_7YEAR)
dat$DMQ_2 <- factor(dat$DMQ_2,
levels=1:6,
labels=c("Married","Widowed","Divorced",
"Separated","Never married",
"Living with partner"))
dat$A1C <- as.integer(dat$A1C)
dat$GLUCOSE <- as.integer(dat$GLUCOSE)
dat$GLUCOSESI <- as.numeric(dat$GLUCOSESI)
dat$DBQ_10 <- as.numeric(dat$DBQ_10)
dat$DBQ_10UNIT <- as.character(dat$DBQ_10UNIT)
dat$DBQ_10 <- ifelse(dat$DBQ_10UNIT=="1",
dat$DBQ_10*7,
ifelse(dat$DBQ_10UNIT=="3",
dat$DBQ_10/4.33333,
dat$DBQ_10))
dat$DBQ_10_3CAT <- cut(dat$DBQ_10, breaks=c(-1,.999,5,999),
labels = c("0-<1","1-5","6 or more"))
dat
}
impute_missing_values <- function(NYC_HANES){
suppressPackageStartupMessages(library(mice))
metadata <- data.frame(sample_data(NYC_HANES))
metadata$DBTS_NEW <- factor(metadata$DBTS_NEW)
metadata$OHQ_3 <- factor(metadata$OHQ_3)
init = mice(metadata, maxit=0)
meth = init$method
predM = init$predictorMatrix
meth["DBTS_NEW"]="pmm"
meth["OHQ_3"]="pmm"
set.seed(103)
imputed = mice(metadata, method=meth, predictorMatrix=predM, m=10)
metadata.imputed <- complete(imputed)
sample_data(NYC_HANES) <- metadata.imputed
NYC_HANES
}
theme_transparent <- ggplot2::theme(axis.title = ggplot2::element_text(colour = "white"),
axis.text = ggplot2::element_text(colour = "white"),
title = ggplot2::element_text(colour = "white"),
legend.text = ggplot2::element_text(colour = "white"),
axis.ticks = ggplot2::element_line(colour = "white"),
panel.border = ggplot2::element_rect(colour = "white"),
panel.background = ggplot2::element_rect(fill = "transparent",colour = NA),
plot.background = ggplot2::element_rect(fill = "transparent",colour = NA),
legend.background = ggplot2::element_rect(fill = "transparent",colour = NA),
legend.key = ggplot2::element_rect(fill = "transparent",colour = NA),
panel.grid = ggplot2::element_blank())
scale_palette <- c("#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00")
formatMetadata <- function(dat) {
#returns a data frame of the sample_data with variable names formatted for printing
#publication-ready tables
rawdata <- if("phyloseq" %in% class(dat)) phyloseq::sample_data(dat) else dat
newdata <- data.frame(
`Age (yrs)` = rawdata$SPAGE,
`Age group` = rawdata$AGEGRP5C,
Gender = rawdata$GENDER,
`Educational achievement` = rawdata$EDU4CAT,
`Annual family income` = rawdata$INC3C,
`Marital Status` = rawdata$DMQ_2,
`Race/ethnicity` = rawdata$RACE,
`Place of birth` = rawdata$US_BORN,
`Gum disease (self-reported)` = rawdata$OHQ_3,
`Mouthwash use (times per week)` = rawdata$OHQ_5_3CAT,
`Sugar-sweetened beverages (per week)` = rawdata$DBQ_10_3CAT,
`Smoking status` = rawdata$SMOKER3CAT,
check.names=FALSE
)
levels(newdata$`Age group`) <- c('20-29', '30-39', '40-49', '50-59', '60 and over')
levels(newdata$`Mouthwash use (times per week)`) <- c("None", "1 to 5", "6 to 7")
newdata$`Place of birth` <- factor(as.character(newdata$`Place of birth`),
labels=c("US, PR and Territories","Other"))
newdata
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.