# from microbiota.R -------------------------------------------------------
filter.phyloseq.REPLACED <- function(phy, ..., prune_unused_taxa=TRUE,prune_unused_samples=FALSE) {
criteria <- quos(...)
is.sample.criteria <- map_lgl(criteria,~eval_phyloseq_expr(.x,phy))
cli_text(col_blue("filter:"))
for (i in seq_along(criteria)) {
expr <- criteria[[i]]
use.samp <- is.sample.criteria[i]
if (use.samp) {
ssub <- phy %>% get.samp() %>% filter(!!expr)
n.samps.old <- nsamples(phy)
n.samps.new <- nrow(ssub)
cli::cli_text(col_blue("sample_data")," ({n.samps.old} to {n.samps.new} sample{?s}): {as_label(expr)}")
phy <- prune_samples(ssub$sample,phy)
} else {
tsub <- phy %>% get.tax() %>% filter(!!expr)
n.taxa.old <- ntaxa(phy)
n.taxa.new <- nrow(tsub)
cli::cli_text(col_blue("tax_table")," ({n.taxa.old} to {n.taxa.new}): {as_label(expr)}")
phy <- prune_taxa(tsub$otu,phy)
}
}
if (prune_unused_taxa && any(is.sample.criteria)) {
cli_text("Removing unused taxa...")
phy <- phy %>% prune_unused_taxa()
}
if (prune_unused_samples && any(!is.sample.criteria)) {
cli_text("Removing unused samples...")
phy <- phy %>% prune_samples(sample_sums(.)>0,.)
}
return(phy)
}
mutate.phyloseq.OLD <- function(phy, ...) {
commands <- quos(...)
is.sample.command <- map_lgl(commands,~eval_phyloseq_expr(.x,phy))
cli_text(col_blue("mutate:"))
for (i in seq_along(commands)) {
expr <- commands[[i]]
use.samp <- is.sample.command[i]
var <- names(commands)[i]
if (use.samp) {
if (var!="") {
samp <- get.samp(phy) %>% mutate(!!var:=!!expr)
} else {
samp <- get.samp(phy) %>% mutate(!!expr)
}
# get.samp(phy) %>% mutate(!!expr)
cli_text(col_blue("sample_data"),": {var} = {as_label(expr)}")
if (!identical(sample_names(phy),samp$sample)) {
message("Note, sample_names() were altered")
sample_names(phy) <- samp$sample
}
sample_data(phy) <- samp %>% set.samp()
} else {
if (var!="") {
tax <- get.tax(phy) %>% mutate(!!var:=!!expr)
} else {
tax <- get.tax(phy) %>% mutate(!!expr)
}
cli_text(col_blue("tax_table"),": {var} = {as_label(expr)}")
if (!identical(taxa_names(phy),tax$otu)) {
message("Note, taxa_names() were altered")
taxa_names(phy) <- tax$otu
}
tax_table(phy) <- tax %>% set.tax()
}
}
return(phy)
}
filter.phyloseq.old <- function(phy, ..., prune_unused_taxa=TRUE) {
ssub <- phy %>% get.samp() %>% filter(...)
physub <- prune_samples(ssub$sample,phy)
if (prune_unused_taxa) {
physub <- prune_unused_taxa(physub)
}
return(physub)
}
#' Calculate the `taxhorn` distance
#'
#'
#' Ephraim Slamka helped to develop this metric, in which the Horn distance is calculated over after
#' collapsing at each taxonomic level and then taking the weighted average of distance values.
#' @param phy phyloseq object
#'
#' @return distance metric of `taxhorn` distances
#' @export
#'
#' @examples
calc.taxhorn.distance <- function(phy) {
message("YTNote: calc.taxhorn.distance is deprecated. Consider using distance2(phy, 'mean.horn')")
fn <- function(x){
set <- x[-1]
weights <- length(set):1
sum(set*weights) / sum(weights)
}
method <- "horn"
phy <- phyloseq(otu_table(phy),tax_table(phy))
ranks <- rank_names(phy)
samples <- sample_names(phy)
# create multiple phyloseq objects, collapsed at the
# Superkingdom, Phylum, .... , Species level.
phy.levels <- ranks %>% seq_along() %>%
map(~ranks[1:.x]) %>% map(~phy.collapse(phy,taxranks=.x)) %>%
setNames(ranks)
phy.levels <- c(phy.levels,list("asv"=phy))
all.levels <- names(phy.levels)
# calculate the distance matrix (metric=method) for each level.
# this is a list of distance matrices.
dist.levels <- phy.levels %>% map(~distance(.x,method=method))
# run get.pairwise() to get a list of pairwise distances.
pairwise.levels <- dist.levels %>% imap(~{
newname <- str_glue("dist.{.y}")
get.pairwise(.x) %>% rename(!!sym(newname):=dist)
})
pairwise.all <- pairwise.levels[[1]]
for (i in seq_along(all.levels)[-1]) {
pairwise.all <- pairwise.all %>% full_join(pairwise.levels[[i]],by=c("sample1","sample2"))
}
pairwise.melt <- pairwise.all %>% pivot_longer(cols=-c(sample1,sample2),
names_to="dist.type",values_to="dist")
pairwise.calcdist <- pairwise.melt %>% group_by(sample1,sample2) %>%
summarize(dist.list=list(setNames(dist,dist.type)),
dist=map_dbl(dist.list,fn),
.groups = "drop")
# if (show.work) {
# return(pairwise.calcdist)
# }
taxdist <- get.dist(pairwise.calcdist)
taxdist
}
# Calculate distances for specified samples from phyloseq
#
# Calculate distances from a phyloseq object. This is the similar to [calc.distance()],
# except that this does not return a distance matrix. Instead, it returns a vector of distances,
# only calculating the comparisons you specified.
# @param sample1 a character vector specifying the first sample(s) for comparison. Should be a sample in `phy`.
# @param sample2 a character vector specifying the second sample(s) for comparison. Should be a sample in `phy`.
# @param phy a phyloseq object containing the samples to be compared.
# @param method the distance metric method to be used. Can be a method from
# [`phyloseq`][`phyloseq::distanceMethodList`], or `"taxhorn'`.
#
# @return a vector of distances, corresponding to `sample1` and `sample2`.
# @export
#
# @examples
# library(tidyverse)
# tbl <- tibble(sample1=c("191A", "228A", "132A", "1045", "179B"),
# sample2=c("198A", "205B", "202C", "175B", "192D"))
# tbl.dists <- tbl %>%
# mutate(taxhorn.dist=calc.pairwise(sample1,sample2,cid.phy,method="taxhorn"))
# calc.pairwise <- function(sample1,sample2,phy,method="bray") {
# message("YTNote: calc.pairwise is deprecated. Consider using calc.pairwise.dist")
# stopifnot(length(sample1)==length(sample2))
# stopifnot(all(c(sample1,sample2) %in% sample_names(phy)))
# dist <- map2_dbl(as.character(sample1),as.character(sample2),~{
# if (.x==.y) return(0)
# physub <- prune_samples(c(.x,.y),phy)
# calc.distance(physub,method=method)
# })
# return(dist)
# }
# Calculate distance matrix from phyloseq data
#
# Basically same as [phyloseq::distance()], but adds `taxhorn` metric
# @param phy phyloseq object
# @param method character string indicating distance metric to be calculated. Can be a method from
# [`phyloseq`][`phyloseq::distanceMethodList`], or `"taxhorn'`, or a function.
# @param ... passed to [phyloseq::distance()]
#
# @return a distance metric
# @export
# #'
# #' @examples
# calc.distance <- function(phy, method, ...) {
# message("YTNote: calc.distance is deprecated. Consider using distance2")
# if (rlang::is_function(method)) {
# dist <- method(phy)
# } else if (is.character(method)) {
# if (method=="taxhorn") {
# dist <- calc.taxhorn.distance(phy)
# } else {
# dist <- distance(physeq=phy, method=method, ...)
# }
# }
# return(dist)
# }
#' Calculate axis limits
#'
#' Determines the actual limits of X and Y, for a given ggplot object. This is used by [gg.align.xlim()].
#' @param gg the ggplot object
#' @return a list containing inforation about limits for X and Y.
#' @example
#' g1 <- ggplot(mtcars,aes(x=mpg)) + geom_histogram()
#' g2 <- ggplot(mtcars,aes(x=mpg,y=disp,color=factor(cyl))) + geom_point()
#' g3 <- ggplot(mtcars,aes(x=mpg)) + geom_histogram(bins=3) + coord_cartesian(expand=FALSE)
#' g4 <- ggplot(mtcars,aes(x=mpg,y=disp)) + geom_point() + coord_cartesian(xlim=c(2,55),expand=TRUE)
#' gg.axis.limits(g1)
#' gg.axis.limits(g2)
#' gg.axis.limits(g3)
#' gg.axis.limits(g4)
#'
#' g1 <- ggplot(mtcars,aes(x=mpg)) + geom_histogram() + scale_x_log10()
#' g2 <- ggplot(mtcars,aes(x=mpg,y=disp,color=factor(cyl))) + geom_point() + scale_x_log10()
#' g3 <- ggplot(mtcars,aes(x=mpg)) + geom_histogram(bins=3) + coord_cartesian(expand=FALSE) + scale_x_log10()
#' g4 <- ggplot(mtcars,aes(x=mpg,y=disp)) + geom_point() + coord_cartesian(expand=FALSE,xlim=c(2,55)) + scale_x_log10()
#' gg.axis.limits(g1)
#' gg.axis.limits(g2)
#' gg.axis.limits(g3)
#' gg.axis.limits(g4)
#'
#' g1 <- ggplot(starwars,aes(x=eye_color)) + geom_bar()
#' g2 <- ggplot(starwars,aes(x=eye_color,y=height)) + geom_boxplot()
#' g3 <- ggplot(starwars,aes(x=eye_color,fill=species)) + geom_bar(width=3)
#' g4 <- ggplot(starwars,aes(x=eye_color,y=height)) + geom_boxplot()
#' gg.axis.limits(g1)
#' gg.axis.limits(g2)
#' gg.axis.limits(g3)
#' gg.axis.limits(g4)
#'
#' g1 <- ggplot(presidential,aes(x=start)) + geom_histogram()
#' g2 <- ggplot(presidential,aes(x=end)) + geom_histogram()
#' g3 <- ggplot(presidential,aes(y=name,yend=name,x=start,xend=end)) + geom_segment(size=5)
#' g4 <- ggplot(presidential,aes(y=name,yend=name,x=start,xend=end,fill=party)) + geom_segment(size=5)
#' gg.axis.limits(g1)
#' gg.axis.limits(g2)
#' gg.axis.limits(g3)
#' gg.axis.limits(g4)
#' @export
gg.axis.limits <- function(gg) {
message("YTNote: gg.axis.limits is deprecated. ggfun::xrange() and ggfun::yrange() perform this function.")
gb <- suppressMessages(ggplot_build(gg))
coord_flip <- is(gb$layout$coord,"CoordFlip")
# expand <- gb$layout$coord$expand
x <- list(
lim = gb$layout$panel_params[[1]]$x.range, #****the ultimate plot limits, post transform, post expansion, post coord lim
# lim.fct = gb$layout$panel_scales_x[[1]]$range_c$range, #exists if categorical, and is numeric representation of lim
# lim2 = gb$layout$panel_scales_x[[1]]$range$range, #lim is the data limits, can be numeric or factor, pre-expansion, post-transform, if not overruled by coord.
# lim3 = gb$layout$panel_params[[1]]$x$limits, #basically same as lim
# lim4 = gb$layout$panel_params[[1]]$x$continuous_range,
# lim5 = gb$layout$panel_params[[1]]$x$get_limits(),
# lim.coord=gb$layout$coord$limits$x,
# expansion = gb$layout$panel_scales_x[[1]]$expand,
transform = gb$layout$panel_scales_x[[1]]$trans$transform,
inverse = gb$layout$panel_scales_x[[1]]$trans$inverse
)
y <- list(
lim = gb$layout$panel_params[[1]]$y.range,
transform = gb$layout$panel_scales_y[[1]]$trans$transform,
inverse = gb$layout$panel_scales_y[[1]]$trans$inverse
)
if (is.null(x$inverse)) {
x$coord_lim <- x$lim
} else {
x$coord_lim <- x$inverse(x$lim)
}
if (is.null(y$inverse)) {
y$coord_lim <- y$lim
} else {
y$coord_lim <- y$inverse(y$lim)
}
return(list(x=x,y=y,coord_flip=coord_flip))
}
#' Create a color palette for taxonomy
#'
#' Given microbiota data, generate a color palette that can be used in ggplot2 plots.
#'
#' Note that the `tax.palette` formula list is evaluated in order, and should probably end in TRUE (similar to [dplyr::case_when()]).
#' @param data taxonomic data, can be [`phyloseq`][`phyloseq::phyloseq-class`], [get.otu.melt()] data frame, or [get.tax()] data frame.
#' @param unitvar the granular column (bare unquoted) by which colors will be assigned. Default is `Species`.
#' Sometimes you might want to switch to another granular identifer, such as `otu`. Depending on the situation.
#' @param tax.palette a list of formulas used to assign colors. Each element should take the form: `"<label>" = <true/false expression> ~ <color vector>`. See examples and details.
#' @return named vector of colors, which can be used in: `ggplot( ... ) + scale_fill_manual(values = <pal> )`
#' @export
#' @examples
#' # generate a stacked plot for one subject
#' otusub <- cid.phy %>% get.otu.melt() %>% filter(Patient_ID=="221") %>%
#' arrange(!!!syms(rank_names(cid.phy))) %>%
#' mutate(otu=fct_inorder(otu))
#' g <- ggplot(otusub,aes(x=day,y=pctseqs,fill=otu)) +
#' geom_col(show.legend=FALSE,width=1) +
#' expand_limits(x=50)
#' g
#'
#' # use default bacterial palette (yt.palette3)
#' pal1 <- get.tax.palette(cid.phy,unitvar="otu")
#' legend1 <- get.tax.legend() %>%
#' annotation_custom(xmin=30, xmax=50, ymin=0.7, ymax=1)
#' g + scale_fill_manual(values=pal1) + legend1
#'
#' # generate a custom palette
#' custom_pal <- exprs(
#' "Gram positives" = Phylum=="Firmicutes" ~ shades("purple",variation=0.25),
#' "Gram negatives" = Phylum=="Bacteroidetes" | Phylum=="Proteobacteria" ~ shades("red",variation=0.25),
#' "Others" = TRUE ~ shades("gray", variation=0.25)
#' )
#' pal2 <- get.tax.palette(cid.phy,unitvar="otu",tax.palette = custom_pal)
#' legend2 <- get.tax.legend(tax.palette = custom_pal) %>%
#' annotation_custom(xmin=30, xmax=50, ymin=0.7, ymax=1)
#' g + scale_fill_manual(values=pal2) + legend2
get.tax.palette <- function(data,unitvar=Species,tax.palette=yt.palette3) {
# data=phy1;unitvar="Species";tax.palette=yt.palette2
warning("YTWarning: Please note that this function is deprecated, consider using geom_taxonomy / scale_fill_taxonomy")
requireNamespace("phyloseq",quietly=TRUE)
unitvar <- ensym(unitvar)
if (is(data,"phyloseq") | is(data,"taxonomyTable")) {
data <- get.tax(data)
}
if (!(is.list(tax.palette) && all(map_lgl(tax.palette,is_formula)))) {
stop("YTError: tax.palette needs to be a list of formulas!")
}
vars.needed <- tax.palette %>% map(~{
rlang::f_lhs(.x) %>% all.vars()
}) %>% simplify() %>% c(as_label(unitvar)) %>% unique() %>% as.character()
if (!all(vars.needed %in% names(data))) {
missing.vars <- setdiff(vars.needed,names(data))
stop("YTError: the tax.palette has vars that were not found in data: ",paste(missing.vars,collapse=","))
}
tax <- data %>% select(!!!syms(vars.needed)) %>% unique() %>%
assert_grouping_vars(id_vars=!!unitvar,stopIfTRUE = FALSE)
is_color <- function(x) {
iscolor <- grepl('^#[0-9A-Fa-f]{6}([0-9A-Fa-f]{2})?$', x) |
(x %in% c(colors(),as.character(1:8),"transparent")) |
is.na(x)
return(all(iscolor))
}
color.list <- map(tax.palette,function(exp) {
colors <- rlang::f_rhs(exp) %>% rlang::eval_tidy()
if (!is_color(colors)) {
stop("YTError: not a valid color set: {paste(colors,collapse=', ')}")
}
criteria <- rlang::f_lhs(exp)
# message(criteria)
color.yes.no <- tax %>%
mutate(criteria=!!criteria,
criteria=criteria & !is.na(criteria)) %>%
pull(criteria)
# x.color <- character()
x.color <- rep(NA_character_,length.out=nrow(tax))
x.color[color.yes.no] <- rep(colors,length.out=sum(color.yes.no))
return(x.color)
}) %>% do.call(coalesce,.)
tax$color <- color.list
pal <- setNames(tax$color,tax[[as_label(unitvar)]])
return(pal)
}
#' Generate tax legend
#'
#' @param tax.palette a list of formulas specifying the palette. Default is [`yt.palette3`].
#' @param fontsize Font size. Default is 5.
#' @return a ggplot object showing the legend.
#' @describeIn get.tax.palette
#' @export
#' @examples
get.tax.legend <- function(tax.palette=yt.palette3,fontsize=5) {
warning("YTWarning: Please note that this function is deprecated, consider using geom_taxonomy / scale_fill_taxonomy")
glist <- imap(tax.palette,function(exp,label) {
colors <- rlang::f_rhs(exp) %>% rlang::eval_tidy()
criteria <- rlang::f_lhs(exp)
d <- tibble(color=colors)
divs <- seq(0,1,length.out=nrow(d)+1)
d$xmin <- divs[-length(divs)]
d$xmax <- divs[-1]
ggplot() + expand_limits(x=-2) +
geom_rect(data=d,aes(xmin=xmin,xmax=xmax,ymin=-0.5,ymax=0.5,fill=color)) +
annotate("text",x=0,y=0,label=label,hjust=1,size=fontsize) +
scale_fill_identity() + theme_void()
})
rlang::inject(gridExtra::arrangeGrob(!!!glist,ncol=1))
}
get.tax.legend2 <- function(tax.palette=yt.palette3,fontsize=5,ncol=NULL,nrow=NULL) {
warning("YTWarning: Please note that this function is deprecated, consider using geom_taxonomy / scale_fill_taxonomy")
if (is.null(ncol) && is.null(nrow)) {
ncol <- 1
}
glist <- imap(tax.palette,function(exp,label) {
colors <- rlang::f_rhs(exp) %>% rlang::eval_tidy()
criteria <- rlang::f_lhs(exp)
d <- tibble(color=colors)
divs <- seq(0,1,length.out=nrow(d)+1)
d$xmin <- divs[-length(divs)]
d$xmax <- divs[-1]
ggplot() + expand_limits(x=-2) +
geom_rect(data=d,aes(xmin=xmin,xmax=xmax,ymin=-0.5,ymax=0.5,fill=color)) +
annotate("text",x=0,y=0,label=label,hjust=1,size=fontsize) +
scale_fill_identity() + theme_void()
})
Reduce(`+`,glist) + plot_layout(ncol=ncol,nrow=nrow)
}
#' Plot tax
#'
#' @param t data frame containing melted tax data. Needs to have vars sample, pctseqs, Kingdom, ... , Species
#' @param xvar xvar by which to plot data. This should be a distinct identifier for samples.
#' @param data whether to return data frame only (`TRUE`), or proceed with plotting (`FALSE`).
#' @param pctseqs the relative abundance column name (default `pctseqs`)
#' @param unitvar unit variable (bare unquoted). Default `Species`
#' @param label column name (bare unquoted) for labels var. Default is `Species`.
#' @param tax.levels tax ranks.
#' @param label.pct.cutoff cutoff by which to label abundances, stored in tax.label
#'
#' @return either ggplot2 object, or data frame.
#' @examples
#' @author Ying Taur
#' @export
tax.plot <- function(t,xvar=sample,pctseqs=pctseqs,unitvar=Species,
label=Species,
tax.levels = c("Superkingdom","Phylum","Class","Order","Family","Genus","Species"),
data=TRUE,label.pct.cutoff=0.3) {
warning("YTWarning: Please note that this function is deprecated, consider using geom_taxonomy / scale_fill_taxonomy")
#t=get.otu.melt(phy.species)
# tax.levels <- c("Superkingdom","Phylum","Class","Order","Family","Genus","Species")
xvar <- ensym(xvar)
pctseqs <- ensym(pctseqs)
unitvar <- ensym(unitvar)
label <- ensym(label)
vars <- c(as_label(xvar),as_label(pctseqs),tax.levels,as_label(unitvar),as_label(label)) %>% unique()
if (!all(vars %in% names(t))) {
missing.vars <- setdiff(vars,names(t))
stop("YTError: missing var:",paste(missing.vars,collapse=","))
}
t <- t %>% arrange(!!!syms(tax.levels)) %>%
mutate(!!unitvar:=fct_inorder(!!unitvar)) %>%
group_by(!!xvar) %>% arrange(!!unitvar) %>%
mutate(cum.pct=cumsum(!!pctseqs),
y.text=(cum.pct + c(0,cum.pct[-length(cum.pct)])) / 2,
y.text=1-y.text) %>%
ungroup() %>%
select(-cum.pct) %>%
mutate(tax.label=ifelse(!!pctseqs>=label.pct.cutoff,as.character(!!label),""))
# pal <- get.yt.palette(t,use.cid.colors=use.cid.colors)
# attr(t,"pal") <- pal
if (data) {
return(t)
} else {
g <- ggplot() +
geom_bar(data=t,aes(x=!!xvar,y=!!pctseqs,fill=!!unitvar),stat="identity",position="fill") +
geom_text(data=t,aes(x=!!xvar,y=y.text,label=tax.label),angle=-90,lineheight=0.9) +
scale_fill_manual(values=attr(t,"pal")) +
theme(legend.position="none")
return(g)
}
}
old3.phy.collapse.base <- function(otudt,taxdt,taxranks,level,criteria,fillin.levels) {
# declare.args( otudt=get.otu.melt(cid.phy,sample_data=FALSE,tax_data=FALSE) %>% as.data.table(), taxdt=get.tax(cid.phy) %>% as.data.table(), taxranks=rank_names(cid.phy), criteria=quo(max.pctseqs<=0.001 | pct.detectable<=0.005), level=7, fillin.levels=FALSE, yingtools2:::phy.collapse.base)
requireNamespace("data.table",quietly=TRUE)
criteria <- enquo(criteria)
otudt <- data.table::as.data.table(otudt) #make sure it's data.table
taxdt <- data.table::as.data.table(taxdt)
nsamps <- data.table::uniqueN(otudt$sample)
allranks <- c(taxranks,"strain")
subscript <- function(x,i) {
paste(x,i,sep="_")
}
# taxdt[,strain:=otu] %>% setnames(allranks,subscript(allranks,1))
taxdt <- taxdt[,strain:=otu] %>% data.table::setnames(allranks,subscript(allranks,1))
# taxdt <- taxdt %>% mutate(strain=otu) %>% rename_with(.fn=~paste(.x,"1",sep="_"),.cols=all_of(allranks))
# look at criteria, determine necessary calculations in make.tax
allcalcs <- rlang::exprs(n.detectable = sum(numseqs > 0),
pct.detectable = sum(numseqs > 0)/..nsamps, # pct.detectable=mean(pctseqs>0),
mean.pctseqs = sum(pctseqs)/..nsamps,
median.pctseqs = median(c(pctseqs, rep(0, length.out = ..nsamps - .N))), # median.pctseqs=median(pctseqs),
max.pctseqs = max(pctseqs),
min.pctseqs = fifelse(..nsamps == .N, min(pctseqs), 0), # min.pctseqs=min(pctseqs),
total.numseqs = sum(numseqs),
max.numseqs = max(numseqs),
min.numseqs = fifelse(..nsamps == .N, min(numseqs), 0), # min.numseqs=min(numseqs),
n.samps = ..nsamps)
# allcalcs <- rlang::exprs(n.detectable=sum(numseqs>0),pct.detectable=sum(numseqs>0) / nsamps,mean.pctseqs=sum(pctseqs) / nsamps,median.pctseqs=median(c(pctseqs,rep(0,length.out=nsamps-n()))),max.pctseqs=max(pctseqs),min.pctseqs=ifelse(nsamps==n(),min(pctseqs),0),total.numseqs=sum(numseqs),max.numseqs=max(numseqs),min.numseqs=ifelse(nsamps==n(),min(numseqs),0),n.samps=nsamps,n.rows=nsamps)
# some calcs depend on other lines, determine the dependencies
depends <- allcalcs %>% imap(~all.vars(.x) %>% intersect(names(allcalcs)) %>% c(.y))
calcvars <- depends[all.vars(criteria)] %>% unname() %>% simplify()
# subset of allcalc that is needed
calcs <- allcalcs[names(allcalcs) %in% calcvars]
make.otu <- function(ss,tt,i) {
by1 <- subscript(allranks,i)
by2 <- subscript(allranks,i+1)
# ss %>% inner_join(tt,by=by1) %>% group_by(!!!syms(by2),sample) %>% summarize(pctseqs=sum(pctseqs),numseqs=sum(numseqs),.groups="drop")
ss %>%
merge(tt,by=by1) %>%
.[, .(pctseqs=sum(pctseqs),numseqs=sum(numseqs)), by=c("sample",by2)]
}
make.tax <- function(ss,i) {
by1 <- subscript(allranks,i)
by2 <- subscript(allranks,i+1)
collapse_var <- subscript("collapse",i)
rank <- length(allranks)+1-i
parent.groups <- by1[1:(rank-1)]
new.tax.var.exprs <- seq_along(by1) %>% setNames(by2) %>%
map(~{
if (.x<rank) {
cur.rank <- sym(by1[.x])
expr(!!cur.rank)
} else if (.x==rank) {
parent <- sym(by1[rank-1])
expr(ifelse(!!sym(collapse_var),
paste("<miscellaneous>",!!parent),
!!sym(by1[.x])))
} else {
expr(ifelse(!!sym(collapse_var),
NA_character_,
!!sym(by1[.x])))
}
})
# tt <- ss %>% group_by(!!!syms(by1)) %>% summarize(!!!calcs0, .groups="drop") %>% mutate(!!sym(collapse_var):=!!criteria, !!!new.tax.var.exprs) %>% group_by(!!!syms(parent.groups)) %>% mutate(n.collapse=sum(!!sym(collapse_var)), nrows=n()) %>% ungroup() %>% mutate(!!sym(collapse_var):=!!sym(collapse_var) & n.collapse>1) %>% select(!!sym(collapse_var),!!!syms(by1),!!!syms(by2))
tt <- inject(
ss # ss %>%
[, .(!!!calcs), by = by1] # group_by(!!!syms(by1)) %>% summarize(!!!calcs, .groups = "drop") %>%
[, (collapse_var) := !!quo_squash(criteria)] # mutate(!!sym(collapse_var) := !!criteria) %>%
[, `:=`(!!!new.tax.var.exprs)] # mutate(!!!new.tax.var.exprs) %>%
[, `:=`(n.collapse=sum(!!sym(collapse_var)), # group_by(!!!syms(parent.groups)) %>%
nrows = .N), by=parent.groups] # mutate(n.collapse = sum(!!sym(collapse_var)), nrows = n()) %>%
[, (collapse_var) := !!sym(collapse_var) & n.collapse>1] # mutate(!!sym(collapse_var) := !!sym(collapse_var) & n.collapse > 1) %>%
[, c(collapse_var,by1,by2), with=FALSE] # select(!!sym(collapse_var), !!!syms(by1), !!!syms(by2))
)
return(tt)
}
# each iteration checks criteria and collapses one level
# ss <- taxdt %>% inner_join(otudt,by="otu")
ss <- taxdt %>% merge(otudt,by="otu")
taxmap.raw <- taxdt
trace <- c()
for (i in 1:level) {
# i=1
# message(i)
tt <- make.tax(ss,i)
ss <- make.otu(ss,tt,i)
byvar <- subscript(allranks,i)
taxmap.raw <- taxmap.raw %>% data.table::merge.data.table(tt,all.x=TRUE, by=byvar)
# taxmap.raw <- taxmap.raw %>% left_join(tt,by=byvar)
trace <- c(trace,nrow(tt))
}
by.tax <- subscript(allranks,i+1) %>% setNames(allranks) %>% map(~expr(!!sym(.x)))
# taxmap <- taxmap.raw %>% transmute(otu,!!!by.tax)
taxmap <- inject(taxmap.raw[, .(otu,!!!by.tax)])
# new.tax.otu <- otudt %>% left_join(taxmap,by="otu") %>% group_by(sample,!!!syms(allranks)) %>% summarize(numseqs=sum(numseqs), pctseqs=sum(pctseqs), .groups="drop") %>% mutate(otu=paste2(!!!syms(allranks),sep="|")) %>% arrange(otu)
new.tax.otu <- inject(
data.table::merge.data.table(otudt,taxmap, all.x=TRUE, by="otu") # left_join(taxmap,by="otu") %>%
[, .(numseqs=sum(numseqs), # group_by(sample,!!!syms(allranks)) %>%
pctseqs=sum(pctseqs)), by=c("sample",allranks)] # summarize(numseqs=sum(numseqs),pctseqs=sum(pctseqs),.groups="drop") %>%
[, otu:=paste2(!!!syms(allranks),sep="|")] # mutate(otu=paste2(!!!syms(allranks),sep="|"))
)
# new.tax.otu <- as_tibble(new.tax.otu)
if (fillin.levels) {
for (i in seq_along(taxranks)[-1]) {
var <- taxranks[i]
allvars <- taxranks[i:1]
inject(new.tax.otu[, (var):=coalesce(!!!syms(allvars))])
# new.tax.otu <- new.tax.otu %>% mutate(!!sym(var):=coalesce(!!!syms(allvars)))
}
}
# new.tax <- new.tax.otu %>% select(otu,!!!syms(taxranks)) %>% unique()
# tcols <- c("otu",taxranks)
# new.tax <- new.tax.otu[, ..tcols] %>% unique()
new.tax <- new.tax.otu[, c("otu",taxranks), with=FALSE] %>% unique()
# new.otu <- new.tax.otu %>% select(otu,sample,numseqs,pctseqs)
# ocols <- c("otu","sample","numseqs","pctseqs")
# new.otu <- new.tax.otu[, ..ocols]
new.otu <- new.tax.otu[, c("otu","sample","numseqs","pctseqs"), with=FALSE]
trace <- c(trace,nrow(new.tax)) %>% setNames(rev(allranks)[1:(level+1)])
message(str_glue("Evaluated across levels: {paste(names(trace),collapse=', ')} ({length(trace)-1} rounds)"))
message(str_glue("Number of taxa: {paste(trace,collapse=' -> ')} (final number of taxa)"))
# ntaxa.final <- nrow(new.tax)
# message(str_glue("Collapsed taxa, {ntaxa.orig} to {ntaxa.final}"))
list(tax=new.tax,otu=new.otu)
}
#' @rdname old3.phy.collapse.bins
#' @export
old3.phy.collapse.bins <- function(x,...) UseMethod("old3.phy.collapse.bins")
#' Collapse phyloseq or otu.melt data into smaller tax bins
#'
#' Collapse [`phyloseq`][`phyloseq::phyloseq-class`] object into smaller bins, using specified criteria based on the data.
#' Examines the data at each level (e.g. otu, Species, Genus, Family, and so on), and at each level, collapses 2 or more taxa if they meet the specified criteria.
#'
#' The binning criteria can be defined multiple ways. These variables are available for criteria, for each taxon:
#'
#' * `n.detectable` number of samples where taxa abundance is above 0.
#'
#' * `pct.detectable` percent of samples where taxa abundance is above 0.
#'
#' * `mean.pctseqs` mean relative abundance for a taxon.
#'
#' * `median.pctseqs` median relative abundance for a taxon.
#'
#' * `max.pctseqs` highest relative abundance of sequences for a taxon (across all samples)
#'
#' * `min.pctseqs` lowest relative abundance of sequences for a taxon (across all samples)
#'
#' * `total.numseqs` total number of sequences for a taxon (across all samples)
#'
#' * `max.numseqs` highest number of sequences for a taxon (across all samples)
#'
#' * `min.numseqs` lowest number of sequences for a taxon (across all samples)
#'
#' * `n.samps` total number of samples (regardless of abundance)
#'
#' * `n.rows` total number of rows for a taxon
#'
#' @param phy [`phyloseq`][`phyloseq::phyloseq-class`] object to be collapsed
#' @param level number of tax levels to evaluate and collapse by, starting with `otu` and moving up. For instance, `level=4` means collapse at `otu`, `Species`, `Genus`, `Family`.
#' @param fillin.levels Whether or not to fill in `NA` values with the collapsed taxa name. Default is `FALSE`.
#' @param criteria an expression that evaluates to TRUE/FALSE, whether to collapse at a particular level. Create this using calculated stats for each taxa (see Details)
#' @return [`phyloseq`][`phyloseq::phyloseq-class`] object or data frame (depending on what was supplied), representing the data, after criteria-based taxonomic binning.
#'
#' @examples
#' # original phyloseq
#' cid.phy
#'
#' # after default binning
#' phy.collapse.bins(cid.phy)
#'
#' # customized binning
#' phy.collapse.bins(cid.phy, level = 5,
#' criteria = mean.pctseqs < 0.001 & n.detectable <= 2)
#' @rdname old3.phy.collapse.bins
#' @export
old3.phy.collapse.bins.phyloseq <- function(phy,
level=length(rank_names(phy)),
fillin.levels=FALSE,
criteria=max.pctseqs<=0.001 | pct.detectable<=0.005) {
# phy=cid.phy;criteria <- quo(max.pctseqs<=0.001 | pct.detectable<=0.005);level=7;fillin.levels=FALSE
# declare.args(phy=cid.phy,yingtools2:::phy.collapse.bins.phyloseq)
requireNamespace(c("phyloseq","data.table"),quietly=TRUE)
criteria <- enquo(criteria)
phy <- suppressMessages(prune_unused_taxa(phy))
taxranks <- rank_names(phy)
otudt <- get.otu.melt(phy,sample_data=FALSE,tax_data=FALSE) %>% data.table::as.data.table()
taxdt <- get.tax(phy) %>% data.table::as.data.table()
objset <- old3.phy.collapse.base(otudt=otudt,taxdt=taxdt,taxranks=taxranks,level=level,criteria=!!criteria,fillin.levels=fillin.levels)
new.tax <- objset$tax %>% as_tibble()
new.otu <- objset$otu %>%
data.table::dcast.data.table(formula=otu ~ sample,value.var="numseqs",fill=0) %>%
as_tibble()
# new.otu <- new.otu %>% as_tibble() %>% pivot_wider(id_cols=otu,names_from=sample,values_from=numseqs,values_fn = sum,values_fill=0)
new.phy <- phyloseq(set.otu(new.otu),set.tax(new.tax),sample_data(phy))
return(new.phy)
}
#' @param data data frame, formatted as [get.otu.melt()] data. Note, it must have columns `otu`, `sample`, `numseqs`, `pctseqs`, and all values in `taxranks`.
#' @param taxranks character vector of taxonomic ranks in `data`.
#' @param sample_vars whether to include sample variables in the data.
#' Can be `TRUE` (include sample vars, and try to determine which ones),
#' `FALSE` no sample vars, or a character vector specifying col names in `data` to be included.
#' @param sample_id name of sample column identifier. Default is `"sample"`.
#' @param taxa_id name of taxon column identifier. Default is `"otu"`.
#' @rdname old3.phy.collapse.bins
#' @examples
#' # You can also enter the data in long form (rows=sample*taxa, such as that produced by [get.otu.melt()]).
#' otu <- get.otu.melt(cid.phy)
#' otu.bin <- phy.collapse.bins(otu,
#' taxranks = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "taxon"),
#' sample_vars = c("Sample_ID", "Patient_ID", "day", "day.old", "Consistency"),
#' level = 5,
#' criteria = mean.pctseqs < 0.001 & n.detectable <= 2)
#' n_distinct(otu$otu)
#' n_distinct(otu.bin$otu)
#' @export
old3.phy.collapse.bins.data.frame <- function(data,
taxranks=c("Superkingdom","Phylum","Class","Order","Family","Genus","Species"),
level=length(taxranks),
fillin.levels=FALSE,
sample_id="sample",
taxa_id="otu",
abundance_var="numseqs",
criteria=max.pctseqs<=0.001 | pct.detectable<=0.005,
sample_vars=TRUE) {
requireNamespace("data.table",quietly=TRUE)
# declare.args(data=get.otu.melt(cid.phy), taxranks <- rank_names(cid.phy), criteria=quo(max.pctseqs<=0.001 | pct.detectable<=0.005), yingtools2:::phy.collapse.bins.data.frame)
criteria <- enquo(criteria)
needvars <- c(taxa_id, sample_id, abundance_var, taxranks)
if (!all(needvars %in% names(data))) {
stop(str_glue("YTError: vars not found in data: {paste(setdiff(needvars,names(data)),collapse=',')}"))
}
data <- data %>% rename(sample=!!sym(sample_id),otu=!!sym(taxa_id),numseqs=!!sym(abundance_var))
rows.are.distinct <- is.distinct(data,otu,sample)
if (!rows.are.distinct) {
stop(str_glue("YTError: rows are not distinct across (sample_id x taxa_id)!"))
}
taxdt <- data %>% select(otu,!!!syms(taxranks)) %>% data.table::as.data.table() %>% unique()
if (isTRUE(sample_vars)) { #logical and true
message("Attempting to determine sample vars...\n")
vars.to.check <- setdiff(names(data),c("otu","numseqs",taxranks))
distinct_sample_vars <- data %>%
test_if_nonvarying_by_group(id_vars=sample, test_vars=all_of(vars.to.check)) %>%
{names(.)[.]} %>% setdiff("sample")
sample_vars <- distinct_sample_vars
} else if (isFALSE(sample_vars)) {
#no sample variables
sample_vars <- c()
} else if (is.character(sample_vars)) {
# sample_vars are specified do nothing
sample_vars <- setdiff(sample_vars,"sample")
} else {
stop("YTError: sample_vars should be a character or logical!")
}
otudt <- data %>% select(otu,sample,numseqs,pctseqs) %>% data.table::as.data.table()
# sampdt <- samp %>% data.table::as.data.table()
objset <- old3.phy.collapse.base(otudt=otudt,taxdt=taxdt,taxranks=taxranks,level=level,criteria=!!criteria,fillin.levels=fillin.levels)
new.otudt <- objset$otu
new.taxdt <- objset$tax
new.dt <- new.otudt %>%
data.table::merge.data.table(new.taxdt, by="otu",all.x = TRUE)
if (length(sample_vars)>0) {
sampdt <- data %>% select(sample,!!!syms(sample_vars)) %>% distinct() %>%
data.table::as.data.table()
if (anyDuplicated(sampdt$sample)>0) {
stop("YTError: sample data with selected sample vars is not unique!")
}
new.dt <- new.dt %>%
data.table::merge.data.table(sampdt,by="sample",all.x = TRUE)
}
new.data <- new.dt %>%
as_tibble() %>%
rename(!!sym(sample_id):=sample,!!sym(taxa_id):=otu,!!sym(abundance_var):=numseqs)
# new.data <- new.otu %>% left_join(new.tax,by="otu") %>% left_join(samp,by="sample") %>% rename(!!sym(taxa_id):=otu,!!sym(sample_id):=sample)
return(new.data)
}
compareOLD.data.frame.OLD <- function(x,y,by=NULL) {
# declare.args(x=d1,y=d1[,-1],yingtools2:::compare.data.frame)
# x.name <- as_label(enquo(x))
# y.name <- as_label(enquo(y))
x.name <- deparse1(substitute(x))
y.name <- deparse1(substitute(y))
if (is.null(by)) {
by <- intersect(names(x),names(y))
}
by.x <- names(by) %||% by
by.y <- unname(by)
x.is.distinct <- x %>% is.distinct(!!!syms(by.x))
y.is.distinct <- y %>% is.distinct(!!!syms(by.y))
if (all(by.x==by.y)) {
by.x <- paste0(by.x,".x")
by.y <- paste0(by.y,".y")
}
# both <- inner_join_replace(x,y,by=by,errorIfDifferent = FALSE)
all <- full_join(x,y,by=by,keep=TRUE) %>%
mutate(.status=case_when(
!is.na(!!sym(by.x[1])) & !is.na(!!sym(by.y[1])) ~ str_glue("both {x.name}(x) and {y.name}(y)"),
!is.na(!!sym(by.x[1])) & is.na(!!sym(by.y[1])) ~ str_glue("{x.name}(x) only"),
is.na(!!sym(by.x[1])) & !is.na(!!sym(by.y[1])) ~ str_glue("{y.name}(y) only")
))
x.vars0 <- str_extract_all(names(all),"(?<=^).+(?=\\.x$)") %>% unlist()
y.vars0 <- str_extract_all(names(all),"(?<=^).+(?=\\.y$)") %>% unlist()
overlap.vars <- intersect(x.vars0,y.vars0)
for (var in overlap.vars) {
var.x <- paste0(var,".x")
var.y <- paste0(var,".y")
diff <- all[[var.x]]!=all[[var.y]]
diff <- !is.na(diff) & diff
}
diffs <- map(overlap.vars,~{
var.x <- paste0(.x,".x")
var.y <- paste0(.x,".y")
diff <- all[[var.x]]!=all[[var.y]]
diff <- !is.na(diff) & diff
ifelse(diff,.x,NA_character_)
}) %>% set_names(overlap.vars) %>%
do.call(paste2,.)
all$.diffs <- diffs
message(str_glue("X <{x.name}> vs. Y <{y.name}>"))
message(str_glue("X: {pretty_number(nrow(x))} rows ({ifelse(x.is.distinct,\"distinct\",\"not distinct\")})"))
message(str_glue("Y: {pretty_number(nrow(y))} rows ({ifelse(y.is.distinct,\"distinct\",\"not distinct\")})"))
all %>% count(.status,.diffs) %>% print()
invisible(all)
}
test_if_nonvarying_by_group_OLD <- function(data,
id_vars = all_of(group_vars(data)),
test_vars = everything(),
verbose = FALSE) {
# data=get.otu.melt(cid.phy);id_vars=quo(sample);test_vars=quo(everything())
id_vars <- enquo(id_vars)
test_vars <- enquo(test_vars)
id_vars_ts <- tidyselect::eval_select(id_vars, data=data)
test_vars_ts <- tidyselect::eval_select(test_vars, data=data)
test_vars_ts <- test_vars_ts[!(test_vars_ts %in% id_vars_ts)]
id_var_names <- names(id_vars_ts)
test_var_names <- names(test_vars_ts)
if (length(id_vars)==0) {
warning("YTWarning: no groups detected")
}
data.rootgroup <- setNames(rep_along(id_var_names,TRUE),id_var_names)
# dt.testing <- data %>% ungroup() %>%
# as.data.table(key=id_var_names) %>%
# .[,group:=.GRP,by=id_var_names]
# groups <- dt.testing[["group"]] %>% unique()
# to.test <- test_var_names
# test <- function(x) {
# uniqueN(x)==1
# }
#
# for (i in groups) {
# # message(i)
# results <- dt.testing[group==i, lapply(.SD, test), .SDcols=to.test] %>% unlist()
# to.test <- names(results)[results]
# if (length(to.test)==0) {
# break
# }
# }
#whatever is left is non-varying.
data.testing <- setNames(test_var_names %in% to.test,test_var_names)
data.testing <- data %>% ungroup() %>% as.data.table(key=id_var_names) %>%
.[, lapply(.SD, uniqueN), .SDcols=test_var_names, by=id_var_names] %>%
.[, lapply(.SD, function(x) all(x==1)), .SDcols=test_var_names] %>%
as_tibble() %>% unlist()
# data.testing0 <- data %>% ungroup() %>% group_by(!!!syms(id_var_names)) %>%
# summarize(across(.cols=all_of(test_var_names), .fns=n_distinct), .groups="drop") %>%
# summarize(across(.cols=all_of(test_var_names), .fns=~all(.x==1))) %>% unlist()
if (verbose) {
test_var_names_cangroup <- names(data.testing)[data.testing]
test_var_names_cannotgroup <- names(data.testing)[!data.testing]
message(str_glue("* ID grouping var(s): {paste(id_var_names,collapse=',')}"))
message(str_glue("* Additional nonvarying grouping vars: {paste(test_var_names_cangroup,collapse=',')}"))
message(str_glue("* Varying non-grouping vars: {paste(test_var_names_cannotgroup,collapse=',')}"))
}
test <- c(data.rootgroup,data.testing)
test
}
phy.collapse.base.old <- function(otu,tax,taxranks,level,criteria,fillin.levels) {
# phy=cid.phy;criteria <- quo(max.pctseqs<=0.001 | pct.detectable<=0.005);level=7;fillin.levels=FALSE
criteria <- enquo(criteria)
# ntaxa.orig <- nrow(tax)
nsamps <- n_distinct(otu$sample)
allranks <- c(taxranks,"strain")
tax <- tax %>% mutate(strain=otu) %>%
rename_with(.fn=~paste(.x,"1",sep="_"),.cols=all_of(allranks))
# look at criteria, determine necessary calculations in make.tax
allcalcs <- rlang::exprs(n.detectable=sum(pctseqs>0),
pct.detectable=n.detectable / nsamps, # pct.detectable=mean(pctseqs>0),
mean.pctseqs=sum(pctseqs) / nsamps,
median.pctseqs=median(c(pctseqs,rep(0,length.out=nsamps-n()))), # median.pctseqs=median(pctseqs),
max.pctseqs=max(pctseqs),
min.pctseqs=ifelse(nsamps==n(),min(pctseqs),0), # min.pctseqs=min(pctseqs),
total.numseqs=sum(numseqs),
max.numseqs=max(numseqs),
min.numseqs=ifelse(nsamps==n(),min(numseqs),0), # min.numseqs=min(numseqs),
n.samps=nsamps,
n.rows=nsamps)
# some calcs depend on other lines, determine the dependencies
depends <- allcalcs %>% imap(~all.vars(.x) %>% intersect(names(allcalcs)) %>% c(.y))
calcvars <- depends[all.vars(criteria)] %>% unname() %>% simplify()
# subset of allcalc that is needed
calcs <- allcalcs[names(allcalcs) %in% calcvars]
make.otu <- function(ss,tt,level) {
by1 <- paste(allranks,level,sep="_")
by2 <- paste(allranks,level+1,sep="_")
ss %>%
inner_join(tt,by=by1) %>%
group_by(!!!syms(by2),
sample) %>%
summarize(pctseqs=sum(pctseqs),
numseqs=sum(numseqs),
.groups="drop")
}
make.tax <- function(ss,level) {
by1 <- paste(allranks,level,sep="_")
by2 <- paste(allranks,level+1,sep="_")
collapse_var <- str_glue("collapse{level}")
rank <- length(allranks)+1-level
parent.groups <- by1[1:(rank-1)]
new.tax.var.exprs <- seq_along(by1) %>% setNames(by2) %>%
map(~{
if (.x<rank) {
cur.rank <- sym(by1[.x])
expr(!!cur.rank)
} else if (.x==rank) {
parent <- sym(by1[rank-1])
expr(ifelse(!!sym(collapse_var),
paste("<miscellaneous>",!!parent),
!!sym(by1[.x])))
} else {
expr(ifelse(!!sym(collapse_var),
NA_character_,
!!sym(by1[.x])))
}
})
tt <- ss %>%
group_by(!!!syms(by1)) %>%
summarize(!!!calcs,
.groups="drop") %>%
mutate(!!sym(collapse_var):=!!criteria,
!!!new.tax.var.exprs) %>%
group_by(!!!syms(parent.groups)) %>%
mutate(n.collapse=sum(!!sym(collapse_var)),
nrows=n()) %>%
ungroup() %>%
mutate(!!sym(collapse_var):=!!sym(collapse_var) & n.collapse>1,
!!!new.tax.var.exprs) %>%
select(!!sym(collapse_var),
!!!syms(by1),!!!syms(by2))
return(tt)
}
# each iteration checks criteria and collapses one level
ss <- tax %>% inner_join(otu,by="otu")
taxmap.raw <- tax
trace <- c()
for (i in 1:level) {
# message(i)
tt <- make.tax(ss,i)
ss <- make.otu(ss,tt,i)
byvar <- paste(allranks,i,sep="_")
taxmap.raw <- taxmap.raw %>% left_join(tt,by=byvar)
trace <- c(trace,nrow(tt))
}
by.tax <- paste(allranks,i+1,sep="_") %>% setNames(allranks) %>% map(~expr(!!sym(.x)))
taxmap <- taxmap.raw %>% transmute(otu,!!!by.tax)
# coll_vars <- paste0("collapse",1:level) %>% rev()
# taxmap.raw %>% count(!!!syms(coll_vars))
new.tax.otu <- otu %>%
left_join(taxmap,by="otu") %>%
group_by(sample,!!!syms(allranks)) %>%
summarize(numseqs=sum(numseqs),
pctseqs=sum(pctseqs),
.groups="drop") %>%
mutate(otu=paste2(!!!syms(allranks),sep="|")) %>% arrange(otu)
if (fillin.levels) {
for (i in seq_along(taxranks)[-1]) {
var <- taxranks[i]
allvars <- taxranks[i:1]
new.tax.otu <- new.tax.otu %>%
mutate(!!sym(var):=coalesce(!!!syms(allvars)))
}
}
new.tax <- new.tax.otu %>% select(otu,!!!syms(taxranks)) %>% unique()
new.otu <- new.tax.otu %>% select(otu,sample,numseqs,pctseqs)
trace <- c(trace,nrow(new.tax)) %>% setNames(rev(allranks[1:level]))
message(str_glue("Evaluated across levels: {paste(names(trace),collapse=', ')} ({length(trace)-1} rounds)"))
message(str_glue("Number of taxa: {paste(trace,collapse=' -> ')} (final number of taxa)"))
# ntaxa.final <- nrow(new.tax)
# message(str_glue("Collapsed taxa, {ntaxa.orig} to {ntaxa.final}"))
list(tax=new.tax,otu=new.otu)
}
#' @rdname phy.collapse.bins.old
#' @export
phy.collapse.bins.old <- function(x,...) UseMethod("phy.collapse.bins.old")
#' Collapse phyloseq or otu.melt data into smaller tax bins
#'
#' Collapse phyloseq object into smaller bins, using specified criteria based on the data.
#' Examines the data at each level (e.g. otu, Species, Genus, Family, and so on), and at each level, collapses 2 or more taxa if they meet the specified criteria.
#'
#' The binning criteria can be defined multiple ways. These variables are available for criteria, for each taxon:
#'
#' * `n.detectable` number of samples where taxa abundance is above 0.
#'
#' * `pct.detectable` percent of samples where taxa abundance is above 0.
#'
#' * `mean.pctseqs` mean relative abundance for a taxon.
#'
#' * `median.pctseqs` median relative abundance for a taxon.
#'
#' * `max.pctseqs` highest relative abundance of sequences for a taxon (across all samples)
#'
#' * `min.pctseqs` lowest relative abundance of sequences for a taxon (across all samples)
#'
#' * `total.numseqs` total number of sequences for a taxon (across all samples)
#'
#' * `max.numseqs` highest number of sequences for a taxon (across all samples)
#'
#' * `min.numseqs` lowest number of sequences for a taxon (across all samples)
#'
#' * `n.samps` total number of samples (regardless of abundance)
#'
#' * `n.rows` total number of rows for a taxon
#'
#' @param phy phyloseq object to be collapsed
#' @param level number of tax levels to evaluate and collapse by, starting with `otu` and moving up. For instance, level=4 means collapse at otu, Species, Genus, Family.
#' @param fillin.levels Whether or not to fill in `NA` values with the collapsed taxa name. Default is `FALSE`.
#' @param criteria an expression that evaluates to TRUE/FALSE, whether to collapse at a particular level. Create this using calculated stats for each taxa (see Details)
#' @return phyloseq object or data frame (depending on what was supplied), representing the data, after criteria-based taxonomic binning.
#'
#' @examples
#' # original phyloseq
#' cid.phy
#'
#' # after default binning
#' phy.collapse.bins(cid.phy)
#'
#' # customized binning
#' phy.collapse.bins(cid.phy, level = 5,
#' criteria = mean.pctseqs < 0.001 & n.detectable <= 2)
#' @rdname phy.collapse.bins.old
#' @export
phy.collapse.bins.old.phyloseq <- function(phy,
level=length(rank_names(phy)),
fillin.levels=FALSE,
criteria=max.pctseqs<=0.001 | pct.detectable<=0.005) {
# phy=cid.phy;criteria <- quo(max.pctseqs<=0.001 | pct.detectable<=0.005);level=7;fillin.levels=FALSE
criteria <- enquo(criteria)
phy <- suppressMessages(prune_unused_taxa(phy))
taxranks <- rank_names(phy)
otu <- get.otu.melt(phy,sample_data=FALSE,tax_data=FALSE)
tax <- get.tax(phy)
objset <- phy.collapse.base.old(otu=otu,tax=tax,taxranks=taxranks,level=level,criteria=!!criteria,fillin.levels=fillin.levels)
new.tax <- objset$tax
new.otu <- objset$otu %>%
pivot_wider(id_cols=otu,
names_from=sample,
values_from=numseqs,
values_fn = sum,
values_fill=0)
new.phy <- phyloseq(set.otu(new.otu),set.tax(new.tax),sample_data(phy))
return(new.phy)
}
#' @param data data frame, formatted as `get.otu.melt`. Note, it must have columns `otu`, `sample`, `numseqs`, `pctseqs`, and all values in `taxranks`.
#' @param taxranks character vector of taxonomic ranks in `data`.
#' @param sample_vars character vector of column names in `data` that convey sample-level information. These will be retained as such after binning.
#' @param sample_id name of sample column identifier. Default is "sample".
#' @param taxa_id name of taxon column identifier. Default is "otu".
#' @rdname phy.collapse.bins.old
#' @examples
#' # You can also enter the data in long form (rows=sample*taxa, such as that produced by get.otu.melt).
#' otu <- get.otu.melt(cid.phy)
#' otu.bin <- phy.collapse.bins(otu,
#' taxranks = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "taxon"),
#' sample_vars = c("Sample_ID", "Patient_ID", "day", "day.old", "Consistency"),
#' level = 5,
#' criteria = mean.pctseqs < 0.001 & n.detectable <= 2)
#' n_distinct(otu$otu)
#' n_distinct(otu.bin$otu)
#' @export
phy.collapse.bins.old.data.frame <- function(data,
taxranks=c("Superkingdom","Phylum","Class","Order","Family","Genus","Species"),
level=length(taxranks),
fillin.levels=FALSE,
sample_id="sample",
taxa_id="otu",
criteria=max.pctseqs<=0.001 | pct.detectable<=0.005,
sample_vars=NULL) {
# data <- get.otu.melt(cid.phy);taxranks <- rank_names(cid.phy);level=4;fillin.levels=FALSE;criteria=quo(max.pctseqs<=0.001 | pct.detectable<=0.005);sample_vars=NULL
criteria <- enquo(criteria)
needvars <- c(taxa_id,sample_id,"numseqs","pctseqs",taxranks)
if (!all(needvars %in% names(data))) {
stop(str_glue("YTError: vars not found in data: {paste(setdiff(needvars,names(data)),collapse=',')}"))
}
data <- data %>% rename(sample=!!sym(sample_id),otu=!!sym(taxa_id))
rows.are.distinct <- is.distinct(data,otu,sample)
if (!rows.are.distinct) {
stop(str_glue("YTError: rows are not distinct across (sample_id x taxa_id)!"))
}
tax <- data %>% select(otu,!!!syms(taxranks)) %>% unique()
sample_vars <- setdiff(sample_vars,"sample")
if (is.null(sample_vars)) {
message("Attempting to determine sample vars...\n")
}
samp <- data %>% select(sample,!!!syms(sample_vars)) %>% unique()
otu <- data %>% select(otu,sample,numseqs,pctseqs)
# phy.collapse()
objset <- phy.collapse.base.old(otu=otu,tax=tax,taxranks=taxranks,level=level,criteria=!!criteria,fillin.levels=fillin.levels)
new.data <- objset$otu %>%
left_join(objset$tax,by="otu") %>%
left_join(samp,by="sample") %>%
rename(!!sym(taxa_id):=otu,!!sym(sample_id):=sample)
return(new.data)
}
#' Get YT Palette
#' @param tax either a data.frame, phyloseq, or tax_table
#' @param use.cid.colors whether to use classic CID colors
#' @return a color palette that can be used in \code{ggplot2}
#' @examples
#' @author Ying Taur
#' @export
get.yt.palette <- function(tax,use.cid.colors=TRUE) {
requireNamespace("phyloseq",quietly=TRUE)
message("YTNote: get.yt.palette() and get.yt.palette2 are deprecated. Try using get.tax.palette().")
if (class(tax)[1] %in% c("phyloseq","taxonomyTable")) {
tax <- get.tax(tax.obj)
}
ranks <- c("Superkingdom","Phylum","Class","Order","Family","Genus","Species")
if (!all(ranks %in% names(tax))) {
stop("YTError: need to have taxon levels: Superkingdom, Phylum, Class, Order, Family, Genus, Species")
}
tax.dict <- tax[,ranks] %>% distinct()
#bacteria are shades of gray by default
tax.dict$color <- rep(shades("gray"),length.out=nrow(tax.dict))
#proteobacteria: red
proteo <- tax.dict$Phylum=="Proteobacteria"
tax.dict$color[proteo] <- rep(shades("red",variation=0.4),length.out=sum(proteo))
#bacteroidetes: cyan
bacteroidetes <- tax.dict$Phylum=="Bacteroidetes"
tax.dict$color[bacteroidetes] <- rep(shades("#2dbfc2",variation=0.4),length.out=sum(bacteroidetes))
#actinobacteria: purple
actino <- tax.dict$Phylum=="Actinobacteria"
tax.dict$color[actino] <- rep(shades("purple",variation=0.4),length.out=sum(actino))
#firmicutes:
firm <- tax.dict$Phylum=="Firmicutes"
tax.dict$color[firm] <- rep(shades("#8f7536",variation=0.3),length.out=sum(firm))
#within firmicutes, color bacilli
bacilli <- tax.dict$Class=="Bacilli"
tax.dict$color[bacilli] <- rep(shades("#3b51a3",variation=0.2),length.out=sum(bacilli))
#within firmicutes, color blautia
blautia <- tax.dict$Genus=="Blautia"
tax.dict$color[blautia] <- rep(shades("#f69ea0",variation=0.2),length.out=sum(blautia))
if (use.cid.colors) {
cid.colors.new <- c("Enterococcus"="#129246","Streptococcus"="#a89e6a","Staphylococcus"="#f1eb25")
cid <- cid.colors.new[match(tax.dict$Genus,names(cid.colors.new))]
tax.dict$color <- ifelse(is.na(cid),tax.dict$color,cid)
}
tax.palette <- structure(tax.dict$color,names=as.character(tax.dict$Species))
tax.palette
}
#' Get YT Palette 2
#' @param tax either a data.frame, phyloseq, or tax_table
#' @param use.cid.colors whether to use classic CID colors
#' @return a color palette that can be used in \code{ggplot2}
#' @examples
#' @author Ying Taur
#' @export
get.yt.palette2 <- function (tax) {
requireNamespace("phyloseq",quietly=TRUE)
message("YTNote: get.yt.palette() and get.yt.palette2 are deprecated. Try using get.tax.palette().")
if (is(tax,"phyloseq") | is(tax,"taxonomyTable")) {
tax <- get.tax(tax)
}
ranks <- c("Superkingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
if (!all(ranks %in% names(tax))) {
missing.vars <- setdiff(ranks,names(tax))
stop("YTError: need to have taxon levels: Superkingdom, Phylum, Class, Order, Family, Genus, Species; missing: ",paste(missing.vars,collapse=","))
}
tax.dict <- tax[, ranks] %>% distinct()
tax.dict$color <- rep(shades("gray", variation=0.25),length.out = nrow(tax.dict))
proteo <- tax.dict$Phylum == "Proteobacteria"
tax.dict$color[proteo] <- rep(shades("red", variation = 0.4), length.out = sum(proteo))
actino <- tax.dict$Phylum == "Actinobacteria"
tax.dict$color[actino] <- rep(shades("#A77097", variation = 0.25), length.out = sum(actino))
bacteroidetes <- tax.dict$Phylum == "Bacteroidetes"
tax.dict$color[bacteroidetes] <- rep(shades("#51AB9B", variation = 0.25), length.out = sum(bacteroidetes))
clost <- tax.dict$Order == "Clostridiales"
tax.dict$color[clost] <- rep(shades("#9C854E", variation = 0.25), length.out = sum(clost))
lachno <- tax.dict$Family == "Lachnospiraceae"
tax.dict$color[lachno] <- rep(shades("#EC9B96", variation = 0.25), length.out = sum(lachno))
rumino <- tax.dict$Family == "Ruminococcaceae"
tax.dict$color[rumino] <- rep(shades("#9AAE73", variation = 0.25), length.out = sum(rumino))
cid.colors.new <- c(Enterococcus = "#129246", Streptococcus = "#9FB846", Staphylococcus = "#f1eb25" , Lactobacillus="#3b51a3")
cid <- cid.colors.new[match(tax.dict$Genus, names(cid.colors.new))]
tax.dict$color <- ifelse(is.na(cid), tax.dict$color, cid)
tax.palette <- structure(tax.dict$color, names = as.character(tax.dict$Species))
tax.palette
}
#' Collapse phyloseq into smaller bins
#'
#' Collapse phyloseq object into smaller bins, using specified criteria based on the data.
#'
#' These variables are available for criteria, for each taxon:
#'
#' * `n.detectable` number of samples where taxa abundance is above 0.
#'
#' * `pct.detectable` percent of samples where taxa abundance is above 0.
#'
#' * `mean.pctseqs` mean relative abundance for a taxon.
#'
#' * `median.pctseqs` median relative abundance for a taxon.
#'
#' * `max.pctseqs` highest relative abundance of sequences for a taxon (across all samples)
#'
#' * `min.pctseqs` lowest relative abundance of sequences for a taxon (across all samples)
#'
#' * `total.numseqs` total number of sequences for a taxon (across all samples)
#'
#' * `max.numseqs` highest number of sequences for a taxon (across all samples)
#'
#' * `min.numseqs` lowest number of sequences for a taxon (across all samples)
#'
#' * `n.samps` total number of samples (regardless of abundance)
#'
#' * `n.rows` total number of rows for a taxon
#'
#' @param phy phyloseq object to be collapsed
#' @param level number of tax levels to evaluate and collapse by, starting with `otu` and moving up. For instance, level=4 means collapse at otu, Species, Genus, Family.
#' @param fillin.levels Whether or not to fill in `NA` values with the collapsed taxa name. Default is `FALSE`.
#' @param criteria an expression that evaluates to TRUE/FALSE, whether to collapse at a particular level. Create this using calculated stats for each taxa (see Details)
#'
#' @return collapsed phyloseq object
#' @export
#'
#' @examples
#' phy.binned <- phy.collapse.bins(cid.phy)
#' phy.binned2 <- phy.collapse.bins(cid.phy, level=5,
#' criteria=mean.pctseqs<0.001 & n.detectable<=2)
phy.collapse.bins.old <- function(phy,
level=length(rank_names(phy)),
fillin.levels=FALSE,
criteria=max.pctseqs<=0.001 | pct.detectable<=0.005) {
# criteria <- quo(max.pctseqs<=0.001 | pct.detectable<=0.005);level=7;fillin.levels=TRUE
criteria <- enquo(criteria)
ntaxa.orig <- ntaxa(phy)
phy <- suppressMessages(prune_unused_taxa(phy))
nsamps <- nsamples(phy)
taxranks <- rank_names(phy)
allranks <- c(taxranks,"strain")
# look at criteria, determine necessary calculations in make.tax
allcalcs <- rlang::exprs(n.detectable=sum(pctseqs>0),
pct.detectable=n.detectable / nsamps, # pct.detectable=mean(pctseqs>0),
mean.pctseqs=sum(pctseqs) / nsamps,
median.pctseqs=median(c(pctseqs,rep(0,length.out=nsamps-n()))), # median.pctseqs=median(pctseqs),
max.pctseqs=max(pctseqs),
min.pctseqs=ifelse(nsamps==n(),min(pctseqs),0), # min.pctseqs=min(pctseqs),
total.numseqs=sum(numseqs),
max.numseqs=max(numseqs),
min.numseqs=ifelse(nsamps==n(),min(numseqs),0), # min.numseqs=min(numseqs),
n.samps=nsamps,
n.rows=nsamps)
# some calcs depend on other lines, determine the dependencies
depends <- allcalcs %>% imap(~all.vars(.x) %>% intersect(names(allcalcs)) %>% c(.y))
calcvars <- depends[all.vars(criteria)] %>% unname() %>% simplify()
# subset of allcalc that is needed
calcs <- allcalcs[names(allcalcs) %in% calcvars]
make.otu <- function(ss,tt,level) {
by1 <- paste(allranks,level,sep="_")
by2 <- paste(allranks,level+1,sep="_")
ss %>%
inner_join(tt,by=by1) %>%
group_by(!!!syms(by2),
sample) %>%
summarize(pctseqs=sum(pctseqs),
numseqs=sum(numseqs),
.groups="drop")
}
make.tax <- function(ss,level) {
# level=1
by1 <- paste(allranks,level,sep="_")
by2 <- paste(allranks,level+1,sep="_")
collapse_var <- str_glue("collapse{level}")
rank <- length(allranks)+1-level
parent.groups <- by1[1:(rank-1)]
new.tax.var.exprs <- seq_along(by1) %>% setNames(by2) %>%
map(~{
if (.x<rank) {
expr(!!sym(by1[.x]))
} else if (.x==rank) {
parent <- rank-1
expr(ifelse(!!sym(collapse_var),
paste("<miscellaneous>",!!sym(by1[parent])),
!!sym(by1[.x])))
} else {
expr(ifelse(!!sym(collapse_var),
NA_character_,
!!sym(by1[.x])))
}
})
tt <- ss %>%
group_by(!!!syms(by1)) %>%
summarize(!!!calcs,
.groups="drop") %>%
mutate(!!sym(collapse_var):=!!criteria,
!!!new.tax.var.exprs) %>%
group_by(!!!syms(parent.groups)) %>%
mutate(n.collapse=sum(!!sym(collapse_var)),
nrows=n()) %>%
ungroup() %>%
mutate(!!sym(collapse_var):=!!sym(collapse_var) & n.collapse>1,
!!!new.tax.var.exprs) %>%
# filter(!!sym(collapse_var)) %>%
select(!!sym(collapse_var),
!!!syms(by1),!!!syms(by2))
return(tt)
}
otu <- get.otu.melt(phy,sample_data=FALSE,tax_data = FALSE)
tax <- get.tax(phy) %>% mutate(strain=otu) %>%
rename_with(.fn=~paste(.x,"1",sep="_"),.cols=all_of(allranks))
# each iteration checks criteria and collapses one level
ss <- tax %>% inner_join(otu,by="otu")
taxmap.raw <- tax
for (i in 1:level) {
tt <- make.tax(ss,i)
ss <- make.otu(ss,tt,i)
byvar <- paste(allranks,i,sep="_")
taxmap.raw <- taxmap.raw %>% left_join(tt,by=byvar)
}
by.tax <- paste(allranks,i+1,sep="_") %>% setNames(allranks) %>% map(~expr(!!sym(.x)))
taxmap <- taxmap.raw %>% mutate(otu,!!!by.tax)
# coll_vars <- paste0("collapse",1:level) %>% rev()
# xx=taxmap %>% mutate(asdf=coalesce_indicators(!!!syms(coll_vars),else.value="other",first.hit.only=TRUE))
new.tax.otu <- otu %>%
left_join(taxmap,by="otu") %>%
pivot_wider(id_cols=c(!!!syms(allranks)),
names_from=sample,
values_from=numseqs,
values_fn = sum,
values_fill=0) %>%
mutate(otu=paste2(!!!syms(allranks),sep="|")) %>% arrange(otu)
if (fillin.levels) {
for (i in seq_along(taxranks)[-1]) {
var <- taxranks[i]
allvars <- taxranks[i:1]
new.tax.otu <- new.tax.otu %>%
mutate(!!sym(var):=coalesce(!!!syms(allvars)))
}
}
new.tax <- new.tax.otu %>% select(otu,!!!syms(taxranks))
new.otu <- new.tax.otu %>% select(otu,!!!syms(sample_names(phy)))
new.phy <- phyloseq(set.otu(new.otu),set.tax(new.tax),sample_data(phy))
ntaxa.final <- ntaxa(new.phy)
message(str_glue("Collapsed taxa, {ntaxa.orig} to {ntaxa.final}"))
return(new.phy)
}
#' Read OTU table from text file
#'
#' Used for uparse pipeline to read in otu table.
#'
#' Assumes tab-delimited file, with 'OTUID' to specify otu column.
#' @param otu.file text file to be read, containing otu table data
#' @param otu.row.names specify column listing OTU names. This is passed to \code{read.delim}; i.e. can be vector of accual row names, single number of column, or character string name of the column.
#' @return Dataframe containing otu table
#' @export
read.otu.table <- function(otu.file,row.names="OTUId") {
warning("YTWarning: Please note that this function is deprecated, uparse/mothur functions have been taken offsite.")
otu <- read.delim(otu.file,header=TRUE,check.names=FALSE,row.names=row.names) %>%
rownames_to_column("otu")
return(otu)
}
#' Read Tree File (Label-Fix)
#'
#' Read in tree file created by uparse pipeline.
#'
#' Functions like \code{read.tree} have trouble reading uparse-generated trees, because the OTU names contain semicolons(;), e.g. "\code{'OTU_2080;size=5;'}". This function temporarily replaces \code{;} with \code{__}, reads in successfully, and reverts back to names with semicolons. Also, removes the single quotes (\code{\'}) from the name. If read in this way, the tree can be quickly merged into a phyloseq object.
#'
#' @param tree.file Newick-formatted text file
#' @return Returns a \code{phylo} object, with original labels.
#' @examples
#' b <- import_biom("uparse/total.8.otu-tax.biom")
#' tr <- read.tree.uparse("uparse/total.10.tree")
#' phy <- merge_phyloseq(b,tr)
#' @author Ying Taur
#' @export
read.tree.uparse <- function(tree.file) {
warning("YTWarning: Please note that this function is deprecated, uparse/mothur functions have been taken offsite.")
tree.text <- scan(tree.file,what=character(),quiet=TRUE)
#replace ; with __, and place single quotes around, if not already there. (qiime places quotes, mothur does not)
new.tree.text <- gsub("'?(OTU_[0-9]+);(size=[0-9]+);'?","'\\1__\\2__'",tree.text)
#new.tree.text <- gsub("(OTU_[0-9]+);(size=[0-9]+);","\\1__\\2__",tree.text)
tr <- ape::read.tree(text=new.tree.text)
tr$tip.label <- gsub("(OTU_[0-9]+)__(size=[0-9]+)__","\\1;\\2;",tr$tip.label)
tr$node.label <- gsub("(OTU_[0-9]+)__(size=[0-9]+)__","\\1;\\2;",tr$node.label)
#tr$tip.label <- gsub("'?(OTU_[0-9]+)__(size=[0-9]+)__'?","\\1;\\2;",tr$tip.label)
#tr$node.label <- gsub("'?(OTU_[0-9]+)__(size=[0-9]+)__'?","\\1;\\2;",tr$node.label)
return(tr)
}
#' Read in mothur taxonomy file.
#'
#' @param tax.file mothur taxonomy file to be read.
#' @return Returns a data frame containing taxonomy
#' @author Ying Taur
#' @export
read.mothur.taxfile <- function(tax.file) {
warning("YTWarning: Please note that this function is deprecated, uparse/mothur functions have been taken offsite.")
taxlevels <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
tbl <- read.delim(tax.file,header=FALSE,row.names=NULL) %>% rename(otu=V1,taxonomy=V2) %>%
mutate(taxonomy=sub(";$","",taxonomy)) %>%
separate(taxonomy,taxlevels,sep=";",remove=FALSE)
for (lvl in taxlevels) {
pctvar <- paste0(lvl,".pid")
tbl[[pctvar]] <- as.numeric(str_extract(tbl[[lvl]],middle.pattern("\\(","[0-9.]+","\\)")))
tbl[[lvl]] <- sub("\\([0-9.]+\\)","",tbl[[lvl]])
tbl[[lvl]] <- sub("[kpcofgs]_+","",tbl[[lvl]])
}
return(tbl)
}
#' Import UPARSE pipeline data and create phyloseq
#'
#' Reads folder and looks for key files used to create phyloseq object.
#'
#' @param dirpath directory path (character) specifying folder where uparse data is.
#' @param otu.file otu table file. Default is 'total.6.otu-table.txt'
#' @param tax.file tax file (blastn). Default is 'total.5.repset.fasta.blastn.refseq_rna.txt'. Specify \code{NULL} to skip.
#' @param repseq.file rep seq file (fasta format). Defaul is 'total.5.repset.fasta' Specify \code{NULL} to skip.
#' @param tree.file phylo tree file (newick file). Default is 'total.10.tree' Specify \code{NULL} to skip.
#' @return phyloseq object containing the specified UPARSE data.
#' @author Ying Taur
#' @export
read.uparse.data <- function(dirpath,
otu.file="total.6.otu-table.txt",
tax.file="total.5.repset.fasta.blastn.refseq_rna.txt",
repseq.file="total.5.repset.fasta",
tree.file="total.10.tree") {
requireNamespace("phyloseq",quietly=TRUE)
warning("YTWarning: Please note that this function is deprecated, uparse/mothur functions have been taken offsite.")
# dirpath="uparse";otu.file="total.6.otu-table.txt";tax.file="total.5.repset.fasta.blastn.refseq_rna.txt";repseq.file="total.5.repset.fasta";tree.file="total.10.tree"
if (!dir.exists(dirpath)) stop("YTError: This directory doesn't exist: ",dirpath)
# dirpath="uparse"
otu.file <- file.path(dirpath,otu.file)
if (!file.exists(otu.file)) stop("YTError not found: ",otu.file)
phy <- read.otu.table(otu.file) %>% set.otu()
if (!is.null(tax.file)) {
tax.file <- file.path(dirpath,tax.file)
if (!file.exists(tax.file)) stop("YTError not found: ",tax.file)
tax <- read.blastn.file(tax.file) %>% set.tax()
phy <- merge_phyloseq(phy,tax)
}
if (!is.null(repseq.file)) {
repseq.file <- file.path(dirpath,repseq.file)
if (!file.exists(repseq.file)) stop("YTError not found: ",repseq.file)
repseq <- phyloseq::import_qiime(refseqfilename=repseq.file)
phy <- merge_phyloseq(phy,repseq)
}
if (!is.null(tree.file)) {
tree.file <- file.path(dirpath,tree.file)
if (!file.exists(tree.file)) stop("YTError not found: ",tree.file)
tree <- read.tree.uparse(tree.file)
phy <- merge_phyloseq(phy,tree)
}
return(phy)
}
#' Plot Principal Components Analysis
#'
#' Plots PCA from distance matrix data.
#'
#' @param dist distance matrix to be plotted.
#' @param data logical, if \code{TRUE}, returns a data frame of PCA axes instead of the plot. Default is \code{FALSE}.
#' @param prefix character, an optional prefix text for PCA variable names. E.g. if \code{"unifrac"} is used, \code{"PCA1"} becomes \code{"unifrac.PCA1"}.
#' @return Returns a \code{ggplot2} graph of PCA1 and PCA2.
#' @examples
#' @author Ying Taur
#' @export
pca.plot <- function(dist,data=FALSE,prefix=NA) {
pca <- prcomp(dist)
pca.axes <- data.frame(pca$x,stringsAsFactors=FALSE)
pca.loadings <- summary(pca)$importance["Proportion of Variance",]
pca.labels <- paste0(sub("PC","PCA",names(pca.loadings))," (",percent(pca.loadings)," variation explained)")
for (i in 1:length(pca.labels)) {
label(pca.axes[,i]) <- pca.labels[i]
}
pca.axes$sample <- row.names(pca.axes)
if (data) {
names(pca.axes) <- sub("^PC",paste2(prefix,"PCA",sep="."),names(pca.axes))
return(pca.axes)
} else {
g <- ggplot(pca.axes) +
geom_point(aes(x=PC1,y=PC2,color=sample,size=3)) +
geom_text(aes(x=PC1,y=PC2,label=sample),size=3,vjust=1.4) +
theme(aspect.ratio=1)
return(g)
}
}
#' LEfSe prep
#'
#' Create the initial file needed for LEfSe (LDA Effect Size) analysis.
#'
#' This function performs the analysis using the following steps.
#' (1) Creates lefse.txt from phyloseq data, a tab-delimited file in the format input required by LEfSe.
#' (2) Provides the command line for running the analysis, assuming you have the scripts locally.
#' @param phy the phyloseq object containing data
#' @param class variable to be tested by LEfSe. This must be a variable in sample_data(phy)
#' @param subclass variable to perform subclass testing. This step is skipped if it is not specified.
#' @param subject variable referring to the subject level designation. This is only necessary if multiple samples per subject.
#' @param anova.alpha alpha level of the kruskal-wallis testing. Default is 0.05
#' @param wilcoxon.alpha alpha level at which to perform wilcoxon testing of subclass testing. Default is 0.05.
#' @param lda.cutoff Cutoff LDA to be reported. Default is 2.0.
#' @param wilcoxon.within.subclass Set whether to perform Wilcox test only among subclasses with the same name (Default FALSE)
#' @param mult.test.correction Can be {0,1,2}. Set the multiple testing correction options. 0 no correction (more strict, default), 1 correction for independent comparisons, 2 correction for independent comparison
#' @param one.against.one for multiclass tasks, sets whether testing is performed one-against-one (TRUE - more strict) or one-against-all (FALSE - less strict)
#' @param levels Taxonomic levels to be tested. Default is to test all levels: rank_names(phy)
#' @return Returns data
#' @examples
#' lefse.tbl <- lefse(phy1,class="CDI",subclass="Sex")
#' @export
lefse.prep <- function(phy,class,subclass=NA,
subject=NA,
anova.alpha = 0.05,
wilcoxon.alpha = 0.05,
lda.cutoff = 2,
wilcoxon.within.subclass = FALSE,
one.against.one = FALSE,
n_boots = 30,
min_c = 10,
f_boots = 0.67,
mult.test.correction = 0,
by_otus=FALSE,
levels=phyloseq::rank_names(phy)) {
requireNamespace(c("phyloseq","data.table"),quietly=TRUE)
# pkgs <- c("splines","stats4","survival","mvtnorm","modeltools","coin","MASS")
# missing.pkgs <- setdiff(pkgs,installed.packages()[,"Package"])
# if (length(missing.pkgs)>0) {
# warning("YTWarning: R packages are needed for the LEFSE scripts to work: ",paste(missing.pkgs,collapse=", "))
# }
warning("YTWarning: Please note that this function is deprecated, and only does the formatting step.
Use this with Docker or Conda image, or consider using lda.effect.")
keepvars <- c(class,subclass,subject,"sample")
keepvars <- unique(keepvars[!is.na(keepvars)])
samp <- get.samp(phy)[,keepvars]
if (by_otus) { #perform by otu only
otu <- get.otu.melt(phy,sample_data=FALSE)
otu.levels <- otu %>% mutate(taxon=otu) %>%
group_by(sample,taxon) %>% summarize(pctseqs=sum(pctseqs)) %>%
mutate(taxon=gsub(" ","_",taxon))
} else { #divide by taxonomy
otu <- get.otu.melt(phy,sample_data=FALSE)
otu.list <- lapply(1:length(levels),function(i) {
lvls <- levels[1:i]
lvl <- levels[i]
otu.level <- otu
otu.level$taxon <- do.call(paste,c(lapply(lvls,function(l) otu[[l]]),sep="|"))
otu.level$rank <- lvl
otu.level2 <- otu.level %>% group_by(sample,taxon,rank) %>% summarize(pctseqs=sum(pctseqs)) %>% ungroup()
return(otu.level2)
})
otu.levels <- bind_rows(otu.list) %>%
mutate(taxon=gsub(" ","_",taxon))
}
otu.tbl <- otu.levels %>%
dcast(sample~taxon,value.var="pctseqs",fill=0) %>%
left_join(samp,by="sample") %>%
select_(.dots=c(keepvars,lazyeval::interp(~everything())))
if (is.na(subject) | subject!="sample") {
otu.tbl <- otu.tbl %>% select(-sample)
}
tbl <- otu.tbl %>% t()
write.table(tbl,"lefse.txt",quote=FALSE,sep="\t",col.names=FALSE)
opt.class <- paste("-c",which(keepvars %in% class))
opt.subclass <- ifelse(is.na(subclass),"",paste("-s",which(keepvars %in% subclass)))
opt.subject <-ifelse(is.na(subject),"",paste("-u",which(keepvars %in% subject)))
format.command <- paste("format_input.py lefse.txt lefse.in",opt.class,opt.subclass,opt.subject,"-o 1000000")
# system(format.command)
# -m {f,s} set the policy to adopt with missin values: f removes
# the features with missing values, s removes samples
# with missing values (default f)
# -n int set the minimum cardinality of each subclass
# (subclasses with low cardinalities will be grouped
# together, if the cardinality is still low, no pairwise
# comparison will be performed with them)
lefse.command <- paste("run_lefse.py lefse.in lefse.res",
"-a",anova.alpha,
"-w",wilcoxon.alpha,
"-l",lda.cutoff,
"-e",as.numeric(wilcoxon.within.subclass),
"-y",as.numeric(one.against.one),
"-b",n_boots,
"--min_c",min_c,
"-f",f_boots,
"-s",mult.test.correction)
message("Commands: ")
message(format.command)
message(lefse.command)
message("plot_res.py lefse.res lefse_lda.png")
message("plot_cladogram.py lefse.res lefse_clado.pdf --format pdf")
return(list(format=format.command,
lefse=lefse.command,
plot1="plot_res.py lefse.res lefse_lda.png",
plot2="plot_cladogram.py lefse.res lefse_clado.pdf --format pdf"))
}
# from yingtools.R --------------------------------------------------------
copy.as.sql.old <- function(x,copy.clipboard=TRUE,fit=TRUE,width=getOption("width")-15) {
#converts x to R-code.
if (is.vector(x)) {
x <- as.character(x)
sql <- paste0("(",paste0("'",x,"'",collapse=","),")")
if (fit) {
sql <- fit(sql,width=width,copy.clipboard=FALSE)
}
} else if (is.data.frame(x)) {
# select '12345678' as mrn, 12 as number, '2016-01-01' as trans_dte
# from idb.oms_ord_catalog where OOC_MSTR_ITEM_GUID = '1000001000074005'
# union all
# select '12345679' as mrn, 13 as number, '2016-01-01' as trans_dte
# from idb.oms_ord_catalog where OOC_MSTR_ITEM_GUID = '1000001000074005'
# union all
# select '12345668' as mrn, 12 as number, '2016-01-01' as trans_dte
# from idb.oms_ord_catalog where OOC_MSTR_ITEM_GUID = '1000001000074005'
# union all
# select '12345448' as mrn, 14 as number, '2016-01-01' as trans_dte
# from idb.oms_ord_catalog where OOC_MSTR_ITEM_GUID = '1000001000074005'
#add quotations if necessary
format.value <- function(col) {
if (is.numeric(col)) {
newcol <- as.character(col)
} else {
newcol <- paste0("'",as.character(col),"'")
}
return(newcol)
}
# data2 <- mutate_all(x,funs(format.value))
data2 <- mutate_all(x,format.value)
for (var in names(data2)) {
data2[[var]] <- paste(data2[[var]],"as",var)
}
sql.values <- apply(data2,1,function(x) {
paste(x,collapse=",")
})
sql <- paste("select",sql.values,"from idb.oms_ord_catalog where OOC_MSTR_ITEM_GUID = '1000001000074005'",collapse="\nunion all\n")
}
if (copy.clipboard) {
copy.to.clipboard(sql)
}
return(sql)
}
#' Sample N Groups
#'
#' Sample groups from a grouped data frame
#' @param grouped_df the grouped data frame to be sampled
#' @param size number of groups to sample
#' @return a subset of the grouped data frame
#' @examples
#' gdf <- mtcars %>% group_by(gear,carb)
#' sample_n_groups(gdf,3)
sample_n_groups_OLD <- function(grouped_df, size) {
dplyr::group_data(grouped_df) %>%
dplyr::sample_n(size) %>%
dplyr::select(-.rows) %>%
dplyr::inner_join(grouped_df,by=dplyr::group_vars(grouped_df)) %>%
dplyr::group_by(!!!groups(grouped_df))
}
#' Extract any text within quotes.
#'
#' Works like \code{str_extract_all}, but is used to extract quoted text within text. This comes for example text a character string contains code itself, like a python list.
#'
#' This is more difficult than you might think. \code{str_extract_all(text,middle.pattern("\"",".*","\""))}
#' doesn't work because (1) it includes stuff on either side of the quote, and (2) it will fail if there are quotes inside the text (which look like \code{\\\"})within the quoted text.
#' So you need to extract based on \code{\"} but ignore \code{\\\"}, and only extract stuff between pairs of quotes.
#' @param text character vector with quotes to be extracted.
#' @param convert.text.quotes logical indicating whether or not to convert \\\" to \" after converting.
#' @examples
#' # Should be a 3 item python list, with middle item being empty.
#' python.list <- "[\"no quotes here, ok?\",\"\",\"I like to put \\\"things\\\" in quotes\"]"
#' #This doesn't work....
#' str_extract_all(python.list,middle.pattern("\"",".*","\""))
#' #This also doesn't work...
#' str_extract_all(python.list,middle.pattern("\"","[^\"]*","\""))
#' #Even this doesn't work
#' str_extract_all(python.list,middle.pattern("(?<!\\\\)\\\"",".*","(?<!\\\\)\\\""))
#' #But: use this function to get it done.
#' str_extract_all_quotes(python.list)
#' @author Ying Taur
#' @export
str_extract_all_quotes <- function(text,convert.text.quotes=TRUE) {
#text="\"\", \"tRNA acetyltransferase TAN1\""
quote.pattern <- "(?<!\\\\)\\\""
quote.list <- lapply(text,function(x) {
quote.pos <- gregexpr(quote.pattern,x,perl=TRUE)[[1]]
if (quote.pos[1]==-1) {
return(NULL)
}
if (length(quote.pos) %% 2!=0) stop("YTError: Found an odd number of quotes in this character string:\n",x)
quote.pairs <- split(quote.pos,cumsum(rep(1:0,length.out=length(quote.pos))))
within.quotes <- sapply(quote.pairs,function(y) substr(x,y[1]+1,y[2]-1))
if (convert.text.quotes) {
within.quotes <- gsub("\\\\\"","\\\"",within.quotes)
}
return(within.quotes)
})
return(quote.list)
}
#not sure I need this
fit <- function(x,width=100,copy.clipboard=TRUE) {
#width=100;copy.clipboard=TRUE
cr.pattern <- "(?<!\\\\)\\n"
multi.line <- grepl(cr.pattern,x,perl=TRUE)
if (multi.line) {
lines <- str_split(x,cr.pattern)[[1]]
out <- paste(sapply(lines,function(x) fit(x,width=width,copy.clipboard=FALSE)),collapse="\n")
} else {
#find quotes (cannot be preceded by backslash)
if (nchar(x)<=width) {
out <- x
} else {
quote.pos <- gregexpr("(?<!\\\\)\\\"",x,perl=TRUE)[[1]]
if (quote.pos[1]==-1) {
within.quotes <- NULL
} else {
if (length(quote.pos) %% 2!=0) stop("YTError: Found an odd number of quotes in this character string:\n",x)
quote.pairs <- split(quote.pos,cumsum(rep(1:0,length.out=length(quote.pos))))
#mark characters that are within quotes.
within.quotes <- lapply(quote.pairs,function(x) x[1]:x[2])
within.quotes <- stack(within.quotes)$values
}
comma.pos <- gregexpr(",",x)[[1]]
if (comma.pos[1]==-1) {
out <- x
} else {
valid.cr.pos <- setdiff(comma.pos,within.quotes)
min.valid.cr.pos <- min(valid.cr.pos)
if (min.valid.cr.pos>=width) {
new.cr <- min.valid.cr.pos
} else {
chars <- strsplit(x,"")[[1]]
cumsum.chars <- cumsum(chars!="\\")
valid.crs.length <- cumsum.chars[valid.cr.pos]
new.cr <- max(valid.cr.pos[valid.crs.length<=width])
}
first.half <- substr(x,1,new.cr)
second.half <- substr(x,new.cr+1,nchar(x))
out <- paste(first.half,fit(second.half,width=width,copy.clipboard=FALSE),sep="\n")
}
}
}
if (copy.clipboard) {
copy.to.clipboard(out)
}
return(out)
}
#' Make Table
#'
#' Creates a summary table (data frame) variables from the data.
#'
#' This was written to create a "Table 1" of a manuscript.
#' @param data Data frame containing data to be described.
#' @param vars character vector of variables within \code{data} to be summarized.
#' @param by Optional, variable name (character) by which to summarize the data. Each separate value will be a column of data in the table.
#' @param showdenom logical, whether to show denominator in the cells.
#' @param fisher.test fisher logical, whether or not to calculate Fisher exact tests. Only performed if \code{by} is also specified.
#' @return Returns a data frame formatted to be summary table.
#' @examples
#' make.table(mtcars,c("cyl","gear"))
#' make.table(mtcars,c("cyl","gear"),by="vs",showdenom=TRUE)
#' @author Ying Taur
#' @export
make.table <- function(data,vars,by=NULL,showdenom=FALSE,fisher.test=TRUE) {
message("Note, make.table is deprecated, consider using make_table")
all.vars <- unique(c(vars,by))
if (any(all.vars %!in% names(data))) {stop("YTError, variable not found in data frame: ",paste(setdiff(c(vars,by),names(data)),collapse=", "))}
data <- data[,all.vars,drop=FALSE]
if (!is.null(by)) {
if (by %in% vars) {warning("YTWarning, ",paste(intersect(by,vars),collapse=",")," is listed in both 'vars' and 'by'!")}
}
factorize <- function(x,ifany=TRUE,as.string=TRUE) {
if (!is.factor(x)) {x <- factor(x)}
if (ifany & !any(is.na(x))) {return(x)}
ll <- levels(x)
if (!any(is.na(ll))) {ll <- c(ll, NA)}
x <- factor(x, levels = ll, exclude = NULL)
if(as.string) {levels(x)[is.na(levels(x))] <- "NA"}
return(x)
}
data <- data %>% mutate_all(factorize)
get.column <- function(subdata) {
#subdata=data
denom <- nrow(subdata)
subtbl <- plyr::adply(vars,1,function(var) {
subdata %>% group_by_(value=var) %>% tally() %>% complete(value,fill=list(n=0)) %>%
mutate(var=var,value=ifelse(!is.na(value),as.character(value),"NA"),denom=denom,pct=n/denom)
},.id=NULL)
if (showdenom) {
subtbl <- subtbl %>% mutate(lbl=paste0(n,"/",denom," (",percent(pct),")"))
} else {
subtbl <- subtbl %>% mutate(lbl=paste0(n," (",percent(pct),")"))
}
#combine var and value pairs. the combined variable is saved as factor to preserve the order during spread
subtbl <- subtbl %>% unite(var_value,var,value,sep="==") %>% mutate(var_value=factor(var_value,levels=var_value))
return(subtbl)
}
tbl <- get.column(data) %>% mutate(column="total")
if (!is.null(by)) {
#run get.column function for each subgroup
sub.tbl <- data %>% group_by_(column=by) %>% do(get.column(.)) %>% ungroup()
#recode subgroup values to include variable name
levels(sub.tbl$column) <- paste0(by,"=",levels(sub.tbl$column))
#combines total and subgroups. use factor levels to preserve subgroup order when spread is performed.
tbl <- tbl %>% bind_rows(sub.tbl) %>%
mutate(column=factor(column,levels=c("var","value",levels(sub.tbl$column),"total")))
}
#reshape into final columns using spread command. then re-separate the var_value into separate variables
tbl.all <- tbl %>% dplyr::select(var_value,column,lbl) %>% spread(column,lbl) %>% separate(var_value,c("var","value"),sep="==")
if (fisher.test & !is.null(by)) {
fisher.pval <- sapply(vars,function(var) {
if (n_distinct(data[[var]])==1) {
warning("YTWarning: ",var," does not vary. Skipping Fisher test.")
return(NA_real_)
}
ftest <- fisher.test(data[[var]],data[[by]])
ftest$p.value
})
tbl.all$fisher <- ""
tbl.all$fisher[match(names(fisher.pval),tbl.all$var)] <- formatC(fisher.pval,format="f",digits=3)
}
return(tbl.all)
}
#' Get Rows (optimized for timeline plots) OLD
#'
#' Given timeline event data with event type labels and start/stop times, calculate rows.
#' If requested, this will attempt to save vertical plot space by placing two event types on the same row, where possible.
#' @param start vector of event start times (numeric or Date).
#' @param stop vector of event stop times (numeric or Date).
#' @param row vector of event types. Can be original row assignments or event labels.
#' @param by optional grouping variable (vector or list of vectors), where events of the same group will be kept to together. Default is \code{NULL}
#' @param min.gap minimum allowable gap between two different event types, if they are to be placed on the same row. Default is \code{Inf}: no row merging, \code{0} tries to perform as much merging as possible.
#' @return Returns a vector of row number assignments for each time event.
#' @author Ying Taur
get.row.OLD <- function(start,stop,row,by=NULL,min.gap=Inf) {
# start=medssub$start_day;stop=medssub$stop_day;row=medssub$y.row;by=list(medssub$abx_class,medssub$med_class3);min.gap=0
if (min.gap<0) {
stop("YTError: min.gap must be greater than 0")
}
if (length(start)==0|length(stop)==0) {return(NA_integer_)}
if (!is.null(by)) {
d <- data.frame(start,stop,row,by)
by.list <- setdiff(names(d),c("start","stop","row"))
dd <- d %>%
mutate(orig.order=1:n()) %>%
group_by_(.dots=by.list) %>%
mutate(newrow=get.row(start,stop,row,min.gap=min.gap)) %>%
ungroup() %>%
arrange_(.dots=c(by.list,"newrow"))
dd$newrow2 <- do.call(paste,dd[,c(by.list,"newrow")])
dd <- dd %>%
mutate(#newrow2=paste(by,newrow),
newrow2=factor(newrow2,levels=unique(newrow2)),
newrow2=as.numeric(newrow2)) %>%
arrange(orig.order)
return(dd$newrow2)
}
d <- data.frame(start,stop,row)
d.collapse <- d %>% group_by(row) %>%
summarize(start=min(start),stop=max(stop)) %>% ungroup() %>%
mutate(y.row1=row_number(stop),
y.row2=row_number(start)-n())
d.row.test <- plyr::adply(0:(nrow(d.collapse)-1),1,function(overlap) {
d.test <- d.collapse %>%
mutate(y.row2=y.row2+overlap,
y.row3=ifelse(y.row2>=1,y.row2,y.row1))
overlap.check <- d.test %>% filter(y.row3<=overlap) %>%
group_by(y.row3) %>% filter(n()==2) %>%
arrange(start) %>%
summarize(start1=start[1],stop1=stop[1],start2=start[2],stop2=stop[2]) %>%
mutate(gap=start2-stop1,
overlaps=start2-stop1<=min.gap)
data.frame(overlap,n.rows=n_distinct(d.test$y.row3),
gap=suppressWarnings(min(overlap.check$gap)))
},.id=NULL)
d.use.row <- d.row.test %>% filter(gap>=min.gap) %>% arrange(n.rows,desc(gap)) %>% slice(1)
d.final <- d.collapse %>%
mutate(y.row2=y.row2+d.use.row$overlap,
y.row3=ifelse(y.row2>=1,y.row2,y.row1),
y.row3=dense_rank(y.row3))
newrow <- d.final$y.row3[match(d$row,d.final$row)]
return(newrow)
}
#' Select 2
#'
#' Basically \code{dplyr::select}, but ignores variables that aren't found in the data frame.
#'
#' @param data Data frame
#' @param ... Comma separated list of unquoted expressions. You can treat variable names like they are positions. Use positive values to select variables; use negative values to drop variables.
#' @return Returns \code{data}, but grouped by times and other variables.
#' @author Ying Taur
#' @export
select2 <- function(data,...) {
select_vars <- quos(...)
select_var_names <- sapply(select_vars,as_name)
select_vars_keep <- select_vars[select_var_names %in% names(data)]
data %>% select(!!!select_vars_keep)
}
#' Cox Proportional Hazards Regression (TAKE 2)
#'
#' STILL WRITING THIS
#' @export
cox.old <- function(data, yvar, ... , starttime=NULL, return.split.data=FALSE,args5=list(cens.model="cox",model="fg")) {
requireNamespace(c("coxphf","cmprsk","timereg","riskRegression"),quietly=TRUE)
yvar <- enquo(yvar)
starttime <- enquo(starttime)
xvars <- quos(...)
# yvar=sym("vre.bsi");xvars=syms("agebmt");starttime=sym(NULL)
yvarday <- quo_name(yvar) %>% paste0("_day") %>% sym()
is.td <- function(var) {
var <- enquo(var)
vardayname <- quo_name(var) %>% paste0("_day")
has_name(data,vardayname)
}
xvars.td <- xvars[sapply(xvars,is.td)]
if (length(xvars.td)>0) {
xvarsdays.td <- xvars.td %>% sapply(quo_name) %>% paste0("_day") %>% syms()
} else {
xvarsdays.td <- syms(NULL)
}
timevars <- c(yvarday,xvarsdays.td)
data <- data %>% mutate_at(vars(!!yvar,!!!xvars.td),as.numeric)
if (quo_is_null(starttime)) {
# data <- data %>% mutate(.y=!!yvar,.tstart=-10000,.tstop=!!yvarday)
# .tstart is pmin of all time vars, because coxphf can't handle -Inf as tstart.
mintime <- data %>% select(!!yvarday,!!!timevars) %>% min(na.rm=TRUE)
start <- min(mintime-1,0)
message("Setting start time as: ",start)
data <- data %>% mutate(.y=!!yvar,.tstart=start,.tstop=!!yvarday) %>%
mutate_at(vars(.tstart,.tstop,!!!timevars),function(x) x-start)
} else {
data <- data %>% mutate(.y=!!yvar,.tstart=!!starttime,.tstop=!!yvarday)
}
splitline <- function(data,xvar) {
xvar <- enquo(xvar)
xvarday <- quo_name(xvar) %>% paste0("_day") %>% sym()
data.nochange <- data %>% filter(!!xvar==0|is.na(!!xvar))
data.split <- data %>% filter(!!xvar==1,.tstart<!!xvarday,!!xvarday<.tstop)
data.xafter <- data %>% filter(!!xvar==1,.tstop<=!!xvarday)
data.xbefore <- data %>% filter(!!xvar==1,!!xvarday<=.tstart)
data.nochange.new <- data.nochange
data.xbefore.new <- data.xbefore
data.xafter.new <- data.xafter %>% mutate(!!xvar:=0)
data.split.new1 <- data.split %>% mutate(.tstop=!!xvarday,!!xvar:=0,.y=0)
data.split.new2 <- data.split %>% mutate(.tstart=!!xvarday,!!xvar:=1)
newdata <- bind_rows(data.nochange.new,data.xbefore.new,data.xafter.new,data.split.new1,data.split.new2) %>%
select(-!!xvarday)
return(newdata)
}
data2 <- data
for (xvar in xvars.td) {
data2 <- data2 %>% splitline(!!xvar)
}
if (return.split.data) {
return(data2)
}
is.competing <- !all(pull(data,!!yvar) %in% c(0,1,NA))
has.timevarying <- length(xvars.td)>0 & nrow(data2)>nrow(data)
leftside <- "Surv(.tstart,.tstop,.y)"
rightside <- xvars %>% sapply(quo_name) %>% paste(collapse=" + ")
model <- paste(leftside,rightside,sep=" ~ ")
formula <- as.formula(model)
#result 1, regular cox
src <- tibble(xvar="<error>",method="coxph")
if (is.competing) {
src <- tibble(xvar="<competing>",method="coxph")
} else {
tryCatch({
results <- coxph(formula,data=data2)
sr <- summary(results)
src <- sr$conf.int %>% as.data.frame() %>% rownames_to_column("var") %>%
as_tibble() %>%
select(xvar=var,haz.ratio=`exp(coef)`,lower.ci=`lower .95`,upper.ci=`upper .95`) %>%
mutate(p.value=sr$coefficients[,"Pr(>|z|)"],method="coxph")
},error=function(e) {
})
}
#result 2
src2 <- tibble(xvar="<error>",method="coxphf.F")
if (is.competing) {
src2 <- tibble(xvar="<competing>",method="coxphf.F")
} else {
tryCatch({
results2 <- coxphf(formula,data=data2,firth=F)
sr2 <- summary(results2)
src2 <- tibble(xvar=names(sr2$coefficients),
haz.ratio=exp(sr2$coefficients),
lower.ci=sr2$ci.lower,
upper.ci=sr2$ci.upper,
p.value=sr2$prob,
method="coxphf.F")
},error=function(e) {
})
}
#result 3
src3 <- tibble(xvar="<error>",method="xxx")
if (is.competing) {
src3 <- tibble(xvar="<competing>",method="coxphf.T")
} else {
tryCatch({
results3 <- coxphf(formula,data=data2,firth=T)
sr3 <- summary(results3)
src3 <- tibble(xvar=names(sr3$coefficients),
haz.ratio=exp(sr3$coefficients),
lower.ci=sr3$ci.lower,
upper.ci=sr3$ci.upper,
p.value=sr3$prob,
method="coxphf.T")
},error=function(e) {
})
}
#result 4
src4 <- tibble(xvar="<error>",method="xxx")
if (has.timevarying) {
src4 <- tibble(xvar="<timevarying>",method="crr")
} else {
tryCatch({
cov <- paste0("~",rightside) %>% as.formula() %>% model.matrix(data=data2)
cov <- cov[,-1,drop=FALSE]
results4 <- data2 %>% with(crr(.tstop,.y,cov1=cov))
sr4 <- summary(results4)
src4 <- sr4$conf.int %>% as.data.frame() %>% rownames_to_column("var") %>%
as_tibble() %>%
select(xvar=var,haz.ratio=`exp(coef)`,lower.ci=`2.5%`,upper.ci=`97.5%`) %>%
mutate(p.value=sr4$coef[,"p-value"],method="crr")
},error=function(e) {
})
}
#results 5, Fine gray riskRegression
src5 <- tibble(xvar="<error>",method="xxx")
if (has.timevarying) {
src5 <- tibble(xvar="<timevarying>",method="riskRegression")
} else {
tryCatch({
leftside.b <- "Hist(.tstop, .y)"
rightside <- xvars %>% sapply(quo_name) %>% paste(collapse=" + ")
model <- paste(leftside.b,rightside,sep=" ~ ")
formula <- as.formula(model)
print(formula)
r5 <- FGR(formula,data=data2,cause=1)
sr5 <- summary(r5)
src5 <- cbind(sr5$coef,sr5$conf.int) %>% as.data.frame() %>% rownames_to_column("var") %>%
select(xvar=var,haz.ratio=`exp(coef)`,lower.ci=`2.5%`,upper.ci=`97.5%`,p.value=`p-value`) %>%
mutate(method="riskRegression")
},error=function(e) {
})
}
# #results6 timereg
# leftside.b <- "Event(.tstart, .tstop, .y)"
# rightside.b <- xvars %>% sapply(quo_name) %>% paste0("const(",.,")",collapse=" + ")
# model.b <- paste0(leftside.b," ~ ",rightside.b)
# args <- c(list(formula=as.formula(model.b),data=data2,cause=1),args5)
# results6 <- do.call(comp.risk,args)
# sr6 <- coef(results6) %>% as.data.frame() %>% rownames_to_column("var")
# src6 <- tibble(xvar=sr6$var,haz.ratio=exp(sr6$Coef.),lower.ci=exp(sr6$`lower2.5%`),upper.ci=exp(sr6$`upper97.5%`),p.value=sr6$`P-val`,method="timereg")
d <- bind_rows(src,src2,src3,src4,src5)
d
# list(coxph=results,coxphf.F=results2,coxphf.T=results3,
# crr=results4,data=d)
}
#' Cox Proportional Hazards Regression
#'
#' Analyzes survival data by Cox regression.
#'
#' Convenience function for survival analysis. Typically uses the \code{coxphf} function.
#'
#' @param ... variable names in the regression
#' @param starttime character column name for start times (either point to zero or indicate left censor times). Default is "tstart".
#' @param data survival data.
#' @param addto if specified, add results to this data.frame of results. Default is NULL
#' @param as.survfit if TRUE, return the survival fit object (use for kaplan-meier stuff).
#' @param firth whether or not to perform Firth's penalized likelihood. Default is TRUE.
#' @param formatted whether to format the data.frame of results. Default is TRUE
#' @param logrank whether to calculate log rank p-value. Default is FALSE
#' @param coxphf.obj whether to return cox results object (rather than regression table). Default is FALSE.
#' @param return.split.data whether to return data after split (do this to split time-dependent variables and run Cox manually). Default is FALSE
#' @return a regression table with the survival results
#' @author Ying Taur
#' @export
stcox <- function( ... ,starttime="tstart",data,addto,as.survfit=FALSE,firth=TRUE,formatted=TRUE,logrank=FALSE,coxphf.obj=FALSE,return.split.data=FALSE) {
requireNamespace("coxphf",quietly=TRUE)
data <- data.frame(data)
y <- c(...)[1]
xvars <- c(...)[-1]
y.day <- paste0(y,"_day")
#xvars that are time-dependent
td.xvars <- xvars[paste(xvars,"_day",sep="") %in% names(data)]
if (length(td.xvars)>0) {
td.xvars.day <- paste(td.xvars,"_day",sep="")
} else {
td.xvars.day <- NULL
}
all.vars <- unique(c(y,y.day,xvars,td.xvars.day,starttime))
#if data is large, this speeds up a lot
data <- data[,all.vars]
#define y, s.start and s.stop
data$y <- data[,y]
data$s.start <- data[,starttime]
data$s.stop <- data[,y.day]
data <- subset(data,s.start<s.stop)
#### check for missing x values.
for (x in xvars) {
missing.x <- is.na(data[,x])
if (any(missing.x)) {
print(paste0(" NOTE - missing values in ",x,", removing ",length(sum(missing.x))," values."))
data <- data[!is.na(data[,x]),]
}
}
splitline <- function(x) {
#time dep xvars where x=1 (xvar.split are vars from td.xvars where splitting needs to be done)
xvar.split <- td.xvars[c(x[,td.xvars]==1)]
xvarday.split <- td.xvars.day[c(x[,td.xvars]==1)]
if (length(xvar.split)==0) {
return(x)
} else {
#cutpoints=time dep where xvars=1, and are between s.start and s.stop
cutpoints <- unlist(c(x[,xvarday.split])) #days at which an xvar==1
cutpoints <- cutpoints[x$s.start<cutpoints & cutpoints<x$s.stop] #eliminate days on s.start or s.stop
cutpoints <- unique(cutpoints)
#sort cutpoints and add s.start and s.stop at ends.
cutpoints <- c(x$s.start,cutpoints[order(cutpoints)],x$s.stop)
#set up new.x, with s.start and s.stop, y=0 by default
new.x <- data.frame(s.start=cutpoints[-length(cutpoints)],s.stop=cutpoints[-1],y=0)
#the last row of new.x takes value of y, the rest are y=0
new.x[nrow(new.x),"y"] <- x$y
#determine values of time dep x's (td.xvars) for each time interval. first, x=0 by default
new.x[,td.xvars] <- 0
for (i in 1:length(xvar.split)) {
xvar <- xvar.split[i]
xvarday <- xvarday.split[i]
new.x[new.x$s.start>=x[,xvarday.split[i]],xvar.split[i]] <- 1
}
remaining.vars <- setdiff(names(x),names(new.x))
new.x[,remaining.vars] <- x[,remaining.vars]
return(new.x)
}
}
if (length(td.xvars)>0) {
data <- plyr::adply(data,1,splitline)
}
if (return.split.data) {
return(data)
}
#calculate model
leftside <- "survival::Surv(s.start,s.stop,y)"
rightside <- paste(xvars,collapse=" + ")
model <- paste(leftside,rightside,sep=" ~ ")
formula <- as.formula(model)
for (x in xvars) { #check for nonvarying predictors
xvalues <- unique(data[,x])
if (length(xvalues)==1) {
stop("YTError: This predictor does not vary across observations: ",x," is always equal to ",xvalues)
}
}
if (as.survfit) {
#return a survfit object
return(survival::survfit(formula,data=data))
} else if (logrank) {
#output logrank test
results <- summary(coxph(formula,data=data))
return(results$logtest[3])
} else {
results <- coxphf::coxphf(formula,data=data,firth=firth)
if (coxphf.obj) {
return(results)
}
results.table <- data.frame(
model=model,
yvar=y,
xvar=names(results$coefficients),
haz.ratio=exp(results$coefficients),
lower.ci=results$ci.lower,
upper.ci=results$ci.upper,
p.value=results$prob,
row.names=NULL,stringsAsFactors=FALSE)
#mark time-dependent xvars with "(td)". note that this just looks for var in td.xvars;
#if variable has level specifications, then it won't work. (could fix this with reg expr instead)
if (length(td.xvars)>0) {
results.table$xvar[results.table$xvar %in% td.xvars] <- sapply(results.table$xvar[results.table$xvar %in% td.xvars],function(x) paste0(x,"(td)"))
}
if (formatted) {
results.table$signif <- as.character(cut(results.table$p.value,breaks=c(-Inf,0.05,0.20,Inf),labels=c("****","*","-")))
results.table$haz.ratio <- format(round(results.table$haz.ratio,2),nsmall=2)
results.table$lower.ci <- format(round(results.table$lower.ci,2),nsmall=2)
results.table$upper.ci <- format(round(results.table$upper.ci,2),nsmall=2)
results.table$p.value <- format(round(results.table$p.value,3),nsmall=3)
results.table <- plyr::adply(results.table,1,function(x) {
x$haz.ratio <- paste0(x$haz.ratio," (",x$lower.ci," - ",x$upper.ci,")")
return(x)
})
results.table <- subset(results.table,select=c(model,yvar,xvar,haz.ratio,p.value,signif))
}
if (missing(addto)) {
return(results.table)
} else {
return(rbind(addto,results.table))
}
}
}
#' Univariate Cox Proportional Hazards
#'
#' Perform univariate survival, then multivariate on significant variables.
#' @param yvars column name of survival endpoint.
#' @param xvars column names of predictors.
#' @param tstart column name of start variable.
#' @param data the data frame to be analyzed
#' @param firth whether to perform Firth's penalized likelihood correction.
#' @param multi whether to perform multivariate modelling of signficant univariate predictors
#' @param multi.cutoff if multivariate is done, the P-value cutoff for inclusion into the multivariate model.
#' @return A regression table containing results
#' @export
univariate.stcox <- function(yvar,xvars,starttime="tstart",data,firth=TRUE,multi=FALSE,multi.cutoff=0.2) {
# yvar="dead.180";xvars=c("age","race.group","detect.trop","imm.med.group2");starttime="tstart";data=pt;firth=F;multi=TRUE;multi.cutoff=0.2;referrent=FALSE
results.list <- lapply(xvars,function(xvar) {
print(xvar)
tryCatch({
stcox(yvar,xvar,starttime=starttime,data=data,firth=firth)
},error=function(e) {
warning(e$message)
data.frame(model=paste0("Surv(s.start,s.stop,y) ~ ",xvar),yvar=yvar,xvar=paste0(xvar,"[ERROR non-varying]"),haz.ratio=NA,p.value=NA,signif=NA)
})
})
results.table <- results.list %>% bind_rows()
if (multi) {
multi.signif <- sapply(results.list,function(tbl) {
any(tbl$p.value<=multi.cutoff)
})
multivars <- xvars[multi.signif]
message("Multivariate model:")
if (length(multivars)>0) {
message(paste0(multivars,collapse=","))
multi.table <- stcox(yvar,multivars,starttime=starttime,data=data,firth=firth)
names(multi.table) <- recode2(names(multi.table),c("haz.ratio"="multi.haz.ratio","p.value"="multi.p.value","signif"="multi.signif"))
multi.table <- multi.table %>% select(-model)
results.table <- results.table %>% select(-model)
combined.table <- results.table %>% left_join(multi.table,by=c("yvar","xvar")) %>%
mutate_at(vars(multi.haz.ratio,multi.p.value,multi.signif),function(x) ifelse(is.na(x),"",x))
# return(list(uni=results.table,multi=multi.table))
n.events <- sum(data[[yvar]])
n.multivars <- length(multivars)
print(paste0(round(n.events/n.multivars,3)," events per multivariable (",n.events,"/",n.multivars,", consider overfitting if less than 10)"))
return(combined.table)
} else {
print("No variables in multivariate!")
results.table <- subset(results.table,select=-model)
results.table$multi.haz.ratio <- ""
results.table$multi.p.value <- ""
results.table$multi.signif <- ""
return(results.table)
}
}
return(results.table)
}
#' Create Survival Data Frame
#'
#' @param f.survfit the survival data
#' @param time0 time0 can be specified. It must be <= first event
#' @return Returns survival data frame
#' @keywords keyword1 keyword2 ...
#' @author Ying Taur
#' @export
createSurvivalFrame <- function(f.survfit,time0=0) {
# define custom function to create a survival data.frame
#YT edit: time0 can be specified. it must be <= first event. if it isn't use first event as time0
time0 <- min(time0,f.survfit$time[1])
# initialise frame variable
f.frame <- NULL
# check if more then one strata
if (length(names(f.survfit$strata))==0) {
f.frame <- data.frame(time=f.survfit$time, n.risk=f.survfit$n.risk, n.event=f.survfit$n.event, n.censor = f.survfit$n.censor, surv=f.survfit$surv, upper=f.survfit$upper, lower=f.survfit$lower)
# create first two rows (start at 1)
f.start <- data.frame(time=c(time0, f.frame$time[1]), n.risk=c(f.survfit$n, f.survfit$n), n.event=c(0,0), n.censor=c(0,0), surv=c(1,1), upper=c(1,1), lower=c(1,1))
# add first row to dataset
f.frame <- rbind(f.start, f.frame)
# remove temporary data
rm(f.start)
# create data.frame with data from survfit
} else { #multiple strata
# create vector for strata identification
f.strata <- NULL
for(f.i in 1:length(f.survfit$strata)){
# add vector for one strata according to number of rows of strata
f.strata <- c(f.strata, rep(names(f.survfit$strata)[f.i], f.survfit$strata[f.i]))
}
# create data.frame with data from survfit (create column for strata)
f.frame <- data.frame(time=f.survfit$time, n.risk=f.survfit$n.risk, n.event=f.survfit$n.event, n.censor = f.survfit
$n.censor, surv=f.survfit$surv, upper=f.survfit$upper, lower=f.survfit$lower, strata=factor(f.strata))
# remove temporary data
rm(f.strata)
# create first two rows (start at 1) for each strata
for(f.i in 1:length(f.survfit$strata)){
# take only subset for this strata from data
f.subset <- subset(f.frame, strata==names(f.survfit$strata)[f.i])
# create first two rows (time: time0, time of first event) YT edit, not time 0 but
f.start <- data.frame(time=c(time0, f.subset$time[1]), n.risk=rep(f.survfit[f.i]$n, 2), n.event=c(0,0), n.censor=c(0,0), surv=c(1,1), upper=c(1,1), lower=c(1,1), strata=rep(names(f.survfit$strata)[f.i],2))
# add first two rows to dataset
f.frame <- rbind(f.start, f.frame)
# remove temporary data
rm(f.start, f.subset)
}
# reorder data
f.frame <- f.frame[order(f.frame$strata, f.frame$time), ]
# rename row.names
rownames(f.frame) <- NULL
}
# return frame
return(f.frame)
}
#' At-Risk Table from Survival Data Frame
#'
#' @param t.breaks vector of times to calculate at-risk
#' @param sf survival frame
#' @param minus.epsilon.last.t means we substract a small amt from last timepoint, because otherwise it's all NA.
#' @author Ying Taur
#' @export
survival.frame.atrisk.table <- function(t.breaks,sf,minus.epsilon.last.t=TRUE,melt=FALSE,row.name.xloc) {
#t.breaks=0:3*365;minus.epsilon.last.t=TRUE;melt=TRUE;row.name.xloc=-200
epsilon <- (max(t.breaks) - min(t.breaks)) / 1000000
t.breaks2 <- ifelse(t.breaks==max(t.breaks),t.breaks-epsilon,t.breaks)
tbl <- data.frame(lapply(t.breaks2,function(t) {
survival.frame.info(t,sf,"n.risk")
}))
names(tbl) <- paste0("time.",t.breaks)
#the factor is to keep factor order same as order of table
tbl <- data.frame(strata=factor(row.names(tbl),levels=row.names(tbl)),tbl,row.names=NULL)
if (melt) {
#default for xlocation
if (missing(row.name.xloc)) {
row.name.xloc <- -200
}
tbl[,paste0("time.",row.name.xloc)] <- tbl$strata
measure.vars <- grep("time\\.",names(tbl),value=TRUE)
atrisk.melt <- melt(tbl,measure.vars=measure.vars)
atrisk.melt$x <- as.numeric(sub("^time\\.","",atrisk.melt$variable))
atrisk.melt$y <- as.numeric(atrisk.melt$strata)
atrisk.melt$label <- atrisk.melt$value
return(atrisk.melt)
} else {
return(tbl)
}
}
#' Survival Frame Info
#'
#' Given survival.frame, and time, provide info in the survivalframe.
#' @param t time
#' @param sf survival frame
#' @param infotype type of info needed. Needs to be a variable in survival frame.
#' @return info needed from survival frame
#' @author Ying Taur
#' @export
survival.frame.info <- function(t,sf,infotype) {
#given survival.frame, and time, provide info in the survivalframe.
#infotype is the name of variable: "n.risk","surv",etc.
if (!(infotype %in% names(sf))) {
print("Error, infotype needs to be a variable in survival.frame")
return(NULL)
}
if (!("strata" %in% names(sf))) {
sf$strata <- "num.at.risk"
}
sf <- sf[order(sf$strata),]
daply(sf,"strata",function(x) {
before.times <- x$time<=t
if (all(before.times)) {
return(NA)
} else {
sub.x <- x[before.times,]
return(sub.x[order(sub.x$time,decreasing=TRUE)[1],infotype])
}
})
}
#' Generate Kaplan-Meier curve in ggplot2
#'
#' Creates a Kaplan-Meier curve which can be used like a geom in ggplot2.
#'
#' Use this to make Kaplan-Meier curves in ggplot2. Utilizes the \code{geom_step} function to draw.
#' \code{yingtools::stcox} function is used to generate data from a survival frame.
#'
#' @param yvar character, used to indicate the variable set to be used as survival endpoint of interest.
#' For example, if \code{yvar="var1"} is specified, then \code{data} should have \code{"var1"} will represent
#' whether the endpoint occurred (logical or 0-1), and \code{"var1_day"} will represent the time at which
#' the event occurred (or didn't occur).
#' @param xvar character, indicating the variable within \code{data} that will contain the groups by which curves will be generated.
#' @param data data frame containing the survival data.
#' @param starttime character, specifying the column within \code{data} that indicates the survival start time.
#' If there is no left censoring, then this would refer to a vector of 0's. Default is \code{"tstart"}
#' @param flip.y logical, indicating whether or not to flip the y-axis. \code{flip.y = FALSE} by default (curve is downwards).
#' @param size line thickness for the survival curve.
#' @return A ggplot2 geom object with Kaplan-Meier curve.
#' @examples
#' library(ggplot2)
#' ggplot() + geom_kaplanmeier("dead","intensity",data=cid.patients)
#' @author Ying Taur
#' @export
geom_kaplanmeier <- function(yvar,xvar=NULL,data,starttime="tstart",flip.y=FALSE,size=NULL,logrank=FALSE,logrank.pos=NULL,logrank.fontsize=5) {
if (is.null(xvar)) {
data$one <- 1
sf <- createSurvivalFrame(stcox(yvar,"one",starttime=starttime,data=data,as.survfit=TRUE))
} else {
sf <- createSurvivalFrame(stcox(yvar,xvar,starttime=starttime,data=data,as.survfit=TRUE))
sf[,xvar] <- sub("^.+=","",sf$strata)
if (is.factor(data[,xvar])) {
sf[,xvar] <- factor(sf[,xvar],levels=levels(data[,xvar]))
}
}
if (flip.y) {
sf$surv <- 1 - sf$surv
}
if (is.null(xvar)) {
if (is.null(size)) {
g <- geom_step(data=sf,aes_string(x="time",y="surv"))
} else {
g <- geom_step(data=sf,aes_string(x="time",y="surv"),size=size)
}
g <- list(g,ylim(0,1))
} else {
if (is.null(size)) {
g <- geom_step(data=sf,aes_string(x="time",y="surv",color=xvar,group=xvar))
} else {
g <- geom_step(data=sf,aes_string(x="time",y="surv",color=xvar,group=xvar),size=size)
}
g <- list(g,ylim(0,1))
if (logrank) {
#function to find best x,y for text
find_best_spot <- function(plot) {
gb <- ggplot_build(plot)
xlim <- gb$layout$panel_params[[1]]$x.range
ylim <- gb$layout$panel_params[[1]]$y.range
xrange <- xlim[2]-xlim[1]
yrange <- ylim[2]-ylim[1]
xs <- seq(xlim[1],xlim[2],length.out=50)
ys <- seq(ylim[1],ylim[2],length.out=50)
d.data <- lapply(gb$data,function(data) {
d.pts <- tibble()
if (c("x","y") %allin% names(data)) {
newdata <- data %>% select(x=x,y=y)
d.pts <- d.pts %>% bind_rows(newdata)
}
if (c("xend","yend") %allin% names(data)) {
newdata <- data %>% select(x=x,y=y)
d.pts <- d.pts %>% bind_rows(newdata)
}
if (c("xmin","xmax","ymin","ymax") %allin% names(data)) {
newdata1 <- data %>% select(x=xmin,y=ymin)
newdata2 <- data %>% select(x=xmax,y=ymin)
newdata3 <- data %>% select(x=xmin,y=ymax)
newdata4 <- data %>% select(x=xmax,y=ymax)
d.pts <- d.pts %>% bind_rows(newdata1,newdata2,newdata3,newdata4)
}
return(d.pts)
}) %>% bind_rows() %>%
filter(between(x,xlim[1],xlim[2]),between(y,ylim[1],ylim[2]))
d.box1 <- tibble(x=xs) %>% crossing(y=ylim)
d.box2 <- tibble(y=ys) %>% crossing(x=xlim)
d <- bind_rows(d.box1,d.box2,d.data)
pts <- tibble(xx=xs) %>% crossing(yy=ys) %>%
crossing(d) %>%
mutate(dist=sqrt(abs((xx-x)/xrange)^2+abs((yy-y)/yrange)^2)) %>%
group_by(xx,yy) %>%
summarize(min.dist=min(dist)) %>%
ungroup() %>%
slice(which.max(min.dist))
return(tibble(x=pts$xx,y=pts$yy))
}
if (is.null(logrank.pos)) {
gg <- ggplot() + g
bestpos <- find_best_spot(gg)
logrank.pos <- c(bestpos$x,bestpos$y)
}
g <- list(g,geom_logrank(yvar=yvar,xvar=xvar,data=data,starttime=starttime,pos=logrank.pos,logrank.fontsize=logrank.fontsize))
}
}
return(g)
}
#' Generate label for Log-rank test results in ggplot2
#'
#' Adds the p-value for a log-rank test to a ggplot2 graph.
#' @author Ying Taur
#' @export
geom_logrank <- function(yvar,xvar,data,starttime="tstart",pos,logrank.fontsize=5) {
if (length(pos)!=2) {
stop("YTError: Logrank position should be a vector of size 2: c(x,y)")
}
logrank <- stcox(yvar=yvar,yvar=xvar,data=data,starttime=starttime,logrank=TRUE)
logrank <- paste0("Log-rank\nP = ",formatC(logrank,format="f",digits=3))
annotate("text",x=pos[1],y=pos[2],label=logrank,size=logrank.fontsize)
}
#' Logistic Regression
#'
#' Performs univariate or multivariate logistic regression
#'
#' Logistic regression is for prediction of yes/no outcomes.
#'
#' @return A logistic regression table containing predictors, odds ratios, confidence limits, and p-values.
#' @examples
#' # logistic regression predicting vs with mpg, cyl, and disp:
#' # specify yvar and xvar in model:
#' logistic("vs",c("mpg","cyl","disp"),data=mtcars)
#' # specify model:
#' logistic(vs~mpg+cyl+disp,data=mtcars)
#' @author Ying Taur
#' @export
logistic <- function(x,...) UseMethod("logistic")
#' @rdname logistic
#' @param yvar Y-variable of interest (column name within data). Should be either logical or 0-1.
#' @param xvar X-variable(s) of interest (vector of column names within data). A vector of length=1 will perform a univariate analysis, length>1 will perform a multivariate analysis.
#' @param data data frame containing the data.
#' @param firth Whether to apply Firth's penalized likelihood correction. Default is \code{FALSE}
#' @param formatted logical specifying whether to format the data in a table. Default is \code{TRUE}.
#' @param digits number of significant digits in results. Default is 3.
#' @export
logistic.character <- function(yvar, xvars ,data,firth=FALSE,formatted=TRUE,digits=3) {
# y <- c(...)[1]
# x <- paste(c(...)[-1],collapse="+")
# model <- paste(y,x,sep="~")
model <- paste0(yvar,"~",paste(xvars,collapse="+"))
logistic(as.formula(model),data=data,firth=firth,formatted=formatted,digits=digits)
}
#' @rdname logistic
#' @param formula formula on which to perform logistic regression.
#' @export
logistic.formula <- function(formula, data=sys.parent(), firth=FALSE,formatted=TRUE,digits=3) {
requireNamespace("logistf",quietly=TRUE)
results <- logistf::logistf(formula, data=data, firth=firth)
results.table <- data.frame(
model=gsub("\"| ","",paste(deparse(results$formula),collapse="")),
yvar=as.character(results$formula)[2],
xvar=results$terms,
odds.ratio=exp(results$coefficients),
lower.ci=exp(results$ci.lower),
upper.ci=exp(results$ci.upper),
p.value=results$prob,
row.names=NULL,stringsAsFactors=FALSE)
#get rid of intercept line, keep above vars only
results.table <- subset(results.table,xvar!="(Intercept)",select=c(model,yvar,xvar,odds.ratio,lower.ci,upper.ci,p.value))
if (formatted) {
results.table$signif <- cut(results.table$p.value,breaks=c(-Inf,0.05,0.20,Inf),labels=c("****","*","-"))
numvars <- sapply(results.table,is.numeric)
results.table[,numvars] <- sapply(results.table[,numvars],function(x) formatC(x,format="f",digits=digits))
results.table <- plyr::adply(results.table,1,function(x) {
x$odds.ratio <- paste0(x$odds.ratio," (",x$lower.ci," - ",x$upper.ci,")")
return(x)
})
results.table <- subset(results.table,select=c(model,yvar,xvar,odds.ratio,p.value,signif))
}
return(results.table)
}
#' Univariate Logistic Regression
#'
#' Perform logistic regression analysis on a group of predictors, and optionally perform multivariate analysis on significant univariate predictors.
#'
#' @param yvar ...param1.description...xxx
#' @param xvars ...param2.description...
#' @param data data frame containing the data.
#' @param firth Whether to apply Firth's penalized likelihood correction. Default is \code{FALSE}
#' @param multi whether to contruct a multivariate model using univariate predictors. Default is \code{FALSE}
#' @param multi.cutoff P-value cutoff at which a univariate predictor is included in the multivariate. Default is \code{0.2}.
#' @param digits number of significant digits in results. Default is 3.
#' @return A logistic regression table containing predictors, odds ratios, confidence limits, and p-values.
#' @examples
#' univariate.logistic("vs",c("mpg","cyl","disp","am","gear"),data=mtcars,multi=TRUE)
#' @author Ying Taur
#' @export
univariate.logistic <- function(yvar,xvars,data,firth=FALSE,multi=FALSE,multi.cutoff=0.2,digits=3) {
# yvar="vs";xvars=c("mpg","cyl","disp","hp","drat","wt","qsec","am","gear","carb");data=mtcars;firth=F;multi=T;multi.cutoff=0.2;digits=3
# results.table <- data.frame()
# for (xvar in xvars) {
# print(xvar)
# results.table <- logistic(yvar,xvar,data=data,firth=firth,addto=results.table,digits=digits)
# }
results.table <- lapply(xvars,function(xvar) {
print(xvar)
logistic(yvar,xvar,data=data,firth=firth,digits=digits)
}) %>% bind_rows()
if (multi) {
multivars <- results.table$xvar[results.table$p.value<=multi.cutoff]
multivars <- unique(multivars)
multivars <- sapply(multivars,function(x) {
xvars[sapply(xvars,function(y) {
y==x | grepl(paste0("^",y),x) & sub(y,"",x) %in% as.character(unique(c(data[,y],levels(data[,y]))))
})]
})
print("multivariate model: ")
print(paste0(multivars,collapse=", "))
multi.table <- logistic(yvar,multivars,data=data,firth=firth,digits=digits)
names(multi.table) <- c("model","yvar","xvar","multi.odds.ratio","multi.p.value","multi.signif")
multi.table <- subset(multi.table,select=-model)
results.table <- subset(results.table,select=-model)
combined.table <- merge(results.table,multi.table,all.x=TRUE)
results.table <- combined.table[order(factor(combined.table$xvar,levels=results.table$xvar)),]
results.table <- data.frame(lapply(results.table,function(x) ifelse(is.na(x),"",as.character(x))))
}
return(results.table)
}
#' Determines if tstart-tstop occurs anywhere within interval.
#' @export
occurs.within <- function(tstart,tstop,start.interval,stop.interval) {
message("YTNote: occurs.within() was renamed to overlaps()")
tstop>=start.interval & stop.interval>=tstart
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.