R/summaryfactorlist4.R

summary.factorlist4 <- function(df, dependent, explanatory,  cont="mean", p=FALSE, na.include=FALSE,
																column=FALSE, total_col=FALSE, orderbytotal=FALSE, glm.id=FALSE,
																na.to.missing = TRUE){
	s <- Hmisc:::summary.formula(as.formula(paste(dependent, "~", paste(explanatory, collapse="+"))), data = df,
											 method="reverse", overall=FALSE,
											 test=TRUE,na.include=na.include)
	# Column vs row proportions for factor variables
	df.out = plyr::ldply(s$stats, function(x){
		if(dim(x)[2] == 13){ #hack to get continuous vs categorical. Wouldn't work for factor with 13 levels
			# Continuous variables
			# Mean (SD)
			if (cont == "mean"){
				a = paste0(round(x[1,12], 1), " (", round(x[1,13], 1), ")")
				b = paste0(round(x[2,12], 1), " (", round(x[2,13], 1), ")")
				c = paste0(round(x[3,12], 1), " (", round(x[3,13], 1), ")")
				d = paste0(round(x[4,12], 1), " (", round(x[4,13], 1), ")")
				row1_name = dimnames(x)[[2]][12]
				row2_name = dimnames(x)[[2]][13]
			}

			# Median (IQR)
			if (cont == "median"){
				a = paste0(x[1,6], " (", x[1,8]-x[1,4], ")")
				b = paste0(x[2,6], " (", x[2,8]-x[2,4], ")")
				c = paste0(x[3,6], " (", x[3,8]-x[3,4], ")")
				d = paste0(x[4,6], " (", x[4,8]-x[4,4], ")")
				row1_name = "Median"
				row2_name = "IQR"}

			col1_name = dimnames(x)[[1]][1]
			col2_name = dimnames(x)[[1]][2]
			col3_name = dimnames(x)[[1]][3]
			col4_name = dimnames(x)[[1]][4]
			df.out = data.frame(paste0(row1_name, " (", row2_name, ")"), a, b, c, d)
			names(df.out) = c("levels", col1_name, col2_name, col3_name,col4_name)
		} else {
			# Factor variables
			row_name = dimnames(x)$w
			col1_name = dimnames(x)$g[1]
			col2_name = dimnames(x)$g[2]
			col3_name = dimnames(x)$g[3]
			col4_name = dimnames(x)$g[4]
			col1 = x[,1]
			col2 = x[,2]
			col3 = x[,3]
			col4 = x[,4]
			total = col1 + col2 + col3 + col4
			if (column == FALSE) {
				col1_prop = (col1/apply(x, 1, sum))*100 # row margin
				col2_prop = (col2/apply(x, 1, sum))*100
				col3_prop = (col3/apply(x, 1, sum))*100
				col4_prop = (col4/apply(x, 1, sum))*100
			} else {
				col1_prop = (col1/sum(col1))*100 # column margin
				col2_prop = (col2/sum(col2))*100
				col3_prop = (col3/sum(col3))*100
				col4_prop = (col4/sum(col4))*100
				total_prop = (total/sum(total))*100
			}
			a = paste0(col1, " (", sprintf("%.1f", round(col1_prop, 1)), ")") #sprintf to keep trailing zeros
			b = paste0(col2, " (", sprintf("%.1f", round(col2_prop, 1)), ")")
			c = paste0(col3, " (", sprintf("%.1f", round(col3_prop, 1)), ")")
			d = paste0(col4, " (", sprintf("%.1f", round(col4_prop, 1)), ")")
			if (total_col == FALSE){
				df.out = data.frame(row_name, a, b, c, d)
				names(df.out) = c("levels", col1_name, col2_name, col3_name, col4_name)
			} else if (total_col == TRUE & column == FALSE) {
				df.out = data.frame(row_name, a, b, c, d, total, total)
				names(df.out) = c("levels", col1_name, col2_name, col3_name, col4_name, "Total", "index_total")
			} else if (total_col == TRUE & column == TRUE) {
				df.out = data.frame(row_name, a, b, c, d, paste0(total, " (", sprintf("%.1f", round(total_prop, 1)), ")"), total)
				names(df.out) = c("levels", col1_name, col2_name, col3_name, col4_name, "Total", "index_total")
			}
		}
		return(df.out)
	})
	# Keep original order
	df.out$index = 1:dim(df.out)[1]

	if (p == TRUE){
		a = plyr::ldply(s$testresults, function(x) sprintf("%.3f",round(x[[1]], 3)))
		names(a) = c(".id", "pvalue")
		df.out = merge(df.out, a, by=".id")
	}

	# Add back in actual labels
	df.labels = data.frame(".id" = names(s$stats), "label" = s$labels)
	df.out.labels = merge(df.out, df.labels, by = ".id")
	if (orderbytotal==FALSE){
		df.out.labels = df.out.labels[order(df.out.labels$index),] # reorder columns
	} else {
		df.out.labels = df.out.labels[order(-df.out.labels$index_total),] # reorder columns
	}

	# Reorder columns and remove unnecessary columns
	# Reorder
	label_index = which(names(df.out.labels) == "label")
	not_label_index = which(names(df.out.labels) != "label")
	df.out.labels = df.out.labels[,c(label_index,not_label_index)]

	# Remove
	id_index = which(names(df.out.labels) == ".id")
	index_index = which(names(df.out.labels) == "index")
	index_total_index = which(names(df.out.labels) == "index_total")
	df.out.labels = df.out.labels[,-c(id_index, index_index, index_total_index)]

	# Remove duplicate labels
	df.out.labels = rm_duplicate_labels(df.out.labels, na.to.missing = na.to.missing)
	return(df.out.labels)
}
ewenharrison/summarizer documentation built on May 16, 2019, 9:41 a.m.