R/statsort.r

statsort <- function (sort, ref, method = "Quartiles", measurements = NULL, cutoff = 1.5, sessiontempdir = NULL, output_options = c(TRUE,TRUE)) {
	print("Outlier analysis started")
	upperfile = "upper.csv"
	lowerfile = "lower.csv"
	nonoutliersfile = "non-outliers.csv"
	cutoffmax <- cutoff[2]
	cutoff <- cutoff[1]
	nocut <- FALSE
	if(cutoffmax == cutoff) {nocut <- TRUE}

	direc <- OsteoSort:::analytical_temp_space(output_options, sessiontempdir) #creates temporary space 
	sd <- paste(sessiontempdir, direc, sep="/")

	sortdata <- array(NA,c(length(sort[,1]),4)) 
	sortdata[,1] <- as.matrix(sort[["id"]]) 
	sortdata[,2] <- tolower(sort[["Side"]]) 
	sortdata[,3] <- tolower(sort[["Element"]])
	sortdata[,4] <- sort[[paste(measurements)]]
	sort <- sortdata
	sort <- na.omit(sort)

	if(ref[1] == "Custom") {
		slope <- as.numeric(ref[2])
		intercept <- as.numeric(ref[3])
		pointestimate <- array(NA,c(length(sort[,1]),4))
		for(i in 1:length(sort[,1])) {
			pointestimate[i,1] <- sort[i,1] #id name
			pointestimate[i,2] <- sort[i,2] #Side name
			pointestimate[i,3] <- sort[i,3] #Bone name
			pointestimate[i,4] <- round( as.numeric(sort[i,4]) * slope + intercept, digits=2)
		}
	} else {
		refdata <- array(NA,c(length(ref[,1]),5)) 
		refdata[,1] <- as.matrix(ref[["id"]]) 
		refdata[,2] <- tolower(ref[["Side"]]) 
		refdata[,3] <- tolower(ref[["Stature"]])
		refdata[,4] <- tolower(ref[["Element"]])
		refdata[,5] <- ref[[paste(measurements)]]
		ref <- na.omit(refdata)
		sort_left <- sort[sort[,2] == "left",]
		sort_right <- sort[sort[,2] == "right",]
		ref_left <- ref[ref[,2] == "left",]
		ref_right <- ref[ref[,2] == "right",]
		if(nrow(sort_left) >= 1) {
			OLSL <- lm(as.numeric(X3) ~ as.numeric(X5), data = data.frame(ref_left))
		}
		if(nrow(sort_right) >= 1) {
			OLSR <- lm(as.numeric(X3) ~ as.numeric(X5), data = data.frame(ref_right))
		}
		pointestimate <- array(NA,c(length(sort[,1]),4))
		for(i in 1:nrow(sort)) {
			if(sort[i,2] == "left") {
				model <- OLSL
			}
			if(sort[i,2] == "right") {
				model <- OLSR
			}
			pointestimate[i,1] <- sort[i,1] #id name
			pointestimate[i,2] <- sort[i,2] #Side name
			pointestimate[i,3] <- sort[i,3] #Bone name
			pointestimate[i,4] <- round(predict(model, data.frame(X5 = sort[i,4])), digits=2)
		}
	}

	s <- sd(as.numeric(pointestimate[,4]))
	m <- mean(as.numeric(pointestimate[,4]))
	me <- median(as.numeric(pointestimate[,4]))
	IQQ <- quantile(as.numeric(pointestimate[,4]))[4] -  quantile(as.numeric(pointestimate[,4]))[2]
	
	if(method == "Standard_deviation") {
		standarddeviation <- sd(as.numeric(pointestimate[,4])) 
		meann <- mean(as.numeric(pointestimate[,4]))
		upper <- meann + standarddeviation * cutoff
		lower <- meann - standarddeviation * cutoff
		uppermax <- meann + standarddeviation * cutoffmax
		lowermax <- meann - standarddeviation * cutoffmax
		plotme <- mean(as.numeric(pointestimate[,4]))
	}
	if(method == "Quartiles") {
		Q1 <- quantile(as.numeric(pointestimate[,4]))[2]
		Q3 <- quantile(as.numeric(pointestimate[,4]))[4]
		IQ <- Q3 - Q1
		upper <- Q3 + IQ * cutoff
		lower <- Q1 - IQ * cutoff
		uppermax <- Q3 + IQ * cutoffmax
		lowermax <- Q1 - IQ * cutoffmax
		plotme <- median(as.numeric(pointestimate[,4]))
	}
	
	outlierdfupper <- array(NA,c(length(sort[,1]),4))
	outlierdflower <- array(NA,c(length(sort[,1]),4))
	nonoutliersdf <- array(NA,c(length(sort[,1]),4))

	for(i in 1:length(sort[,1])) {
		if(nocut) {
			if(as.numeric(pointestimate[i,4]) > upper) {
				outlierdfupper[i,1:4] <- as.matrix(pointestimate[i,1:4])
			}
			if(as.numeric(pointestimate[i,4]) < lower) {
				outlierdflower[i,1:4] <- as.matrix(pointestimate[i,1:4])
			}
			if(as.numeric(pointestimate[i,4]) >= lower && as.numeric(pointestimate[i,4]) <= upper) {
				nonoutliersdf[i,1:4] <- as.matrix(pointestimate[i,1:4])
			}
		}
		if(!nocut) {
			if(as.numeric(pointestimate[i,4]) > upper && as.numeric(pointestimate[i,4]) < uppermax) {
				outlierdfupper[i,1:4] <- pointestimate[i,1:4]
			}
			if(as.numeric(pointestimate[i,4]) < lower && as.numeric(pointestimate[i,4]) > lowermax) {
				outlierdflower[i,1:4] <- pointestimate[i,1:4]
			}
			if(as.numeric(pointestimate[i,4]) >= lower && as.numeric(pointestimate[i,4]) <= upper || as.numeric(pointestimate[i,4]) <= lowermax && as.numeric(pointestimate[i,4]) >= uppermax) {
				nonoutliersdf[i,1:4] <- pointestimate[i,1:4]
			}
		}
	}

	colnames(nonoutliersdf) <- c("id", "Side", "Element", "Point Estimate")
	colnames(outlierdflower) <- c("id", "Side", "Element", "Point Estimate")
	colnames(outlierdfupper) <- c("id", "Side", "Element", "Point Estimate")
	
	if(all(is.na(nonoutliersdf))) {nonoutliersdf <- NULL}
	if(all(is.na(outlierdflower))) {outlierdflower <- NULL}
	if(all(is.na(outlierdfupper))) {outlierdfupper <- NULL}

	if(!is.null(upperfile)) {
		if(!all(is.null(outlierdfupper))) { #skips if all NA (no outliers)
			outlierdfupper <- na.omit(outlierdfupper)
			if(output_options[1] && nrow(outlierdfupper) > 0) {
				no <- OsteoSort:::output_function(hera1 = list(outlierdfupper, upperfile), method = "OS", type = "csv", fpath=sd)
			}
		}
	}
	if(!is.null(lowerfile)) {
		if(!all(is.null(outlierdflower))) {
			outlierdflower <- na.omit(outlierdflower)
			if(output_options[1] && nrow(outlierdflower) > 0) {
				no <- OsteoSort:::output_function(hera1 = list(outlierdflower, lowerfile), method = "OS", type = "csv", fpath=sd)
			}
		}
	}
	if(!is.null(nonoutliersfile)) {
		if(!all(is.null(nonoutliersdf))) {
			nonoutliersdf <- na.omit(nonoutliersdf)
			if(output_options[1] && nrow(nonoutliersdf) > 0) {
				no <- OsteoSort:::output_function(hera1 = list(nonoutliersdf, nonoutliersfile), method = "OS", type = "csv", fpath=sd)
			}
		}
	}
	if(output_options[2]) {
		no <- OsteoSort:::output_function(hera1 = list(as.numeric(pointestimate[,4]), measurements, plotme, upper, lower, lowermax, uppermax, nocut), method = "OS", type = "plot", fpath=sd)
	}

	print("Outlier analysis completed")
	return(list(direc,outlierdflower,outlierdfupper,nonoutliersdf,round(m, digits = 2),round(s, digits=2),round(me, digits=2),round(IQQ, digits=2)))
}
jjlynch2/OsteoSort documentation built on March 9, 2024, 1:48 a.m.