R/Interpret.r

Defines functions lmean lsd interpf input.interp f.iter interpret is.na.ext fill.na.merge

Documented in fill.na.merge interpret

# INTERPRET ################### interpret takes a vector and makes a data.frame out of it (to be used in e.g. make.ovariable).
### It also changes abbreviations into probability samples.
# Lognormal distribution parametrization functions
lmean <- function(parmean, parsd) {return(log(parmean)-log(1+(parsd^2)/(parmean^2))/2)}
lsd <- function(parmean, parsd) {return(log(1+(parsd^2)/(parmean^2)))}

# Actual interpretation function. Takes already pre-processed information and returns a distribution. This is a separate function because
# it isn't vectorizable (easily anyway).
interpf <- function(
	n, 
	res.char, 
	brackets.pos, 
	brackets.length, 
	minus.length, 
	minus.exists, 
	plusminus, 
	plusminus.length, 
	plusminus.pos,
	doublePoint, 
	minus.relevant, 
	fromzero, 
	dbug = FALSE
	) {

	if(doublePoint[1] > 0) {
		tempArgs <- sort(as.numeric(unlist(strsplit(res.char, "\\:"))))
		if(dbug) cat("Triangular distribution. \n")
		if (n == 0) {
			if (tempArgs[1] + tempArgs[3] == 2 * tempArgs[2]) {
				return(tempArgs[2])
			} else if (tempArgs[1] + tempArgs[3] > 2 * tempArgs[2]) {
				return(tempArgs[1] + ((tempArgs[3]-tempArgs[1])(tempArgs[2]-tempArgs[1])/2)^(1/2))
			} else {
				return(tempArgs[3] - ((tempArgs[3]-tempArgs[1])(tempArgs[3]-tempArgs[2])/2)^(1/2))
			}
		} else {
			return(rtriangle(n,tempArgs[1],tempArgs[3],tempArgs[2]))
		}
	}
	if(brackets.pos > 0) {
		n.minus.inside.brackets <- sum(minus.relevant > brackets.pos & minus.relevant < brackets.pos + brackets.length)
		imean <- as.numeric(substr(res.char, 1, brackets.pos - 1))
		if (n == 0) {
			return(imean)
		}
		if(n.minus.inside.brackets == 1) {
			ici <- c(as.numeric(substr(res.char, brackets.pos + 1, minus.relevant[minus.relevant > brackets.pos] - 1)), as.numeric(substr(res.char, 
				minus.relevant[minus.relevant > brackets.pos] + 1, brackets.pos + brackets.length - 2)))
			if((ici[2] - imean) / (imean - ici[1]) < 1.5) {
				if(dbug) cat("Normal distribution. \n")
				isd <- sum(abs(ici - imean) / 2) / qnorm(0.975)
				return(rnorm(n, imean, isd))
			} else {
				if(dbug) cat("Lognormal distribution. \n")
				isd <- sum(abs(log(ici) - log(imean)) / 2) / qnorm(0.975)
				return(exp(rnorm(n, log(imean), isd)))
			}
		}
		if(n.minus.inside.brackets %in% c(2,3)) {
			ici <- c(as.numeric(substr(res.char, brackets.pos + 1, minus.relevant[minus.relevant > brackets.pos][2] - 1)), as.numeric(substr(res.char, 
				minus.relevant[minus.relevant > brackets.pos][2] + 1, brackets.pos + brackets.length - 2)))
			isd <- sum(abs(ici - imean) / 2) / qnorm(0.975)
			if(dbug) cat("Normal distribution. \n")
			return(rnorm(n, imean, isd))
		}
		warning(paste("Unable to interpret \"", res.char, "\"", sep = ""))
		return(rep(NA, n))
	}
	if(plusminus.pos > 0) {
		imean <- as.numeric(substr(res.char, 1, plusminus.pos - 1))
		if (n == 0) {
			return(imean)
		}
		if(dbug) cat("Normal distribution. \n")
		return(rnorm(n, imean, as.numeric(substr(res.char, plusminus.pos + plusminus.length, nchar(res.char)))))
	}
	if(minus.exists) {
		if(length(minus.relevant) == 1) {
			a <- as.numeric(substr(res.char, 1, minus.relevant - 1))
			b <- as.numeric(substr(res.char, minus.relevant + 1, nchar(res.char)))
			if(a / b >= 1/100 | a == 0) {
				if(dbug) cat("Uniform distribution. \n")
				if (n == 0) {
					return((a+b)/2)
				}
				return(runif(n, a, b))
			} else {
				if(dbug) cat("Loguniform distribution. \n")
				if (n == 0) {
					return(exp((log(a)+log(b))/2))
				}
				return(exp(runif(n, log(a), log(b))))
			}
		}
		if(length(minus.relevant) %in% c(2,3)) { # If there is more than one '-' we're porbably dealing with negative boundaries. (More than 3 will produce NAs.)
			if(dbug) cat("Uniform distribution. \n")
			# Assume that negative number is always first.
			a <- as.numeric(substr(res.char, 1, minus.relevant[2] - 1))
			b <- as.numeric(substr(res.char, minus.relevant[2] + 1, nchar(res.char)))
			if (n == 0) {
				return((a+b)/2)
			}
			return(runif(n, a, b))
		}
	}
	if(sum(unlist(strsplit(res.char, ""))==";") > 0) {
		if(dbug) cat("Discrete random samples. \n")
		a <- as.numeric(unlist(strsplit(res.char, ";")))
		if (n == 0) {
			return(mean(a))
		}
		return(sample(a, n, replace = TRUE))
	}
	if(fromzero[[1]][1] == 1) {
		temp <- interpret(
			paste("0-", substr(res.char, 2, nchar(res.char)), sep = ""), n, dbug = dbug
		)
		return(
			temp$Result
		)
	}
	warning(paste("Unable to interpret \"", res.char, "\"", sep = ""))
	return(rep(NA, n))
}

# The following function processes character strings and loops the interpretation function.
input.interp <- function(res.char, n = 1000, dbug = FALSE) {
	res.char <- gsub(" ", "", res.char)
	res.char <- gsub(",", ".", res.char)
	plusminus <- gregexpr(paste("\\+-|", rawToChar(as.raw(177)), sep = ""), res.char)
	plusminus.length <- as.numeric(unlist(sapply(plusminus, attributes)))
	plusminus.pos <- unlist(plusminus)
	minus <- gregexpr("-", res.char)
	e <- gregexpr("e-|E-", res.char) # ignore negative signs in exponents when data is given in form 1e-27
	for (i in 1:length(minus)){
		minus[[i]] <- minus[[i]][!(minus[[i]] %in% (e[[i]] + 1))]
	}
	minus.length <- sapply(minus, length)
	minus.exists <- unlist(minus)[cumsum(c(0, minus.length[-length(minus.length)])) + 1] > 0
	brackets <- gregexpr("\\(.*\\)", res.char) # matches for brackets "(...)"
	brackets.length <- as.numeric(unlist(sapply(brackets, attributes)[1,]))
	brackets.pos <- unlist(brackets)
	doublePoint <- gregexpr(":", res.char)
	fromzero <- gregexpr("<", res.char)
	out <- list()
	for(i in 1:length(res.char)) {
		if(res.char[i] %in% c("NA") | nchar(gsub(" ", "", res.char[i])) == 0) {
			out[[i]] <- NA
		} else {
			val <- suppressWarnings(as.numeric(res.char[i]))
			if(is.na(val)) {
				minus.relevant <- unlist(minus)[(cumsum(c(0, minus.length)) + 1)[i]:cumsum(minus.length)[i]]
				out[[i]] <- interpf(n, res.char[i], brackets.pos[i], brackets.length[i], minus.length[i], minus.exists[i], plusminus[[i]], 
					plusminus.length[i], plusminus.pos[i], doublePoint[[i]], minus.relevant, fromzero[i], dbug
				)
			} else out[[i]] <- val
		}
	}
	if (any(sapply(out, length) > 1)) {
		for(i in 1:length(res.char)) {
			if (length(out[[i]]) == 1) {
				out[[i]] <- rep(out[[i]], n)
			}
		}
	}
	out
}

# Assisting function for data.frame wrapper.
f.iter <- function(x) {
	1:x
}

# Data.frame wrapper for the functions.
interpret <- function(idata, N = NULL, rescol = "Result", dbug = FALSE, ...) {
	if (length(N) == 0) N <- get("N", envir = openv) # use custom environment variable N if not given
	if (!is.data.frame(idata)) idata <- as.data.frame(idata)
	if (ncol(idata) == 0) stop("Empty data.frame!")
	if (!rescol %in% colnames(idata)) stop(paste("No \"", rescol, "\" column found", sep = ""))
	temp <- input.interp(idata[[rescol]], N, dbug)
	temp.lengths <- sapply(temp, length)
	if (ncol(idata) == 1) {
		out <- list()
		out[[rescol]] <- unlist(temp)
		out <- as.data.frame(out)
	} else {
		out <- data.frame(idata[rep(1:nrow(idata), times = temp.lengths),])
		out[[rescol]] <- unlist(temp)
	}
	if (prod(temp.lengths) > 1) {
		dim(temp.lengths) <- length(temp.lengths)
		out$Iter<- c(apply(temp.lengths, 1, f.iter))
	}
	return(out)
}

setGeneric("interpret")

setMethod(
	f = "interpret",
	signature = signature(idata = "character"),
	definition = function(idata, N = NULL, dbug = FALSE) {
		callGeneric(data.frame(Result = idata), N = N, dbug = dbug)
	}
)

setMethod(
	f = "interpret",
	signature = signature(idata = "numeric"),
	definition = function(idata, N = NULL, dbug = FALSE) {
		return(data.frame(Iter = 1:length(idata), Result = idata))
	}
)

#setMethod(
#		f = "interpret",
#		signature = signature(idata = "interpret"),
#		definition = function(idata, N = 1000, dbug = FALSE) {
#			callGeneric(data.frame(Result = idata), N = N, dbug = dbug)
#		}
#)

# Interpreting empty locations in indices
# fillna takes a data.frame and fills the cells with NA with each level in that column.
# object is the data.frame, marginals is a vector of columns (either column names or positions) that are to be filled.
# This version of fillna accepts column positions (as the previous version) and also column names in marginals.

fillna <- function (object, marginals) {
	a <- dropall(object)
	if(!is.numeric(marginals)) marginals <- match(marginals, colnames(object))
	for (i in marginals) {
		a[[i]] <- as.factor(a[[i]])
		a1 <- a[!is.na(a[[i]]), ]
		a2 <- a[is.na(a[[i]]), ][-i]
		addition <- data.frame(A = levels(a[[i]]))
		colnames(addition) <- colnames(a)[i]
		a2 <- merge(addition, a2)
		a <- rbind(a1, a2)
	}
	return(a)
}

is.na.ext <- function(x){
	a <- is.na(x) | x == "NA" | nchar(gsub(" ", "", x)) == 0 | x == "*"
	return(a)
}

# Fill NAs in matching columns in x with union of locations in x and y
# Compare to fill.na which replaces NA with own 

fill.na.merge <- function(x, y) {
	common <- intersect(colnames(x@output), colnames(y@output))
	locs <- list()
	# Loop through common columns
	for (i in common) {
		testx <- is.na.ext(x@output[i])
		testy <- is.na.ext(y@output[i])
		# For x
		if (any(testx)) {
			locs[[i]] <- union(levels(as.factor(x@output[[i]])), levels(as.factor(y@output[[i]])))
			
			if (length(locs[[i]]) > 1) {
				# Duplicate rows with wildcards
				temp <- ifelse(testx, length(locs[[i]]), 1)
				ind <- rep(1:length(temp), temp)
				x@output <- x@output[ind,]
				# Insert locations to duplicated rows
				duplicates <- rep(testx, temp)
				temp <- as.character(x@output[[i]])
				# Since number of duplicates is fixed to length of locs this essentially repeats locs 
				# as many times as there are wildcards in this particular index at the moment. 
				temp[duplicates] <- locs[[i]]
				x@output[[i]] <- factor(temp)
			}
		}
		# For y
		if (any(testy)) {
			if (length(locs[[i]]) == 0) {
				locs[[i]] <- union(levels(as.factor(x@output[[i]])), levels(as.factor(y@output[[i]])))
			}
			
			if (length(locs[[i]]) > 1) {
				# Duplicate rows with wildcards
				temp <- ifelse(testy, length(locs[[i]]), 1)
				ind <- rep(1:length(temp), temp)
				y@output <- y@output[ind,]
				# Insert locations to duplicated rows
				duplicates <- rep(testy, temp)
				temp <- as.character(y@output[[i]])
				temp[duplicates] <- locs[[i]]
				y@output[[i]] <- factor(temp)
			}
		}
	}
	return(list(x, y))
}

#interpret("500(490-5000)", N = 2, dbug= TRUE)

#interpret("1;2;3;4", N = 20, dbug = TRUE)

#interpret("<9", N = 4, dbug = TRUE)

Try the OpasnetUtils package in your browser

Any scripts or data that you put into this service are public.

OpasnetUtils documentation built on May 2, 2019, 12:39 p.m.