R/misc.R
In hdd: Easy Manipulation of Out of Memory Data Sets

Documented in getHdd_extract.cap setHdd_extract.cap

#----------------------------------------------#
# Author: Laurent Berge
# Date creation: Thu Oct 17 22:33:52 2019
# ~: Misc funs used in HDD
#----------------------------------------------#


####
#### Setters/Getters ####
####


#' Sets/gets the size cap when extracting hdd data
#'
#' Sets/gets the default size cap when extracting HDD variables with \code{\link[hdd]{cash-.hdd}} or when importing full HDD data sets with \code{\link[hdd]{readfst}}.
#'
#' @param sizeMB Size cap in MB. Default is 3000.
#'
#' @details
#' In \code{\link[hdd]{readfst}}, if the expected size of the data set exceeds the cap then, 
#' in interactive mode, a confirmation is asked. When not in interactive mode, no confirmation is asked. 
#' This can also be bypassed by using the argument \code{confirm}.
#'
#' @return
#' The size cap, a numeric scalar.
#'
#' @examples
#'
#' # Toy example with iris data
#' # We first create a hdd dataset with approx. 100KB
#' hdd_path = tempfile() # => folder where the data will be saved
#' write_hdd(iris, hdd_path)
#' for(i in 1:10) write_hdd(iris, hdd_path, add = TRUE)
#'
#' base_hdd = hdd(hdd_path)
#' summary(base_hdd) # => 11 files
#'
#' # we can extract the data from the 11 files with '$':
#' pl = base_hdd$Sepal.Length
#'
#' #
#' # Illustration of the protection mechanism:
#' #
#'
#' # By default when extracting a variable with '$'
#' # and the size exceeds the cap (default is greater than 3GB)
#' # a confirmation is needed.
#' # You can set the cap with setHdd_extract.cap.
#'
#' # Following code asks a confirmation:
#' setHdd_extract.cap(sizeMB = 0.005) # new cap of 5KB
#' try(pl <- base_hdd$Sepal.Length)
#'
#' # To extract the variable without changing the cap:
#' pl = base_hdd[, Sepal.Length] # => no size control is performed
#'
#' # Resetting the default cap
#' setHdd_extract.cap()
#'
#'
setHdd_extract.cap = function(sizeMB = 3000){

	check_arg(sizeMB, "numeric scalar GT{0}")

	options("hdd_extract.cap" = sizeMB)
}

#' @rdname setHdd_extract.cap
"getHdd_extract.cap"

getHdd_extract.cap = function(){

	x = getOption("hdd_extract.cap")
	if(length(x) != 1 || !is.numeric(x) || is.na(x) || x < 0){
		stop("The value of getOption(\"hdd_extract.cap\") is currently not legal. Please use function setHdd_extract.cap to set it to an appropriate value.")
	}

	x
}


####
#### HDD utilities ####
####


object_size = function(x){
	if(inherits(x, "hdd")){
		res = tail(x$.size_cum, 1)
	} else {
		res = utils::object.size(x)
	}
	res
}

numID = function(x){
	if(ncol(x) == 1){
		x = x[[1]]
	} else {
		# we recreate the ids using rowid
		vars = names(x)
		x[, "xxOBSxx" := 1:.N]
		setorderv(x, vars)

		x[, "xxNEWIDxx" := cumsum(sign(1 - c(-100, diff(rowidv(x, vars)))))]
		setorderv(x, "xxOBSxx")
		x = x[["xxNEWIDxx"]]
	}
	return(x)
}


find_split = function(x){
	# This function to find where to cut a file.
	# we do not want the same value to be in two different files

	x = numID(x)

	n = length(x)
	obs_mid = round(n/2)
	v_left = x[obs_mid]
	v_right = x[obs_mid + 1]

	if(v_left != v_right){
		return(obs_mid)
	} else {
		# then we need to split differently
		if(v_left != x[n]){
			obs_mid = obs_mid + which.max(x[(obs_mid+1):n] != v_left) - 1
		} else if(v_left != x[1]){
			obs_mid = which.max(x[1:obs_mid] == v_left) - 1
		} else {
			# stop("No middle value found. The two files could not be split, revise the code to allow merging the files automatically.")
			return(NULL)
		}
		return(obs_mid)
	}
}

find_n_split = function(x, key, nfiles){
	# finds where to cut files.
	# we do not want the same key value to be in two different files
	# nfiles: the new number of files
	# returns a vector of the beginning observations

	nfiles_origin = length(x$.nrow)
	n_all = x$.row_cum[nfiles_origin]

	# preliminary starting point of each file
	start = floor(seq(1, n_all, by = n_all/nfiles))
	start = c(start[1:nfiles], n_all)

	# increment: 5% of file size
	DELTA = max(floor(n_all/nfiles * 0.05), 1)

	# We find the right starting point of each file
	for(i in 2:nfiles){
		start_tmp = start[i]
		id = numID(x[start_tmp + 0:1, j = key, with = FALSE])
		id_left = id[1]

		if(id[1] != id[2]){
			# OK! => nothing to do
			next
		} else {
			# we go to the right
			k = 1
			while(TRUE){
				if(start_tmp + k*DELTA > start[i + 1]){
					stop("The hdd file with key could not be split in ", nfiles, " documents (because of too many identical keys). Reduce the number of documents.")
				}

				id = numID(x[start_tmp + c(0, k*DELTA), key, with = FALSE])
				if(id[1] != id[2]){
					# we find the right point!
					id = numID(x[start_tmp + c(0, ((DELTA*(k-1)+1):(DELTA*k))), key, with = FALSE])
					start_new = start_tmp + DELTA*(k-1) + which.max(id[-1] != id[1])
					start[i] = start_new
					break
				} else {
					k = k + 1
				}
			}
		}
	}

	start_final = start[1:nfiles]
	return(start_final)
}

obs = function(x, file){
	# Finds the observation numbers of a hdd document by file

	if(!inherits(x, "hdd")){
		stop("x must be a hdd file.")
	}

	if(missing(file)){
		stop("file must be provided.")
	}

	n = length(x$.nrow)
	check_arg(file, "integer vector GT{0}")
	if(any(file > n)){
		stop("file cannot exceed ", n, ".")
	}


	row_cum = x$.row_cum
	end = row_cum
	start = (1 + c(0, row_cum))

	# creation of the vector
	res = list()
	index = 0
	for(i in file){
		index = index + 1
		res[[index]] = start[i]:end[i]
	}

	unlist(res)
}


clean_path = function(x){
	# we just want proper /
	x = gsub("\\", "/", x, fixed = TRUE)
	gsub("/+", "/", x)
}

####
#### Other Utilities ####
####


is_numeric_in_char = function(x){
  res = tryCatch(as.numeric(x), warning = function(x) "not numeric")
  !identical(res, "not numeric")
}

format_difftime = function(x){
  # x: number of seconds or difftime or time

  if(is.character(x)){
    if(is_numeric_in_char(x)){
      # x: number of seconds
      x = as.numeric(x)
    } else {
      # When the data is not conform:
      # - should there be an error?
      # - should I return NA?

      return(rep("(difftime: NA)", length(x)))
    }
  }

  res = character(length(x))

  for(i in seq_along(x)){
    xi = x[i]
    
    if(inherits(xi, "POSIXt")){
      xi = Sys.time() - xi
    }
    
    if(inherits(xi, "difftime")){
      xi = as.double(xi, units = "secs")
    }
    
    if(xi > 3600){
			n_hour = xi %/% 3600
			rest_s = floor(xi %% 3600)
			n_min = rest_s %/% 60
			res[i] = paste0(n_hour, " hour", ifelse(n_hour > 1, "s", ""), 
			                " ", sprintf("%02i", n_min), " min")
		} else if(xi > 60){
			n_min = xi %/% 60
			n_sec = floor(xi %% 60)
			res[i] = paste0(n_min, " min ", sprintf("%02i", n_sec), " sec")
		} else if(xi > 0.9){
			res[i] = paste0(fsignif(xi, 2, 1), "s")
		} else if(xi > 1e-3){
			res[i] = paste0(fsignif(xi * 1000, 2, 0), "ms")
		} else {
			res[i] = "<1 ms"
		}
	}
  
  res
}

addCommas = function(x){

	addCommas_single = function(x){
		# Cette fonction ajoute des virgules pour plus de
		# visibilite pour les (tres longues) valeurs de vraisemblance

		# This is an internal function => the main is addCommas

		if(!is.finite(x) || log10(abs(x)) < 0) return(as.character(x))

		s = sign(x)
		x = abs(x)
		decimal = x - floor(x)
		if (decimal > 0){
			dec_string = substr(decimal, 2, 4)
		} else {
			dec_string = ""
		}

		entier = sprintf("%.0f", floor(x))
		quoi = rev(strsplit(entier, "")[[1]])
		n = length(quoi)
		sol = c()
		for (i in 1:n) {
			sol = c(sol, quoi[i])
			if (i%%3 == 0 && i != n) sol = c(sol, ",")
		}
		res = paste0(ifelse(s == -1, "-", ""), paste0(rev(sol), collapse = ""),
					 dec_string)
		res
	}

	sapply(x, addCommas_single)
}



formatTable = function(x, d=2, r=1){
	# This function takes in a data.frame and formats all the columns

	# Checks:
	if(checkVector(x) || is.matrix(x)){
		x_format = as.data.frame(x)
	} else if(is.data.table(x)){
		x_format = copy(x)
	} else if(!is.data.frame(x)){
		stop("Argument 'x' must be a data.frame!")
	} else {
		x_format = x
	}

	isChar = !sapply(x_format, is.numeric)

	for(i in which(isChar)){
		x_format[[i]] = .cleanPCT(as.character(x_format[[i]]))
	}

	# the formatting of numbers
	for(i in which(!isChar)){
		x_format[[i]] = numberFormat(x_format[[i]], d=d, r=r)
	}

	return(x_format)
}


checkVector = function(x){
	# it seems that when you subselect in data.table
	# sometimes it does not yield a vector
	# so i cannot use is.vector to check the consistency

	if(is.vector(x)){
		return(TRUE)
	} else {
		if(any(class(x) %in% c("integer", "numeric", "character", "factor", "Date")) && is.null(dim(x))){
			return(TRUE)
		}
	}
	return(FALSE)
}


mysignif = function(x, d=2, r=1){

	# The core function
	mysignif_single = function(x, d, r){
		if(is.na(x)) return(NA)

		if(abs(x)>=10**(d-1)) return(round(x, r))
		else return(signif(x, d))
	}

	# the return
	sapply(x, mysignif_single, d=d, r=r)
}

numberFormat = function(x, d=2, r=1){
	numb_char = as.character(x)
	quiHigh = (abs(x) >= 1e4 & !is.na(x))
	if(sum(quiHigh) > 0){
		numb_char[quiHigh] = addCommas(mysignif(x[quiHigh], d=d, r=r))
	}

	if(sum(!quiHigh) > 0){
		numb_char[!quiHigh] = as.character(mysignif(x[!quiHigh], d=d, r=r))
	}

	numb_char
}

.cleanPCT = function(x){
	# changes % into \% => to escape that character in Latex
	gsub("%", "\\%", x, fixed = TRUE)
}


osize = function(x){
	size = as.numeric(object_size(x))
	n = log10(size)

	if(n < 3){
		# cat(size, " Octets.\n")
		res = paste0(size, " Bytes.")
	} else if(n < 6){
		# cat(mysignif(size/1000, 20, 2), " Ko.\n")
		res = paste0(mysignif(size/1000, 3, 0), " KB.")
	} else {
		# cat(addCommas(mysignif(size/1000000, 20, 2)), " Mo.\n")
		res = paste0(addCommas(mysignif(size/1000000, 3, 0)), " MB.")
	}

	class(res) = "osize"

	res
}

ggrepl = function(pattern, x){
	x[grepl(pattern, x, perl = TRUE)]
}


deparse_long = function (x){
	dep_x = deparse(x)
	if (length(dep_x) == 1) {
		return(dep_x)
	} else {
		return(paste(gsub("^ +", "", dep_x), collapse = ""))
	}
}


is_r_check = function(){
	any(grepl("_R_CHECK", names(Sys.getenv()), fixed = TRUE))
}