R/descriptive.R
In clickR: Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

Documented in antimoda cluster_var descriptive forge GK_assoc good2go ipboxplot kill.factors kurtosis mine.plot moda moda_cont mtapply nearest outliers peek prop_may prop_min scale_01 search_scripts skewness unforge

#' Computes kurtosis
#'
#' @description Calculates kurtosis of a numeric variable
#' @param x A numeric variable
#' @return kurtosis value
#' @importFrom stats sd
kurtosis <- function(x) {
  m4 <- mean((x-mean(x, na.rm=T))^4, na.rm=T)
  kurt <- m4/(sd(x, na.rm=T)^4)-3
  kurt
}

#' Computes skewness
#'
#' @description Calculates skewness of a numeric variable
#' @param x A numeric variable
#' @return skewness value
#' @importFrom stats sd
skewness <-  function(x) {
  m3 <- mean((x-mean(x, na.rm=T))^3, na.rm=T)
  skew <- m3/(sd(x, na.rm=T)^3)
  skew
}

#' Estimates number of modes
#'
#' @description Estimates the number of modes
#' @param x A numeric variable
#' @return Estimated number of modes.
#' @importFrom stats density
moda_cont <- function(x){
  if(length(na.omit(x))>1){
    maxima <- density(x, na.rm=TRUE)$y[diff(diff(density(x, adjust=1, na.rm=T)$y)>=0)<0]
    modas <- length(maxima[maxima >= max(maxima)*1/3])
    return(paste(modas, " ", sep=""))
  }
  else return(NA)
}

#' Scales data between 0 and 1
#'
#' @description Escale data to 0-1
#' @param x A numeric variable
#' @return Scaled data
scale_01 <- function(x) (x-min(x))/(max(x)-min(x))

#' Internal function for descriptive()
#'
#' @description Finds positions for substitution of characters in Distribution column
#' @param x A numeric value between 0-1
#' @param to Range of reference values
#' @return The nearest position to the input value
nearest <- function(x, to=seq(0, 1, length.out = 30)) {
  which.min(abs(to - x))
}

#' Get mode
#'
#' @description Returns the most repeated value
#' @param x A categorical variable
#' @return The mode
moda <- function(x){names(sort(-table(x)))[1]}

#' Get anti-mode
#'
#' @description Returns the least repeated value
#' @param x A categorical variable
#' @return The anti-mode (least repeated value)
antimoda <- function(x){names(sort(table(x)))[1]}

#' Gets proportion of most repeated value
#'
#' @description Returns the proportion for the most repeated value
#' @param x A categorical variable
#' @param ignore.na Should NA values be ignored for computing proportions?
#' @return A proportion
prop_may <- function(x, ignore.na=TRUE) {sort(-table(x))[1]/-(length(x)-ignore.na*sum(is.na(x)))}

#' Gets proportion of least repeated value
#'
#' @description Returns the proportion for the least repeated value
#' @param x A categorical variable
#' @param ignore.na Should NA values be ignored for computing proportions?
#' @return A proportion
prop_min <- function(x, ignore.na=TRUE){sort(table(x))[1]/(length(x)-ignore.na*sum(is.na(x)))}


#' Computes Goodman and Kruskal's tau
#'
#' @description Returns Goodman and Kruskal's tau measure of association between two categorical variables
#' @param x A categorical variable
#' @param y A categorical variable
#' @return Goodman and Kruskal's tau
#' @export
#' @examples
#' data(infert)
#' GK_assoc(infert$education, infert$case)
#' GK_assoc(infert$case, infert$education) #Not the same
GK_assoc <- function(x, y){
  Nij <- table(x, y)
  vx <- 1 - sum(rowSums(Nij/sum(Nij))^2)
  vy <- 1 - sum(colSums(Nij/sum(Nij))^2)
  d <- 1 - sum(rowSums((Nij/sum(Nij))^2)/rowSums(Nij/sum(Nij)))
  tau <- (vy - d)/vy
  return(tau)
}

#' Detailed summary of the data
#'
#' @description Creates a detailed summary of the data
#' @param x A data.frame
#' @param z Number of decimal places
#' @param ignore.na If TRUE NA values will not count for relative frequencies calculations
#' @param by Factor variable definining groups for the summary
#' @param print Should results be printed?
#' @return Summary of the data
#' @importFrom stats density dist family median na.omit quantile sd var
#' @export
#' @examples
#' descriptive(iris)
#' descriptive(iris, by="Species")
descriptive <- function(x, z=3, ignore.na=TRUE, by=NULL, print=TRUE){
  #Data.frame
  if(!is.data.frame(x)){
    x<-data.frame(x)
  }
  x<-x[, !sapply(x, function(x) all(is.na(x))), drop=FALSE]

  if(!is.null(by) && by %in% names(x)){
    by_v <- x[, by]
    x_sin <- x[, !colnames(x) == by, drop=FALSE]
    if (length(x_sin) == 0){
      descriptive(x, z, ignore.na, by=NULL)
      stop("Only one variable in the data. Can't be used as grouping variable")
    }
    x_sin <- x_sin[!is.na(by_v), ]
    by_v <- by_v[!is.na(by_v)]
    by_v <- factor(by_v)
    niveles <- levels(by_v)
    cat("Summary by ", by, ":", sep="")
    cat("\n")
    cat("-------------------------------")
    cat("\n")
    for (i in 1:length(niveles)){
      x_g <- x_sin[by_v == niveles[i], ]
      cat("Level ", by, ": ", niveles[i], sep="")
      cat("\n")
      descriptive(x=x_g, z=z, ignore.na=ignore.na, by=NULL)
      cat("\n")
      cat("-------------------------------")
      cat("\n")
    }
  } else{
    #Splitter (Splits data.frame: Numeric and categorical part)
    nums <- sapply(x, class)

    #Numeric summary
    resumen <- function(y){
      resumen1 <- round(c(min(y, na.rm=TRUE),
                          quantile(y, probs=0.25, na.rm=TRUE),
                          median(y, na.rm=TRUE),
                          quantile(y, probs=0.75, na.rm=T),
                          max(y, na.rm=TRUE),
                          mean(y, na.rm=TRUE),
                          sd(y, na.rm=TRUE),
                          kurtosis(y),
                          skewness(y)),z)
      names(resumen1) <- c("Min", "1st Q.", "Median", "3rd Q.", "Max", "Mean", "SD", "Kurtosis", "Skewness")
      distribution <- c("|", rep("-", 28), "|")
      scaled_Y <- scale_01(y)
      tryCatch(distribution[(nearest((resumen1["1st Q."] - resumen1["Min"])/
                                       (resumen1["Max"] - resumen1["Min"]))+1):
                              (nearest((resumen1["3rd Q."] - resumen1["Min"])/
                                         (resumen1["Max"] - resumen1["Min"]))-1)] <- "#", error = function(e) NA)
      tryCatch(distribution[nearest((resumen1["1st Q."] - resumen1["Min"])/
                                      (resumen1["Max"] - resumen1["Min"]))] <- "[", error = function(e) NA)
      tryCatch(distribution[nearest((resumen1["3rd Q."] - resumen1["Min"])/
                                      (resumen1["Max"] - resumen1["Min"]))] <- "]", error = function(e) NA)
      tryCatch(distribution[nearest((resumen1["Median"] - resumen1["Min"])/
                                      (resumen1["Max"] - resumen1["Min"]))] <- ":", error = function(e) NA)
      return(data.frame(t(resumen1), Modes=moda_cont(y), NAs=sum(is.na(y)),
                        Distribution = paste(distribution, collapse=""), check.names = FALSE, stringsAsFactors = FALSE))
    }
    #Character and factor summary
    resumen2 <- function(w){
      resumen2 <- c(length(table(w)),
                    abbreviate(paste(na.omit(names(sort(-table(w)))[1:5]), collapse="/"),
                               minlength = min(20, nchar(paste(na.omit(names(sort(-table(w)))[1:5]), collapse="/"))),
                               named=FALSE),
                    moda(w), round(prop_may(w, ignore.na = ignore.na),z), antimoda(w),
                    round(prop_min(w, ignore.na = ignore.na),z), sum(is.na(w)))
      names(resumen2)<- c("N. Classes", "Classes", "Mode", "Prop. mode", "Anti-mode", "Prop. Anti-mode", "NAs")
      data.frame(t(resumen2), check.names = FALSE, stringsAsFactors = FALSE)
    }
    #Date summary
    resumen3<-function(y){
      resumen3 <- c(as.character(min(y, na.rm=TRUE)),
                    as.character(as.Date(quantile(as.numeric(y), probs=0.25, na.rm=TRUE),
                                         origin=as.Date("1970-01-01"))),
                    as.character(median(y, na.rm=TRUE)),
                    as.character(as.Date(quantile(as.numeric(y), probs=0.75, na.rm=TRUE),
                                         origin=as.Date("1970-01-01"))),
                    as.character(max(y, na.rm=TRUE)),
                    as.character(mean(y, na.rm=TRUE)),
                    round(sd(y, na.rm=TRUE), z),
                    round(kurtosis(as.numeric(y)), z),
                    round(skewness(as.numeric(y)), z))
      names(resumen3) <- c("Min", "1st Q.", "Median", "3rd Q.", "Max", "Mean", "SD", "Kurtosis", "Skewness")
      distribution <- c("|", rep("-", 28), "|")
      scaled_Y <- scale_01(as.numeric(y))
      tryCatch(distribution[(nearest((as.numeric(as.Date(resumen3["1st Q."])) -
                                        as.numeric(as.Date(resumen3["Min"]))) /
                                      (as.numeric(as.Date(resumen3["Max"])) -
                                        as.numeric(as.Date(resumen3["Min"])))) + 1) :
                            (nearest((as.numeric(as.Date(resumen3["3rd Q."])) -
                                        as.numeric(as.Date(resumen3["Min"]))) /
                                      (as.numeric(as.Date(resumen3["Max"])) -
                                        as.numeric(as.Date(resumen3["Min"])))) - 1)] <- "#", error = function(e) NA)
      tryCatch(distribution[nearest((as.numeric(as.Date(resumen3["1st Q."])) -
                                       as.numeric(as.Date(resumen3["Min"]))) /
                                     (as.numeric(as.Date(resumen3["Max"])) -
                                       as.numeric(as.Date(resumen3["Min"]))))] <- "[", error = function(e) NA)
      tryCatch(distribution[nearest((as.numeric(as.Date(resumen3["3rd Q."])) -
                                       as.numeric(as.Date(resumen3["Min"]))) /
                                     (as.numeric(as.Date(resumen3["Max"])) -
                                       as.numeric(as.Date(resumen3["Min"]))))] <- "]", error = function(e) NA)
      tryCatch(distribution[nearest((as.numeric(as.Date(resumen3["Median"])) -
                                       as.numeric(as.Date(resumen3["Min"]))) /
                                     (as.numeric(as.Date(resumen3["Max"])) -
                                       as.numeric(as.Date(resumen3["Min"]))))] <- ":", error = function(e) NA)
      return(data.frame(t(resumen3), NAs=sum(is.na(y)), Distribution=paste(distribution, collapse=""),
                        check.names = FALSE, stringsAsFactors = FALSE))
    }

    #Results
    if(print){
    cat(paste("Data frame with", dim(x)[1], "observations and", dim(x)[2], "variables."))
    cat("\n")
    cat("\n")
    }
    if(any(nums %in% c("numeric", "integer"))){
      summary1 <- do.call(rbind, lapply(x[,nums %in% c("numeric", "integer"), drop=FALSE], resumen))
      if(print){
        cat("Numeric variables (", sum(nums %in% c("numeric", "integer")), ")", sep="")
        cat("\n")
        print(summary1)
      }
    } else { summary1 <- NULL}

    if(any(nums %in% c("factor", "character", "logical"))){
      summary2 <- do.call(rbind, lapply(x[, nums %in% c("factor", "character", "logical"), drop=FALSE], resumen2))
      if(print){
        cat("\n")
        cat("Categorical variables (", sum(nums %in% c("factor", "character", "logical")), ")", sep="")
        cat("\n")
        print(summary2, quote=FALSE)
      }
    } else { summary2 <- NULL}

    if(any(nums %in% c("Date"))){
      summary3 <- do.call(rbind, lapply(x[, nums %in% c("Date"), drop=FALSE], resumen3))
      if(print){
        cat("\n")
        cat("Date variables (", sum(nums %in% c("Date")), ")", sep="")
        cat("\n")
        print(summary3, quote=FALSE)
      }
    } else { summary3 <- NULL}

    invisible(list(numeric = summary1, character = summary2, date = summary3))
  }
}

#' Clustering of variables
#'
#' @description Displays associations between variables in a data.frame in a heatmap with clustering
#' @param x A data.frame
#' @param margins Margins for the plot
#' @return A heatmap with the variable associations
#' @importFrom stats lm heatmap xtabs
#' @importFrom grDevices colorRampPalette
#' @export
#' @examples
#' cluster_var(iris)
#' cluster_var(mtcars)
cluster_var <- function(x, margins=c(8,1)){
  data <- x
  if(any(sapply(data, is.numeric))){
    associations <- sapply(data[, sapply(data, is.numeric)], function(x) sapply(data, function(y) suppressWarnings(summary(lm(x ~ y))$r.squared)))
    heatmap(associations, col=colorRampPalette(c("gray", "darkred"))(25), scale="none", margins=margins, breaks=seq(0, 1, length.out = 26))
  }
  if (sum(!sapply(data, is.numeric))>1){
    associations2 <- sapply(data[, !sapply(data, is.numeric)], function(x) sapply(data[, !sapply(data, is.numeric)], function(y) GK_assoc(x, y)))
    rownames(associations2)<-colnames(associations2)
    heatmap(associations2, col=colorRampPalette(c("gray", "darkred"))(25), scale="none", margins=margins, breaks=seq(0, 1, length.out = 26))
  }
}

#' Mine plot
#'
#' @description Creates a heatmap-like plot for exploring the data
#' @param x A data.frame
#' @param fun A function that evaluates a vector and returns a logical vector
#' @param spacing Numerical separation between lines at the y-axis
#' @param sort If TRUE, variables are sorted according to their results
#' @param show.x Should the x-axis be plotted?
#' @param show.y Should the y-axis be plotted?
#' @param ... further arguments passed to order()
#' @importFrom graphics par image mtext axis
#' @importFrom methods hasArg
#' @export
#' @examples
#' mine.plot(airquality)   #Displays missing data
#' mine.plot(airquality, fun=outliers)   #Shows extreme values
mine.plot <- function(x, fun=is.na, spacing=5, sort=F, show.x=TRUE, show.y=TRUE, ...){
  x<-as.data.frame(x)
  if(sort){
    orden <- order(sapply(x, function(x) sum(fun(x))), ...)
    x <- x[,orden]
  }
  old.warn <- options(warn = -1)
  pad <- ceiling(dim(x)[2]/30)
  old.par <- par(mar=c(8, 4.5, 6, 4))
  image(t(sapply(x, fun)), xaxt="n", yaxt="n", col=colorRampPalette(c("lightcyan4", "darkred"))(2), ...)
  if(show.x){
    axis(1, at=seq(0, 1, length=dim(x)[2]), labels=paste(names(x), "\n", "(", sapply(x, class), ")", sep=""), las=2, lwd=0, cex.axis=0.8)
  }
  if(show.y){
    axis(2, at = seq(0, dim(x)[1], by = spacing)/dim(x)[1], labels = seq(0, dim(x)[1], by = spacing), las = 1, cex.axis = 0.6)
  }
  for(i in 1:pad){
    axis(3, at=seq(0, 1, length=dim(x)[2])[seq(0 + i, dim(x)[2], by=pad)],
         labels=sapply(x, function(x) round(100*sum(fun(x))/length(x)))[seq(0+i, dim(x)[2], by=pad)], cex.axis=0.6, lwd=0, line=-1+i/2)
  }
  if(!hasArg("main")) mtext(paste("%", as.character(substitute(fun))), 3, line=max(pad/1.5, 2.5), cex=1.2)
  options(old.warn)
  output1 <- do.call(rbind, lapply(1:ncol(x), function(y){
    variable <- names(x)[y]
    out <- fun(x[,y])
    id <- which(out)
    value <- x[,y][out]
    if(ttrue(any(out))) data.frame(variable = variable, id = id, value = as.character(value))
  }))
  output2 <- sapply(x, function(x) round(sum(fun(x))/length(x), 2))
  return(list(list = output1, summary = output2))
  par(old.par)
}

#' outliers
#'
#' @description Function for detecting outliers based on the boxplot method
#' @param x A vector
#' @param threshold Threshold (as multiple of the IQR) to consider an observation as outlier
#' @export
#' @examples
#' outliers(iris$Petal.Length)
#' outliers(airquality$Ozone)
outliers <- function(x, threshold=1.5){
  if(any(c("Date", "POSIXt") %in% class(x))){
    quantiles <- quantile(as.POSIXct(x), probs=c(0.25, 0.75), na.rm=TRUE)
    unclass(as.POSIXct(x)) %<NA% (unclass(quantiles["25%"]) - threshold*(unclass(quantiles["75%"])-unclass(quantiles["25%"]))) |
      unclass(as.POSIXct(x)) %>NA% (unclass(quantiles["75%"]) + threshold*(unclass(quantiles["75%"])-unclass(quantiles["25%"])))
  } else{
    if(is.numeric(x)){
      quantiles <- quantile(x, probs=c(0.25, 0.75), na.rm=TRUE)
      x %<NA% (quantiles["25%"] - threshold*(quantiles["75%"]-quantiles["25%"])) |
        x %>NA% (quantiles["75%"] + threshold*(quantiles["75%"]-quantiles["25%"]))
    } else rep(NA, length(x))
  }
}


#' Improved boxplot
#'
#' @description Creates an improved boxplot with individual data points
#' @param formula Formula for the boxplot
#' @param boxwex Width of the boxes
#' @param ... further arguments passed to beeswarm()
#' @importFrom grDevices rgb
#' @export
#' @examples
#' ipboxplot(Sepal.Length ~ Species, data=iris)
#' ipboxplot(mpg ~ gear, data=mtcars)
ipboxplot<-function(formula, boxwex=0.6, ...){
  boxplot(formula, las=1, cex.axis=1.2, cex.lab=1.2, boxwex=boxwex, ...)
  beeswarm::beeswarm(formula, pch=16, col=rgb(50, 50, 50, 150, maxColorValue=255), add=T, ...)
}

#' Multiple tapply
#'
#' @description Modification of the tapply function to use with data.frames. Consider using aggregate()
#' @param x A data.frame
#' @param group Grouping variable
#' @param fun Function to apply by group
#' @export
#' @examples
#' mtapply(mtcars, mtcars$gear, mean)
mtapply <- function(x, group, fun){
  if(is.null(dim(x))) tapply(x, group, fun)
  else sapply(split(x, group), function(x) sapply(x, function(x) fun(x)))
}

#' Peek
#'
#' @description Takes a peek into a data.frame returning a concise visualization about it
#' @param x A data.frame
#' @param n Number of rows to include in output
#' @param which Columns to include in output
#' @importFrom utils head
#' @export
#' @examples
#' peek(iris)
peek <- function(x, n=10, which=1:ncol(x)){
  class <- sapply(x[,which], class)
  range <- paste("(", sapply(x[,which], function(x) {
    ifelse(class(x) %in% c("character", "factor"), length(unique(x)),
           ifelse(class(x) %in% c("Date") | is.numeric(x), paste(round(range(x, na.rm=TRUE),2), collapse="; "),
                  ""))
  }
  ), ")", sep="")
  blank <- rep("", length=length(class))
  output <- rbind(as.matrix(head(x[,which], n)), rep("", ncol(x[,which])), class, range, blank, blank)
  rownames(output)[(n+4):(n+5)]<-""
  cat("Data frame with ", nrow(x), " rows (showing ", length(which), " of ", ncol(x), " variables) \n \n")
  print(output, quote = FALSE)
}

#' Kill factors
#'
#' @description Changes factor variables to character
#' @param dat A data.frame
#' @param k Maximum number of levels for factors
#' @export
#' @examples
#' d <- data.frame(Letters=letters[1:20], Nums=1:20)
#' d$Letters
#' d <- kill.factors(d)
#' d$Letters
kill.factors <- function(dat, k=10){
  filter <- sapply(dat, function(x) is.factor(x) & length(levels(x))>k)
  dat[filter] <- lapply(dat[filter], as.character)
  return(dat)
}

#' Good to go
#'
#' @description Loads all libraries used in scripts inside the selected path
#' @param path Path where the scripts are located
#' @param info List the libraries found?
#' @param load Should the libraries found be loaded?
#' @export
good2go <- function(path = getwd(), info = TRUE, load = TRUE){
  files <- list.files(path = path, pattern = "\\.R$")
  libraries <- do.call(c, lapply(files, function(x){
    x <- readLines(x)
    x[grepl("library\\(.+\\)|require\\(.+\\)", x)]
  }))
  p_list <- lapply(libraries, function(y){
    kk <- regexpr("\\(.+\\)", y)
    substr(y, kk+1, kk+attr(kk, "match.length")-2)
  })
  if (load)
    lapply(p_list[!duplicated(p_list)], function(x) tryCatch(require(x, character.only = TRUE,
                                                                     quietly = TRUE), error=function(e) warning("Error loading ", x)))
  if (info)
    print(paste("Packages:", paste(p_list[!duplicated(p_list)], collapse = ", ")),
          quote = FALSE)
}

#' Forge
#'
#' @description Reshapes a data frame from wide to long format
#' @param data data.frame
#' @param affixes Affixes for repeated measures
#' @param force.fixed Variables with matching affix to be excluded
#' @param var.name Name for the new created variable (repetitions)
#' @export
#' @examples
#' #Data frame in wide format
#' df1 <- data.frame(id = 1:4, age = c(20, 30, 30, 35), score1 = c(2,2,3,4),
#'                   score2 = c(2,1,3,1), score3 = c(1,1,0,1))
#' df1
#' #Data frame in long format
#' forge(df1, affixes= c("1", "2", "3"))
#'
#' #Data frame in wide format with two repeated measured variables
#' df2 <- data.frame(df1, var1 = c(15, 20, 16, 19), var3 = c(12, 15, 15, 17))
#' df2
#' #Missing times are filled with NAs
#' forge(df2, affixes = c("1", "2", "3"))
#'
#' #Use of parameter force.fixed
#' df3 <- df2[, -7]
#' df3
#' forge(df3, affixes=c("1", "2", "3"))
#' forge(df3, affixes=c("1", "2", "3"), force.fixed = c("var1"))
forge <- function(data, affixes, force.fixed = NULL, var.name = "time"){
  data_ord <- data[, order(names(data))]
  indices <- data.frame(sapply(affixes, function(x) grepl(x, names(data_ord))))
  if(!is.null(force.fixed)){
    positions <- which(names(data_ord) %in% force.fixed)
    indices[positions, ] <- FALSE
  }
  message("Repetitions for each variable: \n \n",
          sort(table(unlist(lapply(indices, function(x){
            gsub(paste(affixes, collapse = "|"), "", names(data_ord)[x])
            }
            ))), decreasing = TRUE))
  listas <- lapply(indices, function(x) {
    setNames(data_ord[, x, drop = FALSE], gsub(paste(affixes,
      collapse = "|"), "", names(data_ord)[x]))
  })
  variables <- unique(unlist(lapply(listas, names)))
  listas <- lapply(listas, function(x) {
    df <- data.frame(x, matrix(NA, nrow = nrow(data), ncol = length(variables[!variables %in%
                                                                                names(x)])))
    names(df) <- c(names(x), variables[!variables %in% names(x)])
    df
  })
  long <- do.call("rbind", listas)
  fixed <- data_ord[, !apply(indices, 1, any), drop=FALSE]
  out <- data.frame(fixed[, na.omit(match(names(data), names(fixed))), drop=FALSE][rep(1:nrow(data),
    length(affixes)), ,drop=FALSE], long, affix = rep(affixes, each = nrow(data)))
  names(out)[ncol(out)] <- var.name
  out
}

#' Un-Forge
#'
#' @description Reshapes a data frame from long to wide format
#' @param data data.frame
#' @param origin Character vector with variable names in data containing the values to be assigned to the different new variables
#' @param variables Variable in data containing the variable names to be created
#' @param prefix Vector with prefixes for the new variable names
#' @export
#' @examples
#' #Data frame in wide format
#' df1 <- data.frame(id = 1:4, age = c(20, 30, 30, 35), score1 = c(2,2,3,4),
#'                   score2 = c(2,1,3,1), score3 = c(1,1,0,1))
#' df1
#' #Data frame in long format
#' df2 <- forge(df1, affixes= c("1", "2", "3"))
#' df2
#' #Data frame in wide format again
#' df3 <- unforge(df2, "score", "time", prefix="score")
#'
unforge <- function(data, origin, variables, prefix=origin){
  splitted <- split(data, data[variables])
  out_data <- cbind(splitted[[1]][,!names(data) %in% c(origin, variables), drop=FALSE],
                    setNames(do.call(cbind, lapply(split(data, data[variables]), function(x) x[origin])),
                             paste(rep(prefix, length(unique(data[,variables]))),
                                   rep(names(splitted), each=length(origin)), sep="_")))
  out_data
}

#' Search scripts
#'
#' @description Searches for strings in R script files
#' @param string Character string to search
#' @param path Character vector with the path name
#' @param recursive Logical. Should the search be recursive into subdirectories?
#' @return A list with each element being one of the files containing the search string
#' @export
search_scripts <- function(string, path=getwd(), recursive=TRUE){
  files <- list.files(path=path, pattern="\\.R$", recursive = recursive, full.names=TRUE)
  listado <- lapply(files, function(x) {
    x <- readLines(x, warn=FALSE)
    x[grepl(string, x)]
  })
  names(listado) <- files
  listado[sapply(listado, length) != 0]
}
Any scripts or data that you put into this service are public.
clickR documentation built on April 3, 2025, 9:22 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
clickR
Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

R/descriptive.R
In clickR: Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

Defines functions search_scripts unforge forge good2go kill.factors peek mtapply ipboxplot outliers mine.plot cluster_var descriptive GK_assoc prop_min prop_may antimoda moda nearest scale_01 moda_cont skewness kurtosis

Documented in antimoda cluster_var descriptive forge GK_assoc good2go ipboxplot kill.factors kurtosis mine.plot moda moda_cont mtapply nearest outliers peek prop_may prop_min scale_01 search_scripts skewness unforge

Try the clickR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

clickR Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

R/descriptive.R In clickR: Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

Defines functions search_scripts unforge forge good2go kill.factors peek mtapply ipboxplot outliers mine.plot cluster_var descriptive GK_assoc prop_min prop_may antimoda moda nearest scale_01 moda_cont skewness kurtosis

Documented in antimoda cluster_var descriptive forge GK_assoc good2go ipboxplot kill.factors kurtosis mine.plot moda moda_cont mtapply nearest outliers peek prop_may prop_min scale_01 search_scripts skewness unforge

Try the clickR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

clickR
Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

R/descriptive.R
In clickR: Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning