explora: Explore and Visualize Data

Documented in Summary_explora

Summary_explora <- function(dataset) {
  require(moments)
  nums <- sapply(dataset, is.numeric)
  H <- dataset[ , nums]


  # ------------------------------------------------------------------------------------------------
  # ------------------------------------------------------------------------------------------------
  # ################################### Shapiro test
  Normal_log    <- apply(H, 2, function(x) {
    if((length(na.omit(x)) > 3 & length(na.omit(x)) < 5000) &&
       (length(unique(x)) > 2) &&
       (min(x,na.rm = T) >= 0))
      { if(min(x,na.rm = T) == 0) {x <- log((x)+1)} else {x <- log((x))}
      x.res  <- shapiro.test(na.omit(x))
      W <- round(x.res$statistic,3)
      P <- symnum(x.res$p.value, legend = F, corr = FALSE, na = FALSE,
                  cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 100), symbols = c("***", "**", "*", ".", " "))
      return(paste(W, P, sep = ' '))
    } else {return('----')}
    })
  Normal_exp    <- apply(H, 2, function(x) {
    if((length(na.omit(x)) > 3 & length(na.omit(x)) < 5000) &&
       (length(unique(x)) > 2))
    {
      x <- exp((x))
      x.res  <- shapiro.test(na.omit(x))
      W <- round(x.res$statistic,3)
      P <- symnum(x.res$p.value, legend = F, corr = FALSE, na = FALSE,
                  cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 100), symbols = c("***", "**", "*", ".", " "))
      return(paste(W, P, sep = ' '))
    } else {return('----')}
  })

  Normal_raw    <- apply(H, 2, function(x) {
    if((length(na.omit(x)) > 3 & length(na.omit(x)) < 5000) &&
       (length(unique(x)) > 2))
    {
      x.res  <- shapiro.test(na.omit(x))
      W <- round(x.res$statistic,3)
      P <- symnum(x.res$p.value, legend = F, corr = FALSE, na = FALSE,
                  cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 100), symbols = c("***", "**", "*", ".", " "))
      return(paste(W, P, sep = ' '))
    } else {return('----')}
  })

  # ------------------------------------------------------------------------------------------------
  # ------------------------------------------------------------------------------------------------
  N_outliers <- apply(H, 2, function(x) {
    x <- na.omit(x)
    qnt <- quantile(x, probs=c(.25, .75))
    H <- 1.5 * IQR(x, na.rm = T)
    y <- x
    y[x < (qnt[1] - H)] <- NA
    y[x > (qnt[2] + H)] <- NA
    sum(length(which(is.na(y))))
  })
  # ------------------------------------------------------------------------------------------------
  # ------------------------------------------------------------------------------------------------




  IntegerContiguous <- apply(H, 2, function(x) {
    min.r <- min(x, na.rm=T)
    max.r <- max(x, na.rm=T)
    d <- diff(sort(unique(x)))
    d.r <- ifelse(mean(d, na.rm=T) == 1, 'Yes', 'No')
    return(d.r)
  })

  Min       <- apply(H, 2, function(x) min(x,na.rm = T))
  Max       <- apply(H, 2, function(x) max(x,na.rm = T))
  unique    <- apply(H, 2, function(x) length(unique(x)))
  N         <- apply(H, 2, function(x) length(na.omit(x)))
  N_NA      <- apply(H, 2, function(x) sum(length(which(is.na(x)))))
  Mean      <- apply(H, 2, function(x) mean(x,na.rm = T))
  Median    <- apply(H, 2, function(x) median(x,na.rm = T))
  SD        <- apply(H, 2, function(x) sd(x,na.rm = T))
  SEM        <- apply(H, 2, function(x) sd(x, na.rm = T)/sqrt(length(na.omit(x))))
  CV        <- apply(H, 2, function(x) sd(x, na.rm = T)/mean(x, na.rm = T))


  df <- data.frame(
    N          = N,
    NAs       = N_NA,
    Min        = round(Min,2),
    Mean       = round(Mean,2),
    Median     = round(Median,2),
    Max        = round(Max,2),
    SD         = round(SD,2),
    SEM        = round(SEM,2),
    CV         = round(CV,2),
    IntContig  = IntegerContiguous,
    UniqueValues     = unique,
    W_raw = Normal_raw,
    W_log = Normal_log,
    W_exp = Normal_exp,
    N_outliers = N_outliers
    )
  rownames(df) <- colnames(H)
  return(df)

}
# Summary_explora(mtcars)