explora: Explore and Visualize Data

Documented in check.norm Norm_log_10 save.QQPlots

check.norm <- function(x) {
  require(moments)
  x <- x
  x.res      <- shapiro.test(x)
  x.log      <- log(x)
  x.log.res  <- shapiro.test(x.log)
  x.sqrt     <- sqrt(x)
  x.sqrt.res <- shapiro.test(x.sqrt)

  # https://stackoverflow.com/questions/4787332/how-to-remove-outliers-from-a-dataset
  # super thanks!!!
  remove_outliers <- function(x) {
    qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
    H <- 1.5 * IQR(x, na.rm = T)
    y <- x
    y[x < (qnt[1] - H)] <- NA
    y[x > (qnt[2] + H)] <- NA
    y
  }

  y          <- remove_outliers(x)
  N.rem      <- length(x)-length(na.omit(y))
  y.res      <- shapiro.test(y)
  y.log      <- log(y)
  y.log.res  <- shapiro.test(y.log)
  y.sqrt     <- sqrt(y)
  y.sqrt.res <- shapiro.test(y.sqrt)

  par(mfrow = c(3,3))
  #################################################### normal
  qqnorm(x,main = 'normal (b)'); qqline(x)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(x.res$statistic,3)),
    paste('p = ', round(x.res$p.value,5)), sep = '\n')
  )
  qqnorm(x.log,main = 'log (b)'); qqline(x.log)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(x.log.res$statistic,3)),
    paste('p = ', round(x.log.res$p.value,5)), sep = '\n')
  )
  qqnorm(x.sqrt,main = 'sqrt (b)'); qqline(x.sqrt)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(x.sqrt.res$statistic,3)),
    paste('p = ', round(x.sqrt.res$p.value,5)), sep = '\n')
  )

  #################################################### normal
  qqnorm(y,main = 'normal (b) \n(NoOut)'); qqline(y)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(y.res$statistic,3)),
    paste('p = ', round(y.res$p.value,5)), sep = '\n')
  )
  qqnorm(y.log,main = 'log (b) \n(NoOut)'); qqline(y.log)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(y.log.res$statistic,3)),
    paste('p = ', round(y.log.res$p.value,5)), sep = '\n')
  )
  qqnorm(y.sqrt,main = 'sqrt (b) \n(NoOut)'); qqline(y.sqrt)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(y.sqrt.res$statistic,3)),
    paste('p = ', round(y.sqrt.res$p.value,5)), sep = '\n')
  )


  # par(mfrow = c(1,1))

  df <- data.frame(
    Transform = c(
      '-',
      'log',
      'sqrt',
      '-',
      'log',
      'sqrt'),
    Outliers = c((rep('-', 3)), rep('removed', 3)),
    Method = rep('before',6),
    N.out.rm = c((rep(0, 3)), rep(N.rem, 3)),
    Skewness = c(
      round(moments::skewness(x, na.rm = T),3),
      round(moments::skewness(x.log, na.rm = T),3),
      round(moments::skewness(x.sqrt, na.rm = T),3),
      round(moments::skewness(y, na.rm = T),3),
      round(moments::skewness(y.log, na.rm = T),3),
      round(moments::skewness(y.sqrt, na.rm = T),3)),
    Kurtosis = c(
      round(moments::kurtosis(x, na.rm = T),3),
      round(moments::kurtosis(x.log, na.rm = T),3),
      round(moments::kurtosis(x.sqrt, na.rm = T),3),
      round(moments::kurtosis(y, na.rm = T),3),
      round(moments::kurtosis(y.log, na.rm = T),3),
      round(moments::kurtosis(y.sqrt, na.rm = T),3)),
    W = c(
      round(x.res$statistic,3),
      round(x.log.res$statistic,3),
      round(x.sqrt.res$statistic,3),
      round(y.res$statistic,3),
      round(y.log.res$statistic,3),
      round(y.sqrt.res$statistic,3)),
    p = c(
      round(x.res$p.value,5),
      round(x.log.res$p.value,5),
      round(x.sqrt.res$p.value,5),
      round(y.res$p.value,5),
      round(y.log.res$p.value,5),
      round(y.sqrt.res$p.value,5)))

  # df <- df[order(-df$p, -df$W),]
  # print(      'method:            before')
  # print(paste('Outliers removed: ', N.rem))
  df.before <- df

  ######################################################################################
  ######################################################################################

  x          <- x
  x.res      <- shapiro.test(x)
  x.log      <- log(x)
  x.log.res  <- shapiro.test(x.log)
  x.sqrt     <- sqrt(x)
  x.sqrt.res <- shapiro.test(x.sqrt)

  # https://stackoverflow.com/questions/4787332/how-to-remove-outliers-from-a-dataset
  # super thanks!!!
  remove_outliers <- function(x) {
    qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
    H <- 1.5 * IQR(x, na.rm = T)
    y <- x
    y[x < (qnt[1] - H)] <- NA
    y[x > (qnt[2] + H)] <- NA
    y
  }

  y          <- remove_outliers(x)
  y.res      <- shapiro.test(y)
  y.log      <- remove_outliers(log(y))
  y.log.res  <- shapiro.test(y.log)
  y.sqrt     <- remove_outliers(sqrt(y))
  y.sqrt.res <- shapiro.test(y.sqrt)


  #################################################### normal
  qqnorm(y,main = 'normal (a) \n(NoOut)'); qqline(y)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(y.res$statistic,3)),
    paste('p = ', round(y.res$p.value,5)), sep = '\n')
  )
  qqnorm(y.log,main = 'log (a) \n(NoOut)'); qqline(y.log)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(y.log.res$statistic,3)),
    paste('p = ', round(y.log.res$p.value,5)), sep = '\n')
  )
  qqnorm(y.sqrt,main = 'sqrt (a) \n(NoOut)'); qqline(y.sqrt)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(y.sqrt.res$statistic,3)),
    paste('p = ', round(y.sqrt.res$p.value,5)), sep = '\n')
  )


  par(mfrow = c(1,1))

  df <- data.frame(
    Transform = c(
      'log',
      'sqrt'),
    Outliers = rep('removed',2),
    Method = rep('after',2),
    N.out.rm = c(length(x.log)-length(na.omit(y.log)),
                 length(x.sqrt)-length(na.omit(y.sqrt))),
    Skewness = c(round(moments::skewness(y.log, na.rm = T),3),
                 round(moments::skewness(y.sqrt, na.rm = T),3)),
    Kurtosis = c(round(moments::kurtosis(y.log, na.rm = T),3),
                 round(moments::kurtosis(y.sqrt, na.rm = T),3)),
    W = c(
      round(y.log.res$statistic,3),
      round(y.sqrt.res$statistic,3)),
    p = c(
      round(y.log.res$p.value,5),
      round(y.sqrt.res$p.value,5))
  )

  df.after <- df
  df <- rbind(df.after, df.before)
  df <- df[order(-df$p, -df$W),]
  df


}







#################################################################################################################################.
# -------------------------------------------------------------------------------------------------------------------------------.
#                                        Norm_log_10.
# -------------------------------------------------------------------------------------------------------------------------------.
#################################################################################################################################.
Norm_log_10 <- function(x) {
  require(moments)

  x.res      <- shapiro.test(x)
  x.log      <- log(x)
  x.log.res  <- shapiro.test(x.log)
  x.log10     <- log10(x)
  x.log10.res <- shapiro.test(x.log10)

  par(mfrow=c(1,3),oma = c(0, 0, 2, 0))

  qqnorm(x,main = 'normal'); qqline(x)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(x.res$statistic,3)),
    paste('p = ', round(x.res$p.value,5)), sep = '\n')
  )
  qqnorm(x.log,main = 'log'); qqline(x.log)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(x.log.res$statistic,3)),
    paste('p = ', round(x.log.res$p.value,5)), sep = '\n')
  )
  qqnorm(x.log10,main = 'log10'); qqline(x.log10)
  legend('topleft', bty = 'n', paste(
    paste('W = ', round(x.log10.res$statistic,3)),
    paste('p = ', round(x.log10.res$p.value,5)), sep = '\n')
  )
  PlotTitle <- deparse(substitute(x))
  mtext(text = PlotTitle, outer = T, cex = 1)

  par(mfrow=c(1,1),oma = c(0, 0, 0, 0))
}
# Norm_log_10(mtcars$mpg)



#################################################################################################################################.
# -------------------------------------------------------------------------------------------------------------------------------.
#                                        SaveQQplots
# -------------------------------------------------------------------------------------------------------------------------------.
#################################################################################################################################.
save.QQPlots <- function(dataset, measures, pic_ParentFolder, plotType) {
  originalWD <- getwd()

  if(missing(plotType)) stop('\n\n\nHey! plotType must either "3" ("Norm_log_10" function) or "9" ("check.norm" plots)')
  else {
    if(plotType == 3) {
      TheFun <- as.function(Norm_log_10)
      width = 10; height = 4
      }
    if(plotType == 9)  {
      TheFun <- as.function(check.norm)
      width = 10; height = 10
    }
  }

  for(i in 1:length(measures)) {

    IVname <- measures[i]
    parent_folder <- paste(originalWD, pic_ParentFolder, sep = '/')
    dir.create(file.path(parent_folder), showWarnings = T,recursive = T)
    setwd(file.path(parent_folder))
    Mname <- measures[i]
    filename_int <- paste(Mname,'.tiff', sep = '')
    png(filename = filename_int, width = width, height = height, units = 'in', res = 72)
    TheFun(dataset[, Mname])
    title(main = Mname, outer = F)
    dev.off()
    setwd(originalWD)
  }
}
# save.QQPlots(dataset = mtcars, measures = c('mpg', 'disp'), pic_ParentFolder = 'myFolder', plotType = 3)




# Norm_log_10(mtcars$mpg)
#
# sadfg <- x
#
# x <- c(0.98, 1.73, 0.83, 1.54, 0.85, 1.16, 1.03, 0.61, 1.17, 1.66, 0.71, 1.25, 0.58, 0.94,
#        2.35, NA, 0.76, 2.48, 1.12, 0.93, 1.14, 0.78, 1.01, 1.03, 1.14, 1.03, 0.72, 1.95, 0.85,
#        0.69, 0.74, 1.14, 1.55, 0.93, 0.78, 0.75, 1.61, 1, 0.57, 0.39, 0.8, 0.92, 0.8, 0.96, 0.55,
#        0.84, 0.87, 1.02, 1.1, 1.11, 0.63, 1.42, 0.77, 1.73, 1.5, 0.67, 0.75, 1.7, 1.21, 1.26, 0.97,
#        0.79, 2.96, 1.14, 0.53, 0.65, 1.96, 0.57, NA, 0.51, 0.98, 0.35, 1.02, 1.55, 1.18)
# check.norm(x)
#
# log(x)
# log10(x)
# shapiro.test(log10(x))
# shapiro.test(log(x))