R/r-batch-eda-utilities.R
In analysisPipelines: Compose Interoperable Analysis Pipelines & Put Them in Production

Documented in bivarPlots CheckColumnType correlationMatPlot getDatatype ignoreCols multiVarOutlierPlot outlierPlot univarCatDistPlots

######################################################################################################
# Title: Functions for various exploratory analysis operations on data frames
# Author: Sanjay, Dheekshitha PS
# Created on: June 14, 2018
# Description: Consolidated file for all functionsto be used out of the box for exploratory analysis
######################################################################################################


########################
# FUNCTION DEFINITIONS #
########################
#' @name ignoreCols
#' @title Ignores the columns in the loaded dataframe object
#' @details The columns selected are removed from the object
#' @param data the dataframe object that needs to be loaded
#' @param columns the names of columns to be ignored from dataframe object
#' @return Updated dataframe object
#' @family Package EDA Utilites functions
#' @examples
#' ignoreCols(data = iris, columns = "Species")
#' @export
ignoreCols <- function(data, columns){
  tryCatch({
    if(all(columns %in% colnames(data))){
      return(data[, setdiff(colnames(data), columns), drop = F])
    }else{
      mismatch <- colnames(data)[!all(columns %in% colnames(data))]
      stop(paste0("Columns ", paste0(mismatch, collapse = ", "), " are not present in the dataset"))
    }
  }, error = function(e){
    stop(e)
  }, warning = function(e){
    warning(e)
  })
}

# Univariate Categoric Distribution function
#' @name univarCatDistPlots
#' @title Univariate Categoric Distribution
#' @details A univariate distribution graph on the selected categorical columns from the dataframe
#' @param data the dataset where the column on which the plot is to be generated is present
#' @param uniCol the name of column on which the plot needs to be generated
#' @param priColor the primary color for the plots
#' @param optionalPlots A Flag for optional plots
#' @return A univariate categoric distribution plot
#' @family Package EDA Utilites functions
#' @examples
#' univarCatDistPlots(data = iris, uniCol = "Species")
#' @export
univarCatDistPlots <- function(data, uniCol, priColor = "blue", optionalPlots = 0){
  levels(data[[uniCol]]) <- c(levels(data[[uniCol]]), "NA")
  data[[uniCol]][is.na(data[[uniCol]])] <- "NA"
  # data <- data %>% dplyr::group_by({{ uniCol }}) %>% dplyr::summarise(count = dplyr::n())
  y <- data[[uniCol]]
  # data %>>% dplyr::arrange(.data$count) -> data

  catPlot <- ggplot2::ggplot(data,
                             ggplot2::aes(x = !! rlang::sym(uniCol))) +
    ggplot2::geom_bar(fill = priColor,alpha=0.7) +
    ggplot2::xlab(uniCol) +
    ggplot2::ylab("Frequency") + ggplot2::theme_bw() +
    ggplot2::theme(
      axis.title = ggplot2::element_text(size = 16),panel.grid.major.y=ggplot2::element_blank(),panel.border=ggplot2::element_rect(size=0.1)
    ) +
    ggplot2::coord_flip()
  if(optionalPlots){
    catPlot <- plotly::plot_ly(y = y, x=data[["count"]],type="bar",orientation='h',color = I(priColor)) %>%
      plotly::layout(title=paste0("Frequency Histogram for ",uniCol),
                     xaxis=list(title = "Frequency"),
                     yaxis=list(title = uniCol))


  }
  return(catPlot)
}

#Outlier Plot Function
#' @name outlierPlot
#' @title Outlier detection plot
#' @details Outlier are to be identified on the selected column from the dataframe
#' @param data the dataframe that needs to be loaded
#' @param method the method on which outliers are to be identified
#' @param columnName the name of column for which the outliers are identified
#' @param cutoffValue the cut off value to define the threshold for outliers
#' @param priColor the primary color for the plots
#' @param optionalPlots A Flag for optional plots
#' @return Outliers plot object
#' @family Package EDA Utilites functions
#' @examples
#' \dontrun{
#' outlierPlot(data = iris, columnName = "Sepal.Length")
#' }
#' @export
outlierPlot <- function(data, method = "iqr", columnName, cutoffValue = 0.05, priColor = "blue", optionalPlots = 0){


  if(TRUE %in% unique(is.na(data[,columnName]))){
    data <- data[-which(is.na(data[,columnName])),]
  }

  if(method == "iqr"){
    lower <- stats::quantile(data[, columnName], .25,na.rm = T) - 1.5*(stats::IQR(data[, columnName], na.rm = T))
    upper <- stats::quantile(data[,columnName],.75,na.rm = T) + 1.5*(stats::IQR(data[,columnName],na.rm = T))
    data$Outlier <- data[,columnName] > upper | data[,columnName] < lower

    outlierPlotObj <- ggplot2::ggplot(data, ggplot2::aes(x="", y = data[,columnName])) +
      ggplot2::geom_boxplot(fill = priColor,alpha=0.7) +
      ggplot2::theme_bw() +
      ggplot2::theme(panel.border=ggplot2::element_rect(size=0.1),panel.grid.minor.x=ggplot2::element_blank(),panel.grid.major.x=ggplot2::element_blank(),legend.position = "bottom") +ggplot2::ylab(columnName) + ggplot2::xlab("")

  }
  if(method == "percentile"){
    lower <- stats::quantile(data[,columnName],cutoffValue,na.rm = T)
    upper <- stats::quantile(data[,columnName],(1-cutoffValue),na.rm = T)
    data$Outlier <- data[,columnName] > upper | data[,columnName] < lower

    Outlier<-data$Outlier
    Value<-data[,columnName]
    outlierPlotObj <- ggplot2::ggplot(data) +
      ggplot2::geom_histogram(ggplot2::aes(x = Value, fill = as.name("Outlier")),
                              bins=30, alpha=0.7) +
      ggplot2::scale_fill_manual(values = c(priColor, "red"),breaks=c("FALSE", "TRUE"),
                                 labels=c("Normal", "Outlier"),name = "Status") +
      ggplot2::theme_bw() +
      ggplot2::theme(panel.border=ggplot2::element_rect(size=0.1),panel.grid.minor.x=ggplot2::element_blank(),panel.grid.major.x=ggplot2::element_blank(),legend.position = "bottom") +
      ggplot2::xlab(columnName)

  }
  if(method == "z_score"){

    lower <- mean(data[,columnName],na.rm = T) - (cutoffValue*(stats::sd(data[,columnName],na.rm = T)))
    upper <- mean(data[,columnName],na.rm = T) + (cutoffValue*(stats::sd(data[,columnName],na.rm = T)))
    data$Outlier <- data[,columnName] > upper | data[,columnName] < lower

    data$zScore <- scale(data[,columnName],center = T, scale = T)
    Zscore<-as.vector(data$zScore)
    y<-data[,columnName]
    outlierPlotObj <-
      ggplot2::ggplot(data, ggplot2::aes(x = Zscore, y = y)) +
      ggplot2::geom_point(ggplot2::aes(color = as.name("Outlier")), alpha=0.7)+
      ggplot2::scale_color_manual("Status", values = c("TRUE" = "red","FALSE" =priColor))+
      ggplot2::ylab(columnName)+
      ggplot2::theme_bw() +
      ggplot2::theme(panel.border=ggplot2::element_rect(size=0.1),panel.grid.minor.x=ggplot2::element_blank(),panel.grid.major.x=ggplot2::element_blank(),legend.position = "bottom") +
      ggplot2::xlab("Z-score")+
      ggplot2::geom_vline(xintercept = (cutoffValue),linetype = "dashed")+
      ggplot2::geom_vline(xintercept = -(cutoffValue), linetype = "dashed")

  }
  #conditionToBe
  if(optionalPlots)
  {
    outlierPlotObj <- plotly::ggplotly(outlierPlotObj)
    outlierPlotObj$x$layout$margin$l <- outlierPlotObj$x$layout$margin$l + 30
    outlierPlotObj$x$layout$margin$b <- outlierPlotObj$x$layout$margin$b + 3

  }

  return(outlierPlotObj)

}

#Mutlivariate Outlier Plot Function
#' @name multiVarOutlierPlot
#' @title Multi-Variate Outlier Plot
#' @details Multivaraite outlier plot using the selected columns from the dataframe
#' @param data the dataframe that needs to be loaded
#' @param depCol the name of column which is to be identified as dependent column
#' @param indepCol the name of an independent column
#' @param sizeCol the name of column used to define the size of point in plots
#' @param priColor the primary color for the plots
#' @param optionalPlots A Flag for optional plots
#' @param cutoffValue A p-alue cutoff for detecting outliers
#' @return Outliers plot
#' @family Package EDA Utilites functions
#' @examples
#' \dontrun{
#' multiVarOutlierPlot(data = iris, depCol = "Sepal.Length",
#'    indepCol = "Sepal.Width", sizeCol = "Petal.Length")
#' }
#' @export
multiVarOutlierPlot <- function(data, depCol, indepCol, sizeCol, priColor = "blue", optionalPlots = 0,
                                cutoffValue = 0.05){
  if(TRUE %in% unique(is.na(data[,depCol])))
  {
    data <- data[-which(is.na(data[,depCol])== T),]
  }

  indep_form <- paste(indepCol, collapse = "+")
  form <- paste(depCol, indep_form, sep = "~")
  form <- stats::formula(form)

  lmObject <- lm(form,data)
  limit <- nrow(data)
  outlierDetect <- car::outlierTest(lmObject, cutoffValue, n.max = limit)
  outlierDetect <- data.frame(outlierDetect[c(1,2,3)])
  outlierDetect <- round(outlierDetect, 4)
  colnames(outlierDetect) <- c("Studentized Residuals","P-Value", "P-Value(Bonferroni Correction)")
  data$Outlier <- ifelse(rownames(data) %in% rownames(outlierDetect),"Outlier","Normal")
  outlierTable <- data[data$Outlier == "Outlier", ]
  outlierTable <- cbind(outlierDetect,outlierTable)

  x<-data[,indepCol]
  y<-data[,depCol]
  size<-data[,sizeCol]
  outlierPlot <- ggplot2::ggplot(data, ggplot2::aes(x = x,y = y), alpha=0.6)+
    ggplot2::geom_point(ggplot2::aes(color = as.name("Outlier"), size = size), alpha=0.7)+
    ggplot2::scale_color_manual("",values = c("Outlier" = "red", "Normal" = priColor))+
    ggplot2::labs(title = paste(depCol,"vs",indepCol)) +  ggplot2::theme_bw() +
    ggplot2::theme(panel.border=ggplot2::element_rect(size=0.1),panel.grid.minor.x=ggplot2::element_blank(),legend.position = "bottom") +
    ggplot2::ylab(depCol) +
    ggplot2::xlab(indepCol)
  #conditionToBe
  if(optionalPlots)
  {
    outlierPlot <- plotly::ggplotly(outlierPlot,tooltip=c("all"))
    outlierPlot$x$layout$margin$l <- outlierPlot$x$layout$margin$l + 30
    outlierPlot$x$layout$margin$b <- outlierPlot$x$layout$margin$b + 3
  }
  return(outlierPlot)
}

## Bivariate Plots
#' @name bivarPlots
#' @title Bi-Variate Plot
#' @details A bivariate distribution graph on the selected columns from the dataframe.Selected two columns are on two axis' and a plot is generated
#' @param dataset the dataframe that needs to be loaded
#' @param select_var_name_1 the name of first column on which the plot needs to be generated
#' @param select_var_name_2 the name of second column on which the plot needs to be generated
#' @param priColor the primary color for the plots
#' @param secColor A secondary color for the plots
#' @return Bivariate plot
#' @family Package EDA Utilites functions
#' @examples
#' bivarPlots(dataset = iris, select_var_name_1 = "Sepal.Length",
#'  select_var_name_2 = "Sepal.Width")
#' @export
bivarPlots <- function(dataset, select_var_name_1, select_var_name_2, priColor = "blue", secColor= "black") {

  numeric_cols <- unlist(getDatatype(dataset)['numeric_cols'])
  cat_cols <- unlist(getDatatype(dataset)['cat_cols'])

  if (select_var_name_1 %in% numeric_cols && select_var_name_2 %in% numeric_cols) {
    x = dataset[, select_var_name_1]
    y = dataset[, select_var_name_2]
    bivarPlot <-
      ggplot2::ggplot(dataset, ggplot2::aes(x, y)) +
      ggplot2::geom_point(color = priColor, alpha = 0.7) +
      ggplot2::geom_smooth(method = lm, color = secColor) +
      ggplot2::xlab(select_var_name_1) +
      ggplot2::ylab(select_var_name_2) + ggplot2::theme_bw() +
      ggplot2::ggtitle(paste(
        'Bivariate plot for',
        select_var_name_1,
        'and',
        select_var_name_2,
        sep = ' '
      )) +
      ggplot2::theme(
        plot.title = ggplot2::element_text(hjust = 0.5, size = 10),
        axis.text = ggplot2::element_text(size = 10),
        axis.title = ggplot2::element_text(size = 10)
      )



  } else if (select_var_name_1 %in% cat_cols &&
             select_var_name_2 %in% cat_cols) {
    new_df <- dataset %>% dplyr::group_by_(.dots=c(select_var_name_1,select_var_name_2)) %>% dplyr::summarise(n = dplyr::n())
    colfunc <- grDevices::colorRampPalette(c(priColor, "white" , secColor))
    colorvar <- length(unique(new_df[[select_var_name_2]]))
    a=as.vector(as.character(unique(new_df[[select_var_name_1]])))
    y=new_df[[select_var_name_1]]
    label=new_df[[select_var_name_2]]
    bivarPlot <-ggplot2::ggplot(new_df, ggplot2::aes(x = y, y= .data$n, fill = label)) +
      ggplot2::geom_bar(position = "dodge", stat = "identity",alpha=0.9) +
      ggplot2::guides(fill=ggplot2::guide_legend(title=select_var_name_2)) +
      ggplot2::coord_flip()+
      ggplot2::xlab(select_var_name_1) +
      ggplot2::ylab("count") + ggplot2::theme_bw() +
      ggplot2::ggtitle(paste('Bivariate plot for',select_var_name_1,'and',select_var_name_2,sep=' '))+
      ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 10),axis.text = ggplot2::element_text(size=10),
                     axis.title=ggplot2::element_text(size=10),legend.position="bottom",axis.text.x=ggplot2::element_text(angle=45, hjust=1))+ ggplot2::scale_fill_manual(values = colfunc(colorvar))


  } else {
    cols <- c(select_var_name_1, select_var_name_2)
    cat_col <- cols[which(cols %in% cat_cols)]
    num_col <- cols[which(cols %in% numeric_cols)]
    a = as.vector(as.character(unique(dataset[[cat_col]])))
    y = dataset[[cat_col]]
    x = dataset[[num_col]]
    bivarPlot <-
      ggplot2::ggplot(dataset, ggplot2::aes(x = y, y = x)) +
      ggplot2::geom_point(color = priColor, alpha = 0.7) +
      ggplot2::coord_flip() +
      ggplot2::xlab(cat_col) +
      ggplot2::ylab(num_col) + ggplot2::theme_bw() +
      ggplot2::ggtitle(paste(
        'Bivariate plot for',
        select_var_name_1,
        'and',
        select_var_name_2,
        sep = ' '
      )) +
      ggplot2::theme(
        plot.title = ggplot2::element_text(hjust = 0.5, size = 10),
        axis.text = ggplot2::element_text(size = 10),
        axis.title = ggplot2::element_text(size = 10)
      )
  }

  return(bivarPlot)
}

## Correlation Matrix
#' @name correlationMatPlot
#' @title Correlation Matrix Plot
#' @description A correlation matrix is created and plotted across all the columns in the dataset
#' @param dataset the dataset that needs to be loaded
#' @param methodused methods to be used for computing correlation
#' @return Correlation Matrix graph
#' @family Package EDA Utilites functions
#' @examples
#' correlationMatPlot(dataset = iris)
#' @export
correlationMatPlot <- function(dataset, methodused = "everything"){
  numeric_cols <- getDatatype(dataset)['numeric_cols']
  cormatrix <- base::round(stats::cor(dataset[,unlist(numeric_cols),drop=F], use = methodused),3)
  return(R.devices::capturePlot(corrplot::corrplot(cormatrix,
                                                   method = "color",
                                                   outline = T,
                                                   addgrid.col = "darkgray",
                                                   # order="hclust",
                                                   addrect = 4,
                                                   rect.col = "black",
                                                   rect.lwd = 5,
                                                   cl.pos = "b",
                                                   tl.col = "black",
                                                   tl.cex = 1,
                                                   cl.cex = 1.5,
                                                   addCoef.col = "black",
                                                   number.digits = 2,
                                                   number.cex = 0.75,
                                                   type = "lower",
                                                   col = grDevices::colorRampPalette(c("red","white","green"))(200))))
}


##################
# MISC FUNCTIONS #
##################

## Return the column type

#' @name CheckColumnType
#' @title Check for type of column
#' @details Checking for type of columns in the datavector
#' @param dataVector a data vector of a column
#' @return column Type
#' @family Package EDA Utilites functions
#' @examples
#' CheckColumnType(iris$Sepal.Length)
#' @export
CheckColumnType <- function(dataVector) {
  #Check if the column type is "numeric" or "character" & decide type accordDingly
  if (class(dataVector) == "integer" || class(dataVector) == "numeric") {
    columnType <- "numeric"
  } else { columnType <- "character" }
  #Return the result
  return(columnType)
}

## Get numeric and categoric
#' @name getDatatype
#' @title Get Data Type
#' @details Based on the datatype the columns are seperated into categorical and numerical columns
#' @param dataset a dataset which needs to be loaded
#' @return list with \code{numeric_cols} and \code{cat_cols}
#' @family Package EDA Utilites functions
#' @examples getDatatype(iris)
#' @export
getDatatype <- function(dataset){
  numeric_cols <- colnames(dataset)[unlist(sapply(dataset,FUN = function(x){ CheckColumnType(x) == "numeric"}))]
  cat_cols <- colnames(dataset)[unlist(sapply(dataset,FUN = function(x){CheckColumnType(x) == "character"|| CheckColumnType(x) == "factor"}))]
  return(list("numeric_cols"=numeric_cols , "cat_cols"=cat_cols))
}

Any scripts or data that you put into this service are public.

analysisPipelines documentation built on July 1, 2020, 7:09 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

analysisPipelines
Compose Interoperable Analysis Pipelines & Put Them in Production

R/r-batch-eda-utilities.R
In analysisPipelines: Compose Interoperable Analysis Pipelines & Put Them in Production

Defines functions getDatatype CheckColumnType correlationMatPlot bivarPlots multiVarOutlierPlot outlierPlot univarCatDistPlots ignoreCols

Documented in bivarPlots CheckColumnType correlationMatPlot getDatatype ignoreCols multiVarOutlierPlot outlierPlot univarCatDistPlots

Try the analysisPipelines package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

analysisPipelines Compose Interoperable Analysis Pipelines & Put Them in Production

R/r-batch-eda-utilities.R In analysisPipelines: Compose Interoperable Analysis Pipelines & Put Them in Production

Defines functions getDatatype CheckColumnType correlationMatPlot bivarPlots multiVarOutlierPlot outlierPlot univarCatDistPlots ignoreCols

Documented in bivarPlots CheckColumnType correlationMatPlot getDatatype ignoreCols multiVarOutlierPlot outlierPlot univarCatDistPlots

Try the analysisPipelines package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

analysisPipelines
Compose Interoperable Analysis Pipelines & Put Them in Production

R/r-batch-eda-utilities.R
In analysisPipelines: Compose Interoperable Analysis Pipelines & Put Them in Production