R/explore.R

#' Generating pie charts based on frequency of elements in all qualitative features
#' 
#' This function develops pie charts of frequency of all Qualitative Variables
#' 
#' @param lsVS is a list generated by verticalSplit function in this same package
#' @return a list of pie charts
#' @author Saurabh
#' @details
#' The input to this function is output of verticalSplit function in this package
#' It outputs a list containing pie chart of frequency of all qualitative variables. 
#' Output must be stored in a variable.
#' And you could access each single pie chart as outputList[1] or outputList[2]
#' @import rCharts,reshape2
#' 
#library(plyr)
############################## External function
piePlots<-function(lsVS){
  dfs<-ls()
  NoTabs<-length(runFacsummary(lsVS))
  for(i in 1:NoTabs){
    dfs[[i]]<-list(data.frame(runFacsummary(lsVS)[i]))
  }
  df1<-dfs[c(1:NoTabs)]
  rm(dfs)     #clearing space
  plts<-ls()  #Plot list
  for(i in 1:NoTabs){
    df<-data.frame(df1[[i]])
    names(df)<-c("Var","Freq")
    h3 = hPlot(x="Var", y ="Freq",data = df,type = "pie") ###changing type from pie
    plts[[i]]<-list(h3)
  }
  plts<-plts[c(1:NoTabs)]
  #return(plts[Plotn])
  paste("# of Plots",length(unlist(plts)),sep=" ")
  
  return(unlist(plts))
}

#######################################BAR CHRT
##################################################### External function
# Generating bar plots based on frequency of elements in all qualitative features
#' 
#' This function develops bar plots of frequency of all Qualitative Variables
#' 
#' @param lsVS is a list generated by verticalSplit function in this same package
#' @return a list of bar plots
#' @author Saurabh
#' @details
#' The input to this function is output of verticalSplit function in this package
#' It outputs a list containing bar plots of frequency of all qualitative variables. 
#' Output must be stored in a variable.
#' And you could access each single bar plots as outputList[1] or outputList[2]
#' @import rCharts,reshape2
#'

barPlots<-function(lsVS){
  dfs<-ls()
  NoTabs<-length(runFacsummary(lsVS))
  for(i in 1:NoTabs){
    dfs[[i]]<-list(data.frame(runFacsummary(lsVS)[i]))
  }
  df1<-dfs[c(1:NoTabs)]
  rm(dfs)     #clearing space
  plts<-ls()  #Plot list
  for(i in 1:NoTabs){
    df<-data.frame(df1[[i]])
    names(df)<-c("Var","Freq")
    p1 = mPlot(x = 'Var', y = list('Freq'), data = df, type = 'Bar', labels = list("Count"))
    p1$set(hideHover = "auto")
    plts[[i]]<-list(p1)
  }
  plts<-plts[c(1:NoTabs)]
  #return(plts[Plotn])
  paste("# of Plots",length(unlist(plts)),sep=" ")
  return(plts)
}
############################           Internal function
#'
#' Calculating coefficient of variance 
#' 
#' 
#' This function calculates Coefficient of variance Quantitative Variables
#' 
#' @param var any numeric vector
#' @return Numeric Coefficient of variance
#' @author Saurabh
#' @details
#' Its an internal function

CoefV<-function(var){
  ans=sd(var,na.rm=TRUE)/mean(var,na.rm=TRUE)
  return(ans)
}
#################################################  Internal function
#'
#' Univariate exploration of all 
#' Quantitative Variables in a Dataset
#' 
#' This function calculates various statistics of Quantitative variables in your data
#' The function calculates Coef. of variance,min,max,standard deviation,variance, 
#' various qunatiles, mean,median,skewness,kurtosis it also checks if data contains NA's and number of unique values.
#' @param df a data frame (your dataset)
#' @return rbound values, can be efficently converted to data frame
#' @author Saurabh
#' @details
#' Its an internal function
#' @import moments
#############################################################
# CarIntT<-function(df){
#   names<-names(df)
#   dfin<-1:dim(df)[1]
#   dfot<-1:dim(df)[1]
#   typ<-sapply(df, class)
#   typ<-as.character(typ)
#   ##integer df
#   for(i in 1:length(names)){
#     if(typ[i]=="integer"||typ[i]=="numeric"){
#       dfin<-cbind(dfin,df[i])
#     }else{
#       dfot<-cbind(dfot,df[i])
#     }
#   }
# }
####################################### Internal function
mysummary<-function(df){
  names<-names(df)
  dfin<-1:dim(df)[1]
  dfot<-1:dim(df)[1]
  typ<-sapply(df, class)
  typ<-as.character(typ)
  ##integer df
  for(i in 1:length(names)){
    if(typ[i]=="integer"||typ[i]=="numeric"){
      dfin<-cbind(dfin,df[i])
    }else{
      dfot<-cbind(dfot,df[i])
    }
  }
  ##############################################  creating two different tables
  dfin<-data.frame(dfin)
  dfot<-data.frame(dfot)
  dfin<-dfin[-c(1)]
  dfot<-dfot[-c(1)]
  
  ##############  Some real satistics
  #library(moments)
  minmy<-round(sapply(dfin,min,na.rm=TRUE),3)
  maxmy<-round(sapply(dfin,max,na.rm=TRUE),3)
  sdmy<-round(sapply(dfin,sd,na.rm=TRUE),3)
  varmy<-round(sapply(dfin,var,na.rm=TRUE),3)
  q0my<-round(sapply(dfin,quantile,probs=c(0),na.rm=TRUE),3)
  q25my<-round(sapply(dfin,quantile,probs=c(0.25),na.rm=TRUE),3)
  q50my<-round(sapply(dfin,quantile,probs=c(0.5),na.rm=TRUE),3)
  q75my<-round(sapply(dfin,quantile,probs=c(0.75),na.rm=TRUE),3)
  q90my<-round(sapply(dfin,quantile,probs=c(0.9),na.rm=TRUE),3)
  q95my<-round(sapply(dfin,quantile,probs=c(0.95),na.rm=TRUE),3)
  q100my<-round(sapply(dfin,quantile,probs=c(1),na.rm=TRUE),3)
  meanmy<-round(sapply(dfin,mean,na.rm=TRUE),3)
  medianmy<-round(sapply(dfin,median,na.rm=TRUE),3)
  skewnessmy<-round(sapply(dfin,skewness,na.rm=TRUE),3)
  kurtosismy<-round(sapply(dfin,kurtosis,na.rm=TRUE),3)
  CoefVmy<-round(sapply(dfin,CoefV),4)
  ##checking na
  naChk<-sapply(dfin,is.na)
  naChk<-data.frame(naChk)
  naChk<-sapply(naChk,sum)
  ##no of unique values
  numUnq<-sapply(dfin,unique)
  numUnq<-sapply(numUnq,length)
  ##for df int untill now
  mysummary<-rbind(CoefVmy,minmy,maxmy,sdmy,varmy,q0my,q25my,q50my,q75my,q90my,q95my,q100my,meanmy,medianmy,skewnessmy,kurtosismy,naChk,numUnq)
  return(mysummary)
}

###############################################Internal function
#'
#' Univariate exploration of all 
#' Qualitative Variables in a Dataset
#' 
#' This function calculates the frequency of various qualitative values
#' in the data 
#' @param df a data frame (your dataset)
#' @return List of frequency of all internal values of all qualitative values
#' @author Saurabh
#' @details
#' Its an internal function
#' 
facsummary<-function(df){
  names<-names(df)
  dfin<-1:dim(df)[1]
  dfot<-1:dim(df)[1]
  typ<-sapply(df, class)
  typ<-as.character(typ)
  ################################fac df
  for(i in 1:length(names)){
    if(typ[i]=="integer"||typ[i]=="numeric"){
      dfin<-cbind(dfin,df[i])
    }else{
      dfot<-cbind(dfot,df[i])
    }
  }
  dfot<-data.frame(dfot)
  dfot<-dfot[-c(1)]
  unqCnt<-sapply(dfot,table)
  return(unqCnt)
  
}

###################################### External function I/P from vertical split
#'
#' Executes mysummary function on complete list output by verticalSplit function
#' 
#' This function calculates various statistics of Quantitative variables in your data
#' The function calculates Coef. of variance,min,max,standard deviation,variance, 
#' various qunatiles, mean,median,skewness,kurtosis it also checks if data contains NA's and number of unique values.
#' @param lsVS a list of dataframes produced by verticalSplit function
#' @return rbound values, can be efficently converted to data frame
#' @author Saurabh
#' @details
#' Its an internal function
#' @import moments,plyr
runSummary<-function(lsVS){
  #library(plyr)
  if(length(lsVS)>1){
  for(i in 1:5){
    df1 <- ldply(lsVS[i], data.frame)
    if(i==1){report1<-mysummary(df1)
    }else if(i==2){report2<-mysummary(df1)
    }else if(i==3){report3<-mysummary(df1)
    }else if(i==4){report4<-mysummary(df1)
    }else {report5<-mysummary(df1)}
    rm(df1)
    
  }
  return(cbind(report1,report2,report3,report4,report5))
  }
  else{
    df1 <- ldply(lsVS[1], data.frame)
    return(mysummary(df1))
  }
}

## lsVS = List from vertical split
####################################### External Function i/p from Vertical Split
#' Executes facsummary function on complete list output by verticalSplit function
#'  
#' This function calculates the frequency of various qualitative values
#' in the data 
#' @param lsVS a list of dataframes produced by verticalSplit function
#' @return List of frequency of all internal values of all qualitative values
#' @author Saurabh
#' @details
#' Its an internal function
#' @import plyr
runFacsummary<-function(lsVS){ 
 # library(plyr)
  if(length(lsVS)>1){
  for(i in 1:5){
    df1 <- ldply(lsVS[i], data.frame)
    if(i==1){report1<-facsummary(df1)
    }else if(i==2){report2<-facsummary(df1)
    }else if(i==3){report3<-facsummary(df1)
    }else if(i==4){report4<-facsummary(df1)
    }else if(i==5){report5<-facsummary(df1)}
    rm(df1)
  
  }
  #####Cant pass 3 lists to append
  finalrep<-append(append(report1,report2),append(report3,report4))
  return(append(finalrep,report5))
  }
  else{
    df1 <- ldply(lsVS[1], data.frame)
    return(facsummary(df1))
  }
}

#################################  ###########Internal function outputs lsVS
#' Splitting of large datasets to more manageable data frames  
#' 
#' 
#' This function splits a large dataset column wise in more managable chunks of data frames 
#' and returns the resulting data frames in form of a list of data frames
#' @param df a data frame (your dataset)
#' @return List of data frames created by dividing orignal data
#' @author Saurabh
#' @details
#' Its an internal function
verticalSplit<-function(df){
  if(length(names(df))>30){
    x<-1:length(names(df))
    n<-5
    chunk <- function(x, n) split(x, sort(rank(x) %% n))
    attrList<-chunk(x,n)
    attrList1<-as.vector(unlist(attrList[1]))
    attrList2<-as.vector(unlist(attrList[2]))
    attrList3<-as.vector(unlist(attrList[3]))
    attrList4<-as.vector(unlist(attrList[4]))
    attrList5<-as.vector(unlist(attrList[5]))
##  }
  df1<-df[,attrList1]
  df2<-df[,attrList2]
  df3<-df[,attrList3]
  df4<-df[,attrList4]
  df5<-df[,attrList5]
  rm(df)
  rm(attrList1)
  rm(attrList2)
  rm(attrList3)
  rm(attrList4)
  rm(attrList5)
  return(list(df1,df2,df3,df4,df5))
}##
else{
  return(list(df))
}
}
############################################################ i/p is o/p of quantVarSumm

#' Generating ScatterPlot of coefficient of variance of all quantitative variables
#' 
#' 
#' This function generates ScatterPlot of coefficient of variance.
#' 
#' @param quantSummary is output of quantVarSumm function from explore package
#' @return a Scatter Plot
#' @author Saurabh Jaju
#' @details
#' The input to this function is output of verticalSplit function in this package
#' It outputs a list containing pie chart of frequency of all qualitative variables. 
#' Output must be stored in a variable.
#' And you could access each single pie chart as outputList[1] or outputList[2]
#' @export
#' @import rCharts,reshape2
#' 

covarianceSPlot<-function(quantSummary){
  summ<-data.frame(quantSummary)
  variance<-as.numeric(summ[1,])
  index<-1:length(variance)
  feature<-names(summ)
  df<-data.frame(index,variance,feature)
  #df<-df[variance<1 & variance>-1,]
  n1 <- rPlot(variance ~ index, data = df, color = "feature", type = "point")
  return(n1)
}

################################################ROUGH
# df1 <- ldply(df[1], data.frame)
# df2 <- ldply(df[2], data.frame)
# df3 <- ldply(df[3], data.frame)
# df4 <- ldply(df[4], data.frame)
# df5 <- ldply(df[5], data.frame)
################################################################# External catering functions 
##############User allowed to call these
#' Generates a detailed univariate statistical summary of quantitative data
#' 
#' 
#' This function calculates various statistics of the quantitative features of input dataset
#' 
#' @param df your data frame
#' @return rbound vectors of input data statistics 
#' @author Saurabh Jaju
#' @details
#' The function first divides the dataset in managable smaller dataframes with \code{verticalSplit}
#' Then calculates summary statistics like
#' Coef. of variance,minimum,maximum,standard deviation,variance, 
#' various qunatiles, mean,median,skewness,kurtosis with \code{runSummary}
#' it also checks if data contains NA's and number of unique values.
#'    
#' @export
#' @import moments,plyr
#' 

quantVarSumm<-function(df){
  lsVS<-verticalSplit(df)
  return(runSummary(lsVS))
}

#' Generates a detailed univariate frequency summary of qualitative features in input dataset
#' 
#' The function first divides the dataset in managable smaller dataframes with \code{verticalSplit}
#' This function provides frequency count of each unique entry in each qualitative feature with \code{runFacsummary}
#' 
#' @param df your data frame
#' @return list of variable name, unique entry, count(frequency)  
#' @author Saurabh Jaju
#' @details
#' The function first divides the dataset in managable smaller dataframes
#' Then calculates frequency of all unique values in all qualitative features 
#' And returns it in the form of a list
#'    
#' @export
#' @import plyr
#' 
qualiVarSumm<-function(df){
  lsVS<-verticalSplit(df)
  return(runFacsummary(lsVS))
}

#' Generating BarPlots 
#' 
#' The function first divides the dataset in managable smaller dataframes with \code{verticalSplit}
#' This function generates BarPlots of frequency of all unique values qualitative variables with \code{barPlots}
#' 
#' @param df your dataframe
#' @return a list of barplots 
#' @author Saurabh Jaju
#' @details
#' It outputs a list containing BarPlots of frequency of all unique values qualitative variables. 
#' Output must be stored in a variable.
#' And you could access each single bar Plot as 
#' outputList<-freqBarPlots(df)
#' outputList[1] or outputList[2]....
#' @export
#' @import rCharts,reshape2
#' 

freqBarPlots<-function(df){
  lsVS<-verticalSplit(df)
  return(barPlots(lsVS))
}

#' Generating PieCharts 
#' The function first divides the dataset in managable smaller dataframes with \code{verticalSplit}
#' This function generates PieCharts of frequency of all unique values qualitative variables with \code{piePlots}
#' 
#' @param df your dataframe
#' @return a list of PieCharts
#' @author Saurabh Jaju
#' @details
#' It outputs a list containing PieCharts of frequency of all qualitative variables. 
#' Output must be stored in a variable.
#' And you could access each single pie chart as 
#' outputList<-freqPiePlots(df)
#' outputList[1] or outputList[2]....
#' @export
#' @import rCharts,reshape2
#' 
freqPiePlots<-function(df){
  lsVS<-verticalSplit(df)
  return(piePlots(lsVS))
}
saurabhJaju/explore- documentation built on May 29, 2019, 3:19 p.m.