#'This function is used to check some common errors of a raw dataset and return a suitable dataset for feeding in the imputer later on. Note that this would not guarantee the dataset is fully cleaned.
#' @param rawdata A data frame. (a raw dataset)
#' @param levels.tol Tolerant proportion of the number of levels to the number of observations in a dataset. Default: 0.2
#' @export
data_clean<-function(rawdata,levels.tol=0.2){
names<-colnames(rawdata)
if(!is.data.frame(rawdata)){
warning("rawdata is not a data frame. It is now converted to a data frame instead.")
rawdata=as.data.frame(rawdata)
}
#location of NaN in numeric columns only
nan.loc<-do.call(cbind,lapply(rawdata,is.nan))
if(any(nan.loc)){
col.idx=which(nan.loc,arr.ind = T)[,2]
nan.col=names[unique(col.idx)]
msg1=paste("There exists at least one entry coded as NaN in the following numeric variable(s): ", paste(nan.col,collapse=";"),
".",sep="")
msg2=paste("It is now coverted to NA instead.")
warning(paste(msg1,msg2,sep="\n"))
#replace NaN in numeric columns with NA
rawdata[nan.loc]<-NA
}
if(any(rawdata=="NaN",na.rm=TRUE)){
col.idx=which(rawdata=="NaN",arr.ind=T)[,2]
nan.col=names[unique(col.idx)]
msg1=paste("There exists at least one entry coded as NaN in the following factor or character variable(s): ", paste(nan.col,collapse=";"),
".",sep="")
msg2=paste("It is now coverted to NA instead.")
warning(paste(msg1,msg2,sep="\n"))
#replace NaN in charater/factor columns with NA
rawdata[rawdata=="NaN"]<-NA
#drop NaN level from factor variables
rawdata=droplevels(rawdata)
}
inf.loc<-do.call(cbind,lapply(rawdata,is.infinite))
if(any(inf.loc)){
col.idx=which(inf.loc,arr.ind = T)[,2]
inf.col=names[unique(col.idx)]
msg1=paste("There exists at least one entry coded as Inf or -Inf in the following variable(s): ", paste(inf.col,collapse=";"),
".",sep="")
msg2=paste("It is now coverted to NA instead.")
warning(paste(msg1,msg2,sep="\n"))
#replace Inf or -Inf in numeric columns with NA
rawdata[inf.loc]<-NA
}
if(any(rawdata=="Inf",na.rm=TRUE) | any(rawdata=="-Inf",na.rm=TRUE)){
col.idx=c(which(rawdata=="Inf",arr.ind=T)[,2],which(rawdata=="-Inf",arr.ind=T)[,2])
inf.col=names[unique(col.idx)]
msg1=paste("There exists at least one entry coded as Inf or -Inf in the following factor or character variable(s): ", paste(inf.col,collapse=";"),
".",sep="")
msg2=paste("It is now coverted to NA instead.")
warning(paste(msg1,msg2,sep="\n"))
#replace Inf or -Inf in charater/factor columns with NA
rawdata[rawdata=="Inf"]<-NA
rawdata[rawdata=="-Inf"]<-NA
#drop NaN level from factor variables
rawdata=droplevels(rawdata)
}
if(any(rawdata=="")){
col.idx=which(rawdata=="",arr.ind=T)[,2]
empty.col=names[unique(col.idx)]
msg1=paste("There exists at least one entry coded as empty cell \"\" in the following variable(s): ", paste(empty.col,collapse=";"),
".",sep="")
msg2=paste("Empty cells are now coverted to NA instead.")
warning(paste(msg1,msg2,sep="\n"))
#replace NaN in charater/factor columns with NA
rawdata[rawdata==""]<-NA
#drop NaN level from factor variables
rawdata=droplevels(rawdata)
}
var.type<-sapply(rawdata,class)
if(any(var.type=="character")){
idx=which(var.type=="character")
chr.col=names[idx]
if(length(chr.col)==1){
msg1=paste("The following variable is of character type: ", paste(chr.col),
".",sep="")
msg2=paste("It is now converted to factor instead.")
}else{
msg1=paste("The following variables are of character type: ", paste(chr.col,collapse=";"),
".",sep="")
msg2=paste("They are now converted to factor instead.")
}
warning(paste(msg1,msg2,sep="\n"))
rawdata<-as.data.frame(unclass(rawdata),stringsAsFactors = TRUE)
}
idx<-which(sapply(rawdata, nlevels) == 1)
if(length(idx)>=1){
single.col=names[idx]
msg1=paste("Factor variable(s) with only one level: ", paste(single.col,collapse=";"),
".",sep="")
msg2=paste("A factor must has at least two levels. Any factor with only one level is now removed from the dataset. ")
warning(paste(msg1,msg2,sep="\n"))
rawdata<-rawdata[, -idx]
}
num.levels<-sapply(rawdata, nlevels)
num.row=nrow(rawdata)
num.tol=round(num.row*levels.tol)
idx<- which(num.levels>num.tol)
if(length(idx)>0){
toomany.col=names[idx]
msg1=paste("The number of observations in this dataset is ", paste(num.row,".",sep=""),sep="")
msg2=paste("However, some factor variables have more than ",paste(num.tol,"levels."),sep="")
msg3=paste("Please check the following variables and consider removing them from the dataset: ", paste(toomany.col,collapse=";"),
".",sep="")
warning(paste(msg1,msg2,msg3,sep="\n"))
}
rawdata
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.