R/final_whole.R

Betsy_Addin <- function(x, rfreq )
{



  #-----------------------------------------------------------------------------------------------------Bryant
x <- as.data.frame(x)




  #Loop function to print summary for each column

  output <- 0
  output <-  matrix(nrow=nrow(x),ncol=ncol(x))


  for (i in 1:nrow(x))
    for (j in 1:ncol(x))
    {
      output[i,j] <- str_length(x[i,j])
    }








  #loop to determine number of rows for freq tables
  numfactors <- 0
  for (i in 1:ncol(x))
  {
    numfactors <- c(numfactors,n_distinct(x[,i]))
  }
  maxfactors <- max(numfactors)


  #loop to determine populate freq matrix

  freq_matrix <- 0
  freq_matrix <- matrix(nrow=maxfactors,ncol=ncol(x)*2)

  for (i in 2:ncol(x))
  {
    freq_matrix[1:nrow(as.matrix(table(output[,1]))),1:2] <- as.matrix(as.data.frame(table(output[,1])))
    freq_matrix[1:nrow(as.matrix(table(output[,i]))),(2*i-1):(2*i)] <- as.matrix(as.data.frame(table(output[,i])))
  }


  #set class of freq matrix to numeric
  class(freq_matrix) <- "numeric"

  #loop to calculate relative frequency

  for (i in seq(2,ncol(freq_matrix),by=2))
  {
    freq_matrix[1:nrow(freq_matrix),i] <- freq_matrix[1:nrow(freq_matrix),i]/sum(na.omit(freq_matrix[,i]))
  }





  #loop to determine how large final table will be
  n <- 0
  for (j in seq(2,ncol(freq_matrix),by=2))
  {
    for (i in 1:nrow(freq_matrix))
    {
      if (is.na(freq_matrix[i,j]) == F)
      {
        if (freq_matrix[i,j] < rfreq)
        {
          n <- n+1
        }
      }
    }}


  #loop to print corresponding outlier values
  final <- matrix(nrow=n,ncol=2)
  n2 <- 0


  while (n2 < n)
  {
    for (j in seq(2,ncol(freq_matrix),by=2))
    {
      for (i in 1:nrow(freq_matrix))

      {
        if (is.na(freq_matrix[i,j]) == F)
        {
          if (freq_matrix[i,j] < rfreq)
          {
            n2 <- n2 + 1

            final[n2,2] <- freq_matrix[i,j-1]
            final[n2,1] <- j/2

          }
        }
      }}}





  for (i in 1:nrow(output))
  {
    for (j in 1:ncol(output))
    {
      if(is.na(output[i,j])==T)
      {
        output[i,j] <- 0
      }
    }
  }


  colname <- colnames(x)

  final_matrix <- matrix(nrow=nrow(final),ncol=5)

  if (n > 0)
  {
  for (i in 1:nrow(final))
  { for (ii in 1:nrow(output))
  {
    if (output[ii,final[i,1]] == final[i,2])
    {

      final_matrix[i,1] <- ii
      final_matrix[i,2] <- final[i,1]
      final_matrix[i,3] <- colname[[final[i,1]]]
      final_matrix[i,4] <- x[ii,final[i,1]]
      final_matrix[i,5] <- final[i,2]

    }
  }
  }}
  String_Lengths_Summary <- as.data.frame(final_matrix)
  colnames(String_Lengths_Summary) <- c("row","col","colname","value","string length")
  View(String_Lengths_Summary)



  #-----------------------------------------------------------------------------------------------Betsy
  ##------------code to discover entries with data types unlike the rest of the column

  #create matrix that parses each entry
  parsed_matrix = matrix(data = NA, nrow = nrow(x), ncol = ncol(x))
  for (j in 1:ncol(x))
  {
    for (i in 1:nrow(x))
    {
      parsed_matrix[i,j] = ifelse(is.na(x[i,j]), NA, guess_parser(x[i,j]))
    }}


  #identify outliers
  #create class of columns matrix
  class_matrix = matrix(data = NA, nrow = 2, ncol = ncol(x))
  col_names = c(names(x))
  for (j in 1:ncol(x)) {
    class_matrix[1,j] = guess_parser(x[,j])
    class_matrix[2,j] = col_names[[j]] }

  #compare class matrix with individually parsed matrix
  outliers_matrix = matrix(data = NA, nrow = nrow(x), ncol = ncol(x))
  for (j in 1:ncol(parsed_matrix))
  {
    for (i in 1:nrow(parsed_matrix))
    {
      outliers_matrix[i,j] = ifelse(parsed_matrix[i,j] != class_matrix[1,j], parsed_matrix[i,j], NA)
    }}

 colname <- colnames(x)
  #return kx3 matrix that shows location and type of wrong classes
  row_wrong = nrow(x) * ncol(x)
  outliers_location = matrix(data = NA, nrow = row_wrong, ncol =5 )
  k=1
  while (k <= row_wrong)
  {
    for (j in 1:ncol(outliers_matrix))
    {
      for (i in 1:nrow(outliers_matrix))
      {
        outliers_location[k,1] = ifelse(is.na(outliers_matrix[i,j]), NA, i)
        outliers_location[k,2] = ifelse(is.na(outliers_matrix[i,j]), NA, j)
        outliers_location[k,3] = ifelse(is.na(outliers_matrix[i,j]), NA, colname[[j]])
        outliers_location[k,4] = ifelse(is.na(outliers_matrix[i,j]), NA, x[i,j])
        outliers_location[k,5] = ifelse(is.na(outliers_matrix[i,j]), NA, outliers_matrix[i,j])
        k = k+1
      }}}
  #remove all NAs to return a df that tells the row location, column location, and the different type of data type
  outliers_df = as.data.frame(outliers_location)
  colnames(outliers_df) = c("row", "col", "colname","value","parsed_type")
  Data_Types_Summary = outliers_df  %>% filter(!is.na(row))
  View(Data_Types_Summary)


  ##--------------------------------------------------------------------------------MATT




  #vectors of common nuls
  numeric_null_list<-c(999,9999,99999,999999,99998,9998,998)
  string_null_list<-c("null","na", "n/a","n\a","nul")
  character_null_list<-c(".","*")


  n<-nrow(x) # Demensions for matrix
  k<-ncol(x) # Demensions for matrix

  #Creating the matrixes
  actual_nulls <-matrix(NA,nrow=n,ncol=k)
  possible_nulls <- matrix(NA,nrow=n,ncol=k) #create matrix
  number_of_nulls <- matrix(NA,nrow=2,ncol=k)
  number_of_nulls<-`row.names<-`(number_of_nulls,c("count","percentage"))

  #loops to get cell location
  for (i in 1:n){
    for (j in 1:k){

      #Finding confirmed nulls and NA
      if (is.null(x[i,j] == TRUE)) {
        actual_nulls[i,j]<-1
      } else if (is.na(x[i,j]) == TRUE ) {
        actual_nulls[i,j]<-1

        # Comparing cells against  possible nulls
      } else if (x[i,j] %in% numeric_null_list) {
        possible_nulls[i,j]<-i
      } else if (str_to_lower(x[i,j]) %in% string_null_list){
        possible_nulls[i,j]<-i
      } else if (str_length(x[i,j]) == 1 & x[i,j] %in% character_null_list){
        possible_nulls[i,j]<-i
      }
      #Creating the count percent of actual nulls
      number_of_nulls[1,j]<-sum(actual_nulls[,j],na.rm=TRUE)
      number_of_nulls[2,j]<-((number_of_nulls[1,j])/n)
    }
  }
  #Finds the location in of the possible nulls and acutal null
  possible_null_location_index<-which(possible_nulls== "i" | actual_nulls == 1 , arr.in =T)




  #Final output
  colname <- colnames(x)
  final_matt <- matrix(nrow=nrow(possible_null_location_index),ncol=3)

if(nrow(possible_null_location_index) > 0)

{
  for (i in 1:nrow(possible_null_location_index))
  {
    final_matt[i,1] <- possible_null_location_index[i,1]
    final_matt[i,2] <- possible_null_location_index[i,2]
    final_matt[i,3] <- colname[[possible_null_location_index[i,2]]]
  }
}
  fourth_column<-x[possible_null_location_index]

  NA_Values_Summary<-cbind(final_matt,fourth_column)

  colnames(NA_Values_Summary)<-c("Row", "Column", "Column Name","Value")



  n2 <- t(number_of_nulls)
  n3 <- as.data.frame(n2,row.names=colname)
  n4 <- mutate(n3,var = c(colnames(x)))


  View(NA_Values_Summary)
  print(ggplot(n4,aes(var,percentage)) + geom_bar(stat="identity") + labs(title="Percentage of NA values per Column"))


#----------------------------------------------------------------------------------Betsy(2)

  #create two separate matrices
  character_matrix_col_length = as.data.frame(t(class_matrix)) %>%
    mutate(column_loc = seq.int(nrow(t(class_matrix)))) %>% filter(V1 == "character")
  character_matrix = matrix(data = NA, nrow = nrow(x), ncol = nrow(character_matrix_col_length))
  numeric_matrix = matrix(data = NA, nrow = nrow(x), ncol = ncol(x) - nrow(character_matrix_col_length))

  #determine column location for character columns
  col_locations1 = c(character_matrix_col_length$column_loc)
  #create character matrix
  for (i in 1:nrow(character_matrix_col_length)) {
    character_matrix[,i] = x[,col_locations1[[i]]]
  }
  #create numeric matrix
  numeric_matrix_col_length = as.data.frame(t(class_matrix)) %>%
    mutate(column_loc = seq.int(nrow(t(class_matrix)))) %>% filter(V1 != "character")
  num_locations1 = c(numeric_matrix_col_length$column_loc)
  #create character matrix
  for (i in 1:nrow(numeric_matrix_col_length)) {
    numeric_matrix[,i] = x[,num_locations1[[i]]]
  }
  #convert to df
  character_df = as.data.frame(character_matrix)
  colnames(character_df) = character_matrix_col_length$V2
  numeric_df = as.data.frame(numeric_matrix)

  #print ggpairs graph for numeric variables
  print(ggpairs(numeric_df, title = "Generalied Pairs Plots for Numeric Variables", columnLabels = numeric_matrix_col_length$V2))

  #print boxplots of character variables
  col_vec = as.vector(character_matrix_col_length$V2)
  for (i in 1:nrow(character_matrix_col_length)) {
    print(ggplot(character_df) +
            geom_bar(aes(x = character_df[,i])) +
            labs(title = paste("Distribution of Character Variable named '",col_vec[[i]],"'"), x = col_vec[[i]]))
  }


  #-----------------------------------------------------------------------Matt(2)

  n<-nrow(x) # Demensions of the data

  k<-ncol(x) # Demensions of the data

  #Matrix to store the character(type)
  class_matrix = matrix(data = NA, nrow = 1, ncol = k)


  #Using guess passer to find the columns that are characters
  for (j in 1:k) {
    class_matrix[1,j] = guess_parser(x[,j]) }

  character_matrix_col_length = as.data.frame(t(class_matrix)) %>% filter(V1 == "character")

  character_matrix = matrix(data = NA, nrow = n, ncol = nrow(character_matrix_col_length))


  #determine column location for character columns

  character_matrix_col_length = as.data.frame(t(class_matrix)) %>%

    mutate(column_loc = seq.int(nrow(t(class_matrix)))) %>% filter(V1 == "character")

  col_locations1 = c(character_matrix_col_length$column_loc)

  #create character matrix

  for (i in 1:nrow(character_matrix_col_length)) {

    character_matrix[,i] = x[,col_locations1[[i]]]
  }

  n<-nrow(character_matrix)

  k<-ncol(character_matrix)

  #Creating matrices to collect first

  first_letter<- matrix(NA, nrow = n,ncol = k)

  #Pulling the first letter for non numeric columns

  for (i in 1:n){

    for (j in 1:k){

      first_letter[i,j]<-str_sub(x[i,j], start=1,end=1)

    }
  }


  Capital_Matrix_letter1<-matrix(NA,nrow = n, ncol=k)


  #Determining if each letter is capitalized 1 means lowercase 2 means capital

  for (i in 1:n){

    for (j in 1:k){

      if (is.na(first_letter[i,j] == TRUE)) {

        Capital_Matrix_letter1[i,j]<-0

      } else if (is.na(str_locate(first_letter[i,j],"[A-Z]")) == TRUE) {

        Capital_Matrix_letter1[i,j]<-1

      } else if (is.na(str_locate(first_letter[i,j],"[A-Z]")) == FALSE) {

        Capital_Matrix_letter1[i,j]<-2
      }

    }
  }


  #Need matrices and vectors
  lol<-matrix(NA,nrow=n,ncol=1)
  help1<-matrix(NA,nrow=n,ncol=k)
  help2<-matrix(NA,nrow=n,ncol=k)
  maybe1<-matrix(NA,nrow=1, ncol=k)
  count_of_lowercase<-matrix(NA,nrow=1, ncol=k)
  count_of_uppercase<-matrix(NA,nrow=1, ncol=k)

  #Determining if the lowercase or uppercase was most common for each column
  for (i in 1:n){

    for (j in 1:k){

      lol<-Capital_Matrix_letter1 %>% subset(select=j)

      if (lol[i] == 1) {

        help1[i,j]<-1

      } else {

        help1[i,j] <-0}

      if (lol[i] == 2) {

        help2[i,j]<-1

      } else {

        help2[i,j]<-0
      }

      count_of_lowercase[,j]<-colSums(help1, na.rm =T)[[j]]

      count_of_uppercase[,j]<-colSums(help2, na.rm = T)[[j]]

      if (count_of_lowercase[1,j] > count_of_uppercase[1,j]) {

        maybe1[,j]<-1 } else {

          maybe1[,j]<-2
        }
    }
  }


  miscapital_matrix_letter1<-matrix(NA,nrow=n,ncol=k)


  for (i in 1:n){

    for (j in 1:k){

      if ( Capital_Matrix_letter1[i,j] == 0){

        NA

      } else if (Capital_Matrix_letter1[i,j] != maybe1[j]){

        miscapital_matrix_letter1[i,j] <-1}
    }
  }





  #Producing
  miscapitalized_location_index1<-which(miscapital_matrix_letter1 == 1,arr.in =T)
  final_capital <- matrix(nrow=nrow(miscapitalized_location_index1), ncol = 3)

if(nrow(miscapitalized_location_index1) > 0)
{

    for (i in 1:nrow(miscapitalized_location_index1))
    {
      final_capital[i,1] <- miscapitalized_location_index1[i,1]
      final_capital[i,2] <- miscapitalized_location_index1[i,2]
      final_capital[i,3] <- colname[[miscapitalized_location_index1[i,2]]]

  }
}

  fourth_column<-x[miscapitalized_location_index1]

  Capitalization_Errors<-cbind(final_capital,fourth_column)
  Capitalization_Summary <- as.data.frame(Capitalization_Errors)
  colnames(Capitalization_Summary)<-c("Row", "Column", "Column Name","Value")

  View(Capitalization_Summary)

}
tcassidey/Betsy documentation built on May 31, 2019, 7:28 a.m.