Betsy_Addin <- function(x, rfreq )
{
#-----------------------------------------------------------------------------------------------------Bryant
x <- as.data.frame(x)
#Loop function to print summary for each column
output <- 0
output <- matrix(nrow=nrow(x),ncol=ncol(x))
for (i in 1:nrow(x))
for (j in 1:ncol(x))
{
output[i,j] <- str_length(x[i,j])
}
#loop to determine number of rows for freq tables
numfactors <- 0
for (i in 1:ncol(x))
{
numfactors <- c(numfactors,n_distinct(x[,i]))
}
maxfactors <- max(numfactors)
#loop to determine populate freq matrix
freq_matrix <- 0
freq_matrix <- matrix(nrow=maxfactors,ncol=ncol(x)*2)
for (i in 2:ncol(x))
{
freq_matrix[1:nrow(as.matrix(table(output[,1]))),1:2] <- as.matrix(as.data.frame(table(output[,1])))
freq_matrix[1:nrow(as.matrix(table(output[,i]))),(2*i-1):(2*i)] <- as.matrix(as.data.frame(table(output[,i])))
}
#set class of freq matrix to numeric
class(freq_matrix) <- "numeric"
#loop to calculate relative frequency
for (i in seq(2,ncol(freq_matrix),by=2))
{
freq_matrix[1:nrow(freq_matrix),i] <- freq_matrix[1:nrow(freq_matrix),i]/sum(na.omit(freq_matrix[,i]))
}
#loop to determine how large final table will be
n <- 0
for (j in seq(2,ncol(freq_matrix),by=2))
{
for (i in 1:nrow(freq_matrix))
{
if (is.na(freq_matrix[i,j]) == F)
{
if (freq_matrix[i,j] < rfreq)
{
n <- n+1
}
}
}}
#loop to print corresponding outlier values
final <- matrix(nrow=n,ncol=2)
n2 <- 0
while (n2 < n)
{
for (j in seq(2,ncol(freq_matrix),by=2))
{
for (i in 1:nrow(freq_matrix))
{
if (is.na(freq_matrix[i,j]) == F)
{
if (freq_matrix[i,j] < rfreq)
{
n2 <- n2 + 1
final[n2,2] <- freq_matrix[i,j-1]
final[n2,1] <- j/2
}
}
}}}
for (i in 1:nrow(output))
{
for (j in 1:ncol(output))
{
if(is.na(output[i,j])==T)
{
output[i,j] <- 0
}
}
}
colname <- colnames(x)
final_matrix <- matrix(nrow=nrow(final),ncol=5)
if (n > 0)
{
for (i in 1:nrow(final))
{ for (ii in 1:nrow(output))
{
if (output[ii,final[i,1]] == final[i,2])
{
final_matrix[i,1] <- ii
final_matrix[i,2] <- final[i,1]
final_matrix[i,3] <- colname[[final[i,1]]]
final_matrix[i,4] <- x[ii,final[i,1]]
final_matrix[i,5] <- final[i,2]
}
}
}}
String_Lengths_Summary <- as.data.frame(final_matrix)
colnames(String_Lengths_Summary) <- c("row","col","colname","value","string length")
View(String_Lengths_Summary)
#-----------------------------------------------------------------------------------------------Betsy
##------------code to discover entries with data types unlike the rest of the column
#create matrix that parses each entry
parsed_matrix = matrix(data = NA, nrow = nrow(x), ncol = ncol(x))
for (j in 1:ncol(x))
{
for (i in 1:nrow(x))
{
parsed_matrix[i,j] = ifelse(is.na(x[i,j]), NA, guess_parser(x[i,j]))
}}
#identify outliers
#create class of columns matrix
class_matrix = matrix(data = NA, nrow = 2, ncol = ncol(x))
col_names = c(names(x))
for (j in 1:ncol(x)) {
class_matrix[1,j] = guess_parser(x[,j])
class_matrix[2,j] = col_names[[j]] }
#compare class matrix with individually parsed matrix
outliers_matrix = matrix(data = NA, nrow = nrow(x), ncol = ncol(x))
for (j in 1:ncol(parsed_matrix))
{
for (i in 1:nrow(parsed_matrix))
{
outliers_matrix[i,j] = ifelse(parsed_matrix[i,j] != class_matrix[1,j], parsed_matrix[i,j], NA)
}}
colname <- colnames(x)
#return kx3 matrix that shows location and type of wrong classes
row_wrong = nrow(x) * ncol(x)
outliers_location = matrix(data = NA, nrow = row_wrong, ncol =5 )
k=1
while (k <= row_wrong)
{
for (j in 1:ncol(outliers_matrix))
{
for (i in 1:nrow(outliers_matrix))
{
outliers_location[k,1] = ifelse(is.na(outliers_matrix[i,j]), NA, i)
outliers_location[k,2] = ifelse(is.na(outliers_matrix[i,j]), NA, j)
outliers_location[k,3] = ifelse(is.na(outliers_matrix[i,j]), NA, colname[[j]])
outliers_location[k,4] = ifelse(is.na(outliers_matrix[i,j]), NA, x[i,j])
outliers_location[k,5] = ifelse(is.na(outliers_matrix[i,j]), NA, outliers_matrix[i,j])
k = k+1
}}}
#remove all NAs to return a df that tells the row location, column location, and the different type of data type
outliers_df = as.data.frame(outliers_location)
colnames(outliers_df) = c("row", "col", "colname","value","parsed_type")
Data_Types_Summary = outliers_df %>% filter(!is.na(row))
View(Data_Types_Summary)
##--------------------------------------------------------------------------------MATT
#vectors of common nuls
numeric_null_list<-c(999,9999,99999,999999,99998,9998,998)
string_null_list<-c("null","na", "n/a","n\a","nul")
character_null_list<-c(".","*")
n<-nrow(x) # Demensions for matrix
k<-ncol(x) # Demensions for matrix
#Creating the matrixes
actual_nulls <-matrix(NA,nrow=n,ncol=k)
possible_nulls <- matrix(NA,nrow=n,ncol=k) #create matrix
number_of_nulls <- matrix(NA,nrow=2,ncol=k)
number_of_nulls<-`row.names<-`(number_of_nulls,c("count","percentage"))
#loops to get cell location
for (i in 1:n){
for (j in 1:k){
#Finding confirmed nulls and NA
if (is.null(x[i,j] == TRUE)) {
actual_nulls[i,j]<-1
} else if (is.na(x[i,j]) == TRUE ) {
actual_nulls[i,j]<-1
# Comparing cells against possible nulls
} else if (x[i,j] %in% numeric_null_list) {
possible_nulls[i,j]<-i
} else if (str_to_lower(x[i,j]) %in% string_null_list){
possible_nulls[i,j]<-i
} else if (str_length(x[i,j]) == 1 & x[i,j] %in% character_null_list){
possible_nulls[i,j]<-i
}
#Creating the count percent of actual nulls
number_of_nulls[1,j]<-sum(actual_nulls[,j],na.rm=TRUE)
number_of_nulls[2,j]<-((number_of_nulls[1,j])/n)
}
}
#Finds the location in of the possible nulls and acutal null
possible_null_location_index<-which(possible_nulls== "i" | actual_nulls == 1 , arr.in =T)
#Final output
colname <- colnames(x)
final_matt <- matrix(nrow=nrow(possible_null_location_index),ncol=3)
if(nrow(possible_null_location_index) > 0)
{
for (i in 1:nrow(possible_null_location_index))
{
final_matt[i,1] <- possible_null_location_index[i,1]
final_matt[i,2] <- possible_null_location_index[i,2]
final_matt[i,3] <- colname[[possible_null_location_index[i,2]]]
}
}
fourth_column<-x[possible_null_location_index]
NA_Values_Summary<-cbind(final_matt,fourth_column)
colnames(NA_Values_Summary)<-c("Row", "Column", "Column Name","Value")
n2 <- t(number_of_nulls)
n3 <- as.data.frame(n2,row.names=colname)
n4 <- mutate(n3,var = c(colnames(x)))
View(NA_Values_Summary)
print(ggplot(n4,aes(var,percentage)) + geom_bar(stat="identity") + labs(title="Percentage of NA values per Column"))
#----------------------------------------------------------------------------------Betsy(2)
#create two separate matrices
character_matrix_col_length = as.data.frame(t(class_matrix)) %>%
mutate(column_loc = seq.int(nrow(t(class_matrix)))) %>% filter(V1 == "character")
character_matrix = matrix(data = NA, nrow = nrow(x), ncol = nrow(character_matrix_col_length))
numeric_matrix = matrix(data = NA, nrow = nrow(x), ncol = ncol(x) - nrow(character_matrix_col_length))
#determine column location for character columns
col_locations1 = c(character_matrix_col_length$column_loc)
#create character matrix
for (i in 1:nrow(character_matrix_col_length)) {
character_matrix[,i] = x[,col_locations1[[i]]]
}
#create numeric matrix
numeric_matrix_col_length = as.data.frame(t(class_matrix)) %>%
mutate(column_loc = seq.int(nrow(t(class_matrix)))) %>% filter(V1 != "character")
num_locations1 = c(numeric_matrix_col_length$column_loc)
#create character matrix
for (i in 1:nrow(numeric_matrix_col_length)) {
numeric_matrix[,i] = x[,num_locations1[[i]]]
}
#convert to df
character_df = as.data.frame(character_matrix)
colnames(character_df) = character_matrix_col_length$V2
numeric_df = as.data.frame(numeric_matrix)
#print ggpairs graph for numeric variables
print(ggpairs(numeric_df, title = "Generalied Pairs Plots for Numeric Variables", columnLabels = numeric_matrix_col_length$V2))
#print boxplots of character variables
col_vec = as.vector(character_matrix_col_length$V2)
for (i in 1:nrow(character_matrix_col_length)) {
print(ggplot(character_df) +
geom_bar(aes(x = character_df[,i])) +
labs(title = paste("Distribution of Character Variable named '",col_vec[[i]],"'"), x = col_vec[[i]]))
}
#-----------------------------------------------------------------------Matt(2)
n<-nrow(x) # Demensions of the data
k<-ncol(x) # Demensions of the data
#Matrix to store the character(type)
class_matrix = matrix(data = NA, nrow = 1, ncol = k)
#Using guess passer to find the columns that are characters
for (j in 1:k) {
class_matrix[1,j] = guess_parser(x[,j]) }
character_matrix_col_length = as.data.frame(t(class_matrix)) %>% filter(V1 == "character")
character_matrix = matrix(data = NA, nrow = n, ncol = nrow(character_matrix_col_length))
#determine column location for character columns
character_matrix_col_length = as.data.frame(t(class_matrix)) %>%
mutate(column_loc = seq.int(nrow(t(class_matrix)))) %>% filter(V1 == "character")
col_locations1 = c(character_matrix_col_length$column_loc)
#create character matrix
for (i in 1:nrow(character_matrix_col_length)) {
character_matrix[,i] = x[,col_locations1[[i]]]
}
n<-nrow(character_matrix)
k<-ncol(character_matrix)
#Creating matrices to collect first
first_letter<- matrix(NA, nrow = n,ncol = k)
#Pulling the first letter for non numeric columns
for (i in 1:n){
for (j in 1:k){
first_letter[i,j]<-str_sub(x[i,j], start=1,end=1)
}
}
Capital_Matrix_letter1<-matrix(NA,nrow = n, ncol=k)
#Determining if each letter is capitalized 1 means lowercase 2 means capital
for (i in 1:n){
for (j in 1:k){
if (is.na(first_letter[i,j] == TRUE)) {
Capital_Matrix_letter1[i,j]<-0
} else if (is.na(str_locate(first_letter[i,j],"[A-Z]")) == TRUE) {
Capital_Matrix_letter1[i,j]<-1
} else if (is.na(str_locate(first_letter[i,j],"[A-Z]")) == FALSE) {
Capital_Matrix_letter1[i,j]<-2
}
}
}
#Need matrices and vectors
lol<-matrix(NA,nrow=n,ncol=1)
help1<-matrix(NA,nrow=n,ncol=k)
help2<-matrix(NA,nrow=n,ncol=k)
maybe1<-matrix(NA,nrow=1, ncol=k)
count_of_lowercase<-matrix(NA,nrow=1, ncol=k)
count_of_uppercase<-matrix(NA,nrow=1, ncol=k)
#Determining if the lowercase or uppercase was most common for each column
for (i in 1:n){
for (j in 1:k){
lol<-Capital_Matrix_letter1 %>% subset(select=j)
if (lol[i] == 1) {
help1[i,j]<-1
} else {
help1[i,j] <-0}
if (lol[i] == 2) {
help2[i,j]<-1
} else {
help2[i,j]<-0
}
count_of_lowercase[,j]<-colSums(help1, na.rm =T)[[j]]
count_of_uppercase[,j]<-colSums(help2, na.rm = T)[[j]]
if (count_of_lowercase[1,j] > count_of_uppercase[1,j]) {
maybe1[,j]<-1 } else {
maybe1[,j]<-2
}
}
}
miscapital_matrix_letter1<-matrix(NA,nrow=n,ncol=k)
for (i in 1:n){
for (j in 1:k){
if ( Capital_Matrix_letter1[i,j] == 0){
NA
} else if (Capital_Matrix_letter1[i,j] != maybe1[j]){
miscapital_matrix_letter1[i,j] <-1}
}
}
#Producing
miscapitalized_location_index1<-which(miscapital_matrix_letter1 == 1,arr.in =T)
final_capital <- matrix(nrow=nrow(miscapitalized_location_index1), ncol = 3)
if(nrow(miscapitalized_location_index1) > 0)
{
for (i in 1:nrow(miscapitalized_location_index1))
{
final_capital[i,1] <- miscapitalized_location_index1[i,1]
final_capital[i,2] <- miscapitalized_location_index1[i,2]
final_capital[i,3] <- colname[[miscapitalized_location_index1[i,2]]]
}
}
fourth_column<-x[miscapitalized_location_index1]
Capitalization_Errors<-cbind(final_capital,fourth_column)
Capitalization_Summary <- as.data.frame(Capitalization_Errors)
colnames(Capitalization_Summary)<-c("Row", "Column", "Column Name","Value")
View(Capitalization_Summary)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.