Nothing
col_function_type = list(spanning_header="spanning_header",spanning_data="spanning_data",aggregate="aggregate",metadata="metadata",empty="empty",empty_with_header="empty_with_header",data="data")
is_spanning_header = function(i,table_types){
if(table_types[1,i,drop=F]==types$empty ||
!all(which(table_types[,i,drop=F]!=types$empty) %in% which(table_types[,min(i+1,ncol(table_types)),drop=F]!=types$empty))){
#|| any(diff(which(table_types[,i]!=types$empty))==1)
return(F)
}
sum(table_types[,i,drop=F]==types$empty)/(nrow(table_types)) > mean(apply(table_types,2,function(x){sum(x==types$empty)/(nrow(table_types))}))
}
is_spanning_data = function(i,table_types,table){
if(table_types[1,i,drop=F]==types$empty || !any(table_types[,i,drop=F]==types$empty)){
return(F)
}
empty = sapply(2:nrow(table),function(j){table_types[j,i,drop=F]==types$empty})
cell_equality = sapply(2:nrow(table),function(j){sum(table[j,,drop=F]==table[j-1,,drop=F],na.rm=T)})
cell_distance = sapply(2:nrow(table),function(j){sum(RecordLinkage::levenshteinDist(unlist(table[j,,drop=F]),unlist(table[j-1,,drop=F])),na.rm=T)})
# duplications = data.frame(matrix(ncol=3,nrow=nrow(table)-1))
# duplications[[1]] = empty
# duplications[[2]] = cell_equality
# duplications[[3]] = cell_distance
#all(cell_equality[empty]>(mean(cell_equality)+sd(cell_equality)) & cell_distance[empty]<(mean(cell_distance)-sd(cell_distance)))
all(cell_equality[empty]>(mean(cell_equality)) & cell_distance[empty]<(mean(cell_distance)))
}
is_aggregate = function(i,table_types){
if(colnames(table_types)[i]!=types$total){
return(F)
}
#all data fields should be numeric punct or empty
all(table_types[,i,drop=F]==types$numeric || table_types[,i,drop=F]==types$empty || table_types[,i,drop=F]==types$punctuation)
}
is_metadata = function(i,table_types){
(colnames(table_types)[i]==types$empty | colnames(table_types)[i]==types$punctuation) &
sum(colnames(table_types)==types$empty | colnames(table_types)==types$punctuation)/ncol(table_types)<0.5 &
sum(table_types[,i,drop=F]==types$empty | table_types[,i,drop=F]==types$punctuation)/nrow(table_types)>0.8
}
is_empty = function(i,table_types){
all(table_types[,i,drop=F]==types$empty | table_types[,i,drop=F]==types$punctuation) && colnames(table_types)[i,drop=F]==types$empty
}
is_empty_with_header = function(i,table_types){
all(table_types[,i,drop=F]==types$empty | table_types[,i,drop=F]==types$punctuation) && colnames(table_types)[i,drop=F]!=types$empty
}
add_hypothesis.col_function_hypotheses = function(hypotheses,confidence,col_functions){
for(i in seq_along(hypotheses)){
if(all(hypotheses[[i]]$col_functions==col_functions)){
hypotheses[[i]]$confidence = hypotheses[[i]]$confidence + confidence
return(hypotheses)
}
}
hypothesis = structure(list(col_functions=col_functions,confidence=confidence),class=c("col_function_hypothesis","hypothesis"))
hypotheses[[length(hypotheses)+1]] = hypothesis
return(hypotheses)
}
detect = function(table,configuration){
if(class(table)[1]!="data.frame") return(stop(paste("Can not handle data type:",class(table)[1])))
#create row function votes
n = ncol(table)
table_types = get_data_types(table)
#list of row function hypotheses and votes
hypotheses = create_hypothesis_list("col_function")
#create first hypothesis
col_functions = rep(col_function_type$data,ncol(table))
for(i in seq_along(table_types)){
if(is_empty(i,table_types)){
col_functions[[i]] = col_function_type$empty
}
if(is_empty_with_header(i,table_types)){
col_functions[[i]] = col_function_type$empty_with_header
}
if(is_aggregate(i,table_types)){
col_functions[[i]] = col_function_type$aggregate
}
}
hypotheses = add_hypothesis(hypotheses,confidence=1,col_functions=col_functions)
#create second hypothesis (with spanning header, spanning_data and metadata)
col_functions = rep(col_function_type$data,ncol(table))
for(i in seq_along(table_types)){
if(is_spanning_header(i,table_types) && ifelse(i>1,!any(col_functions[1:(i-1)]==col_function_type$data),T)){
col_functions[[i]] = col_function_type$spanning_header
}
if(configuration$interpolate_spanning_column_data_cells && is_spanning_data(i,table_types,table)){
col_functions[[i]] = col_function_type$spanning_data
}
if(is_metadata(i,table_types)){
col_functions[[i]] = col_function_type$metadata
}
if(is_empty(i,table_types)){
col_functions[[i]] = col_function_type$empty
}
if(is_empty_with_header(i,table_types)){
col_functions[[i]] = col_function_type$empty_with_header
}
}
hypotheses = add_hypothesis(hypotheses,confidence=1,col_functions=col_functions)
hypotheses = normalize_confidence(hypotheses)
return(hypotheses)
}
get_desc = function(hypothesis){
return(paste("non-data columns:",paste(which(hypothesis$col_functions!=col_function_type$data),collapse=",")))
}
parse = function(table, hypothesis, errorHandler, configuration){
col_functions = hypothesis$col_functions
result = list()
if(length(col_functions)!=ncol(table)){
result$error= "Number of column functions is not equal not number of columns."
return(list(result))
}
remove = c()
result$edits = 0
for(i in seq_along(hypothesis$col_functions)){
if(hypothesis$col_functions[i]==col_function_type$spanning_header && configuration$interpolate_spanning_column_header_cells ||
(hypothesis$col_functions[i]==col_function_type$spanning_data && configuration$interpolate_spanning_column_data_cells)){
last = ""
for(j in 1:nrow(table[,i,drop=F])){
if(table[j,i,drop=F]!=""){
last = table[j,i,drop=F]
}else{
table[j,i] = last
result$edits = result$edits + 1
}
}
}
if((hypothesis$col_functions[i]==col_function_type$aggregate && configuration$remove_aggregates) ||
hypothesis$col_functions[i]==col_function_type$metadata ||
hypothesis$col_functions[i]==col_function_type$empty ||
(hypothesis$col_functions[i]==col_function_type$empty_with_header && configuration$remove_named_empty_cols)){
result$metadata = c(result$metadata,unlist(apply(table[,remove,drop=F],2,function(x){paste(x[x!=""])})))
remove = c(remove,i)
}
}
if(length(remove)==ncol(table)){
table = NULL
}else if(length(remove)>0){
table = table[,-remove,drop=F]
}
result$intermediate = table
return(list(result))
}
register_parsing_step(level="col_function",detect,parse,get_desc)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.