get_result_features = function(result){
features = list(warnings=NA,edits=NA,moves=NA,confidence=NA,total_cells=NA,typed_cells=NA,empty_header=NA,empty_cells=NA,non_latin_chars=NA,row_col_ratio=NA)
table = result$table
if(is.null(table)){
return(features)
}
if(nrow(table)==0){
return(features)
}
types=sapply(table,class)
sum(apply(table,1,function(x){sum(unlist(x)=="" | is.na(unlist(x)))}))
#measure table shape
features$warnings = length(result$warnings)
features$edits = result$edits
features$moves = result$moves
features$confidence = sum(result$confidence,na.rm=T)
features$total_cells = result$cells
features$typed_cells = ncol(table[,types!="character",drop=F])*nrow(table) - sum(apply(table[,types!="character",drop=F],1,function(x){sum(unlist(x)=="" | is.na(unlist(x)))}))
features$empty_header = sum(colnames(table)=="")
features$empty_cells = sum(apply(table,1,function(x){sum(unlist(x)=="" | is.na(unlist(x)))}))
features$non_latin_chars = sum(gregexpr("NONLATINCHARACTER",iconv(paste(table,collapse=" "), "utf8", "latin1", sub="NONLATINCHARACTER"), fixed=T)[[1]] > 0)
features$row_col_ratio = as.integer(nrow(table)>ncol(table))
features = lapply(features,function(x){max(x,0)})
return(features)
}
rank_quality = function(group_results,rank_function=get_result_features,weights=c(1,1,1,-1,-1,-1,1,1,1,-1)){
group_result_features = data.frame()
for(result in group_results){
result_features = rank_function(result)
group_result_features = rbind(group_result_features,result_features)
}
colnames(group_result_features) = names(result_features)
#1 1:0.061617706 2:0.14636832 3:0.17539515 4:-0.59766531 5:0.38812715 6:-0.069931857 7:-0.17779298 8:0.013805295 9:0.12391482 10:0.64217299 11:-0.21883665 #
#1 1:0.0049145184 2:0.19123393 3:0.44251943 4:-0.98470479 5:0.85696524 6:0.22609122 7:0.031004677 8:0.020101082 9:0.19626725 10:1.3443941 11:-0.1704682
#weights = list(cells=0.10168045,typed_cells=-0.39059928,empty_header=1.0380306,edits=0.98764759,confidence=-0.44670057)
#weights=list(ncol=-0.0049145184,nrow=-0.19123393,cells=-0.44251943,typed_cells=0.98470479,empty_header=-0.85696524,numerics_header=-0.22609122,na_fields=-0.031004677,non_latin_chars=-0.020101082,metadata_fields=-0.19626725,edits=-1.3443941,confidence=0.1704682)
#weights = list(cells=1,typed_cells=1,empty_header=1,edits=1,confidence=1)
#weights = c(-1,-1,-1,-1,-1)
quality_ratings = numeric(length(group_results))
for(i in 1:length(group_result_features)){
min = min(group_result_features[,i])
max = max(group_result_features[,i])
for(j in seq_along(group_results)){
weight = weights[i]
if((max-min) > 0){
normalized_feature_value = (group_result_features[j,i]-min)/(max-min)
quality_ratings[j] = quality_ratings[j] + weight * normalized_feature_value
}else{
quality_ratings[j] = quality_ratings[j] + weight * 0
}
}
}
return(order(quality_ratings))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.