load("../data/cora_gold.RData")
load("../data/cora.RData")
#initial set up
n_cora <- nrow(cora) #get total number of rows
unique_id <- rep(0,1697) #number of unique numbers in cora_gold
unique_id[1] <- 1
unique_id_value <- 1
cora_ids <- seq(1,n_cora,1)
cora_gold2 <- cora_gold[order(cora_gold$id1),] #sort cora gold ascending
tabs <- table(cora_gold2$id1)
matched_names <- names(tabs)
matched_names <- as.numeric(matched_names) #get names of table
tabs <- unname(tabs) #unname the match freq table
diffs <- as.vector((diff(as.matrix(tabs)))) #get diff in values between values in freq
diffs <- c(0,diffs)
information <- data.frame(matched_names, tabs, diffs)
information <- information[,-2] #create initiation gold data frame
#first implmenetation of getting unique ids of current numbers in data frame
for (j in (2:nrow(information))){
if(information[j,"diffs"]>=0){
unique_id_value <- unique_id_value + 1
unique_id[j] <- unique_id_value
}
else{
unique_id[j] <- unique_id_value
}
}
information$unique_id <- unique_id #add unique id into data frame
#head(information)
#nrow(information)
unnamed_tabs <- table(cora_gold2$id1)
#View(unnamed_tabs)
ones_id <- c()
name <- names(unnamed_tabs) ##corresponding id to the freq
#get all the values that have freq of 1,
#this means there is a value after this that is a match, should have same id
for (i in 1:length(unnamed_tabs)){
if (unnamed_tabs[i]==1){
ones_id <- c(ones_id, name[i])
}
}
#ones_id
ids_missing <- as.numeric(ones_id) + 1 #get that missing value with same id
missing_total <- setdiff(cora_ids, information$matched_names) #get all the missing ids
already_have_existing_ids <- intersect(missing_total, ids_missing) #get all the missing, but known ids
#length(ids_missing)
#length(already_have_existing_ids)
###^^^THERE IS A DIFFERENCE IN LENGTH HERE, AND THAT SHOULDN'T HAPPEN.... WHY IS THIS.. might lead to error
#get the unique id of the values that had freq of 1 and add it to the next value
uniques <- c()
for (i in 1:length(already_have_existing_ids)){
row <- which(information$matched_names == already_have_existing_ids[i]-1)
uniques <- c(uniques, information[row,"unique_id"])
}
#update cora ids known and corresponding unique ids
current_cora_ids <- c(information$matched_names, already_have_existing_ids)
current_unique_ids <- c(information$unique_id, uniques)
#create vector of last set of missing ids
#these ids don't match with anything, so have unique id of their own
last_set_of_missing_ids <- setdiff(cora_ids, current_cora_ids)
#give the remaning cora ids a unqiue id of their own
unique_id_set2<- c()
for (i in 1:length(last_set_of_missing_ids)){
unique_id_value <- unique_id_value + 1
unique_id_set2 <- c(unique_id_set2, unique_id_value)
}
#combining all cora ids and corresponding unique ids
finalized_cora_ids <- c(current_cora_ids, last_set_of_missing_ids)
finalized_unique_ids <- c(current_unique_ids, unique_id_set2)
#create finalized data set of cora ids and unique ids for updated gold.
cora_gold_update <- data.frame(finalized_cora_ids, finalized_unique_ids)
cora_gold_update <- cora_gold_update[order(cora_gold_update$finalized_cora_ids),]
names(cora_gold_update) <- c("cora_id", "unique_id")
#nrow(cora_gold_update)
save(cora_gold_update, file="../data/cora_gold_update.RData")
#remaining_cora_ids <- setdiff(cora_ids, set2_cora_ids)
#
#length(set2_cora_ids) + length(remaining_cora_ids)
#
#length(cora_ids)
#
#unique_id_set2 <- c()
#
#
#for (i in 1:length(remaining_cora_ids)){
# unique_id_value <- unique_id_value + 1
# unique_id_set2 <- c(unique_id_set2, unique_id_value)
#}
#
#nrow(information) + length(unique_id_set2)
#
#
#cora_id_gold <- c(information$matched_names, remaining_cora_ids)
#unique_id_gold <- c(information$unique_id, unique_id_set2)
#
#almost_finalize_gold <- data.frame(cora_id_gold, unique_id_gold)
#
#
#almost_finalize_gold <- almost_finalize_gold[order(almost_finalize_gold$cora_id_gold),]
#existing_ids <- unique(information$unique_id)
#
#for (i in 2:nrow(information)){
# if (information[i,"unique_id"]-information[i-1,"unique_id"]==1){
# unique_values_set2 <- c(unique_values_set2, (information[i-1,"matched_names"]+1))
# unique_id_set2 <- c(unique_id_set2, information[i-1,"unique_id"])
# }
#}
#unique_values_set2
#intersect(information$matched_names,unique_values_set2)
#
#
#unique_values_set3 <- c(information$matched_names, unique_values_set2)
#unique_id_set3 <- c(information$unique_id, unique_id_set2)
#
#remaining_unique_values_set4 <- setdiff(cora_ids, unique_values_set3)
#unique_id_set4 <- c()
#
#
#for (i in 1:length(remaining_unique_values_set4)){
# unique_id_value <- unique_id_value + 1
# unique_id_set4 <- c(unique_id_set4, unique_id_value)
#}
#
#unique_cora_id_final <- c(unique_values_set3, remaining_unique_values_set4)
#unique_matched_id_final <- c(unique_id_set3, unique_id_set4)
#length(unique_cora_id_final)
#
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.