#evaluates prediction by using density idea of developed and underdeveloped
#config(cl="ucsd")
overlay_complete <- function(n=1, init=2007, end=2012, by=3,cum=FALSE, min_prod=1, what_agg='wgh_jfrac', agg_fun=sum, list_prod=NaN, what_eval='share', min_to_grw =-1, min_to_dev=-1, pl=FALSE, pl_roc=FALSE, pl_min_dens=0.15, pl_node_sc=2.5, pl_min_size=0.7, pl_num_lab = 5 )
{
#n number of sample to conduct the analysis. n=-1 implies over all the producers
#init initial year to take the data
#end final year to take the data
#by number of years to split the data in between init and end
#pl true or false if you want to generete an overlay map for each producer
#min_prod number of publications required in the complete interval of time
#list_prod a vector of IDs of producers to be analyzed c('xhCWdtMAAAAJ', 'YirSp_cAAAAJ')
#mit_to_grw minimum share/production-value for a node to be considered as growing(emerging) area. This value should be less than min_to_dev and growing areas are > min_to_grw and < min_to_dev
#min_to_dev minimum share/production-value for a node to be considered as developed. -1 will be understood as mean of shares.
#dens_int number of intervals for density between 0.000001 and 1
#pl_min_dens the minimum density required to be presented/evaluated over the plot of the producer
#pl_node_sc a number that scales the size of the nodes
#pl_min_size minimum value for the size of the nodes
#pl_num_lab number of developed nodes to be labeled in the plot
#pred implies which is the transition to evaluate posible values are 'i_a', 'i_u', 'u_d' it uses initial for each state.
#what_eval rca, prop, ai what should the function evaluate to
#what_agg which is the name of the column in the raw data that you want to aggregate? author_count, wgh_jfr
#cum should the aggregation be accumulative since the very beginning. TRUE for positive
############get data
data_to_use <- get_data_to_overlay(n=n, init=init, end=end, by=by,cum=cum, min_prod=min_prod, what_agg=what_agg, agg_fun=agg_fun, list_prod=list_prod)
data_to_use_copy <<- data_to_use
##overlay_data and compute densities
overlay_data_3(data_to_use, what_eval=what_eval, min_to_grw =min_to_grw, min_to_dev=min_to_dev, pl=pl, pl_roc = pl_roc)
}
#' @title Get data to overlay
#' @description this function read data in intervals, aggregate it and computes measures of normalization of the data as RCA. Also subset and filter per a list or value of producers
#' @param data_to_use Data computed previously with get_data_to_overlay().
#' @param nodes a data frame with all information of nodes. It must be the same used by rs_to_eval
#' @param produces a data frame with information of producers. It must include columns id and name
#' @param what_eval name of the column of the dataframe data_to_use to use in the evaluation
#' @param min_to_grw minimum value to define the stat growing
#' @param min_to_dev minimum value to define the state of developing
#' @param pl_base plot
#' @param pl_base_mst plot base with mst
#' @param pl To plot or not to plot,
#' @param pl_roc to plot or not to plot roc curve
#' @param pl_min_dens
#' @param pl_node_sc=2.5
#' @param pl_min_size=0.7,
#' @param pl_num_lab
#' @param rs_to_eval An iGraph to overlay on data
#' @param rs_mean_degree
#' @param cex
#' @param pl_w_pdf
#' @param pl_seed
#' @param dens_links dens_links Which links to use in the evaluation of density. 'all' 'bench' just links in benchmark, 'filter' for filtered links
#' @examples
#' overlay_data_3()
#' @return a Data Frame to overlay.
#' @export
overlay_data_3 <- function(data_to_use, nodes, producers=NULL,what_eval='share', min_to_grw =-1, min_to_dev=-1, pl_base=TRUE, pl_base_mst=TRUE, pl=TRUE, pl_roc=TRUE, pl_min_dens=0.15, pl_node_sc=2.5, pl_min_size=0.7, pl_num_lab = 5, rs_to_eval=NULL, rs_mean_degree=4, cex=1, pl_w_pdf=FALSE, pl_seed=69, dens_links='all')
{
############overlay data benchmark magp
#print("Loading Base Map")
#use this options a will, fire a will
#if(dens_links=='all')
# g_orig <- get_benchmark_map(rs = rs_to_eval, mean_degree=rs_mean_degree, pl=pl_base, pl_mst = pl_base_mst, cex=cex, pl_seed=pl_seed) #does not have filtered links, evaluate everithing
# if(dens_links=='filter')
# g_orig <- delete.edges(g_orig, E(g_orig)[E(g_orig)$used < 1 | is.na(E(g_orig)$used)]) #to evaluate the poded tree
# #print("Edges to deleeeete")
# #print(E(g_orig)$weight_bench[is.na(E(g_orig)$weight_bench)])
# if(dens_links=='bench')
# g_orig <- delete.edges(g_orig, E(g_orig)[is.na(E(g_orig)$weight_bench)] ) #to evaluate only nodes that are in the benchmark map
#
evaluation_data_total <- data.frame() #to store final results
id_producers <- unique(data_to_use$producer)
intervals <- unique(data_to_use$interval)
for(producer in id_producers)
{
print(paste("Producer ", producer))
for(interval in intervals)
{
print(paste(" Interval ", interval))
#load data interval
i <- match(interval, intervals) #index of parameter interval
#year_s <- year_start[i]; year_e <- year_end[i] #USING NOT AGGREGATION of time
#get original blank graph
#g <- g_orig
data_producer <- data_to_use[data_to_use$producer==producer & data_to_use$interval == interval,]
data_producer$category <- strtoi(data_producer$category)
#data_p <<- data_producer
#droplevels(data_producer$id_category)
#defining first form options for nodes
g$producer <- producer
g$interval <- interval
g$what_eval <- what_eval
dev_nodes_flag <- FALSE
grw_nodes_flag <- FALSE
und_nodes_flag <- FALSE
V(g)$developed <- NA
V(g)$undeveloped <- NA
V(g)$growing <- NA
V(g)$density <- 0
V(g)$density_2 <- 0 #using all the matrix of similarities
active_nodes <- NA
grw_nodes <- NA
und_nodes <- NA
dev_nodes <- NA
sum_production <- NA
variety <- NA
measures <- NA
#verify that info of producer for this time is not empty
if(nrow(data_producer)>0)
{
#ACTIVE NODES ---------------------------
active_nodes <- as.vector(data_producer[,'category']) #which areas are present in the data of the producer
active_nodes <- strtoi(active_nodes) #converts to integer
nodes_info_act <- nodes[nodes$Id %in% active_nodes,]
nodes_info_act <- merge(nodes_info_act, data_producer, by.x='Id', by.y='category')
#####ORDERING
# print("Nodes ID")
# print(nodes$Id)
# print("Active nodes")
# print(active_nodes)
# print("V(g)$name")
# print(V(g)$name)
active_nodes_g <- V(g)$name[V(g)$name %in% active_nodes] #TO GET THE order that provides iGraph
# print("Active_nodes_g")
# print(active_nodes_g)
nodes_info_act <- nodes_info_act[match(active_nodes_g, nodes_info_act$Id),] #reordering
V(g)$share[V(g)$name %in% active_nodes]<- as.numeric(nodes_info_act[,'share'])
V(g)$value_eval[V(g)$name %in% active_nodes]<- as.numeric(nodes_info_act[,what_eval])
V(g)$active <- 0
V(g)$inactive <- 1
V(g)$active[V(g)$name %in% active_nodes] <- 1
V(g)$inactive[V(g)$name %in% active_nodes] <- 0 #inactive means zero production
V(g)$names_actives[V(g)$name %in% active_nodes] <- as.character(nodes_info_act$subd_name)
#V(g)$size[is.na(V(g)$size)] <- pl_min_size #deliting size of
# vg_df_actives <<- as.data.frame(list( name=V(g)$name, shares=V(g)$shares, size=V(g)$size,names_actives=V(g)$names_actives,color= V(g)$color), stringsAsFactors = FALSE)
#defining DEVELOPED nodes
cut_off_to_dev <- min_to_dev
cut_off_to_grw <- min_to_grw
quant_eval <- quantile(data_producer[,what_eval], na.rm = TRUE)
if(min_to_dev == -1)#if not defined value
{
#cut_off_to_dev <- mean(data_producer[,'share']) #take the mean by default, useful for personal producers
cut_off_to_dev <- quant_eval[4]
}
dev_nodes_df <- data_producer[data_producer[what_eval]>= cut_off_to_dev,] #condition of development nodes
dev_nodes_df <- dev_nodes_df[order(dev_nodes_df[,what_eval], decreasing = TRUE),] #ordering for what eval
if(min_to_grw == -1) #if default value
{
cut_off_to_grw <- quant_eval[2]
}
g$cut_off_to_grw <- cut_off_to_grw
g$cut_off_to_dev <- cut_off_to_dev
grw_nodes_df <- data_producer[data_producer[what_eval]>= cut_off_to_grw & data_producer[what_eval] < cut_off_to_dev,] #grwoing nodes
und_nodes_df <- data_producer[data_producer[what_eval]< cut_off_to_grw,] #undeveloped nodes
if(nrow(und_nodes_df)>0)
{
und_nodes <- as.vector(und_nodes_df[,'category'])
#ensuring for scimago
und_nodes <- strtoi(und_nodes)
und_nodes_flag <- TRUE
nodes_info_und <- nodes[nodes$Id %in% und_nodes,]
nodes_info_und <- merge(nodes_info_und, data_producer, by.x='Id', by.y='category')
#ordering
und_nodes_g <- V(g)$name[V(g)$name %in% und_nodes]
nodes_info_und <- nodes_info_und[match(und_nodes_g, nodes_info_und$Id), ]
V(g)$undeveloped <- NA #only active nodes could be evaluated
V(g)$undeveloped[V(g)$name %in% active_nodes] <- 0 #only active nodes could be evaluated
V(g)$undeveloped[(V(g)$name %in% und_nodes)] <- 1 #is it UNoccupied
}
if(nrow(grw_nodes_df)>0)
{
grw_nodes <- as.vector(grw_nodes_df[,'category'])
grw_nodes <- strtoi(grw_nodes)
grw_nodes_flag <- TRUE
nodes_info_grw <- nodes[nodes$Id %in% grw_nodes,]
nodes_info_grw <- merge(nodes_info_grw, data_producer, by.x='Id', by.y='category')
V(g)$growing <- NA
V(g)$growing[V(g)$name %in% active_nodes] <- 0
V(g)$growing[V(g)$name %in% grw_nodes] <- 1
}
if(nrow(dev_nodes_df)>0)
{
dev_nodes <- as.vector(dev_nodes_df[,'category'])
dev_nodes <- strtoi(dev_nodes)
V(g)$developed <- NA #only active nodes could be evaluated
V(g)$developed[V(g)$name %in% active_nodes] <- 0 #only active nodes could be evaluated
V(g)$developed[V(g)$name %in% dev_nodes] <- 1 #is it occupied
}
}#end no data production
else
{
#DEFINE ZERO for a period without data, this is needed because otherwise it is impossible to plot the graph
print(paste("NO DATA HERE!! For producer:", producer, "INterval:", interval))
#go to next interval since there is no data
#flag_no_data_user <- TRUE
break #jump next interval
}
####MEASURING DENSITies
#V(g)$sum_prox <- graph.strength(g) #weighted degree used to calculate densities ORIGINAL
#V(g)$sum_prox <- graph.strength(g, mode = "in") #weighted degree used to calculate densities MODIFIED
V(g)$sum_prox <- graph.strength(g, mode = "out") #weighted degree used to calculate densities MODIFIED
#g <- delete.edges(g, E(g)[E(g)$used < 1])
#which are the nodes you want to asume as active or developed.
for(pred in c('act', 'dev')) #activation or developing
{
E(g)$weight_bool <- 0
if(pred == 'act')
{
V(g)$dev_temp <- V(g)$active
#print(paste("Sum Act Temp"))
#print(sum(V(g)$dev_temp))
#print(V(g)$dev_temp)
}
if(pred=='dev')
{
V(g)$dev_temp <- V(g)$developed
V(g)$dev_temp[is.na(V(g)$dev_temp)] <- 0
#print(paste("Sum DEV Temp"))
#print(sum(V(g)$dev_temp))
#print(V(g)$dev_temp)
}
#V(g)$dev_temp[is.na(V(g)$dev_temp)] <- 0
#E(g)$weight_bool[ E(g)[from (V(g)$name[V(g)$dev_temp ==1 ])] ] <- 1 #make 0 those edges that are not connecting occupaid nodes, and 1 those edges connected to nodes active. NOTE that, by using this technnique,for active nodes, density will be 1
E(g)$weight_bool[ E(g)[to(V(g)$name[V(g)$dev_temp ==1 ])] ] <- 1 #make 0 those edges that are not connecting occupaid nodes, and 1 those edges connected to nodes active. NOTE that, by using this technnique,for active nodes, density will be 1
#print(paste("LINKS IN ", pred))
#print(sum(E(g)$weight_bool))
E(g)$weight_occ <- E(g)$weight * E(g)$weight_bool #copy weight for active links
V(g)$sum_prox_occ <- graph.strength(g,weights=E(g)$weight_occ) #
if(pred =='act')
{
V(g)$dens_act <- V(g)$sum_prox_occ/V(g)$sum_prox #densities
V(g)$dens_act[V(g)$name %in% dev_nodes] <- 0 #cleanning density of active nodes that is 1.
}
if(pred =='dev')
{
V(g)$dens_dev <- V(g)$sum_prox_occ/V(g)$sum_prox #densities
V(g)$dens_dev[V(g)$name %in% dev_nodes] <- 0 #cleanning density of developed nodes that is 1.
}
}#end for type of prediction
evaluation<-''
if(i > 1) #there is a previous state
{
#defining states transition means that became developed
V(g)$ts_dragons <- V(g_prev)$inactive * V(g)$inactive
V(g)$ts_introduction <- V(g_prev)$inactive * V(g)$undeveloped
V(g)$ts_growth <- V(g_prev)$undeveloped * V(g)$growing
V(g)$ts_decline <- V(g_prev)$developed * V(g)$undeveloped
#totally binary from inactive to active
V(g)$ts_activated <- V(g_prev)$inactive * V(g)$active
V(g)$ts_transition <- V(g_prev)$undeveloped * V(g)$developed
V(g)$ts_maturity <- V(g_prev)$growing * V(g)$developed
#print("values..")
#verify_g<<- data.frame(V(g)$occupied, V(g)$name, V(g)$label,V(g)$active, V(g_prev)$active, V(g_prev)$inactive, V(g)$st_transition)#
prev_interval <- intervals[i-1]
#evaluation_roc <- data.frame(producer, prev_interval, interval, V(g)$name, V(g)$Label, V(g_prev)$active, V(g_prev)$inactive, V(g_prev)$undeveloped, V(g_prev)$growing, V(g_prev)$developed, V(g)$inactive, V(g)$active, V(g)$undeveloped, V(g)$growing,V(g)$developed, V(g)$ts_dragons, V(g)$ts_introduction,V(g)$ts_growth,V(g)$ts_maturity ,V(g)$ts_decline, V(g)$ts_activated, V(g)$ts_transition )#
evaluation_roc <- data.frame(what_eval, producer, prev_interval, V(g_prev)$value_eval, V(g_prev)$dens_act, V(g_prev)$dens_dev, interval, V(g)$value_eval, V(g)$name, V(g_prev)$active, V(g)$active, V(g)$ts_activated, V(g_prev)$undeveloped, V(g)$developed, V(g)$ts_transition, V(g_prev)$growing, V(g)$developed, V(g)$ts_maturity) #
#print(evaluation_data)
evaluation_data_total <- rbind(evaluation_data_total, evaluation_roc)
#print(paste("interval:", i, " Added user:", producer))
#label for plot
if(pl==TRUE)
{
#pending plot if needed
#Pass evaluation
par(mfrow=c(1,1)) #starts template for roc plots
plot_overlay_data(g_prev, pl_min_dens=pl_min_dens, pl_node_sc=pl_node_sc, pl_min_size=pl_min_size, pl_num_lab = pl_num_lab, cex=cex, pl_w_pdf = pl_w_pdf)
plot_overlay_data(g, pl_min_dens=pl_min_dens, pl_node_sc=pl_node_sc, pl_min_size=pl_min_size, pl_num_lab = pl_num_lab, cex=cex, pl_w_pdf = pl_w_pdf)
#print("Here plotting!")
}#end plotting
if(pl_roc == TRUE)
{
#print("Ploting ROC Curves")
roc_results_prod <- plot_rocs(data_eval_roc = evaluation_roc, producers=producers, pl=TRUE)
}
}#end loop i >1 means state with a previous state to evaluate
#save for next iteration
g_prev <- g
#g_kk <<- g
prev_num_act <- length(na.omit(active_nodes)) #any production
}#end interval
#if(flag_no_data_user == TRUE)
# next #jump next user if no data for user in interval
}#end producers
#evaluating data of
print(evaluation_data_total)
str(evaluation_data_total)
#file_n <- file.path(path_interval_overlay,paste("DATA_EVALUATION_ROC",interval_label,"min_prod",min_prod,"samp",n, "trans", pred, ".csv"))
#write.csv(x=evaluation_data_total, file=file_n, row.names=FALSE)
#print(paste("wrote file", file_n ))
#data_eval_roc <<- evaluation_data_total
return(evaluation_data_total)
}
#' @title Overlay data of one or several producers
#' @description this function read data processed with Overlay Data 3 and then plots an overlaid map according several configurations
#' @param data_to_use Data computed previously with get_data_to_overlay().
#' @param nodes a data frame with all information of nodes. It must be the same used by rs_to_eval
#' @param produces a data frame with information of producers. It must include columns id and name
#' @param what_eval name of the column of the dataframe data_to_use to use in the evaluation
#' @param min_to_grw minimum value to define the stat growing
#' @param min_to_dev minimum value to define the state of developing
#' @param pl_base plot
#' @param pl_base_mst plot base with mst
#' @param pl To plot or not to plot,
#' @param pl_roc to plot or not to plot roc curve
#' @param pl_min_dens
#' @param pl_node_sc=2.5
#' @param pl_min_size=0.7,
#' @param pl_num_lab
#' @param rs_to_eval An iGraph to overlay on data
#' @param rs_mean_degree
#' @param cex
#' @param pl_w_pdf
#' @param pl_seed
#' @param dens_links dens_links Which links to use in the evaluation of density. 'all' 'bench' just links in benchmark, 'filter' for filtered links
#' @examples
#' overlay_data_3()
#' @return a Data Frame to overlay.
#' @export
overlay_producers <- function(data_to_use, nodes, producers=NULL, id_producers = NULL, pl_base=TRUE, pl_base_mst=TRUE, pl=TRUE, pl_roc=TRUE, pl_min_dens=0.15, pl_node_sc=2.5, pl_min_size=0.7, pl_num_lab = 5, rs_to_eval=NULL, rs_mean_degree=4, cex=1, pl_w_pdf=FALSE, pl_seed=69, dens_links='all')
{
evaluation_data_total <- data.frame() #to store final results
if(is.null(id_producers)){
id_producers <- unique(data_to_use$producer) #all the producers on the dataframe
}
intervals <- unique(data_to_use$interval)
for(producer in id_producers)
{
print(paste("Producer ", producer))
for(interval in intervals)
{
print(paste(" Interval ", interval))
#load data interval
i <- match(interval, intervals) #index of parameter interval
#year_s <- year_start[i]; year_e <- year_end[i] #USING NOT AGGREGATION of time
#get original blank graph
#g <- g_orig
data_producer <- data_to_use[data_to_use$producer==producer & data_to_use$interval == interval,]
data_producer$category <- strtoi(data_producer$V.g..name) #INTEGER!!!
#data_p <<- data_producer
#droplevels(data_producer$id_category)
#defining first form options for nodes
g$producer <- producer
g$interval <- interval
g$what_eval <- data_to_use$what_eval[[1]]
g$cut_off_to_grw <- 0.5
g$cut_off_to_dev <- 1
cut_off_to_grw <-0.5
cut_off_to_dev <- 1
dev_nodes_flag <- FALSE
grw_nodes_flag <- FALSE
und_nodes_flag <- FALSE
V(g)$developed <- data_producer$V.g..developed
V(g)$active <- data_producer$V.g..active
active_nodes = data_producer[data_producer$V.g..active==1,"category"]
active_nodes <- strtoi(active_nodes) #converts to integer
grw_nodes_df <- data_producer[data_producer$V.g..value_eval>= cut_off_to_grw &data_producer$V.g..value_eval < cut_off_to_dev,] #grwoing nodes
und_nodes_df <- data_producer[data_producer$V.g..value_eval< cut_off_to_grw,] #undeveloped nodes
if(nrow(und_nodes_df)>0)
{
und_nodes <- as.vector(und_nodes_df[,'category'])
#ensuring for scimago
und_nodes <- strtoi(und_nodes)
nodes_info_und <- nodes[nodes$Id %in% und_nodes,]
nodes_info_und <- merge(nodes_info_und, data_producer, by.x='Id', by.y='category')
#ordering
und_nodes_g <- V(g)$name[V(g)$name %in% und_nodes]
nodes_info_und <- nodes_info_und[match(und_nodes_g, nodes_info_und$Id), ]
V(g)$undeveloped <- NA #only active nodes could be evaluated
V(g)$undeveloped[V(g)$name %in% active_nodes] <- 0 #only active nodes could be evaluated
V(g)$undeveloped[(V(g)$name %in% und_nodes)] <- 1 #is it UNoccupied
}
if(nrow(grw_nodes_df)>0)
{
grw_nodes <- as.vector(grw_nodes_df[,'category'])
grw_nodes <- strtoi(grw_nodes)
nodes_info_grw <- nodes[nodes$Id %in% grw_nodes,]
nodes_info_grw <- merge(nodes_info_grw, data_producer, by.x='Id', by.y='category')
V(g)$growing <- NA
V(g)$growing[V(g)$name %in% active_nodes] <- 0
V(g)$growing[V(g)$name %in% grw_nodes] <- 1
}
#V(g)$undeveloped <-
#V(g)$growing <- data_producer$
#V(g)$density <- data_producer$V.g..value_eval
V(g)$density_2 <- 0 #using all the matrix of similarities
active_nodes <- NA
grw_nodes <- NA
und_nodes <- NA
dev_nodes <- NA
sum_production <- NA
variety <- NA
measures <- NA
#verify that info of producer for this time is not empty
if(nrow(data_producer)>0)
{
#ACTIVE NODES ---------------------------
#active_nodes <- as.vector(data_producer[,'category']) #which areas are present in the data of the producer
active_nodes = data_producer[data_producer$V.g..active==1,"category"]
active_nodes <- strtoi(active_nodes) #converts to integer
nodes_info_act <- nodes[nodes$Id %in% active_nodes,]
nodes_info_act <- merge(nodes_info_act, data_producer, by.x='Id', by.y='category')
#####ORDERING
# print("Nodes ID")
# print(nodes$Id)
# print("Active nodes")
# print(active_nodes)
# print("V(g)$name")
# print(V(g)$name)
active_nodes_g <- V(g)$name[V(g)$name %in% active_nodes] #TO GET THE order that provides iGraph
# print("Active_nodes_g")
# print(active_nodes_g)
nodes_info_act <- nodes_info_act[match(active_nodes_g, nodes_info_act$Id),] #reordering
V(g)$share[V(g)$name %in% active_nodes]<- as.numeric(nodes_info_act[,'size']) #share in the original
V(g)$value_eval[V(g)$name %in% active_nodes]<- as.numeric(nodes_info_act[,"V.g..value_eval"])
V(g)$names_actives[V(g)$name %in% active_nodes] <- as.character(nodes_info_act$subd_name)
#V(g)$size[is.na(V(g)$size)] <- pl_min_size #deliting size of
# vg_df_actives <<- as.data.frame(list( name=V(g)$name, shares=V(g)$shares, size=V(g)$size,names_actives=V(g)$names_actives,color= V(g)$color), stringsAsFactors = FALSE)
}#end no data production
else
{
#DEFINE ZERO for a period without data, this is needed because otherwise it is impossible to plot the graph
print(paste("NO DATA HERE!! For producer:", producer, "INterval:", interval))
#go to next interval since there is no data
#flag_no_data_user <- TRUE
break #jump next interval
}
#pending plot if needed
#Pass evaluation
par(mfrow=c(1,1)) #starts template for roc plots
# plot_overlay_data(g_prev, pl_min_dens=pl_min_dens, pl_node_sc=pl_node_sc, pl_min_size=pl_min_size, pl_num_lab = pl_num_lab, cex=cex, pl_w_pdf = pl_w_pdf)
plot_overlay_data(g, pl_min_dens=pl_min_dens, pl_node_sc=pl_node_sc, pl_min_size=pl_min_size, pl_num_lab = pl_num_lab, cex=cex, pl_w_pdf = pl_w_pdf, producers=producers)
#print("Here plotting!")
if(pl_roc == TRUE)
{
#print("Ploting ROC Curves")
roc_results_prod <- plot_rocs(data_eval_roc = evaluation_roc, producers=producers, pl=TRUE)
}
prev_num_act <- length(na.omit(active_nodes)) #any production
}#end interval
#if(flag_no_data_user == TRUE)
# next #jump next user if no data for user in interval
}#end producers
#evaluating data of
#file_n <- file.path(path_interval_overlay,paste("DATA_EVALUATION_ROC",interval_label,"min_prod",min_prod,"samp",n, "trans", pred, ".csv"))
#write.csv(x=evaluation_data_total, file=file_n, row.names=FALSE)
#print(paste("wrote file", file_n ))
#data_eval_roc <<- evaluation_data_total
return(evaluation_data_total)
}
#used to plot overlay maps over benchmark maps
plot_graph_overlay <- function(g,title=paste('Overlay Map -',taxo), subtitle='', file_name=paste('Overlay Map -',taxo), legend=NULL, l_pt.bg,l_col, cex=1, pl_w_pdf=FALSE )
{
info_credit <- 'info'
info_template<- 'fr'
info_interval <- '2001'
plot.igraph(g,
sub=list(paste(subtitle),cex=0.7*cex),
vertex.label.cex=0.6*cex,
vertex.label.font=0,
vertex.label.family='Helvetica',
vertex.label.color='black',
edge.label.cex=0.5*cex,
edge.label.font=0,
edge.label.family='Helvetica',
main=list(title,cex=0.9*cex),
asp=FALSE)
#dev.off()
legend("bottomleft", # location of the legend on the heatmap plot
legend = legend,
pch=21, merge=FALSE,
#pch=19, merge=FALSE,
pt.bg = l_pt.bg,
col = l_col,
#lty= c(1,1,1,1,NA), # line style
lty= NA, # line style
lwd = 1, # line width
#lwd = NA, # line width
pt.cex = 0.8*cex,
box.col = "lightgrey",
#fill = "white"
cex= 0.6*cex
)
if(pl_w_pdf==TRUE)
{
dev_file_name <- file.path(path_interval_overlay, paste(file_name,'.pdf', sep=''))
#dev.print(pdf, file=dev_file_name, widht=6, height=3 );
pdf(dev_file_name, width=16, height=12, family='Helvetica', pointsize=8)
#, edge.label.color='black'
plot.igraph(g,
sub=list(paste(subtitle), cex=0.8*cex),
vertex.label.cex=1.4*cex,
vertex.label.font=0,
vertex.label.family='Helvetica',
vertex.label.color='black',
edge.label.cex=0.5*cex,
edge.label.font=0,
edge.label.family='Helvetica',
main=title,
asp=FALSE)
legend("bottomleft", # location of the legend on the heatmap plot
legend = legend,
pch=21, merge=FALSE,
pt.bg = l_pt.bg,
col = l_col,
#lty= c(1,1,1,1,NA), # line style
lty = NA,
lwd = 0, # line width
pt.cex = 4,
box.col = "lightgrey",
#fill = "white"
cex= 2
)
dev.off()
#dev.copy(pdf,filename=dev_file_name, family='Helvetica');
#dev.off ();
}
}
#plots data overlaid and evaluated using density and transitions
#g graph containing transitions ande previous states
plot_overlay_data <- function(g, pl_min_dens=0.15, pl_node_sc=2.5, pl_min_size=0.7, pl_num_lab = 5, prop_to_lab=0.2, cex=1, pl_w_pdf=FALSE, producers=producers)
{
#color palette <- INactive, Active, Growing, Advantage, Opportunity
#green palette
#color_palette <- c('#edf8e9', '#bae4b3', '#74c476', '#238b45','white','blue' )
#yellow-red heat palette
#color_palette <- c('#ededed', '#fecc5c', '#fd8d3c','#e31a1c', 'white', 'green')
color_palette <- c('#ededed', '#fecc5c', '#fd8d3c','#e31a1c', 'white', 'blue')
node.colors <- c("white", rev(heat.colors(3)), "white") # from empty to developed 4 states
edge.colors <- c('#ededed', '#ededed', '#ededed', "#bcbcbc", "#bcbcbc" ) #from inactive, connecting active, connecting developed
frame.colors <- c('#ededed', '#ededed', '#ededed', "#bcbcbc", "black") #from base, to recommended, to True Positive
#defining some values
num_dragons <- length(V(g)$ts_dragons[V(g)$ts_dragons==1 & !is.na(V(g)$ts_dragons)])
num_introduction <- length(V(g)$ts_introduction[V(g)$ts_introduction==1 & !is.na(V(g)$ts_introduction)])
num_growth <- length(V(g)$ts_growth[V(g)$ts_growth==1 & !is.na(V(g)$ts_growth)])
num_maturity <- length(V(g)$ts_maturity[V(g)$ts_maturity==1 & !is.na(V(g)$ts_maturity)])
num_decline <- length(V(g)$ts_decline[V(g)$ts_decline==1 & !is.na(V(g)$ts_decline)] )
num_activated <- length(V(g)$ts_activated[V(g)$ts_activated==1 & !is.na(V(g)$ts_activated)] )
num_transition <- length(V(g)$ts_transition[V(g)$ts_transition==1 & !is.na(V(g)$ts_transition)] )
#setting default values for vertices
V(g)$color <- node.colors[1]
V(g)$frame.color <- frame.colors[1]
V(g)$size <- pl_min_size
#print(paste("HEREE MIn size", pl_min_size))
#V(g)$label_orig <- V(g)$label #to export
V(g)$Label = as.character(nodes$subd_name[match(V(g)$name, nodes$Id)]) #watch out with the name of the column!!
V(g)$label <- ''
#setting default values for edges
E(g)$color <- edge.colors[2] #coloring edges connecting active nodes
#ACTIVE NODES ######
active_nodes <- V(g)$name[V(g)$active == 1]
V(g)$color[V(g)$name %in% active_nodes] <- node.colors[2]
#V(g)$share[V(g)$name %in% active_nodes]<- as.numeric(nodes_info_act[,'share'])
max_share <- max(V(g)$share, na.rm=TRUE); min_share <- min(V(g)$share, na.rm=TRUE)
#print(paste("max", max_share, "min", min_share))
#V(g)$size[V(g)$name %in% active_nodes] <- V(g)$share[V(g)$name %in% active_nodes]
V(g)$size[V(g)$name %in% active_nodes] <- (((V(g)$share[V(g)$name %in% active_nodes] - min_share )/(max_share-min_share) ) + pl_min_size) * pl_node_sc
#UNDEVELOPED NODES
und_nodes <- V(g)$name[V(g)$undeveloped == 1]
#GROWING NODES #####
grw_nodes <- V(g)$name[V(g)$growing == 1]
V(g)$color[V(g)$name %in% grw_nodes] <- node.colors[3]
#DEVELOPED NODES #######
dev_nodes <- V(g)$name[V(g)$developed == 1]
V(g)$color[V(g)$name %in% dev_nodes] <-node.colors[4]
if(length(dev_nodes)>0)
{
#labeling all edges from recommended nodes
#E(g)$label[ E(g)[from (V(g)[V(g)$density>pl_min_dens]) ] ] <- E(g)$weight[ E(g)[from (V(g)[V(g)$density>pl_min_dens]) ] ] #bug here, something happen
E(g)$color[E(g)$weight_bool==1] <- edge.colors[4] #coloring edges connecting active nodes
#if(length(dev_nodes) < (prop_to_lab * length(rownames(nodes))) )
# V(g)$label = as.character(nodes$subd_name[match(V(g)$name, nodes$Id)]) #watch out with the name of the column!!
#else
#{
#print("Filtering labels...")
nod_max_com <- get_max_com(g) #get the nodes with max degree per community
#print(nod_max_com)
V(g)$label <- ''
#V(g)$label[V(g)$name %in% nod_max_com] <- as.character(nodes$subd_name[match(nod_max_com, nodes$Id)])
nodes_to_label <- V(g)$name[V(g)$name %in% intersect(nod_max_com,dev_nodes)] #to get the order of iGraph
V(g)$label[V(g)$name %in% intersect(nod_max_com,dev_nodes)] <- as.character(nodes$subd_name[match(nodes_to_label, nodes$Id)])
#}
}
#LEGEND AND TITLES ###
#MUST USE AS.CHARACTER, OTHERWISE it will take an integer resulting in extrange colors
#V(g)$color[V(g)$name %in% active_nodes_density_lab] <- as.character(nodes_info_density$color)
num_act <- length(na.omit(active_nodes)) #any production
num_dev <- length(na.omit(dev_nodes)) #developed area
num_grw <- length(na.omit(grw_nodes)) #growing area
num_und <- length(na.omit(und_nodes)) #underdeveloped area
num_ina <- vcount(g)-num_act
measures <- paste('Undeveloped <', round(g$cut_off_to_grw, 4), '< Growing Areas <', round(g$cut_off_to_dev, 4), 'Developed Areas', '\n')
#titles
prod_name <- get_prod_name(g$producer, producers)
prod_domain <- get_prod_domain(g$producer, producers)
taxo="UCSD"
dataset= "GScholar"
title <- paste( prod_name," ", g$interval, '\nTaxonomy: ', taxo, ' - Overlay Data: ', dataset, " - Evaluating: ", toupper(g$what_eval),sep="")
#file_name <- paste('OverlayMap', taxo, prod_domain,g$interval, g$what_eval, sep='_')
file_name <- paste(prod_name,g$interval, g$what_eval, sep='_')
file_name <- paste(file_name,".pdf",sep="")
#subtitle <- paste('Layout: Frughtermand Rengold', '| Size: Share of authorships', '| Color: Areas of Science (original colors) \n' , measures, '\n', evaluation)
subtitle <- paste('Layout: Fruchterman–Reingold', '| Size: Share of authorships', '| Color: Values of ', toupper(g$what_eval), " \n Agg. function: ", "sum", '\n' , measures)
#exporting graph to a dataframe
#V(g)$cut_off_to_dev <- cut_off_to_dev
#vg_final<<- data.frame( V(g)$name, V(g)$label, V(g)$color, V(g)$active,V(g)$inactive,V(g)$size, V(g)$shares, V(g)$cut_off_to_dev, V(g)$developed, V(g)$undeveloped )#
par(lend = 1) # square line ends for the color legend
legend = c(paste("Inactive", num_ina), paste("Nascent", num_und), paste("Intermediate", num_grw), paste("Developed", num_dev)) # category labels
plot_graph_overlay(g, title = title , subtitle=subtitle, file_name=file_name, legend=legend, l_pt.bg=node.colors, l_col=frame.colors, cex=cex, pl_w_pdf=pl_w_pdf )
#plotting smart, use own to use previous defined size
out_dir = "/Users/mguevara/Dropbox/UPLA/INVESTIGACION/proyectos/BRAZIL-RSPACE/DATA/OUTPUT/UNIVERSITIES/OVERLAYS/41"
plot_graph_smart_2(g=g, nodes=nodes, main=title, sub='',legend=legend, v_col="own", v_size = "own", cex = cex, v_label = 'no', file_name = file.path(out_dir, file_name))
}
#plots data overlaid and evaluated using density and transitions
#g graph containing transitions ande previous states
plot_overlay_data_densities <- function(g, pl_min_dens=0.15, pl_node_sc=2.5, pl_min_size=0.7, pl_num_lab = 5, prop_to_lab=0.2, cex=1, pl_w_pdf=FALSE, producers)
{
#color palette <- INactive, Active, Growing, Advantage, Opportunity
#green palette
#color_palette <- c('#edf8e9', '#bae4b3', '#74c476', '#238b45','white','blue' )
#yellow-red heat palette
#color_palette <- c('#ededed', '#fecc5c', '#fd8d3c','#e31a1c', 'white', 'green')
color_palette <- c('#ededed', '#fecc5c', '#fd8d3c','#e31a1c', 'white', 'blue')
node.colors <- c("white", rev(heat.colors(3)), "white") # from empty to developed 4 states
edge.colors <- c('#ededed', '#ededed', '#ededed', "#bcbcbc", "#bcbcbc" ) #from inactive, connecting active, connecting developed
frame.colors <- c('#ededed', '#ededed', '#ededed', "#bcbcbc", "black") #from base, to recommended, to True Positive
#defining some values
num_dragons <- length(V(g)$ts_dragons[V(g)$ts_dragons==1 & !is.na(V(g)$ts_dragons)])
num_introduction <- length(V(g)$ts_introduction[V(g)$ts_introduction==1 & !is.na(V(g)$ts_introduction)])
num_growth <- length(V(g)$ts_growth[V(g)$ts_growth==1 & !is.na(V(g)$ts_growth)])
num_maturity <- length(V(g)$ts_maturity[V(g)$ts_maturity==1 & !is.na(V(g)$ts_maturity)])
num_decline <- length(V(g)$ts_decline[V(g)$ts_decline==1 & !is.na(V(g)$ts_decline)] )
num_activated <- length(V(g)$ts_activated[V(g)$ts_activated==1 & !is.na(V(g)$ts_activated)] )
num_transition <- length(V(g)$ts_transition[V(g)$ts_transition==1 & !is.na(V(g)$ts_transition)] )
#setting default values for vertices
V(g)$color <- node.colors[1]
V(g)$frame.color <- frame.colors[1]
V(g)$size <- pl_min_size
#print(paste("HEREE MIn size", pl_min_size))
#V(g)$label_orig <- V(g)$label #to export
V(g)$Label = as.character(nodes$subd_name[match(V(g)$name, nodes$Id)]) #watch out with the name of the column!!
V(g)$label <- ''
#setting default values for edges
E(g)$color <- edge.colors[2] #coloring edges connecting active nodes
#ACTIVE NODES ######
active_nodes <- V(g)$name[V(g)$active == 1]
V(g)$color[V(g)$name %in% active_nodes] <- node.colors[2]
#V(g)$share[V(g)$name %in% active_nodes]<- as.numeric(nodes_info_act[,'share'])
max_share <- max(V(g)$share, na.rm=TRUE); min_share <- min(V(g)$share, na.rm=TRUE)
#print(paste("max", max_share, "min", min_share))
#V(g)$size[V(g)$name %in% active_nodes] <- V(g)$share[V(g)$name %in% active_nodes]
V(g)$size[V(g)$name %in% active_nodes] <- (((V(g)$share[V(g)$name %in% active_nodes] - min_share )/(max_share-min_share) ) + pl_min_size) * pl_node_sc
#UNDEVELOPED NODES
und_nodes <- V(g)$name[V(g)$undeveloped == 1]
#GROWING NODES #####
grw_nodes <- V(g)$name[V(g)$growing == 1]
V(g)$color[V(g)$name %in% grw_nodes] <- node.colors[3]
#DEVELOPED NODES #######
dev_nodes <- V(g)$name[V(g)$developed == 1]
V(g)$color[V(g)$name %in% dev_nodes] <-node.colors[4]
if(length(dev_nodes)>0)
{
#labeling all edges from recommended nodes
#E(g)$label[ E(g)[from (V(g)[V(g)$density>pl_min_dens]) ] ] <- E(g)$weight[ E(g)[from (V(g)[V(g)$density>pl_min_dens]) ] ] #bug here, something happen
E(g)$color[E(g)$weight_bool==1] <- edge.colors[4] #coloring edges connecting active nodes
#if(length(dev_nodes) < (prop_to_lab * length(rownames(nodes))) )
# V(g)$label = as.character(nodes$subd_name[match(V(g)$name, nodes$Id)]) #watch out with the name of the column!!
#else
#{
#print("Filtering labels...")
nod_max_com <- get_max_com(g) #get the nodes with max degree per community
#print(nod_max_com)
V(g)$label <- ''
#V(g)$label[V(g)$name %in% nod_max_com] <- as.character(nodes$subd_name[match(nod_max_com, nodes$Id)])
nodes_to_label <- V(g)$name[V(g)$name %in% intersect(nod_max_com,dev_nodes)] #to get the order of iGraph
V(g)$label[V(g)$name %in% intersect(nod_max_com,dev_nodes)] <- as.character(nodes$subd_name[match(nodes_to_label, nodes$Id)])
#}
}
#RECOMMENDED NODES #### (density > 0)
#adding labels to nodes
active_nodes_density_lab <- V(g)$name[V(g)$dens_dev>0] #nodes that you have to LABEL becausse of the density.
nodes_info_density <- nodes[nodes$Id %in% active_nodes_density_lab,]
#V(g)$label[V(g)$name %in% active_nodes_density_lab] <- paste(strtrim(nodes_info_density$subd_name,5), as.character(round(V(g)$density[V(g)$name %in% active_nodes_density_lab],2))) #label with name and value of density
#V(g)$label[V(g)$name %in% active_nodes_density_lab] <- paste( as.character(round(V(g)$density[V(g)$name %in% active_nodes_density_lab],2))) #label with name and value of density
#V(g)$size[V(g)$name %in% active_nodes_density_lab] <- V(g)$density[V(g)$name %in% active_nodes_density_lab] #size of recommended nodes, according density
#V(g)$size[V(g)$name %in% active_nodes_density_lab] <- pl_min_size
#create instead a color map from white to black for recommended values according to density
#density_nodes_g <- V(g)$name[V(g)$name %in% active_nodes_density_lab] #TO GET THE order that provides iGraph
#nodes_info_act <- nodes_info_act[match(active_nodes_g, nodes_info_act$Id),] #reordering
#V(g)$color[V(g)$name %in% active_nodes_density_lab] <- color_palette[5]
V(g)$frame.color[V(g)$name %in% active_nodes_density_lab] <- frame.colors[5] #color_palette[6]
#V(g)$color[V(g)$name %in% active_nodes_density_lab] <- node.colors[5] #color_palette[6]
#LEGEND AND TITLES ###
#MUST USE AS.CHARACTER, OTHERWISE it will take an integer resulting in extrange colors
#V(g)$color[V(g)$name %in% active_nodes_density_lab] <- as.character(nodes_info_density$color)
num_act <- length(na.omit(active_nodes)) #any production
num_dev <- length(na.omit(dev_nodes)) #developed area
num_grw <- length(na.omit(grw_nodes)) #growing area
num_und <- length(na.omit(und_nodes)) #underdeveloped area
num_ina <- vcount(g)-num_act
measures <- paste('Undeveloped <', round(g$cut_off_to_grw, 4), '< Growing Areas <', round(g$cut_off_to_dev, 4), 'Developed Areas', '\n')
num_rcm <- length(active_nodes_density_lab)
densities <- ""
#densities <- paste("Transition:" ,"Undeveloped to Developed."," Recommended nodes (connected to developed nodes) for the next period (dark gray):",as.character(num_rcm))
measures <- paste(measures,densities, sep=" ")
#titles
prod_name <- get_prod_name(g$producer, producers)
prod_domain <- get_prod_domain(g$producer, producers)
dataset = "Google Scholar"
title <- paste( prod_name," ", g$interval, '\nBase Map: ', ' - Overlay Data: ', dataset, " - Evaluating: ", toupper(g$what_eval),sep="")
file_name <- paste('OverlayMap', prod_domain,g$interval, g$what_eval, sep='_')
#subtitle <- paste('Layout: Frughtermand Rengold', '| Size: Share of authorships', '| Color: Areas of Science (original colors) \n' , measures, '\n', evaluation)
subtitle <- paste('Layout: Fruchterman–Reingold', '| Size: Share of authorships', '| Color: Values of ', toupper(g$what_eval), " \n Agg. function: ", "sum", '\n' , measures)
#exporting graph to a dataframe
#V(g)$cut_off_to_dev <- cut_off_to_dev
#vg_final<<- data.frame( V(g)$name, V(g)$label, V(g)$color, V(g)$active,V(g)$inactive,V(g)$size, V(g)$shares, V(g)$cut_off_to_dev, V(g)$developed, V(g)$undeveloped )#
par(lend = 1) # square line ends for the color legend
legend = c(paste("Inactive", num_ina), paste("Undeveloped", num_und), paste("Growing", num_grw), paste("Developed", num_dev), paste("Opportunity", num_rcm)) # category labels
plot_graph_overlay(g, layout='fr', title = title , subtitle=subtitle, file_name=file_name, legend=legend, l_pt.bg=node.colors, l_col=frame.colors, cex=cex, pl_w_pdf=pl_w_pdf )
}
#' @title Get data to overlay
#' @description this function read data in intervals, aggregate it and computes measures of normalization of the data as RCA. Also subset and filter per a list or value of producers
#' @param n Number of samples to use. -1 for the complete number of producers.
#' @param init an integer pointing the initial year
#' @param end an integer pointing the final year
#' @param by an integer describing the time window
#' @param cum if FALSE it uses mobile time windows, otherwise it uses allways the same init year.
#' @param min_prod minimum production to filter producers
#' @param what_agg name of column to aggregate 'wgh_jfrac',
#' @param agg_fun function to aggregate, usually sum,
#' @param list_prod default NaN. A finite list of producers
#'
#' @examples
#' get_data_to_overlay()
#' @return a Data Frame to overlay.
#' @export
get_data_to_overlay <- function(data_interval, n=10, init=NaN, end=NaN, by=1, cum=FALSE, min_prod=1, year="year", producer, category, value, agg_fun=mean, list_prod=NaN )
{
#data_raw=load_raw_data(), init=NaN, end=NaN, year = 'year', producer, category, value="value", agg='mean'
#GET years automatically
if(is.na(init)){
init <- min(data_interval[year])
print("Defined min year to ")
print(init)
}
if(is.na(end)){
end <- max(data_interval[year])
print("Defined max year to ")
print(end)
}
#print("Loading Interval DATA...")
data <- subset(data_interval, year>= init & year<= end)
#print(summary(data))
#get sample of producers FILTER
#data_interval <- load_data_interval(init = init, end = end, agg=NaN) #without aggregation
num_years <- end-init
#changing names columns
colnames(data_interval)[match(year,colnames(data_interval))] <- 'year'
colnames(data_interval)[match(producer,colnames(data_interval))] <- 'producer'
colnames(data_interval)[match(category,colnames(data_interval))] <- 'category'
colnames(data_interval)[match(value,colnames(data_interval))] <- 'value'
print(names(data_interval))
#if there is no a list of target producers
if(is.na(list_prod)[[1]])
{
#totals_producers <- aggregate(authorship_count~id_author , data_interval, FUN=sum)
print(length(data_interval$value))
print(length(data_interval$producer))
totals_producers <- aggregate(value~producer, data_interval, FUN=sum)
#producers accomplish min production
#producers <- totals_producers$id_author[totals_producers$authorship_count> (num_years * min_prod)]
producers_ids <- totals_producers$producer[totals_producers$value> (num_years * min_prod)] #FILTER HERE!!!!
if(n==-1)
{
n <- length(producers_ids)
}
#producers <- unique(data_interval[,1])
#print(paste("Number producers:",length(producers)))
producers_sample <- producers_ids[sample(length(producers_ids), n,replace = FALSE)]
}#end sampling producers
else
{
producers_sample <- list_prod
}
#print(paste("Number producers SAMPLE:",length(producers_sample)))
#data with producers choosen
data_sample <- subset(data_interval, producer %in% producers_sample) #watch out with the name of the column of producers.
if(is.na(init)){
init <- min(data_sample[year])
print("Defined min year for sample to ")
print(init)
}
if(is.na(end)){
end <- max(data_sample[year])
print("Defined max year for sample to ")
print(end)
}
##get intervals
intervals <- get_intervals(years=data_sample$year, year_ini=init, year_fin=end,win=by)
#aggregating data for interval, producer, category, value, and calculating Shares, RCAs
data_to_use <- data.frame()
for(interval in intervals$interval)
{
i <- match(interval, intervals$interval) #index of parameter interval
if(cum == TRUE)
year_s <- init
else
year_s <- intervals$year_start[i]
year_e <- intervals$year_end[i] #simplify names #USING LOOONG AGGREGATION of time
print(year_s)
print(year_e)
data_int_raw <- subset(data_sample, year >= year_s & year <= year_e)
#data_int <- aggregate(x=data_int_raw['authorship_count'], by=list(id_author=data_int_raw$id_author, id_category = data_int_raw$subdiscipline_id), FUN=agg_fun)
if(nrow(data_int_raw)<1)
{
print("Not enough data in interval")
print(interval)
print("I did not added to the final dataframe...")
}
else
{
data_int <- aggregate(x=data_int_raw['value'], by=list(producer=data_int_raw$producer, category = data_int_raw$category), FUN=agg_fun) #totalof production of a producer in a category
total_global <- sum(data_int$value)
data_int <- data.frame(interval, data_int)
#data_prod_total <- aggregate(x=data_int['authorship_count'], by=list(id_author=data_int$id_author), FUN=sum) #finding total production by producer
data_prod_total <- aggregate(x=data_int['value'], by=list(producer=data_int$producer), FUN=sum) #finding total production by producer
#print(data_prod_total)
data_int <- merge(data_int,data_prod_total, by='producer') #adding total production to the table
#total per category
colnames(data_int)[colnames(data_int)=='value.x'] <- 'value'
colnames(data_int)[colnames(data_int)=='value.y'] <- 'total_producer'
#data_int['share'] <- data_int$authorship_count.x / data_int$authorship_count.y #computing shares
data_int['share'] <- data_int$value / data_int$total_producer #computing shares
#aggregate
data_categ_total <- aggregate(x=data_int['value'], by=list( category = data_int$category), FUN=agg_fun) #total of that category
data_int <- merge(data_int, data_categ_total, by='category')
colnames(data_int)[colnames(data_int)=='value.x'] <- 'value'
colnames(data_int)[colnames(data_int)=='value.y'] <- 'total_category'
data_int['total_global'] <- total_global
data_int['share_category'] <- data_int$total_category / total_global
data_int['rca'] <- data_int$share/data_int$share_category
data_int['rca_bool'] <- data_int['rca']
data_int$rca_bool[data_int$rca_bool < 1] <- 0
data_int$rca_bool[data_int$rca_bool != 0] <- 1
data_to_use <- rbind(data_to_use,data_int)
}
}
#colnames(data_to_use)[colnames(data_to_use) == 'id_author'] <- 'id_producer'
return(data_to_use)
#________________________
}#end function
#evaluate roc curves
#evaluate roc for each producer
#' @title Evaluate roc curves
#' @description evaluate roc curves for each producer
#' @param data_eval_roc a data frame processed previously with plot roc
#' @param producers a data frame with producers
#' @examples
#' plot_rocs()
#' @return a Data Frame to overlay.
#' @export
plot_rocs <- function(data_eval_roc, producers=NULL, pl=FALSE)
{
library("MESS") #getting area under the curve
intervals_pred <- unique(data_eval_roc$interval)
data_eval_roc_act <- data.frame()
roc_total <- data.frame()
for(prod in unique(data_eval_roc$producer))
{
roc_prod <- data.frame()
for(inter in intervals_pred)
{
#analyzing ROC curves
#print(prod)
#print(inter)
data_eval_prod <- subset(data_eval_roc, producer==prod & interval==inter)
prev_interval <- data_eval_prod$prev_interval[1]
roc_int <- data.frame(prod, prev_interval, inter)
#print(nrow(data_prev_inactive))
#roc_int['producer'] <- prod
#roc_int['interval'] <- inter
roc_int['auc_I_A'] <- NA; roc_int['n_I_A'] <- NA; roc_int['tp_I_A'] <- NA
roc_int['auc_U_D'] <- NA; roc_int['n_U_D'] <- NA; roc_int['tp_U_D'] <- NA
roc_int['auc_G_D'] <- NA; roc_int['n_G_D'] <- NA; roc_int['tp_G_D'] <- NA
data_prev_inactive <- subset(data_eval_prod, V.g_prev..active==0)
if(pl==TRUE)
par(mfrow=c(2,3)) #starts template for roc plots
############EVALUATING INACTIVE TO ACTIVE
if(nrow(data_prev_inactive)==0) #no previous InActive data
{
print(paste("producer:" , prod, "Has no previous INactive data in interval", inter))
}
else
{
data_eval_sor <- data_prev_inactive[order(-data_prev_inactive$V.g_prev..dens_act),]
data_eval_roc_act <- rbind(data_eval_roc_act, data_eval_sor) #aggregate to see data used to evaluate
res_ts <- plot_roc_interval(data_eval_sor, positive="V.g..active", trans="Inactive to Active", col='brown', pl=pl)
roc_int['auc_I_A'] <- res_ts[1]
roc_int['n_I_A'] <- res_ts[2]
roc_int['tp_I_A'] <- res_ts[3]
}#end else no data
##############EVALUATING GROWING TO DEVELOPED
data_prev_growing <- subset(data_eval_prod, V.g_prev..growing==1)
if(nrow(data_prev_growing)==0) #no previous InActive data
{
print(paste("producer:" , prod, "Has no previous UNDEVELOPED data in interval", inter))
}
else
{
data_eval_sor <- data_prev_growing[order(data_prev_growing$V.g_prev..dens_dev, decreasing = TRUE),]
data_eval_roc_act <- rbind(data_eval_roc_act, data_eval_sor) #aggregate to see data used to evaluate
res_ts <- plot_roc_interval(data_eval_sor, positive="V.g..developed", trans="Growing to Developed", col='orange', pl=pl)
roc_int['auc_G_D'] <- res_ts[1]
roc_int['n_G_D'] <- res_ts[2]
roc_int['tp_G_D'] <- res_ts[3]
}#end else No previous data
##############EVALUATING UNDEV TO DEVELOPED
data_prev_undeveloped <- subset(data_eval_prod, V.g_prev..undeveloped==1)
if(nrow(data_prev_undeveloped)==0) #no previous InActive data
{
print(paste("producer:" , prod, "Has no previous UNDEVELOPED data in interval", inter))
}
else
{
data_eval_sor <- data_prev_undeveloped[order(data_prev_undeveloped$V.g_prev..dens_dev, decreasing = TRUE),]
data_eval_roc_act <- rbind(data_eval_roc_act, data_eval_sor) #aggregate to see data used to evaluate
res_ts <- plot_roc_interval(data_eval_sor, positive="V.g..developed", trans="Undeveloped to Developed", pl=pl)
roc_int['auc_U_D'] <- res_ts[1]
roc_int['n_U_D'] <- res_ts[2]
roc_int['tp_U_D'] <- res_ts[3]
}#end else No previous data
if(pl==TRUE)
{
#labaling entitlining main template of roc curves
prev_interval <- data_eval_prod$prev_interval[1]
what_eval <- toupper(data_eval_prod$what_eval[1])
mtext(paste(get_prod_name(prod, producers), "| From: ", prev_interval, "To:", inter, "| Evaluating:", what_eval), side=3, outer=TRUE, line=-3, cex=0.5)
}
roc_prod <- rbind(roc_prod, roc_int) #accumulates intervals for the same producer
}#end for interval
roc_total <- rbind(roc_total, roc_prod) #accumulates producers
}#end for producer
#roc_kk<<- roc_total
#data_eval_roc_used <<- data_eval_roc_act
return(roc_total)
}
#ploting ROC curves for density evaluatio
plot_roc_interval <- function(data_eval_sor, positive, trans="", col="skyblue", pl= TRUE )
{
#print("HEEEYYYY")
prev_interval <- data_eval_sor$prev_interval[1]
interval <- data_eval_sor$interval[1]
prod <- as.character(droplevels(data_eval_sor$producer[[1]]))
what_eval <- data_eval_sor$what_eval[1]
#setting default or null values
auc_val <- NA #area under the curve
n_to_eval <- NA #n of sample to evaluate
n_tp <- NA #n of true positives
#changing NA per 0
data_eval_sor[, positive][is.na(data_eval_sor[, positive])] <- 0
if(sum(!data_eval_sor[,positive], na.rm=TRUE) != 0 && sum(data_eval_sor[1:(length(data_eval_sor)-1),positive], na.rm=TRUE)!= 0 ) #if the last value is positive AUC is divergent and an error is produced!!!!
{
x <- cumsum(!data_eval_sor[,positive]) / sum(!data_eval_sor[,positive], na.rm=TRUE)
y <- cumsum(data_eval_sor[,positive]) / sum(data_eval_sor[,positive], na.rm=TRUE)
#plot(x,y)
#print(paste(x,y))
n_to_eval <- nrow(data_eval_sor) #n of sample to evaluate
n_tp <- sum(data_eval_sor[,positive]) #n of true positives
auc_val <- auc(x,y, type = 'spline') #area under the curve #this gives error if there is only one TP (y value) and it is at the end!!!.
if(pl == TRUE)
{
plot(x, y,
#main=paste(get_prod_name(prod), trans, toupper(what_eval), sep=" | "),
#sub=paste(" From: ", prev_interval, "To:", interval),
sub=paste(paste(trans)),
xlab="False Positives",
ylab = "True Positives",
asp = 1,
col = col,
cex.sub =1.2,
xlim = c(0,1),
ylim = c(0,1)
)
lines(c(0,1), c(0,1), col="lightgray")
#abline(0, 1, col = "lightgray") #diagonal
legend("bottomright",
legend=c(paste("AUC=",round(auc_val,3)), paste("N to EVAL=", n_to_eval), paste("TP= ",n_tp ), paste("FP= ",(n_to_eval-n_tp) )),
cex=0.6)
}
}
else
{
print(paste("No possible evaluation for", prod, " In interval ", interval, " Transition", trans))
auc_val <- NA
}
return(c(auc_val, n_to_eval, n_tp))
}
#' @title Get the name of a producer
#' @description this function returns the name of a producer based on an id defined by id_prod.
#' @param id_prod an identifier for a producer.
#' @param producers a dataframe with columns information of producers, it must contain column id and ideally name.
#' @examples
#' get_prod_name()
#' @return a name for the producer, based on its id. It returns the id in case the column name was not found.
#' @export
get_prod_name <- function(id_prod, producers=NULL)
{
if(is.null(producers)==TRUE)
{
return(id_prod)
}
else{
return(prod_name <- as.character(producers$name[producers$id==id_prod]))
}
}
get_prod_domain <- function(id_prod, producers=NULL)
{
if(is.null(producers)==TRUE)
{
return(id_prod)
}
else{
return(prod_domain <- as.character(producers$i.domain[producers$id==id_prod]))
}
}
#' @export
plot_box <- function(data_auc_total)
{
colors <- c('brown','orange','skyblue')
#linetype <- c(1,1, c(1:(n_eval-2)))
#plotchar <- c(1:n_eval)
producers_ax <- unique(data_auc_total$prod)
prev_int <- data_auc_total$prev_interval[1]
interval <- data_auc_total$inter[[1]]
#data_auc_total$map <- strtrim(data_auc_total$map, 4)
#starting evaluation plots
par(mfrow=c(1,3)) #starts template for roc plots
boxplot(auc_I_A~map, data=data_auc_total, ylab='AUC Inactive to Active', border=colors[1])
#aov_I_A <- aov(auc_I_A~map, data=data_auc_total)
#print("ANOVA Analysis for Inactive to Active transition")
#print(summary(aov_I_A))
boxplot(auc_G_D~map, data=data_auc_total, ylab='AUC Growing to Developed', border=colors[2])
#aov_G_D <- aov(auc_G_D~map, data=data_auc_total)
#print("ANOVA Analysis for Growing to Developed transition")
#print(summary(aov_G_D))
boxplot(auc_U_D~map, data=data_auc_total, ylab='AUC Undeveloped to Developed', border=colors[3])
#aov_U_D <- aov(auc_U_D~map, data=data_auc_total)
#print("ANOVA Analysis for Undeveloped to Developed transition")
#print(summary(aov_U_D))
#plot.new()
#legend <- paste(maps_eval[1:n_eval,'map'], '|',
#round(maps_eval[1:n_eval,'auc_I_A'],3), '|',
#round(maps_eval[1:n_eval,'auc_G_D'],3), '|',
#round(maps_eval[1:n_eval,'auc_U_D'],3))
#legend("topleft", legend=legend, title="Map | avg(AUC_I_A) | avg(AUC_G_D) | avg(AUC_U_D)")
#print(legend)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.