R/plot_hsim.R

Defines functions plot_hsim

Documented in plot_hsim

#' Plot the result of simulate_hindex
#'
#' Plot the result of a simulation computed by simulate_hindex.
#'
#' @param simdata The result of a simulation returned
#' by \code{\link{simulate_hindex}}.
#' @param plot_hindex If this parameter is set to TRUE, the h-index values are
#' plotted.
#' @param plot_halpha If this parameter is set to TRUE, the h-alpha values are
#' plotted.
#' @param plot_toppapers If this parameter is set to TRUE, the numbers of
#' top-10\% papers are plotted.
#' @param plot_mindex If this parameter is set to TRUE, the mindex values are
#' plotted.
#' @param subgroups If this parameter is set to TRUE, the subgroups in simdata
#' are considered for grouping plotting the index values separately for each of
#' these groups.
#' @param group_boundaries Alternative to subgroups for specifying groups of
#' scientists for plotting the index values separately for these groups. Here,
#' the groups are specified based on the initial h-index of the agents. group_boundaries
#' must be a list of vectors or a vector of integers specifying the groups.
#' If a list is specified, each element must be a vector of length 2
#' representing the lower and the upper bound for the initial h-index (if the
#' boundaries are included in the corresponding intervals is specified by the
#' exclude_group_boundaries parameter).
#' If a vector of integers is specified, each element in group_boundaries
#' separates two groups such that all agents with an initial h-index below
#' this boundary (and equal to or above any lower boundary; if
#' exclude_group_boundaries is set to TRUE, the initial h-index has to be
#' above any lower boundary) are in the first group, and all agents with
#' an initial h-index equal to or above this boundary (and below any higher
#' boundary) are in the second group.
#' @param plot_group_diffs If this parameter is specified, the difference
#' between the groups that are specified by group_boundaries is plotted.
#' @param exclude_group_boundaries If this parameter is set to TRUE, the
#' scientists are grouped such that those scientists whose initial h-index
#' is equal to a boundary are not included.
#'
#' @return A ggplot object (\code{\link[ggplot2]{ggplot}}).
#'
#' @export
#' @importFrom foreach "%do%"
#'
#' @examples
#' set.seed(123)
#' simdata <- simulate_hindex(runs = 2, n = 20, periods = 3)
#' plot_hsim(simdata, plot_hindex = TRUE, plot_halpha = TRUE)
plot_hsim <- function(simdata, plot_hindex = FALSE, plot_halpha = FALSE,
                      plot_toppapers = FALSE,
                      plot_mindex = FALSE,
                      subgroups = FALSE,
                      group_boundaries = NULL,
                      exclude_group_boundaries = FALSE,
                      plot_group_diffs = FALSE) {

  if (!plot_hindex & !plot_halpha & !plot_toppapers & !plot_mindex) {
    stop('at least one of the parameters plot_hindex, plot_halpha,
         plot_toppapers, plot_mindex must be TRUE')
  }

  if (subgroups && (!is.null(group_boundaries) || exclude_group_boundaries)) {
    warning('if subgroups is set to TRUE, group_boundaries and exclude_group_boundaries
            have no effect')
  }

  if (typeof(simdata) != 'list' ||
      length(unique(lapply(simdata, length))) != 1 ||
      length(unique(unlist(
        lapply(simdata[1:4], function(currentRun) {lapply(currentRun, length)})
      ))) != 1 ||
      length(simdata[[1]][[1]][[1]]) != length(simdata[[5]][[1]]) ||
	  length(unique(unlist(
		lapply(simdata$subgroup, function(SubgroupsCurrentRun) {length(unique(SubgroupsCurrentRun))})
	  ))) != 1 # verify that in each run has the same number of subgroups
  ) {
    # check if each run has same no of periods, and each period has
    # same no of returned lists (hindex values, h alpha values, toppapers, etc.)
    stop('structure of simdata not correct; make sure to use the result
         returned by simulate_hindex')
  }

  # groups -> initial h values (e.g. low vs high...)
  if (!is.null(group_boundaries)) {
    if (typeof(group_boundaries) == 'list') {

      # use vector for each group indicating lower and upper boundary

      groups <- length(group_boundaries)

      groupBoundariesOrdered <- lapply(group_boundaries, function(currentEl) {
        if (!typeof(currentEl) %in% c('double', 'integer') ||
            length(currentEl) != 2) {
          stop('if group_boundaries are given as a list, each element
               has to be a vector of length 2')
        }
        if (currentEl[1] <= currentEl[2]) {
          return(currentEl)
        } else {
          return(c(currentEl[2], currentEl[1]))
        }
      })

      groupOrder <- order(unlist(lapply(groupBoundariesOrdered, `[`, 1)))
      groupBoundariesOrdered <- groupBoundariesOrdered[groupOrder]

    } else if (typeof(group_boundaries) %in% c('double', 'integer')) {

      # use numeric to indicate the threshold for determining groups

      group_boundaries_ordered <-
        unique(group_boundaries[order(group_boundaries)])
      groups <- length(group_boundaries_ordered) + 1

      groupBoundariesOrdered <- lapply(1:groups, FUN = function(currentGroup) {
        if (currentGroup == 1) {
          return(c(0, group_boundaries_ordered[1] - .Machine$double.eps))
        } else if (currentGroup == groups) {
          return(c(group_boundaries_ordered[currentGroup - 1], Inf))
        } else {
          (c(group_boundaries_ordered[currentGroup - 1],
             group_boundaries_ordered[currentGroup] - .Machine$double.eps))
        }
      })

    } else if (typeof(group_boundaries) == 'character') {

      # use function to determine the threshold between groups

      if (group_boundaries == 'median') {
        threshold <- stats::median(simdata$h[[1]][[1]])
        groups <- 2
        groupBoundariesOrdered <- list(c(0, threshold), c(threshold, Inf))
      } else {
        stop('function for determining groups not supported')
      }

    } else {
      stop('if group_boundaries is specified,
           it must be either a list or a vector')
    }
  } else {
    groups <- 1
  }

  if (subgroups) {
    groups <- length(unique(simdata$subgroup[[1]]))
  }

  # plot_group_diffs: plot the difference of index/indices for two groups
  # two groups have to be specified for this
  if (plot_group_diffs) {
    if (is.null(group_boundaries) && !subgroups) {
      stop('in order to plot differences between groups of scientists with
           different initial h-index, group_boundaries must be specified')
    }
    if (groups != 2) {
      stop('differences between groups of scientists is only possible for two groups')
    }
  }

  if (plot_hindex) {

    nPeriods <- length(simdata$h[[1]])

    hRunIndex <- NULL
    hMeansRuns <- foreach::foreach(hRunIndex = 1:length(simdata$h),
                                   .combine = '+') %do% {

	   hRun <- simdata$h[[hRunIndex]]

       if (groups > 1) {
         if (subgroups) {
           groupsScientists <- lapply(1:groups, function(currentGroup) {which(simdata$subgroup[[hRunIndex]] == currentGroup)})
         } else {
           groupsScientists <- lapply(1:groups, function(currentGroup) {
             # TODO
             if (exclude_group_boundaries) {
               currentScientists <- which(hRun[[1]] >
                                            groupBoundariesOrdered[[currentGroup]][1] &
                                            hRun[[1]] < groupBoundariesOrdered[[currentGroup]][2])
               if (length(currentScientists) <= 0) {
                 warning(paste('no scientist in group', currentGroup, sep = ' '))
               }
             } else {
               currentScientists <- which(hRun[[1]] >=
                                            groupBoundariesOrdered[[currentGroup]][1] &
                                            hRun[[1]] <= groupBoundariesOrdered[[currentGroup]][2])
               if (length(currentScientists) <= 0) {
                 warning(paste('no scientist in group', currentGroup, sep = ' '))
               }
             }
             return(currentScientists)
           })
         }
       } else {
         groupsScientists <- list(1:length(hRun[[1]]))
       }

       hPeriod <- NULL
       runMeans <- foreach::foreach(hPeriod = hRun, .combine = 'cbind') %do% {

         # get mean for each group
         runPeriodGroupMeans <- vapply(1:groups, FUN = function(currentGroup) {
           mean(hPeriod[groupsScientists[[currentGroup]]])
         }, FUN.VALUE = double(1))
         return(t(runPeriodGroupMeans))

       }

       return(runMeans)

    }

    hMeansRuns <- hMeansRuns / length(simdata$h)
    vals <- t(hMeansRuns)
    period <-rep(1:nPeriods, each = groups)
    hInitGroup <- as.integer(round(rep(1:groups, nPeriods)))
    indexType <- 'h-index'

    plotData <- data.frame(vals = vals, period = period,
                           hInitGroup = hInitGroup, indexType = indexType)

    if (plot_group_diffs) {
      diffs <- vals[c(FALSE, TRUE)] - vals[c(TRUE, FALSE)]
      plotData <- rbind(plotData,
                        data.frame(vals = diffs, period = 1:nPeriods,
                                   hInitGroup = groups + 1, indexType = indexType))
    }

  }

  if (plot_halpha) {

    nPeriods <- length(simdata$h[[1]])

    hAlphaRunIndex <- 0
    hAlphaMeansRuns <- foreach::foreach(hAlphaRunIndex = 1:length(simdata$h_alpha),
      .combine = '+') %do% {

       # hAlphaRunIndex <- hAlphaRunIndex + 1

       if (groups > 1) {
         if (subgroups) {
           groupsScientists <- lapply(1:groups, function(currentGroup) {which(simdata$subgroup[[hAlphaRunIndex]] == currentGroup)})
         } else {
           groupsScientists <- lapply(1:groups, function(currentGroup) {
             # TODO
             if (exclude_group_boundaries) {
               currentScientists <- which(simdata$h[[hAlphaRunIndex]][[1]] >
                                            groupBoundariesOrdered[[currentGroup]][1] &
                                            simdata$h[[hAlphaRunIndex]][[1]] <
                                            groupBoundariesOrdered[[currentGroup]][2])
               if (length(currentScientists) <= 0) {
                 warning(paste('no scientist in group', currentGroup, sep = ' '))
               }
             } else {
               currentScientists <- which(simdata$h[[hAlphaRunIndex]][[1]] >=
                                            groupBoundariesOrdered[[currentGroup]][1] &
                                            simdata$h[[hAlphaRunIndex]][[1]] <=
                                            groupBoundariesOrdered[[currentGroup]][2])
               if (length(currentScientists) <= 0) {
                 warning(paste('no scientist in group', currentGroup, sep = ' '))
               }
             }
             return(currentScientists)
           })
         }
       } else {
         groupsScientists <- list(1:length(simdata$h[[1]][[1]]))
       }

       hAlphaPeriod <- NULL
       runMeans <- foreach::foreach(hAlphaPeriod = simdata$h_alpha[[hAlphaRunIndex]], .combine = 'cbind') %do% {

         # get mean for each group
         runPeriodGroupMeans <- vapply(1:groups, FUN = function(currentGroup) {
           mean(hAlphaPeriod[groupsScientists[[currentGroup]]])
         }, FUN.VALUE = double(1))
         return(t(runPeriodGroupMeans))

       }

       return(runMeans)

    }

    hAlphaMeansRuns <- hAlphaMeansRuns / length(simdata$h_alpha)
    vals <- t(hAlphaMeansRuns)
    period <-rep(1:nPeriods, each = groups)
    hInitGroup <- as.integer(round(rep(1:groups, nPeriods)))
    indexType <- 'h-alpha'

    if (plot_hindex) {
      plotData <- rbind(plotData, data.frame(vals = vals, period = period,
                                             hInitGroup = hInitGroup, indexType = indexType))
    } else {
      plotData <- data.frame(vals = vals, period = period,
                             hInitGroup = hInitGroup, indexType = indexType)
    }

    if (plot_group_diffs) {
      diffs <- vals[c(FALSE, TRUE)] - vals[c(TRUE, FALSE)]
      plotData <- rbind(plotData,
                        data.frame(vals = diffs, period = 1:nPeriods,
                                   hInitGroup = groups + 1, indexType = indexType))
    }

  }

  if (plot_toppapers) {

    nPeriods <- length(simdata$h[[1]])

    toppapersRunIndex <- 0
    toppapersMeansRuns <- foreach::foreach(toppapersRunIndex = 1:length(simdata$top10_papers),
                                        .combine = '+') %do% {

                                          # toppapersRunIndex <- toppapersRunIndex + 1

                                          if (groups > 1) {
                                            if (subgroups) {
                                              groupsScientists <- lapply(1:groups, function(currentGroup) {which(simdata$subgroup[[toppapersRunIndex]] == currentGroup)})
                                            } else {
                                              groupsScientists <- lapply(1:groups, function(currentGroup) {
                                                # TODO
                                                if (exclude_group_boundaries) {
                                                  currentScientists <- which(simdata$h[[toppapersRunIndex]][[1]] >
                                                                               groupBoundariesOrdered[[currentGroup]][1] &
                                                                               simdata$h[[toppapersRunIndex]][[1]] <
                                                                               groupBoundariesOrdered[[currentGroup]][2])
                                                  if (length(currentScientists) <= 0) {
                                                    warning(paste('no scientist in group', currentGroup, sep = ' '))
                                                  }
                                                } else {
                                                  currentScientists <- which(simdata$h[[toppapersRunIndex]][[1]] >=
                                                                               groupBoundariesOrdered[[currentGroup]][1] &
                                                                               simdata$h[[toppapersRunIndex]][[1]] <=
                                                                               groupBoundariesOrdered[[currentGroup]][2])
                                                  if (length(currentScientists) <= 0) {
                                                    warning(paste('no scientist in group', currentGroup, sep = ' '))
                                                  }
                                                }
                                                return(currentScientists)
                                              })
                                            }
                                          } else {
                                            groupsScientists <- list(1:length(simdata$h[[1]][[1]]))
                                          }

                                          toppapersPeriod <- NULL
                                          runMeans <- foreach::foreach(toppapersPeriod = simdata$top10_papers[[toppapersRunIndex]], .combine = 'cbind') %do% {

                                            # get mean for each group
                                            runPeriodGroupMeans <- vapply(1:groups, FUN = function(currentGroup) {
                                              mean(toppapersPeriod[groupsScientists[[currentGroup]]])
                                            }, FUN.VALUE = double(1))
                                            return(t(runPeriodGroupMeans))

                                          }

                                          return(runMeans)

                                        }

    toppapersMeansRuns <- toppapersMeansRuns / length(simdata$top10_papers)
    vals <- t(toppapersMeansRuns)
    period <-rep(1:nPeriods, each = groups)
    hInitGroup <- as.integer(round(rep(1:groups, nPeriods)))
    indexType <- 'toppapers'

    if (plot_hindex || plot_halpha) {
      plotData <- rbind(plotData, data.frame(vals = vals, period = period,
                                             hInitGroup = hInitGroup, indexType = indexType))
    } else {
      plotData <- data.frame(vals = vals, period = period,
                             hInitGroup = hInitGroup, indexType = indexType)
    }

    if (plot_group_diffs) {
      diffs <- vals[c(FALSE, TRUE)] - vals[c(TRUE, FALSE)]
      plotData <- rbind(plotData,
                        data.frame(vals = diffs, period = 1:nPeriods,
                                   hInitGroup = groups + 1, indexType = indexType))
    }

  }

  if (plot_mindex) {

    nPeriods <- length(simdata$h[[1]])

    mindexRunIndex <- 0
    mindexMeansRuns <- foreach::foreach(mindexRunIndex = 1:length(simdata$mindex),
                                           .combine = '+') %do% {

                                             # mindexRunIndex <- mindexRunIndex + 1

                                             if (groups > 1) {
                                               if (subgroups) {
                                                 groupsScientists <- lapply(1:groups, function(currentGroup) {which(simdata$subgroup[[mindexRunIndex]] == currentGroup)})
                                               } else {
                                                 groupsScientists <- lapply(1:groups, function(currentGroup) {
                                                   # TODO
                                                   if (exclude_group_boundaries) {
                                                     currentScientists <- which(simdata$h[[mindexRunIndex]][[1]] >
                                                                                  groupBoundariesOrdered[[currentGroup]][1] &
                                                                                  simdata$h[[mindexRunIndex]][[1]] <
                                                                                  groupBoundariesOrdered[[currentGroup]][2])
                                                     if (length(currentScientists) <= 0) {
                                                       warning(paste('no scientist in group', currentGroup, sep = ' '))
                                                     }
                                                   } else {
                                                     currentScientists <- which(simdata$h[[mindexRunIndex]][[1]] >=
                                                                                  groupBoundariesOrdered[[currentGroup]][1] &
                                                                                  simdata$h[[mindexRunIndex]][[1]] <=
                                                                                  groupBoundariesOrdered[[currentGroup]][2])
                                                     if (length(currentScientists) <= 0) {
                                                       warning(paste('no scientist in group', currentGroup, sep = ' '))
                                                     }
                                                   }
                                                   return(currentScientists)
                                                 })
                                               }
                                             } else {
                                               groupsScientists <- list(1:length(simdata$h[[1]][[1]]))
                                             }

                                             mindexPeriod <- NULL
                                             runMeans <- foreach::foreach(mindexPeriod = simdata$mindex[[mindexRunIndex]], .combine = 'cbind') %do% {

                                               # get mean for each group
                                               runPeriodGroupMeans <- vapply(1:groups, FUN = function(currentGroup) {
                                                 mean(mindexPeriod[groupsScientists[[currentGroup]]])
                                               }, FUN.VALUE = double(1))
                                               return(t(runPeriodGroupMeans))

                                             }

                                             return(runMeans)

                                           }

    mindexMeansRuns <- mindexMeansRuns / length(simdata$mindex)
    vals <- t(mindexMeansRuns)
    period <-rep(1:nPeriods, each = groups)
    hInitGroup <- as.integer(round(rep(1:groups, nPeriods)))
    indexType <- 'mindex'

    if (plot_hindex || plot_halpha || plot_toppapers) {
      plotData <- rbind(plotData, data.frame(vals = vals, period = period,
                                             hInitGroup = hInitGroup, indexType = indexType))
    } else {
      plotData <- data.frame(vals = vals, period = period,
                             hInitGroup = hInitGroup, indexType = indexType)
    }

    if (plot_group_diffs) {
      diffs <- vals[c(FALSE, TRUE)] - vals[c(TRUE, FALSE)]
      plotData <- rbind(plotData,
                        data.frame(vals = diffs, period = 1:nPeriods,
                                   hInitGroup = groups + 1, indexType = indexType))
    }

  }

  # initialize ggplot
  sim_plot <- ggplot2::ggplot(plotData,
                              ggplot2::aes(x = period, y = vals,
                                           group = interaction(hInitGroup, indexType),
                                           color = factor(hInitGroup),
                                           linetype = indexType)) +
    ggplot2::geom_line() +
    ggplot2::xlab('Period') +
    ggplot2::theme(axis.title.y = ggplot2::element_blank())

  if (groups > 1) {

    if (subgroups) {
      labels <- paste('Subgroup', 1:groups)
    } else {
      labels <- vector(mode = 'character', length = groups)
      for (currentGroup in 1:groups) {
        if (exclude_group_boundaries) {
          labels[currentGroup] <- eval(bquote(expression(
            h[init] %in% group("(", list(
              .(groupBoundariesOrdered[[currentGroup]][1]),
              .(groupBoundariesOrdered[[currentGroup]][2])
            ), ")")
          )))
        } else {
          labels[currentGroup] <- eval(bquote(expression(
            h[init] %in% group("[", list(
              .(groupBoundariesOrdered[[currentGroup]][1]),
              .(groupBoundariesOrdered[[currentGroup]][2])
            ), ")")
          )))
        }
      }
      if (typeof(group_boundaries) %in% c('double', 'integer')) {
        labels[1] <- eval(bquote(expression(
          h[init] < .(groupBoundariesOrdered[[1]][2])
        )))
        if (exclude_group_boundaries) {
          labels[groups] <- eval(bquote(expression(
            h[init] > .(groupBoundariesOrdered[[groups]][1])
          )))
        } else {
          labels[groups] <- eval(bquote(expression(
            h[init] >= .(groupBoundariesOrdered[[groups]][1])
          )))
        }
      }
    }

    if (plot_group_diffs) {
      labels <- c(labels, 'Difference')
    }

    sim_plot <- sim_plot +
      ggplot2::scale_color_discrete(name = 'Initial h-index',
                                    labels = labels)
  } else {
    sim_plot <- sim_plot +
      ggplot2::scale_color_discrete(guide = FALSE)
  }

  if (plot_hindex + plot_halpha + plot_toppapers == 1) {
    sim_plot <- sim_plot +
      ggplot2::scale_linetype_discrete(guide = FALSE)
  } else {
    sim_plot <- sim_plot +
      ggplot2::scale_linetype_discrete(name = 'Index')
  }

  return(sim_plot)

}

Try the hindex package in your browser

Any scripts or data that you put into this service are public.

hindex documentation built on Feb. 23, 2020, 1:07 a.m.