R/pdynmc_exploratory.R
In pdynmc: Moment Condition Based Estimation of Linear Dynamic Panel Data Models

Documented in data.info pDensTime.plot strucUPD.plot

#' Show Basic Structure of Panel Dataset.
#'
#' \code{data.info} shows basic structure of a balanced/unbalanced
#'    panel dataset contained in a `data.frame`.
#'
#' @param object An object of class `data.frame`.
#' @param i.name Column name of cross-section identifier.
#' @param t.name Column name of time-series identifier.
#' @param ... further arguments.
#'
#' @return Returns information if panel dataset contained
#'    in an object of class `data.frame` is a balanced or
#'    unbalanced panel dataset.
#'
#' @author Markus Fritsch, Joachim Schnurbus
#' @export
#' @importFrom stats var
#'
#' @seealso
#'
#' \code{\link{pdynmc}} for fitting a linear dynamic panel data model.
#'
#' @examples
#' ## Load data
#' data(ABdata, package = "pdynmc")
#' dat <- ABdata
#' dat[,c(4:7)] <- log(dat[,c(4:7)])
#' dat <- dat[c(1:140), ]
#'
#' ## Code example
#' data.info(dat, i.name = "firm", t.name = "year")
#'
#' data.info(dat[dat$year %in% 1979:1981, ], i.name = "firm", t.name = "year")
#'
#'
data.info	<- function(object, i.name = NULL, t.name = NULL, ...){
  if (!is.data.frame(object))
    stop("Function applied to non `data.frame`.")
  if (is.null(i.name) || is.null(t.name)) {
    stop("Cross-section dimension and longitudinal dimension need to be specified.")
  }


  i.set		<- sort(unique(object[, i.name]))
  t.set		<- sort(unique(object[, t.name]))

  cs.obs.per.period		<- tapply(X = object[, i.name], INDEX = object[, t.name], FUN = length)

  balanced	<- var(cs.obs.per.period) == 0	# or < 2*10^(-14)

  if(balanced){
    cat(
      paste("Balanced panel data set with ", nrow(object), " rows: ", sep = ""),
      "\n",
      paste(length(i.set), " cross-sectional units, and ", length(t.set), " time periods (", paste(t.set, collapse = ", "), ").", sep = ""),
      "\n"
    )
  } else {
    cat(
      paste("Unbalanced panel data set with ", nrow(object), " rows and the following time period frequencies:", sep = ""),
      "\n"
    )
    cs.obs.per.period
  }
}


















#' Plot on Structure of Unbalanced Panel Dataset.
#'
#' \code{strucUPD.plot} Plot on cross-section and longtudinal
#'    structure of an object of class `data.frame` containing
#'    an unbalanced panel dataset.
#'
#' @param object An object of class `data.frame`.
#' @param i.name Column name of cross-section identifier.
#' @param t.name Column name of time-series identifier.
#' @param col.range A vector of at least two colors used to
#'    visualize the structure of the unbalanced panel data
#'    set (defaults to 'gold' and 'darkblue'); must be a
#'    valid argument to \link{col2rgb}.
#' @param plot.name A vector indicating the title of the plot
#'    (defaults to 'Unbalanced panel structure').
#' @param ... further arguments.
#'
#' @return Returns a plot for an unbalanced panel dataset
#'    contained in an object of class `data.frame` that
#'    visualizes the structure of the data. Cross-section
#'    dimension is plotted on the ordinate, longitudinal
#'    dimension on the abscissa. Each cross-sectional
#'    observation is represented by a bar. Breaks in the
#'    bars represent missing longitudinal observations.
#'
#' @author Markus Fritsch, Joachim Schnurbus
#' @export
#' @importFrom graphics axis
#' @importFrom graphics box
#' @importFrom graphics par
#' @importFrom graphics plot
#' @importFrom graphics rect
#' @importFrom grDevices colorRampPalette
#' @importFrom stats var
#'
#' @seealso
#'
#' \code{\link{pdynmc}} for fitting a linear dynamic panel data model.
#'
#' @examples
#' ## Load data
#' data(ABdata, package = "pdynmc")
#' dat <- ABdata
#' dat[,c(4:7)] <- log(dat[,c(4:7)])
#'
#' ## Code example
#' strucUPD.plot(dat, i.name = "firm", t.name = "year")
#'
#'
#'
strucUPD.plot	<- function(
  object, i.name = NULL,	t.name = NULL,
  col.range = c("gold", "darkblue"), plot.name = "Unbalanced panel structure", ...
){
  if (!is.data.frame(object))
    stop("Function 'strucPD.plot' applied to non `data.frame`.")
  if (is.null(i.name) || is.null(t.name)) {
    stop("Cross-section dimension and longitudinal dimension need to be specified.")
  }

  i.set		<- sort(unique(object[, i.name]))
  i.set.ind		<- 1:length(i.set)
  t.set		<- sort(unique(object[, t.name]))

  cs.obs.per.period		<- tapply(X = object[, i.name], INDEX = object[, t.name], FUN = length)
  periods.per.cs.obs		<- tapply(X = object[, t.name], INDEX = object[, i.name], FUN = length)
  balanced	<- var(cs.obs.per.period) == 0	# or < 2*10^(-14)

  if (balanced)
    stop("Plot is only suitable for unbalanced panel data.")
  par.mar.def <- graphics::par()$mar
  par.xpd.def <- graphics::par()$xpd
  graphics::par(mar = c(5.1, 4.1, 4.1, 6.1), xpd = TRUE)
  plot(x = c(min(t.set) - 0.5, max(t.set) + 0.5), y = c(min(i.set.ind) -
                                                          0.5, max(i.set.ind) + 0.5), type = "n", xlab = t.name,
       ylab = i.name, main = plot.name, xaxs = "i", yaxs = "i",
       xaxt = "n", ...)
  graphics::axis(side = 1, at = seq(from = min(t.set) - 0.5,
                                    to = max(t.set) + 0.5, by = 1), labels = FALSE)
  graphics::axis(side = 1, at = t.set, labels = t.set,
                 tick = FALSE)
  col.set <- (grDevices::colorRampPalette(col.range))(length(if (sum(periods.per.cs.obs ==
                                                                     0) > 0) {
    table(periods.per.cs.obs)[-1]
  } else {
    table(periods.per.cs.obs)
  }))
  for (i in i.set.ind) {
    t.i <- as.numeric(periods.per.cs.obs[i])
    if (t.i > 0) {

      t.i.set <- object[object[, i.name] == i.set[i], t.name]

      graphics::rect(xleft = t.i.set - 0.5, ybottom = i -
                       0.5, xright = t.i.set + 0.5, ytop = i + 0.5,
                     col = col.set[which(names(table(periods.per.cs.obs)) ==
                                           t.i)], border = col.set[which(names(table(periods.per.cs.obs)) ==
                                                                           t.i)])
    }
  }
  legend(title = expression(T[i]), x.intersp = 0.2, x = max(t.set) +
           0.5, y = max(i.set.ind), legend = rev(names(if (sum(periods.per.cs.obs ==
                                                               0) > 0) {
             table(periods.per.cs.obs)[-1]
           } else {
             table(periods.per.cs.obs)
           })), fill = rev(col.set), border = rev(col.set), bg = "white",
         bty = "n")
  graphics::box()
  graphics::par(mar = par.mar.def, xpd = par.xpd.def)
}


















#' Plot Empirical Density of a Column of a Panel Dataset over Time.
#'
#' \code{pDensTime.plot} Plot the empirical density
#'    of a column of an object of class `data.frame`
#'    containing a panel dataset across time
#'    periods/aggregates of time periods.
#'
#' @param object An object of class `data.frame`.
#' @param i.name Column name of cross-section identifier.
#' @param t.name Column name of time-series identifier.
#' @param var.name Column name of the variable that is
#'    plotted (see `Details').
#' @param aggregate.t Argument of data type `numeric'.
#'    If argument is specified, the corresponding number
#'    of time periods is merged (approximately); (defaults
#'    to 'NULL').
#' @param plot.quantiles Argument of data type `logical',
#'    indicating whether the 5\%- and 95\%-quantiles and
#'    the quartiles should be plotted (as specified by
#'    `col.set[2]'; defaults to 'TRUE').
#' @param plot.mean_ci Argument of data type `logical',
#'    indicating whether the mean and the approximate
#'    confidence intervals (mean plus/minus 2 standard
#'    deviations) should be plotted (as specified by
#'    `col.set[3]'; defaults to 'TRUE').
#' @param plot.extrema Argument of data type `logical',
#'    indicating whether the minimal and maximal observed
#'    value (per time period/group) should be plotted
#'    (as specified by `col.set[4]'; defaults to 'TRUE').
#' @param col.set Vector of length 4 with entries of
#'    data type `character' used to visualize the entities
#'    of `pDensTime.plot' (see `Details'); must be a
#'    valid argument to `col2rgb'; defaults to
#'    `c("gray", "navy", "darkorange1", "red")'.
#' @param ... further arguments.
#'
#' @return Returns a plot that visualizes the empirical
#'    density for a column of a panel dataset
#'    contained in an object of class `data.frame`. The
#'    variable of interest is plotted on the ordinate,
#'    the longitudinal dimension on the abscissa. For each
#'    time period or aggregate of time periods, one
#'    empirical density is computed and plotted.
#'    Corresponding summary statistics on empirical
#'    quantiles and the sample size per longitudinal
#'    dimension are included in the plot.
#'
#' @author Markus Fritsch, Joachim Schnurbus
#' @export
#' @importFrom graphics abline
#' @importFrom graphics axis
#' @importFrom graphics box
#' @importFrom graphics lines
#' @importFrom graphics plot
#' @importFrom graphics polygon
#' @importFrom graphics text
#' @importFrom stats density
#' @importFrom stats median
#' @importFrom stats quantile
#' @importFrom stats sd
#'
#' @seealso
#'
#' \code{\link{pdynmc}} for fitting a linear dynamic panel data model.
#'
#' @examples
#' ## Load data
#' data(ABdata, package = "pdynmc")
#' dat <- ABdata
#' dat[,c(4:7)] <- log(dat[,c(4:7)])
#'
#' ## Minimal set of arguments
#' pDensTime.plot(object = ABdata, var.name = "emp", i.name = "firm", t.name = "year")
#'
#' ## All arguments explicitly stated
#' pDensTime.plot(object	= ABdata, var.name	= "emp", i.name	= "firm", t.name	= "year",
#'  aggregate.t	= NULL,	plot.quantiles	= TRUE, plot.mean_ci	= TRUE, plot.extrema	= TRUE,
#'  col.set		= c("gray", "navy", "darkorange1", "red"))
#'
#' ## Aggregation over time periods (3 time periods per group)
#' pDensTime.plot(object = ABdata, var.name = "emp", i.name = "firm", t.name = "year",
#'  aggregate.t = 3)
#'
#' ## Employ alternative colouring scheme
#' pDensTime.plot(object = ABdata, var.name = "emp", i.name = "firm", t.name = "year",
#'  col.set		= c("pink", "blue", "purple", "black"))
#'
#' ## Plot only density, mean, and asymptotic confidence interval
#' pDensTime.plot(object = ABdata, var.name = "emp", i.name = "firm", t.name = "year",
#'  plot.quantiles	= FALSE, plot.extrema	= FALSE)
#'
pDensTime.plot	<- function(
    object,
    var.name,
    i.name,
    t.name,
    aggregate.t	= NULL,	# if numeric, the corresponding number of time periods is merged (approximately)
    plot.quantiles	= TRUE,
    plot.mean_ci	= TRUE,
    plot.extrema	= TRUE,
    col.set		= c("gray", "navy", "darkorange1", "red"),
    ...
){

  if (!is.data.frame(object)) {
    stop("Function 'pDensTime.plot' applied to non `data.frame`.")
  }
  if (is.null(i.name) || is.null(t.name)) {
    stop("Cross-section dimension and longitudinal dimension need to be specified.")
  }

  t.lab.txt	<- sort(unique(object[, t.name]))
  if(!length(unique(diff(t.lab.txt))) == 1){
    stop("Longitudinal dimension not equidistant.")
  }

  t.lab.dst	<- unique(diff(t.lab.txt))
  t.set	    <- sort(unique(object[, t.name]))
  t.set.sc	<- sort(unique(object[, t.name]))/t.lab.dst

  i.set	<- sort(unique(object[, i.name]))
  da		<- object[, var.name]
  obj.tmp			<- object
  obj.tmp[, t.name]	<- obj.tmp[, t.name]/t.lab.dst




  if(is.numeric(aggregate.t)){
    ###
    ###	aggregated version (to approximately aggregate.t number of time periods per group)
    ###

    t.cat			<- floor((t.set.sc - min(t.set.sc))/aggregate.t) + 1
    t.cat.values	<- floor((obj.tmp[, t.name] - min(obj.tmp[, t.name]))/aggregate.t) + 1
    if(t.cat[length(t.cat)] != t.cat[length(t.cat) - 1]){
      t.cat.values[t.cat.values == t.cat[length(t.cat)]] <- t.cat[length(t.cat) - 1]
      t.cat[length(t.cat)] <- t.cat[length(t.cat) - 1]
    }
    t.cat.set		<- unique(t.cat)

    t.cat.names		<- rep(NA, length(t.cat.set))
    for(i in 1:length(t.cat.names)){
      t.cat.temp	<- t.cat.set[i]
      t.cat.names[i]	<- paste(range(t.set[t.cat == t.cat.temp]), collapse = "-")
    }



    dat.t.dens	<- list()
    for(el in t.cat.set){
      dat.t.dens[[paste("t.", el, sep = "")]]	<- density(da[obj.tmp[, t.name] %in% t.set.sc[t.cat == el]])
    }
    maxy		<- function(da){max(da$y)}
    dens.max	<- max(sapply(X = dat.t.dens, FUN = maxy))





    plot(
      x		= t.cat.set,
      y   = obj.tmp[,var.name][t.cat.set],
      xlim	= range(t.cat.set) + c(-0.5, 1),
      ylim	= range(obj.tmp[,var.name]) + c(-0.5, 1),
      type	= "n",
      xaxs	= "i",
      yaxs	= "i",
      xaxt	= "n",
      ...
    )
    axis(side = 1, at = t.cat.set, labels = t.cat.names)

    abline(v = t.cat.set, col = col.set[1])
    for(el in t.cat.set){
      polygon(
        x	= el + (dat.t.dens[[paste("t.", el, sep = "")]]$y/dens.max),
        y	= dat.t.dens[[paste("t.", el, sep = "")]]$x,
        col = col.set[1], border = col.set[1]
      )
    }

    if(plot.quantiles){
      lines(x = t.cat.set, y = tapply(X = da, INDEX = t.cat.values, FUN = median), col = col.set[2], type = "b", pch = 19)
      lines(x = t.cat.set, y = tapply(X = da, INDEX = t.cat.values, FUN = quantile, probs = 0.25), col = col.set[2], type = "b", pch = 19, lty = 2)
      lines(x = t.cat.set, y = tapply(X = da, INDEX = t.cat.values, FUN = quantile, probs = 0.75), col = col.set[2], type = "b", pch = 19, lty = 2)
      lines(x = t.cat.set, y = tapply(X = da, INDEX = t.cat.values, FUN = quantile, probs = 0.05), col = col.set[2], type = "b", pch = 19, lty = 3)
      lines(x = t.cat.set, y = tapply(X = da, INDEX = t.cat.values, FUN = quantile, probs = 0.95), col = col.set[2], type = "b", pch = 19, lty = 3)
      quant.temp	<- quantile(da[t.cat.values == min(t.cat.set)], probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
      text(x = min(t.cat.set) - 0.2, y = quant.temp, labels = paste("q", c("05", "25", "50", "75", "95"), sep = ""), col = rep(col.set[2], 5))
    }

    if(plot.mean_ci){
      mp2s	<- function(da){mean(da) + 2*sd(da)}
      mm2s	<- function(da){mean(da) - 2*sd(da)}
      lines(x = t.cat.set, y = tapply(X = da, INDEX = t.cat.values, FUN = mean), col = col.set[3], type = "b", pch = 19)
      lines(x = t.cat.set, y = tapply(X = da, INDEX = t.cat.values, FUN = mp2s), col = col.set[3], type = "b", pch = 19, lty = 3)
      lines(x = t.cat.set, y = tapply(X = da, INDEX = t.cat.values, FUN = mm2s), col = col.set[3], type = "b", pch = 19, lty = 3)
      text(x = max(t.cat.set) + 0.35, y = mean(da[t.cat.values == max(t.cat.set)]), labels = "Mean", col = col.set[3])
      text(x = max(t.cat.set) + 0.4, y = mean(da[t.cat.values == max(t.cat.set)]) - 2*sd(da[t.cat.values == max(t.cat.set)]), labels = "M-2SD", col = col.set[3])
      text(x = max(t.cat.set) + 0.4, y = mean(da[t.cat.values == max(t.cat.set)]) + 2*sd(da[t.cat.values == max(t.cat.set)]), labels = "M+2SD", col = col.set[3])
    }

    if(plot.extrema){
      lines(x = t.cat.set, y = tapply(X = da, INDEX = t.cat.values, FUN = quantile, probs = 0), col = col.set[4], type = "l", pch = 19, lty = 1)
      lines(x = t.cat.set, y = tapply(X = da, INDEX = t.cat.values, FUN = quantile, probs = 1), col = col.set[4], type = "l", pch = 19, lty = 1)
      quant.temp	<- quantile(da[t.cat.values == min(t.cat.set)], probs = 0:1)
      text(x = min(t.cat.set) - 0.2, y = quant.temp, labels = c("Min", "Max"), col = rep(col.set[4], 2))
    }

    axis(side = 3, at = t.cat.set, labels = paste("N=", tapply(X = da, INDEX = t.cat.values, FUN = length), sep = ""))
    box()

  } else {		# i.e., aggregate.t is not a number
    ###
    ###	disaggregated version, i.e., one density per period
    ###


    dat.t.dens	<- list()
    for(el in t.set.sc){
      dat.t.dens[[paste("t.", el, sep = "")]]	<- density(da[obj.tmp[, t.name] == el])
    }
    maxy		<- function(da){max(da$y)}
    dens.max	<- max(sapply(X = dat.t.dens, FUN = maxy))



    plot(
      x	= t.set.sc,
      y   = obj.tmp[,var.name][t.set.sc],
      xlim	= range(t.set.sc) + c(-0.5, 1),
      ylim	= range(obj.tmp[,var.name]) + c(-0.5, 1),
      type	= "n",
      xaxs	= "i",
      yaxs	= "i",
      xaxt	= "n",
      ...
    )
    axis(side = 1, at = t.set.sc, labels = t.lab.txt)

    abline(v = t.set.sc, col = col.set[1])
    for(el in t.set.sc){
      polygon(
        x	= el + (dat.t.dens[[paste("t.", el, sep = "")]]$y/dens.max),
        y	= dat.t.dens[[paste("t.", el, sep = "")]]$x,
        col = col.set[1], border = col.set[1]
      )
    }

    if(plot.quantiles){
      lines(x = t.set.sc, y = tapply(X = da, INDEX = obj.tmp[, t.name], FUN = median), col = col.set[2], type = "b", pch = 19)
      lines(x = t.set.sc, y = tapply(X = da, INDEX = obj.tmp[, t.name], FUN = quantile, probs = 0.25), col = col.set[2], type = "b", pch = 19, lty = 2)
      lines(x = t.set.sc, y = tapply(X = da, INDEX = obj.tmp[, t.name], FUN = quantile, probs = 0.75), col = col.set[2], type = "b", pch = 19, lty = 2)
      lines(x = t.set.sc, y = tapply(X = da, INDEX = obj.tmp[, t.name], FUN = quantile, probs = 0.05), col = col.set[2], type = "b", pch = 19, lty = 3)
      lines(x = t.set.sc, y = tapply(X = da, INDEX = obj.tmp[, t.name], FUN = quantile, probs = 0.95), col = col.set[2], type = "b", pch = 19, lty = 3)
      quant.temp	<- quantile(da[obj.tmp[, t.name] == min(t.set)], probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
      text(x = min(t.set.sc) - 0.2, y = quant.temp, labels = paste("q", c("05", "25", "50", "75", "95"), sep = ""), col = rep(col.set[2], 5))
    }

    if(plot.mean_ci){
      mp2s	<- function(da){mean(da) + 2*sd(da)}
      mm2s	<- function(da){mean(da) - 2*sd(da)}
      lines(x = t.set.sc, y = tapply(X = da, INDEX = obj.tmp[, t.name], FUN = mean), col = col.set[3], type = "b", pch = 19)
      lines(x = t.set.sc, y = tapply(X = da, INDEX = obj.tmp[, t.name], FUN = mp2s), col = col.set[3], type = "b", pch = 19, lty = 3)
      lines(x = t.set.sc, y = tapply(X = da, INDEX = obj.tmp[, t.name], FUN = mm2s), col = col.set[3], type = "b", pch = 19, lty = 3)
      text(x = max(t.set.sc) + 0.35, y = mean(da[obj.tmp[, t.name] == max(t.set)]), labels = "Mean", col = col.set[3])
      text(x = max(t.set.sc) + 0.4, y = mean(da[obj.tmp[, t.name] == max(t.set)]) - 2*sd(da[obj.tmp[, t.name] == max(t.set)]), labels = "M-2SD", col = col.set[3])
      text(x = max(t.set.sc) + 0.4, y = mean(da[obj.tmp[, t.name] == max(t.set)]) + 2*sd(da[obj.tmp[, t.name] == max(t.set)]), labels = "M+2SD", col = col.set[3])
    }

    if(plot.extrema){
      lines(x = t.set.sc, y = tapply(X = da, INDEX = obj.tmp[, t.name], FUN = quantile, probs = 0), col = col.set[4], type = "l", pch = 19, lty = 1)
      lines(x = t.set.sc, y = tapply(X = da, INDEX = obj.tmp[, t.name], FUN = quantile, probs = 1), col = col.set[4], type = "l", pch = 19, lty = 1)
      quant.temp	<- quantile(da[obj.tmp[, t.name] == min(t.set)], probs = 0:1)
      text(x = min(t.set.sc) - 0.2, y = quant.temp, labels = c("Min", "Max"), col = rep(col.set[4], 2))
    }

    axis(side = 3, at = t.set.sc, labels = paste("N=", tapply(X = da, INDEX = obj.tmp[, t.name], FUN = length), sep = ""))
    box()

  }

}