R/QuickUDS.R

#'Prepare democracy data before replicating the UDS model
#'
#'This function is designed to take the democracy data included in this package
#'and put it in a form suitable for use with the \code{\link{mirt}} package to
#'replicate the UDS model. It takes a data frame and tries to determine, from
#'the colum names, which variables contain democracy scores.
#'
#'If the column names contain the strings \code{arat}, \code{blm},
#'\code{bollen},\code{wgi}, \code{hadenius}, \code{munck}, \code{pacl},
#'\code{peps}, \code{polyarchy_inclusion_dimension},
#'\code{polyarchy_contestation_dimension}, \code{polity}, \code{prc},
#'\code{v2x}, \code{vanhanen_pmm}, or \code{vanhanen_democratization}, the
#'function performs the following transformations by default:
#'
#'\code{arat}: Following Pemstein, Meserve, and Melton's replication code
#'(Pemstein, Meserve, and Melton 2013), the function cuts Arat (1991)'s 0-109
#'democracy score into 7 intervals with the following cutoffs: 50, 60, 70, 80,
#'90, and 100. The resulting score is ordinal from 1 to 8.
#'
#'\code{bollen}: Following Pemstein, Meserve, and Melton's replication code
#'(Pemstein, Meserve, and Melton 2013), the function cuts Bollen's (2001)'s
#'0-100 democracy score into 10 intervals with the following cutoffs:
#'10,20,30,40,50,60,70,80, and 90. The resulting score is ordinal from 1 to 10.
#'
#'\code{wgi}: If the World Governance Indicator's index of voice and
#'acocuntability is included in the file, the function cuts it into 20
#'categories. The resulting score is ordinal from 1 to 20.
#'
#'\code{hadenius}: Following Pemstein, Meserve, and Melton's replication code
#'(Pemstein, Meserve, and Melton 2013), the function cuts Hadenius (1992)'s 0-10
#'democracy score into 8 intervals with the following cutoffs: 1, 2,3,4, 7, 8,
#'and 9. The resulting score is ordinal from 1 to 8.
#'
#'\code{munck}: Following Pemstein, Meserve, and Melton's replication code
#'(Pemstein, Meserve, and Melton 2013), the function cuts Munck's (2009)'s 0-1
#'democracy score into 4 intervals with the following cutoffs: 0.5,0.5,0.75, and
#'0.99. The resulting score is ordinal from 1 to 4.
#'
#'\code{peps}: If any of the variants of the Participation-Enhanced Polity Score
#'(Moon et al 2006) is included in the file, the function rounds its value
#'(eliminates the decimal) and then transforms it into an ordinal measure from 1
#'to 21.
#'
#'\code{polity}: Following Pemstein, Meserve, and Melton's replication code
#'(Pemstein, Meserve, and Melton 2013), the function takes the polity scores and
#'puts NA for any values below -10, and then transforms it into an ordinal
#'measure from 1 to 21.
#'
#'\code{polyarchy_inclusion_dimension}, \code{polyarchy_contestation_dimension}:
#'If any of the polyarchy inclusion or contestation dimensions from Coppedge,
#'Alvarez and Maldonado (2008)  are included, it cuts them into into 20
#'categories. The resulting score is ordinal from 1 to 20.
#'
#'\code{v2x}: If any of the v2x_ continuous indexes of democracy from the V-Dem
#'dataset (Coppedge et al 2015) are included in the file, the function cuts them
#'into 20 categories. The resulting score is ordinal from 1 to 20.
#'
#'\code{vanhanen_democratization} or \code{vanhanen_pmm}: Following Pemstein,
#'Meserve, and Melton's replication code (Pemstein, Meserve, and Melton 2013),
#'the function cuts Vanhanen's (2012)'s index of democratization into 8
#'intervals with the following cutoffs: 5,10,15,20,25,30, and 35. The resulting
#'score is ordinal from 1 to 8.
#'
#'The function also recognizes the following column names (or partial column
#'names - it also recognizes, e.g., pmm_blm) as measures of democracy:
#'\code{anckar} (from Anckar and Fredriksson 2018), \code{blm} (from Bowman,
#'Lehoucq, and Mahoney 2005), \code{bmr} (from Boix, Miller, and Rosato 2012),
#'\code{doorenspleet} (from Doorenspleet 2000), \code{e_v2x} (the "ordinal"
#'indexes from the V-dem project, Coppedge at al 2015), \code{freedomhouse} or
#'\code{fh} (from Freedom House - freedom scale must be reversed so that "more
#'freedom" is higher), \code{gwf} (from Geddes, Wright, and Frantz 2014 - the
#'dichotomous democracy indicator only), \code{kailitz} (from Kailitz 2013 -
#'democracy/non-democracy indicator), \code{lied} or \code{lexical_index} (from
#'Skaaning, Gerring, and Bartusevicius 2015), \code{mainwaring} (from Mainwaring
#'and Perez Linan 2008), \code{magaloni} (from Magaloni, Min, Chu 2013 -
#'democracy/non-democracy indicator), \code{pacl} (from Cheibub, Gandhi, and
#'Vreeland 2010), \code{pitf} (from Goldstone et al 2010 or Taylor and Ulfelder
#'2015), \code{polyarchy} (from Coppedge and Reinicke 1991), \code{prc} (from
#'Gasiorowski 1996 or Reich 2002), \code{PIPE} (from Przeworski 2010),
#'\code{reign} (from Bell 2016), \code{svmdi} (from Grundler and Krieger 2018,
#'2016), \code{svolik} (from Svolik 2012, democracy/dictatorship indicator
#'only), \code{ulfelder} (from Ulfelder 2012), \code{utip} (from Hsu 2008), and
#'\code{wth} or \code{wahman_teorell_hadenius} (from Wahman, Teorell, and
#'Hadenius 2013). In each of these cases the function transforms the values of
#'these scores by running \code{as.numeric(unclass(factor(x)))}, which
#'transforms them into ordinal variables from 1 to the number of categories.
#'
#'For details of these scores, see the documentation for
#'\code{\link{democracy}}.
#'
#'It is also possible to change these defaults.
#'
#'@section Note: Warning! The function does not perform any sanity checks. It
#'  will try to transform anything that has the right name. You should always
#'  check the results make sense.
#'
#'@param data A dataset of democracy scores. For the function to do anything,
#'  the column names must contain at least one of the following strings:
#'  \code{anckar}, \code{arat}, \code{blm}, \code{bmr}, \code{bollen}, \code{doorenspleet},
#'  \code{wgi}, \code{gwf}, \code{hadenius}, \code{kailitz}, \code{lied},
#'  \code{munck}, \code{pacl}, \code{peps}, \code{polyarchy}, \code{polity},
#'  \code{prc}, \code{PIPE}, \code{svmdi}, \code{svolik}, \code{ulfelder}, \code{utip},
#'  \code{v2x}, \code{vanhanen_democratization}, \code{vanhanen_pmm}, or
#'  \code{wth}. For details of these variables, see the documentation for
#'  \code{\link{democracy}}.
#'@param .funs A names list of functions to modify the columns. It defaults to
#'  the following:
#'
#'  \code{funs(arat = cut(., breaks = c(0, 50, 60, 70, 80, 90, 100, 109), labels
#'  = 1:7, include.lowest = TRUE, right = FALSE),
#'
#'  hadenius = cut(., breaks =  c(0, 1, 2, 3, 4, 7, 8, 9, 10), labels = 1:8,
#'  include.lowest = TRUE, right = FALSE),
#'
#'  bollen = cut(., breaks = c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
#'  labels = 1:10, include.lowest = TRUE, right = FALSE),
#'
#'  vanhanen = cut(., breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 50), labels =
#'  1:8, include.lowest = TRUE, right = FALSE),
#'
#'  munck = cut(., breaks = c(0, 0.5, 0.75, 0.99, 1), labels = 1:4,
#'  include.lowest = TRUE, right = FALSE),
#'
#'  polyarchy_dimensions = cut(., breaks = 20, include.lowest = TRUE, right =
#'  FALSE, ordered_result = TRUE),
#'
#'  polity = ifelse(. < -10, NA, .), v2x = cut(., breaks = 20, include.lowest =
#'  TRUE, right = FALSE, ordered_result = TRUE),
#'
#'  v2x_* = cut(., breaks = 20, include.lowest = TRUE, right = FALSE,
#'  ordered_result = TRUE),
#'
#'  svmdi = cut(., breaks = 20, include.lowest = TRUE, right = FALSE,
#'  ordered_result = TRUE),
#'
#'  eiu = cut(., breaks = 20, include.lowest = TRUE, right = FALSE,
#'  ordered_result = TRUE),
#'
#'  wgi = cut(., breaks = 20, include.lowest = TRUE, right = FALSE,
#'  ordered_result = TRUE),
#'
#'  peps = round(.),
#'
#'  other = as.numeric(unclass(factor(.))))}
#'
#'@return A data frame with the transformed scores, if any.
#'@import dplyr
#'@export
#'
#' @examples
#' summary(democracy)
#' summary(prepare_data(democracy))
#'@references
#'
#' Anckar, Carsten and C. Fredriksson. 2018. "Classifying political regimes
#' 1800-2016: a typology and a new dataset".  European Political Science. DOI:
#' 10.1057/s41304-018-0149-8. Data and codebook available at
#' \url{https://doi.org/10.1057/s41304-018-0149-8}.
#'
#'Arat, Zehra F. 1991. Democracy and human rights in developing countries.
#'Boulder: Lynne Rienner Publishers.
#'
#'Bell, Curtis. 2016. The Rulers, Elections, and Irregular Governance Dataset
#'(REIGN).\url{http://oefresearch.org/datasets/reign}
#'
#'Boix, Carles, Michael Miller, and Sebastian Rosato. 2012. A Complete Data Set
#'of Political Regimes, 1800-2007. Comparative Political Studies 46 (12):
#'1523-1554. Original data available at
#'\url{https://sites.google.com/site/mkmtwo/democracy-v2.0.dta?attredirects=0}
#'
#'Bollen, Kenneth A. 2001. "Cross-National Indicators of Liberal Democracy,
#'1950-1990." 2nd ICPSR version. Chapel Hill, NC: University of North Carolina,
#'1998. Ann Arbor, MI: Inter-university Consortium for Political and Social
#'Research, 2001. Original data available at
#'\url{http://webapp.icpsr.umich.edu/cocoon/ICPSR-STUDY/02532.xml}.
#'
#'Bowman, Kirk, Fabrice Lehoucq, and James Mahoney. 2005. Measuring Political
#'Democracy: Case Expertise, Data Adequacy, and Central America. Comparative
#'Political Studies 38 (8): 939-970.
#'\url{http://cps.sagepub.com/content/38/8/939}. Data available at
#'\url{http://www.blmdemocracy.gatech.edu/}.
#'
#'Cheibub, Jose Antonio, Jennifer Gandhi, and James Raymond Vreeland. 2010.
#'"Democracy and Dictatorship Revisited." Public Choice. 143(1):67-101. Original
#'data available at
#'\url{https://sites.google.com/site/joseantoniocheibub/datasets/democracy-and-dictatorship-revisited}.
#'
#'Coppedge, Michael, John Gerring, Staffan I. Lindberg, Svend-Erik Skaaning, and
#'Jan Teorell, with David Altman, Michael Bernhard, M. Steven Fish, Adam Glynn,
#'Allen Hicken, Carl Henrik Knutsen, Kelly McMann, Pamela Paxton, Daniel
#'Pemstein, Jeffrey Staton, Brigitte Zimmerman, Frida Andersson, Valeriya
#'Mechkova, Farhad Miri. 2015. V-Dem Codebook v5. Varieties of Democracy (V-Dem)
#'Project. Original data available at \url{https://v-dem.net/en/data/}.
#'
#'Coppedge, Michael and Wolfgang H. Reinicke. 1991. Measuring Polyarchy. In On
#'Measuring Democracy: Its Consequences and Concomitants, ed. Alex Inkeles. New
#'Brunswuck, NJ: Transaction pp. 47-68.
#'
#'Coppedge, A. Alvarez and C. Maldonado. 2008. "Two Persistent Dimensions of
#'Democracy: Contestation and Inclusiveness". The Journal of Politics 70.03, pp.
#'632-647. DOI: 10.1017/S0022381608080663.
#'
#'Doorenspleet, Renske. 2000. Reassessing the Three Waves of Democratization.
#'World Politics 52 (03): 384-406. DOI: 10.1017/S0043887100016580.
#'\url{http://dx.doi.org/10.1017/S0043887100016580}.
#'
#'Kaufmann, D. and A. Kraay. 2016. Worldwide Governance Indicators. 2016.
#'\url{http://www.govindicators.org}.
#'
#'Freedom House. 2015. "Freedom in the World." Original data available at
#'\url{http://www.freedomhouse.org}.
#'
#'Gasiorowski, Mark J. 1996. "An Overview of the Political Regime Change
#'Dataset." Comparative Political Studies 29(4):469-483.
#'
#'Geddes, Barbara, Joseph Wright, and Erica Frantz. 2014. Autocratic Breakdown
#'and Regime Transitions: A New Data Set. Perspectives on Politics 12 (1):
#'313-331. Original data available at \url{http://dictators.la.psu.edu/}.
#'
#'Goldstone, Jack, Robert Bates, David Epstein, Ted Gurr, Michael Lustik, Monty
#'Marshall, Jay Ulfelder, and Mark Woodward. 2010. A Global Model for
#'Forecasting Political Instability. American Journal of Political Science 54
#'(1): 190-208. DOI:10.1111/j.1540-5907.2009.00426.x
#'
#' Grundler, K. and T. Krieger. 2018. "Machine Learning Indices, Political
#' Institutions, and Economic Development". Report. CESifo Group Munich, 2018.
#' \url{https://www.cesifo-group.de/DocDL/cesifo1_wp6930.pdf}.
#'
#' Grundler, K. and T. Krieger. 2016. "Democracy and growth: Evidence from a
#' machine learning indicator". European Journal of Political Economy 45, pp.
#' 85-107. DOI: \url{https://doi.org/10.1016/j.ejpoleco.2016.05.005}.
#'
#'Hadenius, Axel. 1992. Democracy and Development. Cambridge: Cambridge
#'University Press.
#'
#'Hadenius, Axel & Jan Teorell. 2007. "Pathways from Authoritarianism", Journal
#'of Democracy 18(1): 143-156.
#'
#'Hsu, Sara "The Effect of Political Regimes on Inequality, 1963-2002," UTIP
#'Working Paper No. 53 (2008), http://utip.gov.utexas.edu/papers/utip_53.pdf.
#'Data available for download at http://utip.gov.utexas.edu/data/.
#'
#'Kailitz, Steffen. 2013. Classifying political regimes revisited: legitimation
#'and durability. Democratization 20 (1): 39-60. Original data available at
#'\url{http://dx.doi.org/10.1080/13510347.2013.738861}.
#'
#'Mainwaring, Scott, Daniel Brinks, and Anibal Perez Linan. 2008. "Political
#'Regimes in Latin America, 1900-2007." Original data available from
#'\url{http://kellogg.nd.edu/scottmainwaring/Political_Regimes.pdf}.
#'
#'Magaloni, Beatriz, Jonathan Chu, and Eric Min. 2013. Autocracies of the World,
#'1950-2012 (Version 1.0). Dataset, Stanford University. Original data and
#'codebook available at
#'\url{http://cddrl.fsi.stanford.edu/research/autocracies_of_the_world_dataset/}
#'
#'Marshall, Monty G., Ted Robert Gurr, and Keith Jaggers. 2012. "Polity IV:
#'Political Regime Characteristics and Transitions, 1800-2012." Updated to 2015.
#'Original data available from
#'\url{http://www.systemicpeace.org/polity/polity4.htm}.
#'
#'Moon, Bruce E., Jennifer Harvey Birdsall, Sylvia Ceisluk, Lauren M. Garlett,
#'Joshua J. Hermias, Elizabeth Mendenhall, Patrick D. Schmid, and Wai Hong Wong
#'(2006) "Voting Counts: Participation in the Measurement of Democracy" Studies
#'in Comparative International Development 42, 2 (Summer, 2006). The complete
#'dataset is available here:
#'\url{http://www.lehigh.edu/~bm05/democracy/Obtain_data.htm}.
#'
#'Munck, Gerardo L. 2009. Measuring Democracy: A Bridge Between Scholarship and
#'Politics. Baltimore: Johns Hopkins University Press.
#'
#'Pemstein, Daniel, Stephen Meserve, and James Melton. 2010. Democratic
#'Compromise: A Latent Variable Analysis of Ten Measures of Regime Type.
#'Political Analysis 18 (4): 426-449.
#'
#'Pemstein, Daniel, Stephen A. Meserve, and James Melton. 2013. "Replication
#'data for: Democratic Compromise: A Latent Variable Analysis of Ten Measures of
#'Regime Type." In: Harvard Dataverse. \url{http://hdl.handle.net/1902.1/PMM}
#'
#'Przeworski, Adam et al. 2013. Political Institutions and Political Events
#'(PIPE) Data Set. Department of Politics, New York University.
#'\url{https://sites.google.com/a/nyu.edu/adam-przeworski/home/data}
#'
#'Reich, G. 2002. Categorizing Political Regimes: New Data for Old Problems.
#'Democratization 9 (4): 1-24.
#'\url{http://www.tandfonline.com/doi/pdf/10.1080/714000289}.
#'
#'Skaaning, Svend-Erik, John Gerring, and Henrikas Bartusevicius. 2015. A
#'Lexical Index of Electoral Democracy. Comparative Political Studies 48 (12):
#'1491-1525. Original data available from
#'\url{http://thedata.harvard.edu/dvn/dv/skaaning}.
#'
#'Svolik, Milan. 2012. The Politics of Authoritarian Rule. Cambridge and New
#'York: Cambridge University Press. Original data available from
#'\url{http://campuspress.yale.edu/svolik/the-politics-of-authoritarian-rule/}.
#'
#'Taylor, Sean J. and Ulfelder, Jay, A Measurement Error Model of Dichotomous
#'Democracy Status (May 20, 2015). Available at SSRN:
#'\url{http://ssrn.com/abstract=2726962} or
#'\url{http://dx.doi.org/10.2139/ssrn.2726962}
#'
#'Ulfelder, Jay. 2012. "Democracy/Autocracy Data Set." In: Harvard Dataverse.
#'\url{http://hdl.handle.net/1902.1/18836}.
#'
#'Vanhanen, Tatu. 2012. "FSD1289 Measures of Democracy 1810-2012." Original data
#'available from
#'\url{http://www.fsd.uta.fi/english/data/catalogue/FSD1289/meF1289e.html}
#'
#'Wahman, Michael, Jan Teorell, and Axel Hadenius. 2013. Authoritarian regime
#'types revisited: updated data in comparative perspective. Contemporary
#'Politics 19 (1): 19-34.
prepare_data <- function(data,
                         .funs) {

  other_vars <- c("anckar", "blm", "bmr", "doorenspleet", "fh|freedomhouse", "gwf",
                  "lied|lexical_index", "mainwaring",
                  "magaloni", "pacl", "pitf", "polyarchy",
                  "prc", "PIPE|przeworski", "svolik", "svmdi_2016",
                  "ulfelder", "utip", "kailitz", "e_v2x",
                  "wth|wahman_teorell", "reign")

  . <- NULL

  if(missing(.funs)) {
    .funs <- funs(
      arat = cut(., breaks = c(0, 50, 60, 70, 80, 90, 100, 109),
            labels = 1:7, include.lowest = TRUE, right = FALSE),
      hadenius = cut(., breaks =  c(0, 1, 2, 3, 4, 7, 8, 9, 10),
            labels = 1:8, include.lowest = TRUE, right = FALSE),
      bollen = cut(., breaks = c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
            labels = 1:10, include.lowest = TRUE, right = FALSE),
      vanhanen = cut(., breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 50),
            labels = 1:8, include.lowest = TRUE, right = FALSE),
      munck = cut(., breaks = c(0, 0.5, 0.75, 0.99, 1),
            labels = 1:4, include.lowest = TRUE, right = FALSE),
      polyarchy_dimensions = cut(., breaks = 20, include.lowest = TRUE,
                right = FALSE, ordered_result = TRUE),
      polity = ifelse(. < -10, NA, .),
      v2x = cut(., breaks = 20, include.lowest = TRUE,
                right = FALSE, ordered_result = TRUE),
      svmdi = cut(., breaks = 20, include.lowest = TRUE,
                right = FALSE, ordered_result = TRUE),
      wgi = cut(., breaks = 20, include.lowest = TRUE,
                right = FALSE, ordered_result = TRUE),
      eiu = cut(., breaks = 20, include.lowest = TRUE,
                right = FALSE, ordered_result = TRUE),
      peps = round(.),
      other = as.numeric(unclass(factor(.))))
  }

  other_pattern <- paste(c(other_vars,
                           "arat",
                           "hadenius",
                           "bollen",
                           "vanhanen",
                           "munck",
                           "polity",
                           "v2x",
                           "svmdi",
                           "wgi",
                           "eiu",
                           "peps"),collapse="|")

  data <- data %>%
    mutate_at(vars(matches("arat")), .funs[['arat']]) %>%
    mutate_at(vars(matches("hadenius")), .funs[['hadenius']]) %>%
    mutate_at(vars(matches("bollen")), .funs[['bollen']]) %>%
    mutate_at(vars(matches("pmm_vanhanen|vanhanen_democratization")),
              .funs[['vanhanen']]) %>%
    mutate_at(vars(matches("munck")), .funs[['munck']]) %>%
    mutate_at(vars(matches("polyarchy_(inclusion|contestation)_dimension")),
              .funs[['polyarchy_dimensions']]) %>%
    mutate_at(vars(matches("polity")), .funs[['polity']]) %>%
    mutate_at(vars(starts_with("v2x")), .funs[['v2x']]) %>%
    mutate_at(vars(starts_with("csvmdi"), starts_with("svmdi_2016")), .funs[['svmdi']]) %>%
    mutate_at(vars(matches("wgi")), .funs[['wgi']]) %>%
    mutate_at(vars(matches("eiu")), .funs[['eiu']]) %>%
    mutate_at(vars(matches("peps")), .funs[['peps']]) %>%
    mutate_at(vars(matches(other_pattern)), .funs[['other']])

  data

}

#' Prepares selected indexes in the democracy dataset for use in a UD model
#'
#' @param ... a set of measures of democracy in the \code{\link{democracy}}
#'   dataset (either bare column names, or strings). Can be a [dplyr::select]
#'   expression (but [dplyr] must be loaded for this to work).
#' @param identifiers A set of identifiers to return with the data. Defaults to
#'   \code{c("extended_country_name", "year", "GWn", "cown", "polity_ccode", "in_GW_system")}.
#'   Must exist in the [democracy] dataset.
#'
#' @return A dataset ready for use with \code{\link{democracy_model}}
#' @export
#'
#' @import dplyr
#'
#' @examples
#' prepare_democracy(dplyr::matches("pmm"))
prepare_democracy <- function(...,
                              identifiers = c("extended_country_name", "year",
                                              "GWn", "cown", "polity_ccode", "in_GW_system")) {

  cols <- check_vars(..., identifiers = identifiers)

  . <- NULL

  df <- QuickUDS::democracy %>%
    select(cols$identifiers, cols$measures) %>%
    mutate("id" = 1:nrow(QuickUDS::democracy)) %>%
    prepare_data() %>%
    tidyr::gather("variable", "value", !!!cols$measures) %>%
    filter_at("value", any_vars(!is.na(.))) %>%
    tidyr::spread("variable", "value") %>%
    arrange_at("id") %>%
    select(-matches("^id$"))


  class(df) <- c("democracydata", class(df))

  df
}

check_vars <- function(..., identifiers) {
  vars <- quos(...)

  non_selectable <- c("extended_country_name", "year",
                      "GWn", "cown", "polity_ccode", "in_GW_system",
                      "uds_2010_mean","uds_2010_median",
                      "uds_2011_mean", "uds_2011_median",
                      "uds_2014_mean", "uds_2014_median")


  if(any(names(QuickUDS::democracy %>%
               select(!!!vars)) %in%
         non_selectable)) {
    non_selectable <- names(QuickUDS::democracy %>%
                              select(!!!vars))[ names(QuickUDS::democracy %>%
                                                        select(!!!vars)) %in%
                                                  non_selectable]
    warning(sprintf("Cannot select variables %s as model variables, only as identifiers.
                    Automatically adding them as identifiers and excluding them from model variables.",
                    paste(non_selectable, collapse = ", ")))

    identifiers <- unique(c(identifiers, non_selectable))
    measures <-  names(QuickUDS::democracy %>%
                         select(!!!vars))
    measures <- measures[ !measures %in% identifiers ]
  } else {
    measures <- names(QuickUDS::democracy %>%
                        select(!!!vars))
  }

  list(identifiers = identifiers,
       measures = measures)

}

#' Probability that a country-year is more democratic than another
#'
#' @param data A UD dataset with a country_name, year, latent variable mean and
#'   latent variable standard deviation columns at least. Little sanity checking
#'   is performed - careful!
#' @param country1 The first country to compare.
#' @param country2 The second country to compare. Can be the same as
#'   \code{country1}
#' @param years Either a single year, or a length 2 vector of years.
#' @param mean_col The name of the column that contains the mean of the latent
#'   variable (defaults to \code{z1})
#' @param sd_col The name of the column that contains the standard error of the
#'   latent variable (defaults to \code{se_z1})
#' @param country_col The name of the column that contains the country name
#'   (defaults to \code{extended_country_name})
#' @param year_col The name of the column that contains the years (defaults to
#'   \code{year})
#'
#' @return The probability that the first country-year in the comparison is more
#'   democratic than the second.
#'
#' @export
#'
#' @importFrom stats pnorm
#'
#' @examples
#' # Probability that the USA in 2000 was more democratic than Brazil in 2000,
#' # according to 2010 release of UDS
#' prob_more(uds_2010, "United States of America","Brazil",
#'           2000, mean_col="mean", sd_col="sd")
#' # Probability that Brazil in 1980 was more democratic than the USA in 1980,
#' # according to 2010 release of UDS
#' prob_more(uds_2010, "Brazil","United States of America",
#'           1980, mean_col="mean", sd_col="sd")
#' # Probability that the USA in 2000 was more democratic than the USA in 1950,
#' # according to 2010 release of UDS
#' prob_more(uds_2010, "United States of America","United States of America",
#'            years = c(2000,1950), mean_col="mean", sd_col="sd")
prob_more <- function(data, country1, country2, years, mean_col = "z1",
                      sd_col = "se_z1",
                      country_col = "extended_country_name",
                      year_col = "year") {

  if(length(years) == 1) {
    years <- c(years, years)
  }
  mu <- data[[mean_col]][ data[[country_col]] == country1 &
                            data[[year_col]] == years[1]] -
    data[[mean_col]][data[[country_col]] == country2  &
                       data[[year_col]] == years[2] ]

  sigma <- sqrt((data[[sd_col]][data[[country_col]] == country1
                                & data[[year_col]] == years[1]])^2 +
                  (data[[sd_col]][data[[country_col]] == country2 &
                                    data[[year_col]] == years[2] ])^2)

  prob <- 1-pnorm(-mu/sigma)
  prob
}

#' Extract cutpoints from a UD model in a tidy format.
#'
#' This function takes a model of the democracy scores and extracts the
#' discrimination parameters, score cutpoints, and standard errors for all the
#' variables involved, putting these into a tidy data frame.
#'
#' @param model A \code{\link{mirt}} \code{\link{SingleGroupClass-class}} model
#'   of the democracy scores.
#' @param type A string specifying the cutpoint type. Can be (an abbreviation
#'   of) "score" (for score cutpoints) or "discrimination" (for discrimination
#'   parameters). Default is "score."
#'
#' @return A [tibble] with either score cutpoints for each variable used to
#'   construct the latent scores in terms of the latent variable (the default), or
#'   discrimination parameters for each variable used to construct the index.
#'   For the score cutpoints (\code{type = 'score'}), the columns
#'   \code{estimate}, \code{pct975}, and \code{pct025} report the IRT
#'   parametrization of the model estimates, a normalized measure in the same
#'   scale as the latent variable.
#' @export
#'
#' @import dplyr
#' @import mirt
#'
#' @examples
#' \donttest{
#' # Replicate the official UDS 2011 release and calculate its cutpoints
#' replication_2011_model <- democracy_model(dplyr::matches("pmm"), verbose = FALSE)
#' cutpoints(replication_2011_model)}
cutpoints <- function(model, type = "score") {
  stopifnot(class(model) == "SingleGroupClass")

  type <- match.arg(type, c("score", "discrimination"))

  # A hack to get around the "no visible binding for global variable" note

  par <- CI_2.5 <- CI_97.5 <- variable <- se <- NULL

  estimate <- pct025 <- pct975 <- coef_type <- coef <- NULL

  coefs <- as.data.frame(coef(model, as.data.frame = TRUE))

  coefs <- coefs %>%
    mutate(variable = rownames(coefs),
           coef_type = stringr::str_extract(variable,"a[0-9]+$|d[0-9]+$"),
           variable = stringr::str_replace(variable,"\\.a[0-9]+$|\\.d[0-9]+$",""))

  coefs <- coefs %>%
    group_by(variable) %>%
    mutate(estimate = -(par)/(par[1]),
           pct025 = -(CI_2.5)/(CI_2.5[1]),
           pct975 = -(CI_97.5)/(CI_97.5[1]),
           se = abs(pct975 - estimate)/1.96) %>%
    filter(!is.na(coef_type))

  num_obs <- model@Data$data %>%
    as_tibble() %>%
    summarise_all(~sum(!is.na(.))) %>%
    tidyr::gather("variable", "num_obs")

  coefs <- coefs %>%
    left_join(num_obs,
              by = "variable")

  if(type == "score") {
    coefs <- coefs %>%
      filter(!grepl("^a",coef_type)) %>%
      select(variable,
             estimate,
             pct025,
             pct975,
             se,
             num_obs)
  } else {
    coefs <- coefs %>%
      filter(grepl("^a",coef_type)) %>%
      mutate(estimate = par,
             pct025 = CI_2.5,
             pct975 = CI_97.5) %>%
      select(variable,
             estimate,
             pct025,
             pct975,
             num_obs)
  }

  coefs %>% ungroup()


}

#' Extract rater info from a UD model in a tidy format.
#'
#' @param model A \code{\link{mirt}} \code{\link{SingleGroupClass-class}} model
#'   of the democracy scores.
#'
#' @return A data frame with rater information for each democracy index over the
#'   range of the latent variable \code{theta}.
#' @export
#'
#' @import dplyr
#'
#' @examples
#' \donttest{
#' replication_2011_model <- democracy_model(dplyr::matches("pmm"), verbose = FALSE)
#' raterinfo(replication_2011_model)}
raterinfo <- function(model) {
    raters <- dimnames(model@Data$data)[[2]]
    Theta <- model@Model$Theta
    rater.info <- data.frame()

    for (i in raters) {
        rater.info <- suppressWarnings(bind_rows(rater.info,
                                data.frame(rater = i,
                                           theta = as.numeric(Theta),
                                           info = iteminfo(extract.item(model, i),
                                                           Theta = Theta))))
    }
    rater.info %>%
      as_tibble()
}

#' Produce a UD model from democracy data
#'
#' This function is a simple wrapper for [mirt::mirt] that automates the process
#' of selecting the relevant columns in the [democracy] dataset, preparing the
#' data for use with [mirt::mirt], and packaging the result in a form useful for
#' [democracy_scores]. More fine-grained control can be achieved by using
#' \code{\link{mirt}} directly; see the Vignette on replicating and extending
#' the UD scores.
#'
#' @param ... [democracy] variables to use for the model. Can be bare column
#'   names or strings, or a [dplyr::select] expression.
#' @param identifiers Identifier columns. Can be any combination of columns in
#'   the [democracy] dataset. Defaults to \code{c("extended_country_name",
#'   "year", "GWn", "cown", "polity_ccode", "uds_2010_mean", "uds_2011_mean",
#'   "uds_2014_mean")}.
#' @param verbose Passed to [mirt::mirt]; whether to print a running commentary.
#'   Default is `TRUE`.
#' @param technical Passed to [mirt::mirt]. Defaults to \code{list(NCYCLES =
#'   2500)} to ensure that extende models converge.
#'
#' @return a \code{\link{SingleGroupClass-class}} model of latent democracy
#'   scores suitable for use by \code{\link{democracy_scores}}.
#' @import dplyr
#' @import mirt
#' @export
#'
#' @examples
#' \donttest{
#' replication_2011_model <- democracy_model(dplyr::matches("pmm"), verbose = FALSE)
#' replication_2011_model
#' summary(replication_2011_model)
#' }
democracy_model <- function(..., identifiers = c("extended_country_name",
                                            "year",
                                            "GWn",
                                            "cown",
                                            "polity_ccode",
                                            "uds_2010_mean",
                                            "uds_2011_mean",
                                            "uds_2014_mean",
                                            "in_GW_system"),
                            verbose = TRUE,
                            technical = list(NCYCLES = 2500)) {

  cols <- check_vars(...,
                     identifiers = identifiers)

  data <- prepare_democracy(cols$measures,
                            identifiers = cols$identifiers)


  model <- mirt::mirt(data %>% select(cols$measures),
                      model = 1,
                      itemtype = "graded",
                      SE = TRUE,
                      verbose = verbose,
                      technical = technical)

  model@Data$identifiers <- data %>%
    select(cols$identifiers)

  model

}

#' Extract UD scores from a UD model
#'
#' This function is a simple wrapper for \code{fscores(model, full.scores =
#' TRUE, full.scores.SE = TRUE, ...)} that returns scores in a tidy data frame
#' instead of a matrix. More fine-grained control can be achieved by using
#' \code{\link{fscores}} directly.
#'
#' @param ... [democracy] variables to use for the model. Can be bare column
#'   names or strings, or a [dplyr::select] expression.
#' @param model a \code{\link{SingleGroupClass-class}} model produced by
#'   \code{\link{democracy_model}}. If missing, calculates a model using the
#'   selected columns.
#' @param adjust_to_dichotomous Whether to calculate an adjusted score where the
#'   midpoint represents the average cutpoint for dichotomous scores. See the
#'   vignette for more details. Default is `TRUE`.
#' @param as_prob Whether to output scores as 0-1 probability scales. See the
#'   vignette for more details. Default is `TRUE`.
#'
#' @return A data frame with latent variable democracy scores (the equivalent of
#'   the UDS posterior means) for all country-years in the data, with standard
#'   errors and 95% confidence intervals. The following quantities are output by
#'   default:
#'
#' @template standard-variables
#' @section UDS measures for comparison purposes: \describe{
#'
#'   \item{uds_2010_mean, uds_2011_mean, uds_2014_mean}{The UDS means for the
#'   2010, 2011, and 2014 releases, for comparison purposes}.
#'
#'   }
#'
#' @template latent-variables
#' @export
#'
#' @import dplyr
#' @import mirt
#' @importFrom rlang .data
#' @importFrom stats pnorm
#'
#' @examples
#' \donttest{
#' # Replicate the official UDS scores (2011 release)
#' democracy_scores(dplyr::matches("pmm"), verbose = FALSE)}
democracy_scores <- function(..., model,
                             adjust_to_dichotomous = TRUE,
                             as_prob = TRUE) {

  identifiers <- c("extended_country_name",
                  "year",
                  "GWn",
                  "cown",
                  "polity_ccode",
                  "in_GW_system",
                  "uds_2010_mean",
                  "uds_2011_mean",
                  "uds_2014_mean")

  . <- NULL

  if(missing(model)) {
    model <- democracy_model(..., identifiers = identifiers)
  }

  stopifnot("SingleGroupClass" %in% class(model))

  scores <- mirt::fscores(model, full.scores = TRUE, full.scores.SE = TRUE) %>%
    as_tibble() %>%
    rename_at(c("F1", "SE_F1"), quos(c("z1", "se_z1"))) %>%
    mutate_at("z1",
              funs(z1_pct975 = . + 1.96 * .data$se_z1,
                   z1_pct025 = . - 1.96 * .data$se_z1))

  scores <- model@Data$identifiers %>%
    bind_cols(scores)

  if(adjust_to_dichotomous) {
    avg_dichotomous <- cutpoints(model) %>%
      group_by_at("variable") %>%
      filter(n() == 1) %>%
      pull("estimate") %>%
      mean()

    scores <- scores %>%
      mutate_at(vars(starts_with("z1")),
                funs(adj = . - avg_dichotomous))
  }

  if(as_prob) {
    scores <- scores %>%
      mutate_at(vars(starts_with("z1")),
                funs(as_prob = pnorm(.)))
  }

  scores %>% ungroup()
}
xmarquez/QuickUDS documentation built on May 4, 2019, 1:24 p.m.