R/data.R

#' Abortion Opinion Data
#'
#' Opinions about abortion classified by gender and SES
#'
#' `Support_Abortion` is a natural response variable.
#'
#' The combinations of `Sex` and `Status` represent four independent
#' samples, having fixed `Sex`-`Status` marginal totals.  There were
#' 500 females and 600 males. Within the female group, 250 of low status and
#' 250 of high status were sampled. Similarly for the males, with 300 in each
#' of the low and hgh status sub-groups.
#'
#' This is an example of a product-multinomial sampling scheme. the
#' `Sex:Status` association must be included in any loglinear model where
#' the goal is to determine how attitude toward abortion depends on the others.
#'
#' Alternatively, a logit model for abortion support may provide a simpler
#' analysis.
#'
#' @name Abortion
#' @docType data
#' @format A 3-dimensional array resulting from cross-tabulating 3 variables
#' for 1100 observations. The variable names and their levels are:
#'
#' \tabular{rll}{
#'   No \tab Name                    \tab Levels \cr
#'   1  \tab `Sex`              \tab `"Female", "Male"`\cr
#'   2  \tab `Status`           \tab `"Lo", "Hi"`\cr
#'   3  \tab `Support_Abortion` \tab `"Yes", "No"`\cr
#' }
#'
#' @source
#'
#' Christensen, R. (1990).  *Log-Linear Models*, New York, NY: Springer-Verlag, p. 92, Example 3.5.2.
#'
#' Christensen, R. (1997).  *Log-Linear Models and Logistic Regression*,
#' New York, NY: Springer, p. 100, Example 3.5.2.
#' @keywords datasets
#' @examples
#'
#' data(Abortion)
#'
#'
#' ftable(Abortion)
#' mosaic(Abortion, shade=TRUE)
#'
#' # stratified by Sex
#' fourfold(aperm(Abortion, 3:1))
#' # stratified by Status
#' fourfold(aperm(Abortion, c(3,1,2)))
#'
#'
NULL





#' Traffic Accident Victims in France in 1958
#'
#' Bertin (1983) used these data to illustrate the cross-classification of data
#' by numerous variables, each of which could have various types and could be
#' assigned to various visual attributes.
#'
#' For modeling and visualization purposes, the data can be treated as a 4-way
#' table using loglinear models and mosaic displays, or as a frequency-weighted
#' data frame using a binomial response for `result` (`"Died"` vs.
#' `"Injured"`) and plots of predicted probabilities.
#'
#' `age` is an ordered factor, but arguably, `mode` should be treated
#' as ordered, with levels `Pedestrian` < `Bicycle` <
#' `Motorcycle` < `4-Wheeled` as Bertin does.  This affects the
#' parameterization in models, so we don't do this directly in the data frame.
#'
#' @name Accident
#' @docType data
#' @format A data frame in frequency form (comprising a 5 x 2 x 4 x 2 table)
#' with 80 observations on the following 5 variables.
#' \describe{
#'   \item{`age`}{an ordered factor with levels `0-9` < `10-19` <`20-29`
#'         < `30-49` < `50+`}
#'   \item{`result`}{a factor with levels `Died` `Injured`}
#'   \item{`mode`}{mode of transportation, a factor with levels `4-Wheeled` `Bicycle`
#'         `Motorcycle` `Pedestrian`}
#'   \item{`gender`}{a factor with levels `Female` `Male`}
#'   \item{`Freq`}{a numeric vector}
#' }
#'
#' @references Bertin, J. (1983), *Semiology of Graphics*, University of
#' Wisconsin Press.
#' @source Bertin (1983), p. 30; original data from the Ministere des Travaux
#' Publics
#' @keywords datasets
#' @examples
#'
#' # examples
#' data(Accident)
#' head(Accident)
#'
#' # for graphs, reorder mode
#' Accident$mode <- ordered(Accident$mode,
#'    levels=levels(Accident$mode)[c(4,2,3,1)])
#'
#' # Bertin's table
#' accident_tab <- xtabs(Freq ~ gender + mode + age + result, data=Accident)
#' structable(mode + gender ~ age + result, data=accident_tab)
#'
#' ## Loglinear models
#' ## ----------------
#'
#' # mutual independence
#' acc.mod0 <- glm(Freq ~ age + result + mode + gender,
#'                 data=Accident,
#'                 family=poisson)
#' LRstats(acc.mod0)
#'
#' mosaic(acc.mod0, ~mode + age + gender + result)
#'
#' # result as a response
#' acc.mod1 <- glm(Freq ~ age*mode*gender + result,
#'                 data=Accident,
#'                 family=poisson)
#' LRstats(acc.mod1)
#'
#' mosaic(acc.mod1, ~mode + age + gender + result,
#'     labeling_args = list(abbreviate = c(gender=1, result=4)))
#'
#' # allow two-way association of result with each explanatory variable
#' acc.mod2 <- glm(Freq ~ age*mode*gender + result*(age+mode+gender),
#'                 data=Accident,
#'                 family=poisson)
#' LRstats(acc.mod2)
#' mosaic(acc.mod2, ~mode + age + gender + result,
#'     labeling_args = list(abbreviate = c(gender=1, result=4)))
#'
#' acc.mods <- glmlist(acc.mod0, acc.mod1, acc.mod2)
#' LRstats(acc.mods)
#'
#' ## Binomial (logistic regression) models for result
#' ## ------------------------------------------------
#' library(car)  # for Anova()
#' acc.bin1 <- glm(result=='Died' ~ age + mode + gender,
#'     weights=Freq, data=Accident, family=binomial)
#' Anova(acc.bin1)
#'
#' acc.bin2 <- glm(result=='Died' ~ (age + mode + gender)^2,
#'     weights=Freq, data=Accident, family=binomial)
#' Anova(acc.bin2)
#'
#' acc.bin3 <- glm(result=='Died' ~ (age + mode + gender)^3,
#'     weights=Freq, data=Accident, family=binomial)
#' Anova(acc.bin3)
#'
#' # compare models
#' anova(acc.bin1, acc.bin2, acc.bin3, test="Chisq")
#'
#' # visualize probability of death with effect plots
#' \dontrun{
#' library(effects)
#' plot(allEffects(acc.bin1), ylab='Pr (Died)')
#'
#' plot(allEffects(acc.bin2), ylab='Pr (Died)')
#' }
#'
#'
#' #
NULL





#' Air Crash Data
#'
#' Data on all fatal commercial airplane crashes from 1993--2015. Excludes
#' small planes (less than 6 passengers) and non-commercial (cargo, military,
#' private) aircraft.
#'
#' `Phase` of the flight was cleaned by combining related variants,
#' spelling, etc.
#'
#' @name AirCrash
#' @docType data
#' @format A data frame with 439 observations on the following 5 variables.
#' \describe{
#'   \item{`Phase`}{phase of the flight, a factor with levels `en route` `landing`
#'         `standing` `take-off`
#' `unknown`}
#'   \item{`Cause`}{a factor with levels `criminal` `human error` `mechanical`
#'         `unknown` `weather`}
#'   \item{`date`}{date of crash, a Date}
#'   \item{`Fatalities`}{number of fatalities, a numeric vector}
#'   \item{`Year`}{year, a numeric vector}
#' }
#' @references Rick Wicklin,
#' <http://blogs.sas.com/content/iml/2015/03/30/visualizing-airline-crashes.html>
#' @source Originally from David McCandless,
#' <https://informationisbeautiful.net/visualizations/plane-truth-every-single-commercial-plane-crash-visualized/>,
#' with the data at
#' <https://docs.google.com/spreadsheets/d/1OvDq4_BtbR6nSnnHnjD5hVC3HQ-ulZPGbo0RDGbzM3Q/edit?usp=drive_web>,
#' downloaded April 14, 2015.
#' @keywords datasets
#' @examples
#'
#' data(AirCrash)
#' aircrash.tab <- xtabs(~Phase + Cause, data=AirCrash)
#' mosaic(aircrash.tab, shade=TRUE)
#'
#' # fix label overlap
#' mosaic(aircrash.tab, shade=TRUE,
#'        labeling_args=list(rot_labels=c(30, 30, 30, 30)))
#'
#' # reorder by Phase
#' phase.ord <- rev(c(3,4,1,2,5))
#' mosaic(aircrash.tab[phase.ord,], shade=TRUE,
#'        labeling_args=list(rot_labels=c(30, 30, 30, 30)),
#'        offset_varnames=0.5)
#'
#' # reorder by frequency
#' phase.ord <- order(rowSums(aircrash.tab), decreasing=TRUE)
#' cause.ord <- order(colSums(aircrash.tab), decreasing=TRUE)
#' mosaic(aircrash.tab[phase.ord,cause.ord], shade=TRUE,
#'        labeling_args=list(rot_labels=c(30, 30, 30, 30)))
#'
#'
#' library(ca)
#' aircrash.ca <- ca(aircrash.tab)
#' plot(aircrash.ca)
#'
NULL





#' Alligator Food Choice
#'
#' The Alligator data, from Agresti (2002), comes from a study of the primary
#' food choices of alligators in four Florida lakes. Researchers classified the
#' stomach contents of 219 captured alligators into five categories: Fish (the
#' most common primary food choice), Invertebrate (snails, insects, crayfish,
#' etc.), Reptile (turtles, alligators), Bird, and Other (amphibians, plants,
#' household pets, stones, and other debris).
#'
#' The table contains a fair number of 0 counts.
#'
#' `food` is the response variable.  `fish` is the most frequent
#' choice, and often taken as a baseline category in multinomial response
#' models.
#'
#' @name Alligator
#' @docType data
#' @format A frequency data frame with 80 observations on the following 5
#' variables.
#' \describe{
#'   \item{`lake`}{a factor with levels `George` `Hancock` `Oklawaha` `Trafford`}
#'   \item{`sex`}{a factor with levels `female` `male`}
#'   \item{`size`}{alligator size, a factor with levels `large` (>2.3m) `small` (<=2.3m)}
#'   \item{`food`}{primary food choice, a factor with levels `bird` `fish`
#'         `invert` `other` `reptile`}
#'   \item{`count`}{cell frequency, a numeric vector} }
#' @source Agresti, A. (2002). *Categorical Data Analysis*, New York:
#' Wiley, 2nd Ed., Table 7.1
#' @keywords datasets
#' @examples
#'
#' data(Alligator)
#'
#' # change from frequency data.frame to table
#' allitable <- xtabs(count ~ lake + sex + size + food, data=Alligator)
#' # Agresti's Table 7.1
#' structable(food ~ lake + sex + size, allitable)
#'
#'
#' plot(allitable, shade=TRUE)
#'
#' # mutual independence model
#' mosaic(~ food + lake + size, allitable, shade=TRUE)
#'
#' # food jointly independent of lake and size
#' mosaic(~ food + lake + size, allitable, shade=TRUE,
#'        expected = ~lake:size + food)
#'
#' if (require(nnet)) {
#' 	# multinomial logit model
#' 	mod1 <- multinom(food ~ lake + size + sex, data=Alligator, weights=count)
#' }
#'
#'
NULL





#' Effect of Exposure to Asbestos
#'
#' A two-way contingency table formed from the cross-classification of the
#' number of years of occupational exposure to asbestos and the diagnosed
#' severity of asbestosis of 1117 New York workers. Asbestosis is a chronic
#' lung disease that results in the lung tissue being scared due to contact
#' with the fibers which can lead to severe breathing difficulties.
#'
#' `exposure` and `grade` should be regarded as ordered factors. Beh
#' and Lombardo (2022) use this data to illustrate a polynomial biplot for
#' ordered categories.
#'
#' The data summarized here was studied by Beh and Smith (2011) and comes from
#' the original data collected and published by Selikoff (1981) who examined
#' the link between asbestos exposure and asbestosis severity in 1963.
#'
#' @name Asbestos
#' @docType data
#' @format The format is:
#' \preformatted{
#'  num [1:5, 1:4] 310 212 21 25 7 36 158 35 102 35 ...
#'  - attr(*, "dimnames")=List of 2
#'   ..$ exposure: chr [1:5] "0-9" "10-19" "20-29" "30-39" ...
#'   ..$ grade   : chr [1:4] "None" "Grade 1" "Grade 2" "Grade 3"#'
#'   }
#' @references Beh, E. J., and D. R. Smith (2011b), Real World Occupational
#' Epidemiology, Part 2: A Visual Interpretation of Statistical Significance,
#' *Archives of Environmental & Occupational Health*, **66**, 245-248.
#'
#' Selikoff, I. J. (1981), Household Risks With Inorganic Fibers,
#' *Bulletin of the New York Academy of Medicine*, **57**, 947-961.
#' @source Beh, E. J. & Lombardo, R. (2022). Features of the Polynomial Biplot
#' for Ordered Contingency Tables, *Journal of Computational and Graphical
#' Statistics*, 31:2, 403-412, DOI: 10.1080/10618600.2021.1990773, Table 1.
#' @keywords datasets
#' @examples
#'
#' data(Asbestos)
#' # mosaic plot
#' vcd::mosaic(Asbestos, shade=TRUE, legend=FALSE)
#'
#' # do the correspondence analysis
#' library(ca)
#' Asbestos.ca <- ca(Asbestos)
#'
#' plot(Asbestos.ca, lines=TRUE)
#'
#'
NULL





#' Bartlett Data on Plum Root Cuttings
#'
#' In an experiment to investigate the effect of cutting length (two levels)
#' and planting time (two levels) on the survival of plum root cuttings, 240
#' cuttings were planted for each of the 2 x 2 combinations of these factors,
#' and their survival was later recorded.
#'
#' Bartlett (1935) used these data to illustrate a method for testing for no
#' three-way interaction in a contingency table.
#'
#'
#' @name Bartlett
#' @docType data
#' @format A 3-dimensional array resulting from cross-tabulating 3 variables
#' for 960 observations. The variable names and their levels are:
#'
#' \tabular{rll}{
#'  dim \tab Name          \tab Levels                \cr
#'    1  \tab `Alive`  \tab `"Alive", "Dead"`\cr
#'    2  \tab `Time`   \tab `"Now", "Spring"`\cr
#'    3  \tab `Length` \tab `"Long", "Short"`\cr
#' }

#'
#' @references
#' Bartlett, M. S. (1935).  Contingency Table Interactions *Journal of the Royal Statistical Society*, Supplement,
#' 1935, 2, 248-252.
#'
#' @source
#'
#' Hand, D. and Daly, F. and Lunn, A. D.and McConway, K. J. and Ostrowski, E. (1994). *A Handbook of Small Data Sets*.
#' London: Chapman & Hall, p. 15, # 19.
#' @keywords datasets
#' @examples
#'
#' data(Bartlett)
#'
#' # measures of association
#' assocstats(Bartlett)
#' oddsratio(Bartlett)
#'
#' # Test models
#'
#' ## Independence
#' MASS::loglm(formula = ~Alive + Time + Length, data = Bartlett)
#'
#' ## No three-way association
#' MASS::loglm(formula = ~(Alive + Time + Length)^2, data = Bartlett)
#'
#' # Use woolf_test() for a formal test of homogeneity of odds ratios
#' vcd::woolf_test(Bartlett)
#'
#'
#' # Plots
#' fourfold(Bartlett, mfrow=c(1,2))
#'
#' mosaic(Bartlett, shade=TRUE)
#' pairs(Bartlett, gp=shading_Friendly)
#'
NULL





#' Burt (1950) Data on Hair, Eyes, Head and Stature
#'
#' Cyril Burt (1950) gave these data, on a sample of 100 people from Liverpool,
#' to illustrate the application of a method of factor analysis (later called
#' multiple correspondence analysis) applied to categorical data.
#'
#' He presented these data initially in the form that has come to be called a
#' "Burt table", giving the univariate and bivariate frequencies for an n-way
#' frequency table.
#'
#' Burt says: "In all, 217 individuals were examined, about two-thirds of them
#' males. But, partly to simplify the calculations and partly because the later
#' observations were rather more trustworthy, I shall here restrict my analysis
#' to the data obtained from the last hundred males in the series."
#'
#' `Head` and `Stature` reflect a binary coding where people are
#' classified according to whether they are below or above the average for the
#' population.
#'
#' @name Burt
#' @docType data
#' @format A frequency data frame (representing a 3 x 3 x 2 x 2 frequency
#' table) with 36 cells on the following 5 variables.
#' \describe{
#'   \item{`Hair`}{hair color, a factor with levels `Fair` `Red` `Dark`}
#'   \item{`Eyes`}{eye color, a factor with levels `Light` `Mixed` `Dark`}
#'   \item{`Head`}{head shape, a factor with levels `Narrow` `Wide`}
#'   \item{`Stature`}{height, a factor with levels `Tall` `Short`}
#'   \item{`Freq`}{a numeric vector}
#' }
#'
#' @source Burt, C. (1950). The factorial analysis of qualitative data,
#' *British Journal of Statistical Psychology*, **3**(3), 166-185.
#' Table IX.
#' @keywords datasets
#' @examples
#'
#' data(Burt)
#' mosaic(Freq ~ Hair + Eyes + Head + Stature, data=Burt, shade=TRUE)
#'
#' #or
#' burt.tab <- xtabs(Freq ~ Hair + Eyes + Head + Stature, data=Burt)
#' mosaic(burt.tab, shade=TRUE)
#'
NULL





#' Risk Factors for Infection in Caesarian Births
#'
#' Data from infection from birth by Caesarian section, classified by
#' `Risk` (two levels), whether `Antibiotics` were used (two levels)
#' and whether the Caesarian section was `Planned` or not.  The outcome is
#' `Infection` (three levels).
#'
#' @details `Infection` is regarded as the response variable here.  There are quite
#' a few 0 cells here, particularly when `Risk` is absent and the
#' Caesarian section was unplanned. Should these be treated as structural or
#' sampling zeros?
#'
#' @name Caesar
#' @docType data
#' @format A 4-dimensional array resulting from cross-tabulating 4 variables
#' for 251 observations. The variable names and their levels are:
#'
#'   \tabular{rll}{
#'   dim \tab Name \tab Levels \cr
#'     1 \tab `Infection`\tab `"Type 1", "Type 2", "None"`\cr
#'     2 \tab `Risk`\tab `"Yes", "No"` (presence of risk factors)\cr
#'     3 \tab `Antibiotics`\tab `"Yes", "No"` (were antibiotics given?)\cr
#'     4 \tab `Planned`\tab `"Yes", "No"` (was the C section planned?)\cr
#'}
#'
#'
#' @seealso \code{\link[Fahrmeir]{caesar}} for the same data recorded as a
#' frequency data frame with other variables.
#' @source
#'
#' % \cite{Fahrmeir:94}
#' Fahrmeir, L. & Tutz, G. (1994). Multivariate
#' Statistical Modelling Based on Generalized Linear Models New York: Springer
#' Verlag, Table 1.1.
#' @keywords datasets
#' @examples
#'
#' data(Caesar)
#' #display table;  note that there are quite a few 0 cells
#' structable(Caesar)
#' require(MASS)
#'
#' # baseline model, Infection as response
#' Caesar.mod0 <- loglm(~Infection + (Risk*Antibiotics*Planned),
#'                      data=Caesar)
#'
#' # NB: Pearson chisq cannot be computed due to the 0 cells
#' Caesar.mod0
#'
#' mosaic(Caesar.mod0, main="Baseline model")
#'
#' # Illustrate handling structural zeros
#' zeros <- 0+ (Caesar >0)
#' zeros[1,,1,1] <- 1
#' structable(zeros)
#'
#' # fit model excluding possible structural zeros
#' Caesar.mod0s <- loglm(~Infection + (Risk*Antibiotics*Planned),
#'                       data=Caesar,
#' 	                    start=zeros)
#' Caesar.mod0s
#'
#' anova(Caesar.mod0, Caesar.mod0s, test="Chisq")
#'
#' mosaic (Caesar.mod0s)
#'
#' # what terms to add?
#' add1(Caesar.mod0, ~.^2, test="Chisq")
#'
#' # add Association of Infection:Antibiotics
#' Caesar.mod1 <- update(Caesar.mod0, ~ . + Infection:Antibiotics)
#' anova(Caesar.mod0, Caesar.mod1, test="Chisq")
#'
#' mosaic(Caesar.mod1,
#'        gp=shading_Friendly,
#'        main="Adding Infection:Antibiotics")
#'
#'
NULL





#' Survival of Breast Cancer Patients
#'
#' Three year survival of 474 breast cancer patients according to nuclear grade
#' and diagnostic center.
#'
#'
#' @name Cancer
#' @docType data
#' @format A 3-dimensional array resulting from cross-tabulating 3 variables
#' for 474 observations. The variable names and their levels are:
#'
#' \tabular{rll}{
#'   dim \tab Name \tab Levels \cr
#'     1\tab `Survival`\tab  `"Died", "Surv"`\cr
#'     2\tab `Grade`\tab `"Malignant", "Benign"`\cr
#'     3\tab `Center`\tab `"Boston", "Glamorgan"`\cr
#'     }
#' @source
#'
#' Lindsey, J. K. (1995).
#' Analysis of Frequency and Count Data Oxford, UK: Oxford University Press. p.
#' 38, Table 2.5.
#'
#' Whittaker, J. (1990) Graphical Models in Applied Multivariate Statistics New
#' York: John Wiley and Sons, p. 220.
#' @keywords datasets
#' @examples
#'
#' data(Cancer)
#'
#' MASS::loglm(~Survival + Grade + Center, data = Cancer)
#'
#' vcd::mosaic(Cancer, shade=TRUE)
#'
NULL





#' Advertising Behavior by Males Cormorants
#'
#' Male double-crested cormorants use advertising behavior to attract females
#' for breeding. In this study by Meagan McRae (2015), cormorants were observed
#' two or three times a week at six stations in a tree-nesting colony for an
#' entire season, April 10, 2014-July 10, 2014. The number of advertising birds
#' was counted and these observations were classified by characteristics of the
#' trees and nests.
#'
#' The goal is to determine how this behavior varies temporally over the season
#' and spatially, as well as with characteristics of nesting sites.
#'
#' Observations were made on only 2 days in weeks 3 and 4, but 3 days in all
#' other weeks. One should use log(days) as an offset, so that the response
#' measures rate.
#'
#' `Cormorants$days <- ifelse(Cormorants$week \%in\% 3:4, 2, 3)`
#'
#' @name Cormorants
#' @docType data
#' @format A data frame with 343 observations on the following 8 variables.
#'
#' \describe{
#'   \item{`category`}{Time of season, divided into 3 categories based on breeding chronology, an ordered factor with levels `Pre` < `Incubation` < `Chicks Present`}
#'   \item{`week`}{Week of the season}
#'   \item{`station`}{Station of observations on two different peninsulas in a park, a factor with levels `B1` `B2` `C1` `C2` `C3` `C4`}
#'   \item{`nest`}{Type of nest, an ordered factor with levels `no` < `partial` < `full`}
#'   \item{`height`}{Relative height of bird in the tree, an ordered factor with levels `low` < `mid` < `high`}
#'   \item{`density`}{Number of other nests in the tree, an ordered factor with levels `zero` < `few` < `moderate` < `high`}
#'   \item{`tree_health`}{Health of the tree the bird is advertising in, a factor with levels `dead` `healthy`}
#'   \item{`count`}{Number of birds advertising, a numeric vector}
#' }

#' @source McRae, M. (2015). Spatial, Habitat and Frequency Changes in
#' Double-crested Cormorant Advertising Display in a Tree-nesting Colony.
#' Unpublished MA project, Environmental Studies, York University.
#' @keywords datasets
#' @examples
#'
#' data(Cormorants)
#' str(Cormorants)
#'
#' if (require("ggplot2")) {
#'   print(ggplot(Cormorants, aes(count)) +
#'     geom_histogram(binwidth=0.5) +
#' 	  labs(x="Number of birds advertising"))
#'
#' # Quick look at the data, on the log scale, for plots of `count ~ week`,
#' #   stratified by something else.
#'
#'   print(ggplot(Cormorants, aes(week, count, color=height)) +
#'     geom_jitter() +
#' 	  stat_smooth(method="loess", size=2) +
#' 	  scale_y_log10(breaks=c(1,2,5,10)) +
#' 	  geom_vline(xintercept=c(4.5, 9.5)))
#' }
#'
#' # ### models using week
#' fit1 <-glm(count ~ week + station + nest + height + density + tree_health,
#'            data=Cormorants,
#'            family =  poisson)
#'
#' if (requireNamespace("car"))
#'   car::Anova(fit1)
#'
#' # plot fitted effects
#' if (requireNamespace("effects"))
#'   plot(effects::allEffects(fit1))
#'
#'
NULL





#' London Cycling Deaths
#'
#' A data frame containing the number of deaths of cyclists in London from 2005
#' through 2012 in each fortnightly period.  Aberdein & Spiegelhalter (2013)
#' discuss these data in relation to the observation that six cyclists died in
#' London between Nov. 5 and Nov. 13, 2013.
#'
#'
#' @name CyclingDeaths
#' @docType data
#' @format A data frame with 208 observations on the following 2 variables.
#' \describe{
#'   \item{`date`}{a Date}
#'   \item{`deaths`}{number of deaths, a numeric vector}
#'   }
#' @references Aberdein, Jody and Spiegelhalter, David (2013). Have London's
#' roads become more dangerous for cyclists? *Significance*, 10(6),
#' 46--48.
#' @source
#' <https://www.data.gov.uk/dataset/cb7ae6f0-4be6-4935-9277-47e5ce24a11f/road-accidents-safety-data>,
#' STATS 19 data, 2005-2012, using the files `Casualty0512.csv` and
#' `Accidents0512.csv`
#' @keywords datasets
#' @examples
#'
#' data(CyclingDeaths)
#'
#' plot(deaths ~ date, data=CyclingDeaths,
#'   type="h",
#' 	lwd=3,
#' 	ylab="Number of deaths",
#' 	axes=FALSE)
#' axis(1, at=seq(as.Date('2005-01-01'),
#'                by='years',
#'                length.out=9),
#'      labels=2005:2013)
#' axis(2, at=0:3)
#'
#' # make a one-way frequency table
#' CyclingDeaths.tab <- table(CyclingDeaths$deaths)
#'
#' gf <- goodfit(CyclingDeaths.tab)
#' gf
#' summary(gf)
#'
#' rootogram(gf, xlab="Number of Deaths")
#' distplot(CyclingDeaths.tab)
#'
#' # prob of 6 or more deaths in one fortnight
#' lambda <- gf$par$lambda
#' ppois(5, lambda, lower.tail=FALSE)
#'
NULL





#' Dayton Student Survey on Substance Use
#'
#' This data, from Agresti (2002), Table 9.1, gives the result of a 1992 survey
#' in Dayton Ohio of 2276 high school seniors on whether they had ever used
#' alcohol, cigarettes and marijuana.
#'
#' Agresti uses the letters G (`sex`), R (`race`), A
#' (`alcohol`), C (`cigarette`), M (`marijuana`) to refer to the
#' table variables, and this usage is followed in the examples below.
#'
#' Background variables include `sex` and `race` of the respondent
#' (GR), typically treated as explanatory, so that any model for the full table
#' should include the term `sex:race`. Models for the reduced table,
#' collapsed over `sex` and `race` are not entirely unreasonable, but
#' don't permit the estimation of the effects of these variables on the
#' responses.
#'
#' The full 5-way table contains a number of cells with counts of 0 or 1, as
#' well as many cells with large counts, and even the ACM table collapsed over
#' GR has some small cell counts.  Consequently, residuals for these models in
#' mosaic displays are best represented as standardized (adjusted) residuals.
#'
#' @name DaytonSurvey
#' @docType data
#' @format A frequency data frame with 32 observations on the following 6
#' variables.
#' \describe{
#'   \item{`cigarette`}{a factor with levels `Yes` `No`}
#'   \item{`alcohol`}{a factor with levels `Yes` `No`}
#'   \item{`marijuana`}{a factor with levels `Yes` `No`}
#'   \item{`sex`}{a factor with levels `female` `male`}
#'   \item{`race`}{a factor with levels `white` `other`}
#'   \item{`Freq`}{a numeric vector}
#' }

#' @references Thompson, L. (2009). *R (and S-PLUS) Manual to Accompany
#' Agresti's Categorical Data*,
#' http://www.stat.ufl.edu/~aa/cda/Thompson_manual.pdf
#' @source Agresti, A. (2002). *Categorical Data Analysis*, 2nd Ed., New
#' York: Wiley-Interscience, Table 9.1, p. 362.
#' @keywords datasets
#' @examples
#'
#' data(DaytonSurvey)
#'
#' # mutual independence
#' mod.0  <- glm(Freq ~ ., data=DaytonSurvey, family=poisson)
#'
#' # mutual independence + GR
#' mod.GR <- glm(Freq ~ . + sex*race, data=DaytonSurvey, family=poisson)
#' anova(mod.GR, test = "Chisq")
#'
#' # all two-way terms
#' mod.all2way <- glm(Freq ~ .^2, data=DaytonSurvey, family=poisson)
#' anova(mod.all2way, test = "Chisq")
#'
#' # compare models
#' LRstats(mod.0, mod.GR, mod.all2way)
#'
#' # collapse over sex and race
#' Dayton.ACM <- aggregate(Freq ~ cigarette+alcohol+marijuana,
#'                         data=DaytonSurvey,
#'                         FUN=sum)
#' Dayton.ACM
#'
NULL





#' Dependencies of R Packages
#'
#' This one-way table gives the type-token distribution of the number of
#' dependencies declared in 4983 packages listed on CRAN on January 17, 2014.
#'
#'
#' @name Depends
#' @docType data
#' @format The format is a one-way frequency table of counts of packages with
#' 0, 1, 2, ... dependencies.
#'
#' \preformatted{
#'  table' int [1:15(1d)] 986 1347 993 685 375 298 155 65 32 19 ...
#'  - attr(*, "dimnames")=List of 1
#'  ..$ Depends: chr [1:15] "0" "1" "2" "3" ...
#' }
#' @source Using code from
#' <https://blog.revolutionanalytics.com/2013/12/a-look-at-the-distribution-of-r-package-dependencies.html>
#' @keywords datasets
#' @examples
#'
#' data(Depends)
#' plot(Depends,
#'      xlab="Number of Dependencies",
#'      ylab="Number of R Packages",
#'      lwd=8)
#'
#' # what type of distribution?
#' # Ord_plot can't classify this!
#' Ord_plot(Depends)
#'
#' \dontrun{
#' # The code below, from Joseph Rickert, downloads and tabulates the data
#' p <- as.data.frame(available.packages(),stringsAsFactors=FALSE)
#' names(p)
#'
#' pkgs <- data.frame(p[,c(1,4)])                  # Pick out Package names and Depends
#' row.names(pkgs) <- NULL                         # Get rid of row names
#' pkgs <- pkgs[complete.cases(pkgs[,2]),]         # Remove NAs
#'
#' pkgs$Depends2 <-strsplit(pkgs$Depends,",")      # split list of Depends
#' pkgs$numDepends <- as.numeric(lapply(pkgs$Depends2,length)) # Count number of dependencies in list
#' zeros <- c(rep(0,dim(p)[1] - dim(pkgs)[1]))     # Account for packages with no dependencies
#' Deps <- as.vector(c(zeros,pkgs$numDepends))     # Set up to tablate
#' Depends <- table(Deps)
#'
#' }
#'
NULL





#' Detergent Preference Data
#'
#' Cross-classification of a sample of 1008 consumers according to (a) the
#' softness of the laundry water used, (b) previous use of detergent Brand M,
#' (c) the temperature of laundry water used and (d) expressed preference for
#' Brand X or Brand M in a blind trial.
#'
#'
#' @name Detergent
#' @docType data
#' @format A 4-dimensional array resulting from cross-tabulating 4 variables
#' for 1008 observations. The variable names and their levels are:
#'
#'   \tabular{rll}{
#' dim \tab Name \tab Levels \cr
#' 1\tab `Temperature`\tab `"High", "Low"`\cr
#' 2\tab `M_User`\tab `"Yes", "No"`\cr
#' 3\tab `Preference`\tab `"Brand X", "Brand M"`\cr
#' 4\tab `Water_softness`\tab `"Soft", "Medium", "Hard"`\cr
#' }

#' @references
#' Ries, P. N. & Smith, H. (1963). The use of
#' chi-square for preference testing in multidimensional problems.
#' *Chemical Engineering Progress*, 59, 39-43.
#' @source
#'
#' Fienberg, S. E. (1980). *The Analysis of
#' Cross-Classified Categorical Data* Cambridge, MA: MIT Press, p. 71.
#' @keywords datasets
#' @examples
#'
#' data(Detergent)
#'
#' # basic mosaic plot
#' mosaic(Detergent, shade=TRUE)
#'
#' require(MASS)
#' (det.mod0 <- loglm(~ Preference + Temperature + M_User + Water_softness,
#'                    data=Detergent))
#' # examine addition of two-way terms
#' add1(det.mod0, ~ .^2, test="Chisq")
#'
#' # model for Preference as a response
#' (det.mod1 <- loglm(~ Preference + (Temperature * M_User * Water_softness),
#'                    data=Detergent))
#' mosaic(det.mod0)
#'
#'
#'
NULL





#' Survival in the Donner Party
#'
#' This data frame contains information on the members of the Donner Party, a
#' group of people who attempted to migrate to California in 1846. They were
#' trapped by an early blizzard on the eastern side of the Sierra Nevada
#' mountains, and before they could be rescued, nearly half of the party had
#' died.
#'
#' What factors affected who lived and who died?
#'
#' This data frame uses the person's name as row labels. `family` reflects
#' a recoding of the last names of individuals to reduce the number of factor
#' levels. The main families in the Donner party were: Donner, Graves, Breen
#' and Reed. The families of Murphy, Foster and Pike are grouped as
#' `'MurFosPik'`, those of Fosdick and Wolfinger are coded as
#' `'FosdWolf'`, and all others as `'Other'`.
#'
#' `survived` is the response variable. What kind of models should be used
#' here?
#'
#' @name Donner
#' @docType data
#' @format A data frame with 90 observations on the following 5 variables.
#'
#' \describe{
#'   \item{`family`}{family name, a factor with 10 levels }
#'   \item{`age`}{age of person, a numeric vector}
#'   \item{`sex`}{a factor with levels `Female` `Male`}
#'   \item{`survived`}{a numeric vector, 0 or 1}
#'   \item{`death`}{date of death for those who died before rescue, a POSIXct}
#' }
#'
#' @seealso `donner` in \pkg{alr3}, \code{\link[Sleuth2]{case2001}} in
#' \pkg{Sleuth2}(adults only) provide similar data sets.
#'
#' @references Ramsey, F.L. and Schafer, D.W. (2002).  *The Statistical
#' Sleuth: A Course in Methods of Data Analysis*, (2nd ed), Duxbury.
#'
#' Friendly, M. and Meyer, D. (2016).  *Discrete Data Analysis with R:
#' Visualization and Modeling Techniques for Categorical and Count Data*.  Boca
#' Raton, FL: Chapman & Hall/CRC. <http://ddar.datavis.ca>.
#' @source D. K. Grayson, 1990, "Donner party deaths: A demographic
#' assessment", *J. Anthropological Research*, **46**, 223-242.
#'
#' Johnson, K. (1996). *Unfortunate Emigrants: Narratives of the Donner
#' Party*.  Logan, UT: Utah State University Press.  Additions, and dates of
#' death from <http://user.xmission.com/~octa/DonnerParty/Roster.htm>.
#' @keywords datasets
#' @examples
#'
#' # conditional density plots
#' op <- par(mfrow=c(1,2), cex.lab=1.5)
#' cdplot(factor(survived) ~ age,
#'        subset=sex=='Male',
#'        data=Donner,
#'        main="Donner party: Males",
#'        ylevels=2:1,
#'        ylab="Survived",
#'        yaxlabels=c("yes", "no"))
#' with(Donner, rug(jitter(age[sex=="Male"]),
#'                  col="white", quiet=TRUE))
#'
#' cdplot(factor(survived) ~ age,
#'        subset=sex=='Female',
#'        data=Donner,
#'        main="Donner party: Females",
#'        ylevels=2:1,
#'        ylab="Survived",
#'        yaxlabels=c("yes", "no"))
#' with(Donner, rug(jitter(age[sex=="Female"]),
#'                  col="white", quiet=TRUE))
#' par(op)
#'
#'
#' # fit some models
#' (mod1 <- glm(survived ~ age + sex, data=Donner, family=binomial))
#' (mod2 <- glm(survived ~ age * sex, data=Donner, family=binomial))
#' anova(mod2, test="Chisq")
#'
#' (mod3 <- glm(survived ~ poly(age,2) * sex, data=Donner, family=binomial))
#' anova(mod3, test="Chisq")
#' LRstats(glmlist(mod1, mod2, mod3))
#'
#' # plot fitted probabilities from mod2 and mod3
#' # idea from: http://www.ling.upenn.edu/~joseff/rstudy/summer2010_ggplot2_intro.html
#' library(ggplot2)
#'
#' # separate linear fits on age for M/F
#' ggplot(Donner, aes(age, survived, color = sex)) +
#'   geom_point(position = position_jitter(height = 0.02, width = 0)) +
#'   stat_smooth(method = "glm",
#'               method.args = list(family = binomial),
#'               formula = y ~ x,
#'               alpha = 0.2,
#'               size=2,
#'               aes(fill = sex))
#'
#' # separate quadratics
#' ggplot(Donner, aes(age, survived, color = sex)) +
#'   geom_point(position = position_jitter(height = 0.02, width = 0)) +
#'   stat_smooth(method = "glm",
#'               method.args = list(family = binomial),
#'               formula = y ~ poly(x,2),
#'               alpha = 0.2,
#'               size=2,
#'               aes(fill = sex))
#'
#'
#'
NULL





#' USA 1970 Draft Lottery Data
#'
#' This data set gives the results of the 1970 US draft lottery, in the form of
#' a data frame.
#'
#' The draft lottery was used to determine the order in which eligible men
#' would be called to the Selective Service draft. The days of the year
#' (including February 29) were represented by the numbers 1 through 366
#' written on slips of paper. The slips were placed in separate plastic
#' capsules that were mixed in a shoebox and then dumped into a deep glass jar.
#' Capsules were drawn from the jar one at a time.
#'
#' The first number drawn was 258 (September 14), so all registrants with that
#' birthday were assigned lottery number `Rank` 1. The second number drawn
#' corresponded to April 24, and so forth.  All men of draft age (born 1944 to
#' 1950) who shared a birthdate would be called to serve at once. The first 195
#' birthdates drawn were later called to serve in the order they were drawn;
#' the last of these was September 24.
#'
#' @name Draft1970
#' @docType data
#' @format A data frame with 366 observations on the following 3 variables.
#' \describe{
#'   \item{`Day`}{day of the year, 1:366}
#'   \item{`Rank`}{draft priority rank of people born on that day}
#'   \item{`Month`}{an ordered factor with levels `Jan` < `Feb` \dots < `Dec`}
#' }
#'
#' @seealso \code{\link{Draft1970table}}
#' @references Fienberg, S. E. (1971), "Randomization and Social Affairs: The
#' 1970 Draft Lottery," *Science*, 171, 255-261.
#'
#' <https://en.wikipedia.org/wiki/Draft_lottery_(1969)>
#' @source Starr, N. (1997). Nonrandom Risk: The 1970 Draft Lottery,
#' *Journal of Statistics Education*, v.5, n.2
#' <https://jse.amstat.org/v5n2/datasets.starr.html>
#' @keywords datasets
#' @examples
#'
#' data(Draft1970)
#'
#' # scatterplot
#' plot(Rank ~ Day, data=Draft1970)
#' with(Draft1970, lines(lowess(Day, Rank), col="red", lwd=2))
#' abline(lm(Rank ~ Day, data=Draft1970), col="blue")
#'
#' # boxplots
#' plot(Rank ~ Month, data=Draft1970, col="bisque")
#'
#' lm(Rank ~ Month, data=Draft1970)
#' anova(lm(Rank ~ Month, data=Draft1970))
#'
#' # make the table version
#' Draft1970$Risk <- cut(Draft1970$Rank, breaks=3, labels=c("High", "Med", "Low"))
#' with(Draft1970, table(Month, Risk))
#'
NULL





#' USA 1970 Draft Lottery Table
#'
#' This data set gives the results of the 1970 US draft lottery, in the form of
#' a frequency table. The rows are months of the year, Jan--Dec and columns
#' give the number of days in that month which fall into each of three draft
#' risk categories High, Medium, and Low, corresponding to the chances of being
#' called to serve in the US army.
#'
#' The lottery numbers are divided into three categories of risk of being
#' called for the draft -- High, Medium, and Low -- each representing roughly
#' one third of the days in a year.  Those birthdays having the highest risk
#' have lottery numbers 1-122, medium risk have numbers 123-244, and the lowest
#' risk category contains lottery numbers 245-366.
#'
#' @name Draft1970table
#' @docType data
#' @format The format is:
#'
#' \preformatted{
#' 'table' int [1:12, 1:3] 9 7 5 8 9 11 12 13 10 9 ...
#' - attr(*, "dimnames")=List of 2
#' ..$ Month: chr [1:12] "Jan" "Feb" "Mar" "Apr" ...
#' ..$ Risk : chr [1:3] "High" "Med" "Low"
#' }
#'
#' @seealso \code{\link{Draft1970}}
#' @references Fienberg, S. E. (1971), "Randomization and Social Affairs: The
#' 1970 Draft Lottery," *Science*, 171, 255-261.
#'
#' Starr, N. (1997). Nonrandom Risk: The 1970 Draft Lottery, *Journal of
#' Statistics Education*, v.5, n.2
#' <https://jse.amstat.org/v5n2/datasets.starr.html>
#' @source This data is available in several forms, but the table version was
#' obtained from
#'
#' <https://sas.uwaterloo.ca/~rwoldfor/software/eikosograms/data/draft-70>
#' @keywords datasets
#' @examples
#'
#' data(Draft1970table)
#' chisq.test(Draft1970table)
#'
#' # plot.table -> graphics:::mosaicplot
#' plot(Draft1970table, shade=TRUE)
#' mosaic(Draft1970table, gp=shading_Friendly)
#'
#' # correspondence analysis
#' if(require(ca)) {
#'   ca(Draft1970table)
#'   plot(ca(Draft1970table))
#' }
#'
#' # convert to a frequency data frame with ordered factors
#' Draft1970df <- as.data.frame(Draft1970table)
#'
#' Draft1970df <- within(Draft1970df, {
#'   Month <- ordered(Month)
#'   Risk <- ordered(Risk, levels=rev(levels(Risk)))
#'   })
#' str(Draft1970df)
#'
#' # similar model, as a Poisson GLM
#' indep <- glm(Freq ~ Month + Risk, family = poisson, data = Draft1970df)
#'
#' mosaic(indep, residuals_type="rstandard", gp=shading_Friendly)
#'
#' # numeric scores for tests of ordinal factors
#' Cscore <- as.numeric(Draft1970df$Risk)
#' Rscore <- as.numeric(Draft1970df$Month)
#'
#' # linear x linear association between Month and Risk
#' linlin <- glm(Freq ~ Month + Risk + Rscore:Cscore, family = poisson, data = Draft1970df)
#'
#' # compare models
#' anova(indep, linlin, test="Chisq")
#' mosaic(linlin, residuals_type="rstandard", gp=shading_Friendly)
#'
#'
#'
NULL





#' Sources of Knowledge of Cancer
#'
#' Observational data on a sample of 1729 individuals, cross-classified in a
#' 2^5 table according to their sources of information (read newspapers, listen
#' to the radio, do 'solid' reading, attend lectures) and whether they have
#' good or poor knowledge regarding cancer.  Knowledge of cancer is often
#' treated as the response.
#'
#'
#' @name Dyke
#' @docType data
#' @format A 5-dimensional array resulting from cross-tabulating 5 variables
#' for 1729 observations. The variable names and their levels are:
#'
#' \tabular{rll}{
#'  dim \tab Name \tab Levels \cr
#'   1\tab `Knowledge`\tab `"Good", "Poor"`\cr
#'   2\tab `Reading`\tab `"No", "Yes"`\cr
#'   3\tab `Radio`\tab `"No", "Yes"`\cr
#'   4\tab `Lectures`\tab `"No", "Yes"`\cr
#'   5\tab `Newspaper`\tab `"No", "Yes"`\cr
#' }

#' @references
#' Dyke, G. V. and Patterson, H. D. (1952). Analysis of factorial
#' arrangements when the data are proportions. *Biometrics*, 8, 1-12.
#'
#' Lindsey, J. K. (1993).  *Models for Repeated Measurements* Oxford, UK:
#' Oxford University Press, p. 57.
#'
#' @source Fienberg, S. E. (1980). *The Analysis of Cross-Classified
#' Categorical Data* Cambridge, MA: MIT Press, p. 85, Table 5-6.
#' @keywords datasets
#' @examples
#'
#' data(Dyke)
#'
#' # independence model
#' mosaic(Dyke, shade=TRUE)
#'
#' # null model, Knowledge as response, independent of others
#' require(MASS)
#' dyke.mod0 <- loglm(~ Knowledge + (Reading * Radio * Lectures * Newspaper), data=Dyke)
#' dyke.mod0
#' mosaic(dyke.mod0)
#'
#' # view as doubledecker plot
#' Dyke <- Dyke[2:1,,,,]    # make Good the highlighted value of Knowledge
#' doubledecker(Knowledge ~ ., data=Dyke)
#'
#' # better version, with some options
#' doubledecker(Knowledge ~ Lectures + Reading + Newspaper + Radio,
#'   data=Dyke,
#' 	margins = c(1,6, length(dim(Dyke)) + 1, 1),
#' 	fill_boxes=list(rep(c("white", gray(.90)),4))
#' 	)
#'
#' # separate (conditional) plots for those who attend lectures and those who do not
#' doubledecker(Knowledge ~ Reading + Newspaper + Radio,
#'   data=Dyke[,,,1,],
#' 	main="Do not attend lectures",
#' 	margins = c(1,6, length(dim(Dyke)) + 1, 1),
#' 	fill_boxes=list(rep(c("white", gray(.90)),3))
#' 	)
#' doubledecker(Knowledge ~ Reading + Newspaper + Radio,
#'   data=Dyke[,,,2,],
#' 	main="Attend lectures",
#' 	margins = c(1,6, length(dim(Dyke)) + 1, 1),
#' 	fill_boxes=list(rep(c("white", gray(.90)),3))
#' 	)
#'
#'
#' drop1(dyke.mod0, test="Chisq")
#'
#'
NULL





#' Carcinogenic Effects of a Fungicide
#'
#' Data from Gart (1971) on the carcinogenic effects of a certain fungicide in
#' two strains of mice. Of interest is how the association between `group`
#' (Control, Treated) and `outcome` (Tumor, No Tumor) varies with
#' `sex` and `strain` of the mice.
#'
#' Breslow (1976) used this data to illustrate the application of linear models
#' to log odds ratios.
#'
#' All tables have some small cells, so a continuity correction is recommended.
#'
#' @name Fungicide
#' @docType data
#' @format The data comprise a set of four 2 x 2 tables classifying 403 mice,
#' either Control or Treated and whether or not a tumor was later observed.
#' The four groups represent the combinations of sex and strain of mice.
#'
#' The format is:
#' \preformatted{
#' num [1:2, 1:2, 1:2, 1:2] 5 4 74 12 3 2 84 14 10 4 ...
#' - attr(*, "dimnames")=List of 4
#' ..$ group  : chr [1:2] "Control" "Treated"
#' ..$ outcome: chr [1:2] "Tumor" "NoTumor"
#' ..$ sex    : chr [1:2] "M" "F"
#' ..$ strain : chr [1:2] "1" "2"
#' }
#
#' @references Breslow, N. (1976), Regression analysis of the log odds ratio: A
#' method for retrospective studies, *Biometrics*, 32(3), 409-416.
#'
#' @source Gart, J. J. (1971). The comparison of proportions: a review of
#' significance tests, confidence intervals and adjustments for stratification.
#' *International Statistical Review*, 39, 148-169.
#'
#' @keywords datasets
#' @examples
#'
#' data(Fungicide)
#' # loddsratio was moved to vcd; requires vcd_1.3-3+
#' \dontrun{
#' fung.lor <- loddsratio(Fungicide, correct=TRUE)
#' fung.lor
#' confint(fung.lor)
#' }
#'
#' # visualize odds ratios in fourfold plots
#' cotabplot(Fungicide, panel=cotab_fourfold)
#' #  -- fourfold() requires vcd >= 1.2-10
#' fourfold(Fungicide, p_adjust_method="none")
#'
#'
#'
NULL





#' Geissler's Data on the Human Sex Ratio
#'
#' Geissler (1889) published data on the distributions of boys and girls in
#' families in Saxony, collected for the period 1876-1885. The `Geissler`
#' data tabulates the family composition of 991,958 families by the number of
#' boys and girls listed in the table supplied by Edwards (1958, Table 1).
#'
#' The data on family composition was available because, on the birth of a
#' child, the parents had to state the sex of all their children on the birth
#' certificate. These family records are not necessarily independent, because a
#' given family may have had several children during this 10 year period,
#' included as multiple records.
#'
#' @name Geissler
#' @docType data
#' @format A data frame with 90 observations on the following 4 variables.  The
#' rows represent the non-NA entries in Edwards' table.
#' \describe{
#'   \item{`boys`}{number of boys in the family, `0:12`}
#'   \item{`girls`}{number of girls in the family, `0:12`}
#'   \item{`size`}{family size: `boys+girls`}
#'   \item{`Freq`}{number of families with this sex composition}
#' }

#' @seealso \code{\link[vcd]{Saxony}}, containing the data for families of size
#' 12.
#' @references
#' Friendly, M. and Meyer, D. (2016).  *Discrete Data Analysis
#' with R: Visualization and Modeling Techniques for Categorical and Count
#' Data*.  Boca Raton, FL: Chapman & Hall/CRC. <http://ddar.datavis.ca>.
#'
#' Geissler, A. (1889). *Beitrage zur Frage des Geschlechts verhaltnisses
#' der Geborenen* Z. K. Sachsischen Statistischen Bureaus, 35, n.p.
#'
#' Lindsey, J. K. & Altham, P. M. E. (1998).  Analysis of the human sex ratio
#' by using overdispersion models. *Journal of the Royal Statistical
#' Society: Series C (Applied Statistics)*, 47, 149-157.
#'
#' @source
#' Edwards, A. W. F. (1958). An Analysis Of Geissler's Data On The
#' Human Sex Ratio. *Annals of Human Genetics*, 23, 6-15.
#' @keywords datasets
#' @examples
#'
#' data(Geissler)
#' str(Geissler)
#'
#' # reproduce Saxony data, families of size 12
#' Saxony12 <- subset(Geissler, size==12, select=c(boys, Freq))
#' rownames(Saxony12)<-NULL
#'
#' # make a 1-way table
#' xtabs(Freq~boys, Saxony12)
#'
#' # extract data for other family sizes
#' Saxony11 <- subset(Geissler, size==11, select=c(boys, Freq))
#' rownames(Saxony11)<-NULL
#'
#' Saxony10 <- subset(Geissler, size==10, select=c(boys, Freq))
#' rownames(Saxony10)<-NULL
#'
#'
NULL





#' Clothing and Intelligence Rating of Children
#'
#' Schoolboys were classified according to their clothing and to their teachers
#' rating of "dullness" (lack of intelligence), in a 5 x 7 table originally
#' from Gilby (1911). Anscombe (1981) presents a slightly collapsed 4 x 6
#' table, used here, where the last two categories of clothing were pooled as
#' were the first two categories of dullness due to small counts.
#'
#' Both `Dullness` and `Clothing` are ordered categories, so models
#' and methods that examine their association in terms of ordinal categories
#' are profitable.
#'
#'
#' @name Gilby
#' @docType data
#' @format A 2-dimensional array resulting from cross-tabulating 2 variables
#' for 1725 observations. The variable names and their levels are:
#'
#' \tabular{rll}{
#'   No \tab Name \tab Levels \cr
#'   1\tab `Dullness`\tab `"Ment. defective", "Slow", "Slow Intell", "Fairly Intell", "Capable", "V.Able"`\cr
#'   2\tab `Clothing`\tab `"V.Well clad", "Well clad", "Passable", "Insufficient"`\cr
#' }

#' @references
#' Gilby, W. H. (1911).
#' On the significance of the teacher's appreciation of general
#' intelligence.  *Biometrika*, 8, 93-108 (esp. p. 94).  (Quoted by Kendall (1943,..., 1953) Table 13.1, p 320.)
#'
#' @source Anscombe, F. J. (1981). *Computing in Statistical Science Through APL*. New York: Springer-Verlag, p. 302
#' @keywords datasets
#' @examples
#'
#' data(Gilby)
#'
#' # CMH tests treating row/column variables as ordinal
#' CMHtest(Gilby)
#'
#' mosaic(Gilby, shade=TRUE)
#'
#' # correspondence analysis to see relations among categories
#' if(require(ca)){
#' 	ca(Gilby)
#' 	plot(ca(Gilby), lines=TRUE)
#'
#' }
#'
#'
#'
NULL





#' British Social Mobility from Glass(1954)
#'
#' Glass(1954) gave this 5 x 5 table on the occupations of 3500 British fathers
#' and their sons.
#'
#' The occupational categories in order of status are: (1) Professional & High
#' Administrative (2) Managerial, Executive & High Supervisory (3) Low
#' Inspectional & Supervisory (4) Routine Nonmanual & Skilled Manual (5) Semi-
#' & Unskilled Manual
#'
#' However, to make the point that factors are ordered alphabetically by
#' default, Friendly & Meyer (2016) introduce this data set in the form given
#' here.
#'
#' @name Glass
#' @docType data
#' @format A frequency data frame with 25 observations on the following 3
#' variables representing a 5 x 5 table with 3500 cases.
#' \describe{
#'   \item{`father`}{a factor with levels `Managerial` `Professional` `Skilled` `Supervisory` `Unskilled`}
#'   \item{`son`}{a factor with levels `Managerial` `Professional` `Skilled` `Supervisory` `Unskilled`}
#'   \item{`Freq`}{a numeric vector}
#' }
#'
#' @references
#' Bishop, Y. M. M. and Fienberg, S. E. and Holland, P. W. (1975).
#' *Discrete Multivariate Analysis: Theory and Practice*, MIT Press.
#'
#' Friendly, M. and Meyer, D. (2016).  *Discrete Data Analysis with R:
#' Visualization and Modeling Techniques for Categorical and Count Data*.  Boca
#' Raton, FL: Chapman & Hall/CRC. <http://ddar.datavis.ca>.
#'
#' @source Glass, D. V. (1954), *Social Mobility in Britain*. The Free
#' Press.
#' @keywords datasets
#' @examples
#'
#' data(Glass)
#' glass.tab <- xtabs(Freq ~ father + son, data=Glass)
#'
#' largs <- list(set_varnames=list(father="Father's Occupation",
#'                                 son="Son's Occupation"),
#'               abbreviate=10)
#' gargs <- list(interpolate=c(1,2,4,8))
#'
#' mosaic(glass.tab,
#'   shade=TRUE,
#'   labeling_args=largs,
#'   gp_args=gargs,
#'   main="Alphabetic order",
#'   legend=FALSE,
#'   rot_labels=c(20,90,0,70))
#'
#' # reorder by status
#' ord <- c(2, 1, 4, 3, 5)
#' mosaic(glass.tab[ord, ord],
#'   shade=TRUE,
#'   labeling_args=largs,
#'   gp_args=gargs,
#'   main="Effect order",
#'   legend=FALSE,
#'   rot_labels=c(20,90,0,70))
#'
#'
NULL





#' General Social Survey-- Sex and Party affiliation
#'
#' Data from the General Social Survey, 1991, on the relation between sex and
#' party affiliation.
#'
#'
#' @name GSS
#' @docType data
#' @format A data frame in frequency form with 6 observations on the following
#' 3 variables.
#'
#' \describe{
#'   \item{`sex`}{a factor with levels `female` `male`}
#'   \item{`party`}{a factor with levels `dem` `indep` `rep`}
#'   \item{`count`}{a numeric vector}
#' }

#' @source Agresti, A. *Categorical Data Analysis*, 2nd E., John Wiley &
#' Sons, 2002, Table 3.11, p. 106.
#' @keywords datasets
#' @examples
#'
#' data(GSS)
#' str(GSS)
#'
#' # use xtabs to show the table in a compact form
#' (GSStab <- xtabs(count ~ sex + party, data=GSS))
#'
#' # fit the independence model
#' (mod.glm <- glm(count ~ sex + party, family = poisson, data = GSS))
#'
#' # display all the residuals in a mosaic plot
#' mosaic(mod.glm,
#'   formula = ~ sex + party,
#'   labeling = labeling_residuals,
#'   suppress=0)
#'
NULL





#' Hair Color and Eye Color in Caithness and Aberdeen
#'
#' A three-way frequency table crossing eye color and hair color in two places,
#' Caithness and Aberdeen, Scotland. These data were of interest to Fisher
#' (1940) and others because there are mixtures of people of Nordic, Celtic and
#' Anglo-Saxon origin.
#'
#' One or both tables have been widely analyzed in conjunction with RC and
#' canonical correlation models for categorical data, e.g., Becker and Clogg
#' (1989).
#'
#' @details
#' The hair and eye colors are ordered as in the original source, suggesting
#' that they form ordered categories.
#'
#' @name HairEyePlace
#' @docType data
#' @format
#' The format is:
#' \preformatted{
#'   num [1:4, 1:5, 1:2] 326 688 343 98 38 116 84 48 241 584 ...
#' - attr(*, "dimnames")=List of 3
#' ..$ Eye  : chr [1:4] "Blue" "Light" "Medium" "Dark"
#' ..$ Hair : chr [1:5] "Fair" "Red" "Medium" "Dark" ...
#' ..$ Place: chr [1:2] "Caithness" "Aberdeen"
#' }
#'
#' @references Becker, M. P., and Clogg, C. C. (1989).  Analysis of Sets of
#' Two-Way Contingency Tables Using Association Models.  *Journal of the
#' American Statistical Association*, 84(405), 142-151.
#'
#' Fisher, R.A. (1940) The precision of discriminant functions.  *Annals
#' of Eugenics*, 10, 422-429.
#' @source This data was taken from the `colors` data in \pkg{logmult}.
#' @keywords datasets
#' @examples
#'
#' data(HairEyePlace)
#'
#' # separate mosaics
#' mosaic(HairEyePlace[,,1], shade=TRUE, main="Caithness")
#' mosaic(HairEyePlace[,,2], shade=TRUE, main="Aberdeen")
#'
#' # condition on Place
#' mosaic(~Hair + Eye |Place, data=HairEyePlace, shade=TRUE, legend=FALSE)
#'
#' cotabplot(~Hair+Eye|Place, data=HairEyePlace, shade=TRUE, legend=FALSE)
#'
NULL





#' Hauser (1979) Data on Social Mobility
#'
#' Hauser (1979) presented this two-way frequency table, cross-classifying
#' occupational categories of sons and fathers in the United States.
#'
#' It is a good example for exploring a variety of models for square tables:
#' quasi-independence, quasi-symmetry, row/column effects, uniform association,
#' etc., using the facilities of the \pkg{gnm}.
#'
#' Hauser's data was first presented in 1979, and then published in 1980. The
#' name of the dataset reflects the earliest use.
#'
#' It reflects the "frequencies in a classification of son's first full-time
#' civilian occupation by father's (or other family head's) occupation at son's
#' sixteenth birthday among American men who were aged 20 to 64 in 1973 and
#' were not currently enrolled in school".
#'
#' As noted in Hauser's Table 1, "Counts are based on observations weighted to
#' estimate population counts and compensate for departures of the sampling
#' design from simple random sampling. Broad occupation groups are upper
#' nonmanual: professional and kindred workers, managers and officials, and
#' non-retail sales workers; lower nonmanual: proprietors, clerical and kindred
#' workers, and retail sales workers; upper manual: craftsmen, foremen, and
#' kindred workers; lower manual: service workers, operatives and kindred
#' workers, and laborers (except farm); farm: farmers and farm managers, farm
#' laborers, and foremen. density of mobility or immobility in the cells to
#' which they refer."
#'
#' The table levels for `Son` and `Father` have been arranged in
#' order of decreasing status as is common for mobility tables.
#'
#' @name Hauser79
#' @docType data
#' @format A frequency data frame with 25 observations on the following 3
#' variables, representing the cross-classification of 19912 individuals by
#' father's occupation and son's first occupation.
#' \describe{
#'   \item{`Son`}{a factor with levels `UpNM` `LoNM` `UpM` `LoM` `Farm`}
#'   \item{`Father`}{a factor with levels `UpNM` `LoNM` `UpM` `LoM` `Farm`}
#'   \item{`Freq`}{a numeric vector}
#' }
#'
#' @references
#' Powers, D.A. and Xie, Y. (2008). *Statistical Methods for
#' Categorical Data Analysis*, Bingley, UK: Emerald.
#' @source
#' R.M. Hauser (1979), Some exploratory methods for modeling mobility
#' tables and other cross-classified data.  In: K.F. Schuessler (Ed.),
#' *Sociological Methodology*, 1980, Jossey-Bass, San Francisco, pp.
#' 413-458. Table 1.
#' @keywords datasets
#' @examples
#'
#' data(Hauser79)
#' str(Hauser79)
#'
#' # display table
#' structable(~Father+Son, data=Hauser79)
#'
#' #Examples from Powers & Xie, Table 4.15
#' # independence model
#' mosaic(Freq ~ Father + Son, data=Hauser79, shade=TRUE)
#'
#' hauser.indep <- gnm(Freq ~ Father + Son,
#'   data=Hauser79,
#'   family=poisson)
#'
#' mosaic(hauser.indep, ~Father+Son,
#'        main="Independence model",
#'        gp=shading_Friendly)
#'
#' # Quasi-independence
#' hauser.quasi <-  update(hauser.indep,
#'                         ~ . + Diag(Father,Son))
#' mosaic(hauser.quasi, ~Father+Son,
#'        main="Quasi-independence model",
#'        gp=shading_Friendly)
#'
#' # Quasi-symmetry
#' hauser.qsymm <-  update(hauser.indep,
#'                         ~ . + Diag(Father,Son) + Symm(Father,Son))
#'
#' mosaic(hauser.qsymm, ~Father+Son,
#'        main="Quasi-symmetry model",
#'        gp=shading_Friendly)
#'
#'
#' # numeric scores for row/column effects
#' Sscore <- as.numeric(Hauser79$Son)
#' Fscore <- as.numeric(Hauser79$Father)
#'
#' # row effects model
#' hauser.roweff <- update(hauser.indep, ~ . + Father*Sscore)
#' LRstats(hauser.roweff)
#'
#' # uniform association
#' hauser.UA <- update(hauser.indep, ~ . + Fscore*Sscore)
#' LRstats(hauser.UA)
#'
#' # uniform association, omitting diagonals
#' hauser.UAdiag <- update(hauser.indep, ~ . + Fscore*Sscore + Diag(Father,Son))
#' LRstats(hauser.UAdiag)
#'
#' # Levels for Hauser 5-level model
#' levels <- matrix(c(
#'   2,  4,  5,  5,  5,
#'   3,  4,  5,  5,  5,
#'   5,  5,  5,  5,  5,
#'   5,  5,  5,  4,  4,
#'   5,  5,  5,  4,  1
#'   ), 5, 5, byrow=TRUE)
#'
#' hauser.topo <- update(hauser.indep,
#'                       ~ . + Topo(Father, Son, spec=levels))
#'
#' mosaic(hauser.topo, ~Father+Son,
#'        main="Topological model", gp=shading_Friendly)
#'
#' # RC model
#' hauser.RC <- update(hauser.indep, ~ . + Mult(Father, Son), verbose=FALSE)
#' mosaic(hauser.RC, ~Father+Son, main="RC model", gp=shading_Friendly)
#' LRstats(hauser.RC)
#'
#' # crossings models
#' hauser.CR <- update(hauser.indep, ~ . + Crossings(Father,Son))
#' mosaic(hauser.topo, ~Father+Son, main="Crossings model", gp=shading_Friendly)
#' LRstats(hauser.CR)
#'
#' hauser.CRdiag <- update(hauser.indep, ~ . + Crossings(Father,Son) + Diag(Father,Son))
#' LRstats(hauser.CRdiag)
#'
#'
#' # compare model fit statistics
#' modlist <- glmlist(hauser.indep, hauser.roweff, hauser.UA, hauser.UAdiag,
#'                    hauser.quasi, hauser.qsymm,  hauser.topo,
#'                    hauser.RC, hauser.CR, hauser.CRdiag)
#' sumry <- LRstats(modlist)
#' sumry[order(sumry$AIC, decreasing=TRUE),]
#' # or, more simply
#' LRstats(modlist, sortby="AIC")
#'
#' mods <- substring(rownames(sumry),8)
#' with(sumry,
#' 	{plot(Df, AIC, cex=1.3, pch=19, xlab='Degrees of freedom', ylab='AIC')
#' 	text(Df, AIC, mods, adj=c(0.5,-.5), col='red', xpd=TRUE)
#' 	})
#'
#'
#'
NULL





#' Sex, Occupation and Heart Disease
#'
#' Classification of individuals by gender, occupational category and
#' occurrence of heart disease
#'
#'
#' @name Heart
#' @docType data
#' @format A 3-dimensional array resulting from cross-tabulating 3 variables
#' for 21522 observations. The variable names and their levels are:
#'
#' \tabular{rll}{
#'   No \tab Name \tab Levels \cr
#'   1\tab `Disease`\tab `"Disease", "None"`\cr
#'   2\tab `Gender`\tab `"Male", "Female"`\cr
#'   3\tab `Occup`\tab `"Unempl", "WhiteCol", "BlueCol"`\cr
#' }

#' @source
#'
#' % \cite{Karger, 1980}
#' Karger, (1980).
#' @keywords datasets
#' @examples
#'
#' data(Heart)
#' str(Heart)
#'
#' # Display the frequencies for occupational categories.
#' # Each row is a 2 x 2 table
#' vcd::structable(Disease + Gender ~ Occup, data=Heart)
#'
#' # display as fourfold plots
#' vcd::cotabplot(~ Disease + Gender | Occup, data=Heart, panel = cotab_fourfold)
#'
NULL





#' Labour Force Participation of Married Women 1967-1971
#'
#' 1583 married women were surveyed over the years 1967-1971, recording whether
#' or not they were employed in the labor force.
#'
#' The data, originally from Heckman & Willis (1977) provide an example of
#' modeling longitudinal categorical data, e.g., with markov chain models for
#' dependence over time.
#'
#' Lindsey (1993) fits an initial set of logistic regression models examining
#' the dependence of employment in 1971 (`e1971`) on successive subsets of
#' the previous years, `e1970`, `e1969`, \dots{} `e1967`.
#'
#' Alternatively, one can examine markov chain models of first-order
#' (dependence on previous year), second-order (dependence on previous two
#' years), etc.
#'
#' @name Heckman
#' @docType data
#' @format A 5-dimensional \eqn{2^5} array resulting from cross-tabulating 5
#' binary variables for 1583 observations. The variable names and their levels
#' are:
#'
#' \tabular{rll}{
#'   No \tab Name \tab Levels \cr
#'   1\tab `e1971`\tab `"71Yes", "No"`\cr
#'   2\tab `e1970`\tab `"70Yes", "No"`\cr
#'   3\tab `e1969`\tab `"69Yes", "No"`\cr
#'   4\tab `e1968`\tab `"68Yes", "No"`\cr
#'   5\tab `e1967`\tab `"67Yes", "No"`\cr
#' }

#' @references
#' % \cite{HeckmanWillis:77}
#' Heckman, J.J. & Willis, R.J. (1977).
#' "A beta-logistic model for the analysis of sequential labor force
#' participation by married women."  *Journal of Political Economy*, 85:
#' 27-58
#' @source
#'
#' Lindsey, J. K. (1993).  *Models for
#' Repeated Measurements* Oxford, UK: Oxford University Press, p. 185.
#' @keywords datasets
#' @examples
#'
#' data(Heckman)
#'
#' # independence model
#' mosaic(Heckman, shade=TRUE)
#' # same, as a loglm()
#' require(MASS)
#' (heckman.mod0 <- loglm(~ e1971+e1970+e1969+e1968+e1967, data=Heckman))
#' mosaic(heckman.mod0, main="Independence model")
#'
#' # first-order markov chain: bad fit
#' (heckman.mod1 <- loglm(~ e1971*e1970 + e1970*e1969 +e1969*e1968 + e1968*e1967, data=Heckman))
#' mosaic(heckman.mod1, main="1st order markov chain model")
#'
#' # second-order markov chain: bad fit
#' (heckman.mod2 <- loglm(~ e1971*e1970*e1969 + e1970*e1969*e1968 +e1969*e1968*e1967, data=Heckman))
#' mosaic(heckman.mod2, main="2nd order markov chain model")
#'
#' # third-order markov chain: fits OK
#' (heckman.mod3 <- loglm(~ e1971*e1970*e1969*e1968 + e1970*e1969*e1968*e1967, data=Heckman))
#' mosaic(heckman.mod2, main="3rd order markov chain model")
#'
#'
NULL





#' Hospital Visits Data
#'
#' Length of stay in hospital for 132 schizophrenic patients, classified by
#' visiting patterns, originally from Wing (1962).
#'
#' Both table variables can be considered ordinal. The variable `visit`
#' refers to visiting patterns recorded hospital.  The category labels are
#' abbreviations of those given by Goodman (1983); e.g., `"Regular"` is
#' short for \dQuote{received visitors regularly or patient went home}. The
#' variable `stay` refers to length of stay in hospital, in year groups.
#'
#' @name HospVisits
#' @docType data
#' @format A 3 by 3 frequency table, with format:
#' \preformatted{
#' table [1:3, 1:3] 43 6 9 16 11 18 3 10 16
#' - attr(*, "dimnames")=List of 2
#' ..$ visit: chr [1:3] "Regular" "Infrequent" "Never"
#' ..$ stay : chr [1:3] "2-9" "10-19" "20+"
#' }
#
#' @seealso \code{\link[ca]{ca}}
#' @references
#' Wing, J. K. (1962). Institutionalism in Mental Hospitals,
#' *British Journal of Social and Clinical Psychology*, 1 (1), 38-51.
#'
#' @source
#' Goodman, L. A. (1983) The analysis of dependence in
#' cross-classifications having ordered categories, using log-linear models for
#' frequencies and log-linear models for odds.  *Biometrics*, 39, 149-160.
#'
#' @keywords datasets
#' @examples
#'
#' data(HospVisits)
#' mosaic(HospVisits, gp=shading_Friendly)
#'
#' if(require(ca)){
#'   ca(HospVisits)
#'   # surprisingly 1D !
#'   plot(ca(HospVisits))
#'   }
#'
NULL





#' Household Tasks Performed by Husbands and Wives
#'
#' A 13 x 4 table of frequencies of household tasks performed by couples,
#' either by the `Husband`, `Wife`, `Alternating` or `Jointly`.
#'
#'
#' @name HouseTasks
#' @docType data
#' @format The format is:
#' \preformatted{
#'   'table' int [1:13, 1:4] 36 11 24 51 13 1 1 14 20 46 ...
#' - attr(*, "dimnames")=List of 2
#' ..$ Task: chr [1:13] "Breakfast" "Dinner" "Dishes" "Driving" ...
#' ..$ Who : chr [1:4] "Alternating" "Husband" "Jointly" "Wife"
#' }
#'
#' @source This data set was taken from \code{\link[factoextra]{housetasks}}, a
#' 13 x 4 data.frame. In this table version, the rows and columns were sorted
#' alphabetically (and a typo was corrected).
#' @keywords datasets
#' @examples
#'
#' data(HouseTasks)
#' str(HouseTasks)
#'
#' chisq.test(HouseTasks)
#'
#' # mosaic plot, illustrating some tweaks to handle overlapping labels
#' require(vcd)
#' mosaic(HouseTasks, shade = TRUE,
#'        labeling = labeling_border(rot_labels = c(45,0, 0, 0),
#'                                   offset_label =c(.5,5,0, 0),
#'                                   varnames = c(FALSE, TRUE),
#'                                   just_labels=c("center","right"),
#'                                   tl_varnames = FALSE),
#'        legend = FALSE)
#'
#' # use seriation package to permute rows & cols using correspondence analysis
#' if(require(seriation)) {
#' order <- seriate(HouseTasks, method = "CA")
#' # the permuted row and column labels
#' rownames(HouseTasks)[order[[1]]]
#' colnames(HouseTasks)[order[[2]]]
#'
#' # do the permutation
#' HT_perm <- permute(HouseTasks, order, margin=1)
#'
#' mosaic(HT_perm, shade = TRUE,
#'        labeling = labeling_border(rot_labels = c(45,0, 0, 0),
#'                                   offset_label =c(.5,5,0, 0),
#'                                   varnames = c(FALSE, TRUE),
#'                                   just_labels=c("center","right"),
#'                                   tl_varnames = FALSE),
#'        legend = FALSE)
#' }
#'
NULL





#' Minnesota High School Graduates
#'
#' Minnesota high school graduates of June 1930 were classified with respect to
#' (a) `Rank` by thirds in their graduating class, (b) post-high school
#' `Status` in April 1939 (4 levels), (c) `Sex`, (d) father's
#' `Occupation`al status (7 levels, from 1=High to 7=Low).
#'
#' The data were first presented by Hoyt et al. (1959) and have been analyzed
#' by Fienberg(1980), Plackett(1974) and others.
#'
#' Post high-school `Status` is natural to consider as the response.
#' `Rank` and father's `Occupation` are ordinal variables.
#'
#' @name Hoyt
#' @docType data
#' @format A 4-dimensional array resulting from cross-tabulating 4 variables
#' for 13968 observations. The variable names and their levels are:
#'
#' \tabular{rll}{
#'   No \tab Name \tab Levels \cr
#'   1\tab `Status`\tab `"College", "School", "Job", "Other"`\cr
#'   2\tab `Rank`\tab `"Low", "Middle", "High"`\cr
#'   3\tab `Occupation`\tab `"1", "2", "3", "4", "5", "6", "7"`\cr
#'   4\tab `Sex`\tab `"Male", "Female"`\cr
#' }
#
#' @seealso \code{\link[MASS]{minn38}} provides the same data as a data frame.
#' @references
#' Hoyt, C. J., Krishnaiah, P. R. and Torrance, E. P. (1959)
#' Analysis of complex contingency tables, *Journal of Experimental
#' Education* 27, 187-194.
#' @source
#'
#' Fienberg, S. E. (1980). *The Analysis of Cross-Classified Categorical
#' Data*. Cambridge, MA: MIT Press, p. 91-92.
#'
#' R. L. Plackett, (1974). *The Analysis of Categorical Data*. London: Griffin.
#' @keywords datasets
#' @examples
#'
#' data(Hoyt)
#'
#' # display the table
#' structable(Status + Sex ~ Rank + Occupation, data=Hoyt)
#'
#' # mosaic for independence model
#' plot(Hoyt, shade=TRUE)
#'
#' # examine all pairwise mosaics
#' pairs(Hoyt, shade=TRUE)
#'
#' # collapse Status to College vs. Non-College
#' Hoyt1 <- collapse.table(Hoyt, Status=c("College", rep("Non-College",3)))
#' plot(Hoyt1, shade=TRUE)
#'
#' #################################################
#' # fitting models with loglm, plotting with mosaic
#' #################################################
#'
#' # fit baseline log-linear model for Status as response
#' require(MASS)
#' hoyt.mod0 <- loglm(~ Status + (Sex*Rank*Occupation),
#'   data=Hoyt1)
#' hoyt.mod0
#'
#' mosaic(hoyt.mod0,
#'   gp=shading_Friendly,
#'   main="Baseline model: Status + (Sex*Rank*Occ)")
#'
#' # add one-way association of Status with factors
#' hoyt.mod1 <- loglm(~ Status * (Sex + Rank + Occupation) + (Sex*Rank*Occupation),
#'   data=Hoyt1)
#' hoyt.mod1
#'
#' mosaic(hoyt.mod1,
#'   gp=shading_Friendly,
#'   main="Status * (Sex + Rank + Occ)")
#'
#' # can we drop any terms?
#' drop1(hoyt.mod1, test="Chisq")
#'
#' # assess model fit
#' anova(hoyt.mod0, hoyt.mod1)
#'
#' # what terms to add?
#' add1(hoyt.mod1, ~.^2, test="Chisq")
#'
#' # add interaction of Sex:Occupation on Status
#' hoyt.mod2 <- update(hoyt.mod1, ~ . + Status:Sex:Occupation)
#'
#' mosaic(hoyt.mod2,
#'   gp=shading_Friendly,
#'   main="Adding Status:Sex:Occupation")
#'
#' # compare model fits
#' anova(hoyt.mod0, hoyt.mod1, hoyt.mod2)
#'
#' # Alternatively, try stepwise analysis, heading toward the saturated model
#' steps <- step(hoyt.mod0,
#'   direction="forward",
#'   scope=~Status*Sex*Rank*Occupation)
#'
#' # display anova
#' steps$anova
#'
#'
NULL





#' ICU data set
#'
#' The ICU data set consists of a sample of 200 subjects who were part of a
#' much larger study on survival of patients following admission to an adult
#' intensive care unit (ICU), derived from Hosmer, Lemeshow and Sturdivant
#' (2013) and Friendly (2000).
#'
#' The major goal of this study was to develop a logistic regression model to
#' predict the probability of survival to hospital discharge of these patients
#' and to study the risk factors associated with ICU mortality. The clinical
#' details of the study are described in Lemeshow, Teres, Avrunin, and Pastides
#' (1988).
#'
#' This data set is often used to illustrate model selection methods for
#' logistic regression.
#'
#' Patient ID numbers are the rownames of the data frame.
#'
#' Note that the last two variables `white` and `uncons` are a
#' recoding of respectively `race` and `coma` to binary variables.
#'
#' @name ICU
#' @docType data
#' @format A data frame with 200 observations on the following 22 variables.
#' \describe{
#'   \item{`died`}{Died before discharge?, a factor with levels `No` `Yes`}
#'   \item{`age`}{Patient age, a numeric vector}
#'   \item{`sex`}{Patient sex, a factor with levels `Female` `Male`}
#'   \item{`race`}{Patient race, a factor with levels `Black` `Other` `White`.  Also represented here as `white`.}
#'   \item{`service`}{Service at ICU Admission, a factor with levels `Medical` `Surgical`}
#'   \item{`cancer`}{Cancer part of present problem?, a factor with levels `No` `Yes`}
#'   \item{`renal`}{History of chronic renal failure?, a factor with levels `No` `Yes`}
#'   \item{`infect`}{Infection probable at ICU admission?, a factor with levels `No` `Yes`}
#'   \item{`cpr`}{Patient received CPR prior to ICU admission?, a factor with levels `No` `Yes`}
#'   \item{`systolic`}{Systolic blood pressure at admission (mm Hg), a numeric vector}
#'   \item{`hrtrate`}{Heart rate at ICU Admission (beats/min), a numeric vector}
#'   \item{`previcu`}{Previous admission to an ICU within 6 Months?, a factor with levels `No` `Yes`}
#'   \item{`admit`}{Type of admission, a factor with levels `Elective` `Emergency`}
#'   \item{`fracture`}{Admission with a long bone, multiple, neck, single area, or hip fracture?
#'         a factor with levels `No` `Yes`}
#'   \item{`po2`}{PO2 from initial blood gases, a factor with levels `>60` `<=60`}
#'   \item{`ph`}{pH from initial blood gases, a factor with levels `>=7.25` `<7.25`}
#'   \item{`pco`}{PCO2 from initial blood gases, a factor with levels `<=45` `>45`}
#'   \item{`bic`}{Bicarbonate (HCO3) level from initial blood gases, a factor with levels `>=18` `<18`}
#'   \item{`creatin`}{Creatinine, from initial blood gases, a factor with levels `<=2` `>2`}
#'   \item{`coma`}{Level of unconsciousness at admission to ICU,	a factor with levels `None` `Stupor` `Coma`}
#'   \item{`white`}{a recoding of `race`,  a factor with levels `White` `Non-white`}
#'   \item{`uncons`}{a recoding of `coma` a factor with levels `No` `Yes`}
#' }
#'
#' @references
#'
#' Lemeshow, S., Teres, D., Avrunin, J. S., Pastides, H. (1988). Predicting the
#' Outcome of Intensive Care Unit Patients. *Journal of the American
#' Statistical Association*, 83, 348-356.
#' @source M. Friendly (2000), *Visualizing Categorical Data*, Appendix
#' B.4. SAS Institute, Cary, NC.
#'
#' Hosmer, D. W. Jr., Lemeshow, S. and Sturdivant, R. X. (2013) *Applied
#' Logistic Regression*, NY: Wiley, Third Edition.
#' @keywords datasets
#' @examples
#'
#' data(ICU)
#' # remove redundant variables (race, coma)
#' ICU1 <- ICU[,-c(4,20)]
#'
#' # fit full model
#' icu.full <- glm(died ~ ., data=ICU1, family=binomial)
#' summary(icu.full)
#'
#' # simpler model (found from a "best" subsets procedure)
#' icu.mod1 <- glm(died ~ age + sex + cancer + systolic + admit + uncons,
#'   data=ICU1,
#'   family=binomial)
#' summary(icu.mod1)
#'
#' # even simpler model
#' icu.mod2 <- glm(died ~ age + cancer  + admit + uncons,
#'   data=ICU1,
#'   family=binomial)
#' summary(icu.mod2)
#'
#' anova(icu.mod2, icu.mod1, icu.full, test="Chisq")
#'
#' ## Reproduce Fig 6.12 from VCD
#'
#' icu.fit <- data.frame(ICU, prob=predict(icu.mod2, type="response"))
#'
#' # combine categorical risk factors to a single string
#' risks <- ICU[, c("cancer", "admit", "uncons")]
#' risks[,1] <- ifelse(risks[,1]=="Yes", "Cancer", "")
#' risks[,2] <- ifelse(risks[,2]=="Emergency", "Emerg", "")
#' risks[,3] <- ifelse(risks[,3]=="Yes", "Uncons", "")
#' risks <- apply(risks, 1, paste, collapse="")
#' risks[risks==""] <- "(none)"
#' icu.fit$risks <- risks
#'
#' library(ggplot2)
#' ggplot(icu.fit, aes(x=age, y=prob, color=risks)) +
#' 	geom_point(size=2) +
#' 	geom_line(size=1.25, alpha=0.5) +
#' 	theme_bw() + ylab("Probability of death")
#'
#'
NULL





#' Cross-classification of job satisfaction by income
#'
#' This data set is a contingency table of job satisfaction by income for a
#' small sample of black males from the 1996 General Social Survey, as used by
#' Agresti (2002) for an example.
#'
#' Both `income` and `satisfaction` are ordinal variables, and are so
#' ordered in the table.  Measures of association, visualizations, and models
#' should take ordinality into account.
#'
#' @name JobSat
#' @docType data
#' @format A 4 x 4 contingency table of `income` by `satisfaction`,
#' with the following structure:
#' \preformatted{
#'   table [1:4, 1:4] 1 2 1 0 3 3 6 1 10 10 ...
#'   - attr(*, "dimnames")=List of 2
#'   ..$ income      : chr [1:4] "< 15k" "15-25k" "25-40k" "> 40k"
#'   ..$ satisfaction: chr [1:4] "VeryD" "LittleD" "ModerateS" "VeryS"
#' }
#'
#' @source Agresti, A. Categorical Data Analysis John Wiley & Sons, 2002, Table
#' 2.8, p. 57.
#' @keywords datasets
#' @examples
#'
#' data(JobSat)
#' assocstats(JobSat)
#' GKgamma(JobSat)
#'
NULL





# Loglinear Model Utilities
#
#
# @aliases loglin-utilities conditional joint loglin2formula loglin2string
#          markov mutual saturated
# @param nf number of factors for which to generate the model
# @param table a contingency table used only for factor names in the model,
# typically the output from \code{\link[base]{table}} and possibly permuted
# with \code{aperm}
# @param factors names of factors used in the model formula when \code{table}
# is not specified
# @param with For \code{joint} and \code{conditional} models, \code{with}
# gives the indices of the factors against which all others are considered
# jointly or conditionally independent
# @param order For \code{markov}, this gives the order of the Markov chain
# model for the factors.  An \code{order=1} Markov chain allows associations
# among sequential pairs of factors, e.g., \code{[A,B], [B,C], [C,D]} \dots{}.
# An \code{order=2} Markov chain allows associations among sequential triples.
# @param x For the \code{loglin2*} functions, a list of terms in a loglinear
# model, such as returned by \code{conditional}, \code{joint}, \dots{}
# @param env For \code{loglin2formula}, environment in which to evaluate the
# formula
# @param brackets For \code{loglin2string}, characters to use to surround
# model terms.  Either a single character string containing two characters
# (e.g., \code{'[]'} or a character vector of length two.
# @param sep For \code{loglin2string}, the separator character string used for
# factor names within a given model term
# @param collapse For \code{loglin2string}, the character string used between
# terms in the the model string
# @param abbrev For \code{loglin2string}, whether and how to abbreviate the
# terms in the string representation. This has not yet been implemented.
# @return For the main model specification functions, \code{conditional},
# \code{joint}, \code{markov}, \dots{}, the result is a list of vectors
# (terms), where the elements in each vector are the names of the factors. The
# elements of the list are given names \code{term1, term2, \dots{}}.
# @author Michael Friendly
# @seealso \code{\link[stats]{loglin}}, \code{\link[MASS]{loglm}}
# @references These functions were inspired by the original SAS implementation
# of mosaic displays, described in the \emph{User's Guide},
# \url{http://www.datavis.ca/mosaics/mosaics.pdf}
# @keywords models
# EXAMPLES NOT COPIED
# @examples
#
# joint(3, table=HairEyeColor)
# # as a formula or string
# loglin2formula(joint(3, table=HairEyeColor))
# loglin2string(joint(3, table=HairEyeColor))
#
# joint(2, HairEyeColor)  # marginal model for [Hair] [Eye]
#
# # other possibilities
# joint(4, factors=letters, with=1)
# joint(5, factors=LETTERS)
# joint(5, factors=LETTERS, with=4:5)
#
# conditional(4)
# conditional(4, with=3:4)
#
# # use in mosaic displays or other strucplots
# mosaic(HairEyeColor, expected=joint(3))
# mosaic(HairEyeColor, expected=conditional(3))
#
# # use with MASS::loglm
# cond3 <- loglin2formula(conditional(3, table=HairEyeColor))
# cond3 <- loglin2formula(conditional(3))  # same, with factors 1,2,3
# require(MASS)
# loglm(cond3, data=HairEyeColor)
#
# saturated(3, HairEyeColor)
# loglin2formula(saturated(3, HairEyeColor))
# loglin2string(saturated(3, HairEyeColor))
# loglin2string(saturated(3, HairEyeColor), brackets='{}', sep=', ')
#
#
#NULL





# The Logarithmic Series Distribution
#
# The logarithmic series distribution is a long-tailed distribution introduced
# by Fisher etal. (1943) in connection with data on the abundance of
# individuals classified by species.
#
# These functions provide the density, distribution function, quantile
# function and random generation for the logarithmic series distribution with
# parameter \code{prob}.
#
# The logarithmic series distribution with \code{prob} = \eqn{p} has density
# \deqn{ p ( x ) = \alpha p^x / x } for \eqn{x = 1, 2, \dots}, where
# \eqn{\alpha= -1 / \log(1 - p)} and \eqn{0 < p <1}.  Note that counts
# \code{x==2} cannot occur.
#
# @aliases Logseries dlogseries plogseries qlogseries rlogseries
# @param x,q vector of quantiles representing the number of events.
# @param prob parameter for the distribution, \code{0 < prob < 1}
# @param log,log.p logical; if TRUE, probabilities \code{p} are given as
# \code{log(p)}
# @param lower.tail logical; if TRUE (default), probabilities are \eqn{P[X \le
# x]}{P[X <= x]}, otherwise, \eqn{P[X > x]}{P[X > x]}.
# @param p vector of probabilities
# @param max.value maximum value returned by \code{qlogseries}
# @param n number of observations for \code{rlogseries}
# @return \code{dlogseries} gives the density, \code{plogseries} gives the
# distribution function, \code{qlogseries} gives the quantile function, and
# \code{rlogseries} generates random deviates.
#
# %% ~Describe the value returned %% If it is a LIST, use %% \item{comp1
# }{Description of 'comp1'} %% \item{comp2 }{Description of 'comp2'} %% ...
# @author Michael Friendly, using original code modified from the
# \code{gmlss.dist} package by Mikis Stasinopoulos.
# @seealso \code{\link[stats]{Distributions}}, ~~~
# @references \url{https://en.wikipedia.org/wiki/Logarithmic_distribution}
#
# Fisher, R. A. and Corbet, A. S. and Williams, C. B. (1943). The relation
# between the number of species and the number of individuals \emph{Journal of
# Animal Ecology}, 12, 42-58.
# @keywords distribution
# @examples
#
# XL <-expand.grid(x=1:5, p=c(0.33, 0.66, 0.99))
# lgs.df <- data.frame(XL, prob=dlogseries(XL[,"x"], XL[,"p"]))
# lgs.df$p = factor(lgs.df$p)
# str(lgs.df)
#
# require(lattice)
# mycol <- palette()[2:4]
# xyplot( prob ~ x, data=lgs.df, groups=p,
# 	xlab=list('Number of events (k)', cex=1.25),
# 	ylab=list('Probability',  cex=1.25),
# 	type='b', pch=15:17, lwd=2, cex=1.25, col=mycol,
# 	key = list(
# 					title = 'p',
# 					points = list(pch=15:17, col=mycol, cex=1.25),
# 					lines = list(lwd=2, col=mycol),
# 					text = list(levels(lgs.df$p)),
# 					x=0.9, y=0.98, corner=c(x=1, y=1)
# 					)
# 	)
#
#
# # random numbers
# hist(rlogseries(200, prob=.4), xlab='x')
# hist(rlogseries(200, prob=.8), xlab='x')
#
#
# NULL





#' Mammogram Ratings
#'
#' Kundel & Polansky (2003) give (possibly contrived) data on a set of 110
#' mammograms rated by two readers.
#'
#'
#' @name Mammograms
#' @docType data
#' @format
#' A frequency table in matrix form.  The format is:
#' \preformatted{
#'   num [1:4, 1:4] 34 6 2 0 10 8 5 1 2 8 ...
#' - attr(*, "dimnames")=List of 2
#' ..$ Reader2: chr [1:4] "Absent" "Minimal" "Moderate" "Severe"
#' ..$ Reader1: chr [1:4] "Absent" "Minimal" "Moderate" "Severe"
#' }
#'
#' @source
#' Kundel, H. L. & Polansky, M. (2003), "Measurement of Observer
#' Agreement", *Radiology*, **228**, 303-308, Table A1
#' @keywords datasets
#' @examples
#'
#' data(Mammograms)
#' B <- agreementplot(Mammograms, main="Mammogram ratings")
#' # agreement measures
#' B
#' Kappa(Mammograms)
#'
#' ## other displays
#' mosaic(Mammograms, shade=TRUE)
#'
#' sieve(Mammograms, pop = FALSE, shade = TRUE)
#' labeling_cells(text = Mammograms,
#'   gp_text = gpar(fontface = 2, cex=1.75))(as.table(Mammograms))
#'
NULL





#' Mental Impairment and Parents SES
#'
#' A 6 x 4 contingency table representing the cross-classification of mental
#' health status (`mental`) of 1660 young New York residents by their
#' parents' socioeconomic status (`ses`).
#'
#' @details
#' Both `ses` and `mental` can be treated as ordered factors or
#' integer scores.  For `ses`, 1="High" and 6="Low".
#'
#' @name Mental
#' @docType data
#' @format A data frame frequency table with 24 observations on the following 3
#' variables.
#' \describe{
#'   \item{`ses`}{an ordered factor with levels `1` < `2` < `3` < `4` < `5` < `6`}
#'   \item{`mental`}{an ordered factor with levels `Well` < `Mild` < `Moderate` < `Impaired`}
#'   \item{`Freq`}{cell frequency: a numeric vector}
#' }
#'
#' @references
#' Friendly, M. *Visualizing Categorical Data*, Cary, NC: SAS
#' Institute, 2000, Appendix B.7.
#' @source
#' Haberman, S. J.  *The Analysis of Qualitative Data: New
#' Developments*, Academic Press, 1979, Vol. II, p. 375.
#'
#' Srole, L.; Langner, T. S.; Michael, S. T.; Kirkpatrick, P.; Opler, M. K. &
#' Rennie, T. A. C.  *Mental Health in the Metropolis: The Midtown
#' Manhattan Study*, NYU Press, 1978, p. 289
#'
#' @keywords datasets
#' @examples
#'
#' data(Mental)
#' str(Mental)
#' (Mental.tab <- xtabs(Freq ~ ses + mental, data=Mental))
#'
#' # mosaic and sieve plots
#' mosaic(Mental.tab, gp=shading_Friendly)
#' sieve(Mental.tab, gp=shading_Friendly)
#'
#' if(require(ca)){
#'   plot(ca(Mental.tab), main="Mental impairment & SES", lines=TRUE)
#' }
#'
#'
NULL





#' Mice Depletion Data
#'
#' Data from Kastenbaum and Lamphiear (1959). The table gives the number of
#' depletions (deaths) in 657 litters of mice, classified by litter size and
#' treatment.  This data set has become a classic in the analysis of
#' contingency tables, yet unfortunately little information on the details of
#' the experiment has been published.
#'
#'
#' @name Mice
#' @docType data
#' @format A frequency data frame with 30 observations on the following 4
#' variables, representing a 5 x 2 x 3 contingency table.
#' \describe{
#'   \item{`litter`}{litter size, a numeric vector}
#'   \item{`treatment`}{treatment, a factor with levels `A` `B`}
#'   \item{`deaths`}{number of depletions, a factor with levels `0` `1` `2+`}
#'   \item{`Freq`}{cell frequency, a numeric vector}
#' }
#'
#' @references
#' Kastenbaum, M. A. & Lamphiear, D. E. (1959) Calculation of
#' chi-square to calculate the no three-factor interaction hypothesis.
#' *Biometrics*, 15, 107-115.
#' @source
#' Goodman, L. A. (1983) The analysis of dependence in
#' cross-classifications having ordered categories, using log-linear models for
#' frequencies and log-linear models for odds. *Biometrics*, 39, 149-160.
#' @keywords datasets
#' @examples
#'
#' data(Mice)
#' # make a table
#' ftable(mice.tab <- xtabs(Freq ~ litter + treatment + deaths, data=Mice))
#'
#' #library(vcd)
#' vcd::mosaic(mice.tab, shade=TRUE)
#'
#'
NULL





#' Social Mobility data
#'
#' Data on social mobility, recording the occupational category of fathers and
#' their sons.
#'
#'
#' @name Mobility
#' @docType data
#' @format A 2-dimensional array resulting from cross-tabulating 2 variables
#' for 19912 observations. The variable names and their levels are:
#'
#' \tabular{rll}{
#'   No \tab Name \tab Levels \cr
#'   1\tab `Son's_Occupation`\tab `"UpNonMan", "LoNonMan", "UpManual", "LoManual", "Farm"`\cr
#'     2\tab `Father's_Occupation`\tab `"UpNonMan", "LoNonMan", "UpManual", "LoManual", "Farm"`\cr
#' }
#'
#' @seealso \code{\link{Glass}}, \code{\link{Hauser79}},
#'      \code{\link{Yamaguchi87}} for other examples of mobility data.
#' @source
#'
#' Falguerolles, A. de and Mathieu, J. R. (1988).  *Proceedings of
#' COMPSTAT 88*, Copenhagen, Denmark, Springer-Verlag.
#'
#' % \cite{FeathermanHauser:78}
#'
#' Featherman, D. L. and Hauser, R. M. Occupations and social mobility in the
#' United States.  *Sociological Microjournal*, 12, Fiche 62. Copenhagen:
#' Sociological Institute.
#' @keywords datasets
#' @examples
#'
#' data(Mobility)
#' Mobility
#'
#' # independence model
#' MASS::loglm(~Father_Occupation + Son_Occupation, data = Mobility)
#'
#' vcd::mosaic(Mobility, shade=TRUE, legend = FALSE)
#'
#'
#'
NULL





#' Publications of PhD Candidates
#'
#' A data set giving the number of publications by doctoral candidates in
#' biochemistry in relation to various predictors, originally from Long (1997).
#'
#' There is a large number of zero counts. Is there evidence for a separate
#' group of non-publishers?
#'
#' In this version of the data set, `phdprestige` had been rounded to the
#' nearest integer. A Stata version with the continuous values was subsequently
#' found at <https://www.stata-press.com/data/lf2/couart2.dta>
#'
#' @name PhdPubs
#' @docType data
#' @format A data frame with 915 observations on the following 6 variables.
#' \describe{
#'   \item{`articles`}{number of articles published in the final three years of PhD studies}
#'   \item{`female`}{dummy variable for gender, coded `1` for female}
#'   \item{`married`}{dummy variable for marital status, coded `1` for married}
#'   \item{`kid5`}{number of young children, age 5 and under}
#'   \item{`phdprestige`}{prestige of the PhD department.  The higher the number the more prestigious the program.}
#'   \item{`mentor`}{number of publications by the mentor in the preceeding three years}
#' }
#'
#' @source
#' Long, J. S. (1997). *Regression Models for Categorical and
#' Limited Dependent Variables*, Sage.
#'
#' Long, J. S. & Freese, J. (2006). *Regression Models for Categorical
#' Dependent Variables Using Stata*, 2nd Ed., Stata Press.
#' @keywords datasets
#' @examples
#'
#' data(PhdPubs)
#' # very uninformative
#' hist(PhdPubs$articles,
#'      breaks=0:19, col="pink", xlim=c(0,20),
#'      xlab="Number of Articles")
#'
#' library(vcd)
#' rootogram(goodfit(PhdPubs$articles), xlab="Number of Articles")
#'
#' # compare with negative binomial
#' rootogram(goodfit(PhdPubs$articles, type="nbinomial"),
#' 	xlab="Number of Articles", main="Negative binomial")
#'
#'
#'
NULL





#' Shakespeare's Word Type Frequencies
#'
#' This data set, from Efron and Thisted (1976), gives the number of distinct
#' words types (`Freq`) of words that appeared exactly once, twice, etc.
#' up to 100 times (`count`) in the complete works of Shakespeare.  In
#' these works, Shakespeare used 31,534 distinct words (types), comprising
#' 884,647 words in total.
#'
#' Efron & Thisted used this data to ask the question, "How many words did
#' Shakespeare know?"  Put another way, suppose another new corpus of works
#' Shakespeare were discovered, also with 884,647 words. How many new word
#' types would appear? The answer to the main question involves contemplating
#' an infinite number of such new corpora.
#'
#' In addition to the words that appear `1:100` times, there are 846 words
#' that appear more than 100 times, not listed in this data set.
#'
#' @name ShakeWords
#' @docType data
#' @format A data frame with 100 observations on the following 2 variables.
#' \describe{
#'    \item{`count`}{the number of times a word type appeared in Shakespeare's written works}
#'    \item{`Freq`}{the number of different words (types) appearing with this count.}
#'   }
#
#' @source
#' Bradley Efron and Ronald Thisted (1976). Estimating the Number of
#' Unseen Species: How Many Words Did Shakespeare Know? *Biometrika*, Vol.
#' 63, No. 3, pp. 435-447,
#' %<http://www.jstor.org/stable/2335721>
#' @keywords datasets
#' @examples
#'
#' data(ShakeWords)
#' str(ShakeWords)
#'
#' plot(sqrt(Freq) ~ count, data=ShakeWords)
#'
NULL





#' Passengers on the Titanic
#'
#' Data on passengers on the RMS Titanic, excluding the Crew and some
#' individual identifier variables.
#'
#' There are a number of related versions of the Titanic data, in various
#' formats. This version was derived from `ptitanic` in the
#' \pkg{rpart.plot} package, modifying it to remove the `Class 'labelled'`
#' attributes for some variables (inherited from Frank Harrell's
#' `titanic3` version) which caused problems with some applications,
#' notably `ggplot2`.
#'
#' Other versions:
#'
#' \code{\link[datasets]{Titanic}} is the 4-way frequency table of all 2201
#' people aboard the Titanic, including passengers and crew.
#'
#' @name Titanicp
#' @docType data
#' @format A data frame with 1309 observations on the following 6 variables.
#' \describe{
#'   \item{`pclass`}{a factor with levels `1st` `2nd` `3rd`}
#'   \item{`survived`}{a factor with levels `died` `survived`}
#'   \item{`sex`}{a factor with levels `female` `male`}
#'   \item{`age`}{passenger age in years (or fractions of a year, for children), a numeric vector; age is missing for 263 of the passengers}
#'   \item{`sibsp`}{number of siblings or spouses aboard, integer: `0:8`}
#'   \item{`parch`}{number of parents or children aboard, integer: `0:6`}
#' }
#
#' @source
#'
#' The original R source for this dataset was compiled by Frank Harrell and
#' Robert Dawson:
#' <https://hbiostat.org/data/repo/titanic.txt>,
#' described in more detail in
#' <https://hbiostat.org/data/repo/titanic>
#'
#' For this version of the Titanic data, passenger details were deleted,
#' survived was cast as a factor, and the name changed to `Titanicp` to
#' minimize confusion with other versions.
#' @keywords datasets
#' @examples
#'
#' data(Titanicp)
#' ## maybe str(Titanicp) ; plot(Titanicp) ...
#'
NULL





#' Toxaemia Symptoms in Pregnancy
#'
#' Brown et al (1983) gave these data on two signs of toxaemia, an abnormal
#' condition during pregnancy characterized by high blood pressure
#' (hypertension) and high levels of protein in the urine.  If untreated, both
#' the mother and baby are at risk of complications or death.
#'
#' The data frame `Toxaemia` represents 13384 expectant mothers in
#' Bradford, England in their first pregnancy, who were also classified
#' according to social class and the number of cigarettes smoked per day.
#'
#'
#' @name Toxaemia
#' @docType data
#' @format A data frame in frequency form representing a 5 x 3 x 2 x 2
#' contingency table, with 60 observations on the following 5 variables.
#' \describe{
#'   \item{`class`}{Social class of mother, a factor with levels `1` `2` `3` `4` `5`}
#'   \item{`smoke`}{Cigarettes smoked per day during pregnancy, a factor with levels `0` `1-19` `20+`}
#'   \item{`hyper`}{Hypertension level, a factor with levels `Low` `High`}
#'   \item{`urea`}{Protein urea level, a factor with levels `Low` `High`}
#'   \item{`Freq`}{frequency in each cell, a numeric vector}
#' }
#'
#' @references Friendly, M.  (2000), *Visualizing Categorical Data*, SAS
#' Institute, Cary, NC, Example 7.15.
#'
#' Friendly, M. and Meyer, D. (2016).  *Discrete Data Analysis with R:
#' Visualization and Modeling Techniques for Categorical and Count Data*.  Boca
#' Raton, FL: Chapman & Hall/CRC. <http://ddar.datavis.ca>. Example 10.10.
#' @source
#' Brown, P. J., Stone, J. and Ord-Smith, C. (1983), Toxaemic signs
#' during pregnancy. *JRSS, Series C, Applied Statistics*, 32, 69-72
#' @keywords datasets
#' @examples
#'
#' data(Toxaemia)
#'
#' tox.tab <- xtabs(Freq ~ class + smoke + hyper + urea, Toxaemia)
#' ftable(tox.tab, row.vars=1)
#'
#'
#' # symptoms by smoking
#' mosaic(~smoke + hyper + urea, data=tox.tab, shade=TRUE)
#'
#' # symptoms by social class
#' mosaic(~class + hyper + urea, data=tox.tab, shade=TRUE)
#'
#' # predictors
#' mosaic(~smoke + class, data=tox.tab, shade=TRUE)
#'
#' # responses
#' mosaic(~hyper + urea, data=tox.tab, shade=TRUE)
#'
#' # log odds ratios for urea and hypertension, by class and smoke
#' \dontrun{
#' LOR <-loddsratio(aperm(tox.tab))
#' LOR
#' }
#'
#'
NULL





#' TV Viewing Data
#'
#' This data set `TV` comprises a 5 x 11 x 3 contingency table based on
#' audience viewing data from Neilsen Media Research for the week starting
#' November 6, 1995.
#'
#' The original data, `tv.dat`, contains two additional networks: "Fox"
#' and "Other", with small frequencies. These levels were removed in the
#' current version. There is also a fourth factor, transition State transition
#' (turn the television Off, Switch channels, or Persist in viewing the current
#' channel). The `TV` data here includes only the Persist observations.
#'
#' @name TV
#' @docType data
#' @format A 5 x 11 x 3 array of cell frequencies with the following structure:
#' \preformatted{
#'   int [1:5, 1:11, 1:3] 146 244 233 174 294 151 181 161 183 281 ...
#'   - attr(*, "dimnames")=List of 3
#'   ..$ Day    : chr [1:5] "Monday" "Tuesday" "Wednesday" "Thursday" ...
#'   ..$ Time   : chr [1:11] "8:00" "8:15" "8:30" "8:45" ...
#'   ..$ Network: chr [1:3] "ABC" "CBS" "NBC"
#' }
#'
#' @references
#' Friendly, M. and Meyer, D. (2016).  *Discrete Data Analysis
#' with R: Visualization and Modeling Techniques for Categorical and Count
#' Data*.  Boca Raton, FL: Chapman & Hall/CRC. <http://ddar.datavis.ca>.
#'
#' Emerson, John W. Mosaic Displays in S-PLUS: A General Implementation and a
#' Case Study. *Statistical Graphics and Computing Newsletter*, 1998,
#' 9(1), 17--23, <http://www.stat.yale.edu/~jay/R/mosaic/v91.pdf>
#'
#' Hartigan, J. A. & Kleiner, B. A Mosaic of Television Ratings. *The
#' American Statistician*, 1984, 38, 32-35.
#'
#' @source The original data, `tv.dat`, came from the initial
#' implementation of mosaic displays in R by Jay Emerson (1998). Similar data
#' had been used by Hartigan and Kleiner (1984) as an illustration.
#' @keywords datasets
#' @examples
#'
#' data(TV)
#' structable(TV)
#' doubledecker(TV)
#'
#' # reduce number of levels of Time
#' TV.df <- as.data.frame.table(TV)
#' levels(TV.df$Time) <- rep(c("8:00-8:59", "9:00-9:59", "10:00-10:44"),
#'                           c(4, 4, 3))
#'
#' TV2 <- xtabs(Freq ~ Day + Time + Network, TV.df)
#'
#' # re-label for mosaic display
#' levels(TV.df$Time) <- c("8", "9", "10")
#' # fit mode of joint independence, showing association of Network with Day*Time
#' mosaic(~ Day + Network + Time,
#'   data = TV.df,
#'   expected = ~ Day:Time + Network,
#'   legend = FALSE)
#'
#'
#' # with doubledecker arrangement
#' mosaic(~ Day + Network + Time,
#'   data = TV.df,
#'   expected = ~ Day:Time + Network,
#'   split = c(TRUE, TRUE, FALSE),
#'   spacing = spacing_highlighting,
#'   legend = FALSE)
#'
NULL



#' Student Opinion about the Vietnam War
#'
#' A survey of student opinion on the Vietnam War was taken at the University
#' of North Carolina at Chapel Hill in May 1967 and published in the student
#' newspaper. Students were asked to fill in ballot papers stating which policy
#' out of A,B,C or D they supported. Responses were cross-classified by
#' gender/year.
#'
#' The response categories were:
#' \describe{
#'   \item{`A`}{Defeat North Vietnam by widespread bombing and land invasion}
#'   \item{`B`}{Maintain the present policy}
#'   \item{`C`}{De-escalate military activity, stop bombing and begin negotiations}
#'   \item{`D`}{Withdraw military forces Immediately}
#' }
#'
#'
#' For some analyses, it is useful to treat `year` as numeric, and
#' possibly assign grad students a value `year=7`.
#'
#' @name Vietnam
#' @docType data
#' @format A frequency data frame with 40 observations representing a 2 x 5 x 4
#' contingency table on the following 4 variables.
#' \describe{
#'   \item{`sex`}{a factor with levels `Female` `Male`}
#'   \item{`year`}{year of study, an ordered factor with levels
#'     `Freshmen`, `Sophomore`, `Junior`, `Senior`, `Grad student`}
#'   \item{`response`}{a factor with levels `A` `B` `C` `D`}
#'   \item{`Freq`}{cell frequency, a numeric vector}
#' }
#
#' @references Friendly, M.  (2000), *Visualizing Categorical Data*, SAS
#' Institute, Cary, NC, Example 7.9.
#' @source Aitken, M. etal, 1989, *Statistical Modelling in GLIM*
#' @keywords datasets
#' @examples
#'
#' data(Vietnam)
#' ## maybe str(Vietnam) ; plot(Vietnam) ...
#'
NULL





#' Race and Politics in the 1980 Presidential Vote
#'
#' Data from the 1982 General Social Survey on votes in the 1980 U.S.
#' presidential election in relation to race and political conservatism.
#'
#' The data contains a number of sampling zeros in the frequencies of NonWhites
#' voting for Ronald Reagan.
#'
#' @name Vote1980
#' @docType data
#' @format A frequency data frame representing a 2 x 7 x 2 table, with 28
#' observations on the following 4 variables.
#' \describe{
#'   \item{`race`}{a factor with levels `NonWhite` `White`}
#'   \item{`conservatism`}{a factor with levels `1` `2` `3` `4` `5` `6` `7`,
#'        `1`=most liberal, `7`=most conservative}
#'   \item{`votefor`}{a factor with levels `Carter` `Reagan`; `Carter`represents Jimmy Carter or other.}
#'   \item{`Freq`}{a numeric vector}
#' }
#'
#' @references
#' Agresti, A. (1990) *Categorical Data Analysis*, Table 4.12
#' New York: Wiley-Interscience.
#'
#' Friendly, M. (2000) *Visualizing Categorical Data*, Example 7.5 Cary,
#' NC: SAS Institute.
#' @source
#' Clogg, C. & Shockey, J. W. (1988). In Nesselroade, J. R. & Cattell,
#' R. B. (ed.)  Multivariate Analysis of Discrete Data, *Handbook of
#' Multivariate Experimental Psychology*, New York: Plenum Press.
#' @keywords datasets
#' @examples
#'
#' data(Vote1980)
#' fourfold(xtabs(Freq ~ race + votefor + conservatism,
#'   data=Vote1980),
#'   mfrow=c(2,4))
#'
#'
NULL





#' Worker Satisfaction Data
#'
#' Blue collar workers job satisfaction from large scale investigation in
#' Denmark in 1968 (Andersen, 1991).
#'
#'
#' @name WorkerSat
#' @docType data
#' @format A frequency data frame with 8 observations on the following 4
#' variables, representing the 2 x 2 x 2 classification of 715 cases.
#' \describe{
#'   \item{`Manage`}{Quality of management, an ordered factor with levels `bad` < `good`}
#'   \item{`Super`}{Supervisor satisfaction, an ordered factor with levels `low` < `high`}
#'   \item{`Worker`}{Worker job satisfaction, an ordered factor with levels `low` < `high`}
#'   \item{`Freq`}{a numeric vector}
#' }
#' @references Andersen, E. B. (1991) Statistical Analysis of Categorical Data,
#' 2nd Ed., Springer-Verlag.
#' @source
#' Originally from <https://online.stat.psu.edu/stat504/lesson/10/>
#' @keywords datasets
#' @examples
#'
#' data(WorkerSat)
#'
#' worker.tab <- xtabs(Freq ~ Worker + Super + Manage, data=WorkerSat)
#' fourfold(worker.tab)
#' mosaic(worker.tab, shade=TRUE)
#'
#'
NULL





#' Occupational Mobility in Three Countries
#'
#' Yamaguchi (1987) presented this three-way frequency table, cross-classifying
#' occupational categories of sons and fathers in the United States, United
#' Kingdom and Japan.  This data set has become a classic for models comparing
#' two-way mobility tables across layers corresponding to countries, groups or
#' time (e.g., Goodman and Hout, 1998; Xie, 1992).
#'
#' The US data were derived from the 1973 OCG-II survey; those for the UK from
#' the 1972 Oxford Social Mobility Survey; those for Japan came from the 1975
#' Social Stratification and Mobility survey. They pertain to men aged 20-64.
#'
#' Five status categories -- upper and lower nonmanuals (`UpNM`,
#' `LoNM`), upper and lower manuals (`UpM`, `LoM`), and
#' `Farm`) are used for both fathers' occupations and sons' occupations.
#'
#' Upper nonmanuals are professionals, managers, and officials; lower
#' nonmanuals are proprietors, sales workers, and clerical workers; upper
#' manuals are skilled workers; lower manuals are semi-skilled and unskilled
#' nonfarm workers; and farm workers are farmers and farm laborers.
#'
#' Some of the models from Xie (1992), Table 1, are fit in
#' `demo(yamaguchi-xie)`.
#'
#' @name Yamaguchi87
#' @docType data
#' @format A frequency data frame with 75 observations on the following 4
#' variables. The total sample size is 28887.
#'
#' \describe{
#'   \item{`Son`}{a factor with levels `UpNM` `LoNM` `UpM` `LoM` `Farm`}
#'   \item{`Father`}{a factor with levels `UpNM` `LoNM` `UpM` `LoM` `Farm`}
#'   \item{`Country`}{a factor with levels `US` `UK` `Japan`}
#'   \item{`Freq`}{a numeric vector}
#' }
#'
#' @references Goodman, L. A. and Hout, M. (1998). Statistical Methods and
#' Graphical Displays for Analyzing How the Association Between Two Qualitative
#' Variables Differs Among Countries, Among Groups, Or Over Time: A Modified
#' Regression-Type Approach. *Sociological Methodology*, 28 (1), 175-230.
#'
#' Xie, Yu (1992). The log-multiplicative layer effect model for comparing
#' mobility tables. *American Sociological Review*, 57 (June), 380-395.
#' @source Yamaguchi, K. (1987).  Models for comparing mobility tables: toward
#' parsimony and substance, *American Sociological Review*, vol. 52
#' (Aug.), 482-494, Table 1
#' @keywords datasets
#' @examples
#'
#' data(Yamaguchi87)
#' # reproduce Table 1
#' structable(~ Father + Son + Country, Yamaguchi87)
#' # create table form
#' Yama.tab <- xtabs(Freq ~ Son + Father + Country, data=Yamaguchi87)
#'
#' # define mosaic labeling_args for convenient reuse in 3-way displays
#' largs <- list(rot_labels=c(right=0), offset_varnames = c(right = 0.6),
#'               offset_labels = c(right = 0.2),
#'               set_varnames = c(Son="Son's status", Father="Father's status")
#'              )
#'
#' ###################################
#' # Fit some models & display mosaics
#'
#' # Mutual independence
#' yama.indep <- glm(Freq ~ Son + Father + Country,
#'   data=Yamaguchi87,
#'   family=poisson)
#' anova(yama.indep)
#'
#' mosaic(yama.indep, ~Son+Father, main="[S][F] ignoring country")
#'
#' mosaic(yama.indep, ~Country + Son + Father, condvars="Country",
#'        labeling_args=largs,
#'        main='[S][F][C] Mutual independence')
#'
#' # no association between S and F given country ('perfect mobility')
#' # asserts same associations for all countries
#' yama.noRC <- glm(Freq ~ (Son + Father) * Country,
#'   data=Yamaguchi87,
#'   family=poisson)
#' anova(yama.noRC)
#'
#' mosaic(yama.noRC, ~~Country + Son + Father, condvars="Country",
#'        labeling_args=largs,
#'        main="[SC][FC] No [SF] (perfect mobility)")
#'
#' # ignore diagonal cells
#' yama.quasi <- update(yama.noRC, ~ . + Diag(Son,Father):Country)
#' anova(yama.quasi)
#'
#' mosaic(yama.quasi, ~Son + Father, main="Quasi [S][F]")
#'
#' ## see also:
#' # demo(yamaguchi-xie)
#' ##
#'
NULL

Try the vcdExtra package in your browser

Any scripts or data that you put into this service are public.

vcdExtra documentation built on Dec. 11, 2025, 9:06 a.m.