R/datasummary.R

Defines functions datasummary

Documented in datasummary

#' Data Summary
#'
#' Uses the data frame generated by the readnames function and creates
#' graphics associated with the data.
#'
#' @param type The kind of output given. The possible outputs are \code{count}, \code{timeline},
#'     \code{proportion}, \code{comparison}. \code{count} displays the frequency of the different
#'     variables within the dataset each year. \code{timeline} creates a line graph of the change
#'     in the number of graduation merit types that Jewish people from Williams receives each year.
#'     \code{proportion} creates a line graph of the change in proportion in the graduation types
#'     that Jewish people receive each year. \code{comparision} does the same thing as proportion,
#'     but also includes the proportion of non-jewish people so comparisons can be made.
#'     \code{differencetest} runs a difference of proportions test between jewish and non-jewish
#'     students with H0: 0 and HA: not 0.
#'
#' @return Desired infomation from the kind of output specified by \code{type}.
#' @example datasummary(type)
#'
#' @import ggplot2
#' @export

datasummary <- function(type){

  library(ggplot2)
  ##Pulls from the totaldata function so that the dataset can be initialized here
  source("~/names/R/totaldata.R")
  dat <- totaldata()

  ##Creates a summary of the data that are useful for the type of analysis
  count <- table(unlist(dat))
  num <- cbind(c(summary(dat$year),
                 summary(dat$major),
                 summary(dat$honors),
                 summary(dat$degree),
                 summary(dat$ethnicity)))

  ##Reformatting the datamatrix to make plotting easier
  jew <- data.frame(table(dat$Jewstatus, dat$year, dat$degree))
  colnames(jew) <- c("jewstatus", "year", "degree", "freq")

  ##Creates a dataframe of the counts of data
  if (type == "count"){
    num
  }

  else if (type == "timeline"){
    ##Creates a timeline of the proportion of jewish summa, magna, and cummas out of jewish people

    ##subsets the dataset so that I can call just jewish or not
    jew <- jew[ jew$jewstatus == "jewish",]

    ##Puts everything together into a plot
    ggplot(data=jew, aes(x = year, y = freq, group = degree,  col= degree), alpha = 0.7) +
      geom_point(size = 3) +
      geom_line() +
      ylab("Number of Jews") +
      xlab("Year") +
      ggtitle("Merit Levels of Graduating Jews\n") +
      scale_color_discrete(name="Legend") +
      theme(plot.title = element_text (size = 16, face = "bold", color = "purple"))
  }

  else if (type == "proportion"){
    ##Changes the number each year to a proportion each year and then graphs it
    count <- data.frame(table(dat$year, dat$Jewstatus))

    ##Subsets the total number of jewish people each year
    totaljew <- count[count$Var2 == "jewish",]
    jewdenominator <- rep(c(totaljew$Freq), times = 5)

    ##Creates the list of just jewish
    jewonly <- jew[ jew$jewstatus == "jewish",]
    jewonly$freq/ jewdenominator
    jewonly <- transform(jewonly, proportion = freq / jewdenominator)

    ##Puts everything together into a plot
    ggplot(data = jewonly, aes(x = year, y = proportion, group = degree, col= degree), alpha = 0.7) +
      geom_point(size = 3) +
      geom_line() +
      ylab("Proportion of Jews") +
      xlab("Year") +
      ggtitle("Proportion of Merit Levels of Jews\n") +
      scale_color_discrete(name="Legend") +
      theme(plot.title = element_text (size = 16, face = "bold", color = "purple"))
  }

  else if (type == "comparison"){

    ##Changes the number each year to a proportion each year and then graphs it
    count <- data.frame(table(dat$year, dat$Jewstatus))

    ##Subsets the total number of jewish people each year
    totaljew <- count[count$Var2 == "jewish",]
    jewdenominator <- rep(c(totaljew$Freq), times = 5)

    ##Subsets the total number of non jewish people each year
    totalnonjew <- count[count$Var2 == "not jewish",]
    nondenominator <- rep(c(totalnonjew$Freq), times = 5)

    ##Creates the list of just jewish
    jewonly <- jew[ jew$jewstatus == "jewish",]
    jewonly$freq/ jewdenominator
    jewonly <- transform(jewonly, proportion = freq / jewdenominator)

    ##Creates the list of non jewish
    nonjewish <- jew[ jew$jewstatus == "not jewish",]
    nonjewish$freq/ nondenominator
    nonjewish <- transform(nonjewish, proportion = freq/nondenominator)

    ##Reconstructs the data into a big set
    propdat <- rbind(jewonly, nonjewish)

    ##Puts everything together into a plot
    ggplot(data=propdat, aes(x = year, y = proportion, group = interaction(degree, jewstatus), col= degree, shape = jewstatus), alpha = 0.7) +
      geom_point(size = 3) +
      geom_line() +
      ylab("Proportion of Jews") +
      xlab("Year") +
      ggtitle("Proportion of Merit Levels of Jews Vs. Non-Jewish\n") +
      scale_color_discrete(name="Legend") +
      theme(plot.title = element_text (size = 16, face = "bold", color = "purple"))
  }

  else if (type == "differencetest"){

    ##Subsets the total number of jewish people each year
    totaljew <- count[count$Var2 == "jewish",]
    jewdenominator <- rep(c(totaljew$Freq), times = 5)

    ##Subsets the total number of non jewish people each year
    totalnonjew <- count[count$Var2 == "not jewish",]
    nondenominator <- rep(c(totalnonjew$Freq), times = 5)

    ##Creates the list of just jewish
    jewonly <- jew[ jew$jewstatus == "jewish",]
    jewonly$freq/ jewdenominator
    jewonly <- transform(jewonly, proportion = freq / jewdenominator)

    ##Creates the list of non jewish
    nonjewish <- jew[ jew$jewstatus == "not jewish",]
    nonjewish$freq/ nondenominator
    nonjewish <- transform(nonjewish, proportion = freq/nondenominator)

    ##Subsets the dataframe into proportions separated by cum laude, magna cum laude, summa cum laude, masters
    ##Then runs the t.test function to test for a difference of means between the success rates of each group

    ##Bachelors of Arts test
    bachjew <- jewonly[jewonly$degree == "Bachelor of Arts",]
    bachnon <- nonjewish[nonjewish$degree == "Bachelor of Arts",]
    t.test(bachjew$proportion, bachnon$proportion)

    ##Cum Laude test
    cumjew <- jewonly[jewonly$degree == "Cum Laude",]
    cumnon <- nonjewish[nonjewish$degree == "Cum Laude",]
    t.test(cumjew$proportion, cumnon$proportion)

    ##Magna Cum Laude test
    magnajew <- jewonly[jewonly$degree == "Magna Cum Laude",]
    magnanon <- nonjewish[nonjewish$degree == "Magna Cum Laude",]
    t.test(magnajew$proportion, magnanon$proportion)

    ##Summa Cum Laude test
    summajew <- jewonly[jewonly$degree == "Summa Cum Laude",]
    summanon <- nonjewish[nonjewish$degree == "Summa Cum Laude",]
    t.test(summajew$proportion, summanon$proportion)

    ##Masters Test
    masterjew <- jewonly[jewonly$degree == "Masters",]
    masternon <- nonjewish[nonjewish$degree == "Masters",]
    t.test(masterjew$proportion, masternon$proportion)
  }

  else{
    warning("The specified type does not exist. Enter ?statsummary or help(statsummary) to view all the summary options available.")
  }
}
jian13579/names documentation built on May 19, 2019, 9:29 a.m.