readEphData <- function(dataset) {
  if(dataset %in% c("williams2001.txt", "williams2002.txt", "williams2003.txt", "williams2004.txt", "williams2005.txt", "williams2006.txt", "williams2007.txt", "williams2008.txt", "williams2009.txt", "williams2010.txt")) {
      frame <- readLines(system.file("extdata", dataset, package = "wgd"), warn = FALSE)
          rm_spaces <- gsub(" ", "", frame) # remove spaces
          rm_stuff <- gsub("[[:punct:]]", "", rm_spaces) # remove non alphanumeric characters
          rm_one <- rm_stuff[nchar(rm_stuff) > 0] # remove possible empty value
      state <- rm_one[nchar(rm_one) > 3] # subset for state names
      number <- rm_one[nchar(rm_one) <= 3] # subset for number of students
    total <- cbind(state, number)
}

    # now have 2 x 2 matrix/frame with variable number of rows

    else if(dataset %in% c("williams2011.txt", "williams2012.txt", "williams2013.txt", "williams2014.txt", "williams2015.txt", "williams2016.txt")) {
        frame <- readLines(system.file("extdata", dataset, package = "wgd"), warn = FALSE)
            frame <- frame[nchar(frame) > 1] # remove possible empty value
            rm_spaces <- gsub(" ", "", frame) # remove spaces
          state <- gsub("[0-9]", "", rm_spaces) # subset only states
          number <- gsub("[a-zA-Z]", "", rm_spaces) # subset only numbers
      total <- cbind(state, number)
    }

  possible_sub <- grep("Armed|Islands", total) # exclude cases with Armed Forces or Virgin Islands
  if(length(possible_sub) > 0) { total <- total[-possible_sub, ] } # for cases these categories aren't present

  # now have 2 x 2 matrix/frame with variable number of rows

      # add in columns for states that may have 0 students and are thus not displayed

  if(identical(total[total[, 1] %in% c("Nebraska", "Nebraska\t"), 1], character(0))) {
    a1 <- matrix(c("Nebraska", "0"), 1, 2)
  }  else { a1 <- matrix(1:2, 1, 2)}
  if(identical(total[total[, 1] %in% c("PuertoRico", "PuertoRico\t"), 1], character(0))) {
    a2 <- matrix(c("PuertoRico", "0"), 1, 2)
  }  else { a2 <- matrix(1:2, 1, 2)}
  if(identical(total[total[, 1] %in% c("Wyoming", "Wyoming\t"), 1], character(0))) {
    a3 <- matrix(c("Wyoming", "0"), 1, 2)
  }  else { a3 <- matrix(1:2, 1, 2)}
  if(identical(total[total[, 1] %in% c("WestVirginia", "WestVirginia\t"), 1], character(0))) {
    a4 <- matrix(c("WestVirginia", "0"), 1, 2)
  }  else { a4 <- matrix(1:2, 1, 2)}
  if(identical(total[total[, 1] %in% c("Guam", "Guam\t"), 1], character(0))) {
    a5 <- matrix(c("Guam", "0"), 1, 2)
  }  else { a5 <- matrix(1:2, 1, 2)}
  if(identical(total[total[, 1] %in% c("Kansas", "Kansas\t"), 1], character(0))) {
    a6 <- matrix(c("Kansas", "0"), 1, 2)
  }  else { a6 <- matrix(1:2, 1, 2)}
  if(identical(total[total[, 1] %in% c("Mississippi", "Mississippi\t"), 1], character(0))) {
    a7 <- matrix(c("Mississippi", "0"), 1, 2)
  }  else { a7 <- matrix(1:2, 1, 2)}
  if(identical(total[total[, 1] %in% c("NorthDakota", "NorthDakota\t"), 1], character(0))) {
    a8 <- matrix(c("NorthDakota", "0"), 1, 2)
  }  else { a8 <- matrix(1:2, 1, 2)}
  if(identical(total[total[, 1] %in% c("Nevada", "Nevada\t"), 1], character(0))) {
    a9 <- matrix(c("Nevada", "0"), 1, 2)
  }  else { a9 <- matrix(1:2, 1, 2)}
  if(identical(total[total[, 1] %in% c("Montana", "Montana\t"), 1], character(0))) {
    a10 <- matrix(c("Montana", "0"), 1, 2)
  }  else { a10 <- matrix(1:2, 1, 2)}
  if(identical(total[total[, 1] %in% c("Oklahoma", "Oklahoma\t"), 1], character(0))) {
    a11 <- matrix(c("Olkahoma", "0"), 1, 2)
  }  else { a11 <- matrix(1:2, 1, 2)}
  if(identical(total[total[, 1] %in% c("SouthDakota", "SouthDakota\t"), 1], character(0))) {
    a12 <- matrix(c("SouthDakota", "0"), 1, 2)
  }  else { a12 <- matrix(1:2, 1, 2)}

    full_data <- rbind(total, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) # combine all possible rows
    full_data <- full_data[nchar(full_data[, 1]) > 1, ] # subset for rows with a valid state
    full_data <- full_data[order(full_data[, 1]), ] # sort by ascending state
    full_data <- as.data.frame(full_data, stringsAsFactors = FALSE)
    full_data
}
plot_state <- function(state) {

  williams_data <- as.data.frame(cbind(readEphData("williams2001.txt"), readEphData("williams2002.txt"), readEphData("williams2003.txt"), readEphData("williams2004.txt"), readEphData("williams2005.txt"), readEphData("williams2006.txt"), readEphData("williams2007.txt"), readEphData("williams2008.txt"), readEphData("williams2009.txt"), readEphData("williams2010.txt"), readEphData("williams2011.txt"), readEphData("williams2012.txt"), readEphData("williams2013.txt"), readEphData("williams2014.txt"), readEphData("williams2015.txt"), readEphData("williams2016.txt")), stringsAsFactors = FALSE)
  frame <- williams_data[, c(31, seq(2, 32, by = 2))]
  dat <- as.data.frame(lapply(frame[, 2:ncol(frame)], as.numeric)) # make number columns numeric
  dat <- cbind(williams_data[, 31], dat) # combine with list of states
  names(dat) <- c("State", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011",
                  "2012", "2013", "2014", "2015", "2016")

  if(state %in% dat[, 1]) {
  data <- dat[dat$State == state, ] #subset for state of interest
  data <- as.numeric(data[, 2:ncol(data)])
      names(data) <- c("2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011",
                       "2012", "2013", "2014", "2015", "2016")
  df.bar <- barplot(data, main = state, bg = "grey", col = "purple", ylab = "Number of Students")
  lines(x = df.bar, y = data/2, col = "white")
  return(points(x = df.bar, y = data/2, col = "white"))
  }
  else if(state == "Notable") {
    data <- as.matrix(dat[dat$State %in% c("California", "Florida", "Hawaii", "Massachusetts", "Maine"), 2:ncol(data)])
        df.bar <- plot(data[1, ], ylim = c(0, 450), main = "Notable States", xlab = "Year", ylab = "Number of Students", axes = FALSE, bty = "L")
        v1 <- c(1, 6, 11, 16) # -> defines position of tick marks.
        v2 <- c("2001", "2006", "2011", "2016") # defines labels of tick marks.
    axis(side = 1, at = v1, labels = v2, tck = -.05)
    axis(side = 2)
        lines(data[1, ], type = "o", pch = 15, lty=2, col = "red")
        lines(data[2, ], type = "o", pch = 16, lty=2, col = "black")
        lines(data[3, ], type = "o", pch = 17, lty=2, col = "purple")
        lines(data[4, ], type = "o", pch = 18, lty=2, col = "green")
        lines(data[5, ], type = "o", pch = 16, lty=2, col = "blue")
    colors = c("red", "black", "purple", "green", "blue")
    par(xpd=TRUE)
    legend(x = 10 , y = 525 , c("California", "Florida", "Hawaii", "Maine", "Massachusetts"), bty ="n", fill = colors, cex = 0.6)
  } else { stop("Input any valid state or Notable. Example: plot_state('DistrictofColumbia')") }
}
plot_region <- function(region) {

  williams_data <- as.data.frame(cbind(readEphData("williams2001.txt"), readEphData("williams2002.txt"), readEphData("williams2003.txt"), readEphData("williams2004.txt"), readEphData("williams2005.txt"), readEphData("williams2006.txt"), readEphData("williams2007.txt"), readEphData("williams2008.txt"), readEphData("williams2009.txt"), readEphData("williams2010.txt"), readEphData("williams2011.txt"), readEphData("williams2012.txt"), readEphData("williams2013.txt"), readEphData("williams2014.txt"), readEphData("williams2015.txt"), readEphData("williams2016.txt")), stringsAsFactors = FALSE)
  frame <- williams_data[, c(31, seq(2, 32, by = 2))]
  dat <- as.data.frame(lapply(frame[, 2:ncol(frame)], as.numeric)) # make number columns numeric
  dat <- cbind(williams_data[, 31], dat) # combine with list of states
  names(dat) <- c("State", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011",
                  "2012", "2013", "2014", "2015", "2016")

  subwest <- dat[dat$State %in% c("California", "Oregon", "Washington", "Idaho", "Nevada", "Arizona"), ]
  West <- colSums(subwest[2:ncol(subwest)])
  submountainwest <- dat[dat$State %in% c("NewMexico", "Colorado", "Wyoming", "Montana", "NorthDakota", "SouthDakota", "Utah"), ]
  MountainWest <- colSums(submountainwest[2:ncol(subwest)])
  subsouth <- dat[dat$State %in% c("Oklahoma", "Texas", "Arkansas", "Mississippi", "Alabama", "Louisiana", "Florida", "Georgia", "SouthCarolina", "NorthCarolina", "Tennesse"), ]
  South <- colSums(subsouth[2:ncol(subwest)])
  submidwest <- dat[dat$State %in% c("Nebraska", "Kansas", "Iowa", "Missouri", "Ohio", "Kentucky", "Indiana", "Illinois", "Wisconsin", "Michigan", "Minnesota"), ]
  Midwest <- colSums(submidwest[2:ncol(subwest)])
  subnortheast <- dat[dat$State %in% c("New York", "Massachusetts", "Vermont", "New Hampshire", "Maine", "Rhode Island", "Connecticut"), ]
  Northeast <- colSums(subnortheast[2:ncol(subwest)])
  submidatlantic <- dat[dat$State %in% c("Virginia", "Deleware", "DistrictofColumbia", "NewJersey", "WestVirginia", "Pennsylvania", "Maryland"), ]
  MidAtlantic <- colSums(submidatlantic[2:ncol(subwest)])

  if(region == "West") {
    df.bar <- barplot(West, main = region, ylab = "Number of Students", col = "purple")
    lines(x = df.bar, y = West/2, col = "white")
    return(points(x = df.bar, y = West/2, col = "white")) }
  else if(region == "MountainWest") {
    df.bar <- barplot(MountainWest, main = region, ylab = "Number of Students", col = "purple")
    lines(x = df.bar, y = MountainWest/2, col = "white")
    return(points(x = df.bar, y = MountainWest/2, col = "white")) }
  else if(region == "South") {
    df.bar <- barplot(South, main = region, ylab = "Number of Students", col = "purple")
    lines(x = df.bar, y = South/2, col = "white")
    return(points(x = df.bar, y = South/2, col = "white")) }
  else if(region == "Midwest") {
    df.bar <- barplot(Midwest, main = region, ylab = "Number of Students", col = "purple")
    lines(x = df.bar, y = Midwest/2, col = "white")
    return(points(x = df.bar, y = Midwest/2, col = "white")) }
  else if(region == "Northeast") {
    df.bar <- barplot(Northeast, main = region, ylab = "Number of Students", col = "purple")
    lines(x = df.bar, y = Northeast/2, col = "white")
    return(points(x = df.bar, y = Northeast/2, col = "white")) }
  else if(region == "MidAtlantic") {
    df.bar <- barplot(MidAtlantic, main = region, ylab = "Number of Students", col = "purple")
    lines(x = df.bar, y = MidAtlantic/2, col = "white")
    return(points(x = df.bar, y = MidAtlantic/2, col = "white")) }
  else if(region == "All") {
    data<- rbind(West, MountainWest, South, Midwest, Northeast, MidAtlantic)

    df.bar <- plot(data[1, ], ylim = c(0, 700), main = "All Regions", xlab = "Year", ylab = "Number of Students", axes = FALSE, bty = "L")
    v1 <- c(1, 6, 11, 16) # -> defines position of tick marks.
    v2 <- c("2001", "2006", "2011", "2016") # defines labels of tick marks.
    axis(side = 1, at = v1, labels = v2, tck = -.05)
    axis(side = 2)
    lines(data[1, ], type = "o", pch = 15, lty=2, col = "red")
    lines(data[2, ], type = "o", pch = 16, lty=2, col = "black")
    lines(data[3, ], type = "o", pch = 17, lty=2, col = "purple")
    lines(data[4, ], type = "o", pch = 18, lty=2, col = "green")
    lines(data[5, ], type = "o", pch = 16, lty=2, col = "blue")
    lines(data[6, ], type = "o", pch = 16, lty=2, col = "orange")

    colors = c("red", "black", "purple", "green", "blue", "orange")
    par(xpd=TRUE)
    legend(x = 13.5 , y = 850 , c("West","MountainWest","South","Midwest","Northeast", "MidAtlantic"), bty ="n", fill = colors, cex = 0.5)
  } else { stop("Input any valid region or All. Example: plot_region('MountainWest')") }
}
total_dist <- function(type) {

williams_data <- as.data.frame(cbind(readEphData("williams2001.txt"), readEphData("williams2002.txt"), readEphData("williams2003.txt"), readEphData("williams2004.txt"), readEphData("williams2005.txt"), readEphData("williams2006.txt"), readEphData("williams2007.txt"), readEphData("williams2008.txt"), readEphData("williams2009.txt"), readEphData("williams2010.txt"), readEphData("williams2011.txt"), readEphData("williams2012.txt"), readEphData("williams2013.txt"), readEphData("williams2014.txt"), readEphData("williams2015.txt"), readEphData("williams2016.txt")), stringsAsFactors = FALSE)
frame <- williams_data[, c(31, seq(2, 32, by = 2))]
dat <- as.data.frame(lapply(frame[, 2:ncol(frame)], as.numeric)) # make number columns numeric
dat <- cbind(williams_data[, 31], dat) # combine with list of states
names(dat) <- c("State", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011",
                "2012", "2013", "2014", "2015", "2016")
if(type == "Summary") {
  return(dat)
}
else if(type == "US") {
  USenrollment <- colSums(dat[, 2:ncol(dat)])
  df.bar <- barplot(USenrollment, main = "Total United States Enrollment", col = "purple", ylab = "Number of Students") # includes Guam, Puerto Rico
  lines(x = df.bar, y = USenrollment/2, col = "white")
  return(points(x = df.bar, y = USenrollment/2, col = "white"))
}
else if(type == "SD") {
  USenrollment <- colSums(dat[, 2:ncol(dat)])
  enrollment_var <- as.data.frame(cbind(sd(USenrollment[1:10]), sd(USenrollment[11:16])))
  names(enrollment_var) <- c("2001-2010", "2011-2016")
  row.names(enrollment_var) <- "SD"
  return(enrollment_var)
} else { stop("Input Summary, US, or SD") }
}
williams_data <- as.data.frame(cbind(readEphData("williams2001.txt"), readEphData("williams2002.txt"), readEphData("williams2003.txt"), readEphData("williams2004.txt"), readEphData("williams2005.txt"), readEphData("williams2006.txt"), readEphData("williams2007.txt"), readEphData("williams2008.txt"), readEphData("williams2009.txt"), readEphData("williams2010.txt"), readEphData("williams2011.txt"), readEphData("williams2012.txt"), readEphData("williams2013.txt"), readEphData("williams2014.txt"), readEphData("williams2015.txt"), readEphData("williams2016.txt")), stringsAsFactors = FALSE)
  frame <- williams_data[, c(31, seq(2, 32, by = 2))]
  dat <- as.data.frame(lapply(frame[, 2:ncol(frame)], as.numeric)) # make number columns numeric
  dat <- cbind(williams_data[, 31], dat) # combine with list of states
  names(dat) <- c("State", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011",
                  "2012", "2013", "2014", "2015", "2016")

Introduction

The geographic distribution of the study body, or more simply, where students are from, is a dynamic that heavily influences the social, ideological, and academic environment of the college. An understanding of how this structure changes over time is important to the diversity of the college and should be watched closely. The package wgd uses string manipulation on tables extracted from PDFs and organizes the information in the form of a usable data frame.

The package wgd also creates a variety of statistics and graphics that illustrate changes in the geographic distribution on a country, region, and state level. Users can create bar plots showing how the number of students from a certain state or region changes over time. These introductory graphics show geographic distribution trends a clear and concise fashion. The data can also be used to perform more advanced analysis. For example, external data sets could be used to determine what variables are associated with changes in the Williams College geographic distribution.

Data

The raw data used for this analysis is in the form of PDFs posted to the website of the Office of the Registrar of Williams College. The data of interest are stored in tables inside the PDFs, and are available for the years between the years 2001 and 2016. In 2011, the Office of the Registrar began recording the geographic distribution data in a different way in previous years. Thus, two different methods were employed to read the data into a usable format, one for the years 2001 to 2010, and one for the years 2011 to 2016.

For 2001 to 2010, the data was converted into text files using PDFelement, an online PDF to text editor. The result is a character vector containing two distinct sections. The first x lines reference the names of the states and the next x lines reference the number of students from each state. There are many unneeded spaces and periods in the raw text file. To clean this, spaces and non-alphanumeric characters are removed.

rm_spaces <- gsub(" ", "", frame) 
rm_stuff <- gsub("[[:punct:]]", "", rm_spaces)

Two new character vectors are created, one referring to the name of the state and one referring to the number of students from that state. Binding them together creates the format we want.

names <- rm_one[nchar(rm_one) > 3] 
numbers <- rm_one[nchar(rm_one) <= 3]
full <- cbind(names, numbers)

For 2011 to 2016, the data can simply be copy and pasted from the PDF into a text file, and then manually edited to contain a new line for each state reference. Unneeded spaces are removed, and vectors of the names of the states and the number of students from each state are created and bound together.

rm_spaces <- gsub(" ", "", data) 
states <- gsub("[0-9]", "", rm_spaces) 
numbers <- gsub("[a-zA-Z]", "", rm_spaces) 
total <- cbind(states, numbers)

These operations yield a two column matrix, one column representing the names of the states and one column representing the number of students from each state. However, this 2 column matrix has a variable number of rows depending on the text file that is being manipulated. In the instance that a state has zero students from it, the raw data does not contain that state in the table. Since certain states sometimes have zero students at Williams, matrices will often contain a different number of rows and have missing states. In order to report data on all states between the years of 2001 and 2016, the relevant data must be added in. A solution is to create a 1 x 2 matrix with the relevant information and bind it to the data. Several if statements check for this condition and create the missing data. Do not mind the matrix created for the else condition, as it is a placeholder and will be deleted.

if(identical(total[total[, 1] %in% c("Nebraska", "Nebraska\t"), 1], character(0)))
{ a1 <- matrix(c("Nebraska", "0"), 1, 2) } else { a1 <- matrix(1:2, 1, 2) }

Next, the original data and the new matrices are bound together. The placeholder matrices are removed and the matrix is ordered by alphabetically by state name. The matrix is coerced into a data frame that has 53 rows: The 50 states, District of Columbia, Guam, and Puerto Rico.

readEphData("williams2012.txt")[1:4, ]

Use readEphData

The information in the locally stored text files can be loaded and parsed using the readEphData function. The function creates a dataframe containing the geographic distribution of students for the year of the user coded text file. Subsequent functions combine the data and format it into a data frame. The function has one argument and can be used as follows.

readEphData("dataset")

Use plot_state

The function plot_state is used to generate graphics that show the geographic distribution over time for a user-coded state. The function subsets the data for a particular state and then displays the information in a bar plot. The function has one argument and is used as follows.

plot_state("State")

The function can display graphics for any of the 50 states, District of Columbia, Guam, and Puerto Rico. The user should code the state without spaces and with capital letters where normal. For example:

plot_state("Virginia")
plot_state("Florida")

Use plot_region

The function plot_region is used to generate graphics that show the geographic distribution over time for a user-coded region. The function subsets the data for a particular region and then displays the information in a plot. The function has one argument and is used as follows.

plot_region("Region")

The function can display graphics for any of the six regions that this package defines. The regions and states they contain are as follows:

The user should code the state without spaces and with capital letters where normal. For example:

plot_region("West")
plot_region("Northeast")

Use total_dist

The function total_dist uses the geographic distribution data from the readEphData function to generate graphics and summary statistics for user coded options. The function has the argument type which dictates the type of output generated. The possible options are Summary, US, and SD. Summary generates a data frame containing the geographic distribution for the years 2001 to 2016. US generates a bar plot displaying how total United States enrollment has changed at Williams from 2001 to 2016. SD generates a data frame containing the standard deviation in total United States enrollment for the year blocks 2001 to 2010 and 2011 to 2016. The function can be used as follows

total_dist("Summary")[1:4, ]

Discussion

Before delving deeper into the data by state or region, let’s look at the total number of students from the 50 states, District of Columbia, Guam, and Puerto Rico between 2001 and 2016.

total_dist("US")

A quick glance at the bar plot shows that the total number has not changed much in the past 15 years. What is surprising is how flat the distribution has been over the past six years versus the ten years before that. Quick calculations of the standard deviation for each time period yields an interesting result.

total_dist("SD")

The standard deviation of total United States enrollment for the past six years is almost five times smaller than it was for the preceding ten years. Did admissions become "smarter" about yield calculations sometime near 2011? What other factors could potentially explain this?

Output from the plot_state function shows marked upward and downward trends for different states. California, with only 156 students in 2001, has become a Williams powerhouse with 267 students in 2016. Florida has also seen huge gains, rising from 28 in 2001 to 83 in 2016. Some states have opposite trends. District of Columbia had 26 students at Williams in 2001 and 2002, but only 12 in 2015 and 2016. Massachusetts has also seen decreases, from 341 to 280 from 2001 to 2016.

One especially interesting state is Mississippi. The southern state had an average of 2.4 students at Williams between 2001 and 2010. Not a single student from Mississippi has matriculated since then.

plot_state("Mississippi")

Output from the plot_region function shows marked upward and downward trends for different regions.

plot_region("All")

A quick glance shows decreases in the Northeast and Mid Atlantic Regions, and increases in the West and South Regions. The most notable aspect of the graphic is that the Northeast and West regions have been converging over the past 15 years.

Conclusion

The student geographic distribution data, on both a state and region level, show marked downward and upward trends. This is an important result that has no doubt influenced the social, ideological, and academic environment of the college. A rough statement summarizing the data would be as follows: Between the years of 2001 and 2016, Williams has become increasingly represented by students from the West and South and decreasingly represented by students from the Northeast and Mid Atlantic.

What is more important than this result are the reasons why these trends have occurred. Why has the number of Students from Florida increased almost by a factor of three. Why hasn't a single student from Mississippi matriculated to Williams since 2010 after an average of 2.4 matriculated the previous ten years? It is unclear whether these trends are due to internal bias (i.e. changing admissions goals/quotas for students from certain states or areas of the country) or the quality of the students from different areas of the country.

As follows is a quick conceptual model outlining potential reasons why the geographic distribution might change. This model may guide further research in this topic.

Internal Bias - Williams desires to admit students from all 50 states for diversity purposes - Williams desires more/less students from x state or region for y reason

Quality of Students - Williams has become more well known in areas like the West and South over the past 15 years, leading to more and better qualified applications. Thus, more students have matriculated from those areas - The educational quality in state x has changed over time, resulting in more or less students at Williams from state x

Other Reason - A Williams College admissions counselor in charge of state or region x is replaced by a counselor who is better/worse at convincing quality students to apply to Williams, and thus more/less students from state or region x matriculate to Williams

The actual underpinnings for the geographic distribution of the student body are almost certainly a combination of the three described above, and many others. Isolating and compiling data for many of these reasons would be a difficult task.

An interesting data set to compare with the results from the wgd package would be data on the admission rates by state. If the admission rate for a certain state or region remains constant while the number of students from that state increases or decreases, the change is likely attributed to the number of students applying from that area. If the admission rate for a certain state or region trends up or down, then the change is likely due either the quality of students or internal bias in Williams admissions. Unfortunately, it is unlikely that Williams would release this data.

Another interesting data set for comparison would be geographic distribution data from a school similar to Williams. Under the premise that students at Williams from the West and South have increased over the past 15 years due to increased awareness or recognition of Williams, it would be interesting to see if a similar trend has occurred at schools with recognition and awareness that has presumably not changed, or at least not a much. Harvard, for example, has always been well known and attractive to students all across the country.

Regardless, the package wgd includes functions that allow easy reading and analysis of geographic distribution data published by Williams College. Depending on the format of the data, this method can be generalized to other institutions that publish data in a similar format. wgd provides a good starting point for analyzing the geographic distribution of the Williams College student body. As mentioned above, outside data sets would need to be pulled in order to analyze the possible underpinnings for the results.



jmoss13148/wgd documentation built on May 19, 2019, 1:54 p.m.