# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
all.logical <- function(var) {
# Checks whether a vector contains only logical constants, perhaps as chars
#
# Args:
# var: a vector of any type
#
# Returns:
# A logical scalar indicating whether the input can be treated as logical
unique.vals <- setdiff(unique(var), NA)
# Non-bolean values will be turned in to NA, stacked at the front
vals.as.logical <- sort(as.logical(unique.vals), na.last = FALSE)
return(identical(vals.as.logical, c(FALSE, TRUE)))
}
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
rename <- function(df, old, new, pos = NULL) {
# Renames R.* variable names to NLS versions with metadata built in
df[, new] <- df[, old]
df[, old] <- NULL;
if (!is.null(pos)) { # This is a swap
current.pos <- which(names(df) == new)
val.at.new.pos <- df[, pos]
name.at.new.pos <- names(df)[pos]
df[, pos] <- df[, new]
names(df)[pos] <- new
df[, current.pos] <- val.at.new.pos
names(df)[current.pos] <- name.at.new.pos
}
return(df)
}
EncodeFactorAsNumeric <- function(var, new.levels, data, verbose = TRUE) {
# TODO: Needs documentation and a quick test
if (class(data[, var]) != "factor") {
stop("This function converts factors only")
}
old.levels <- levels(data[, var])
levels(data[, var]) <- new.levels
data[, var] <- as.numeric(as.character(data[, var]))
if (verbose) {
cat("mapping of", var, "\n")
for (i in 1:length(old.levels)) {
cat("Old level:", old.levels[i], ", New level:", new.levels[i], "\n")
}
}
return(data)
}
GetChunkListFromContents <- function(contents,
separator = paste(rep("-", 80), collapse = "")) {
# Returns a list of distinct text chunks from the NLS codebook
chunks <- list()
chunk <- c()
for (i in 1:length(contents)) {
if(contents[i] != separator){
chunk <- c(chunk, contents[i])
} else {
chunks[[length(chunks) + 1]] <- chunk
chunk <- c()
}
}
return(chunks)
}
CreateNLSdata <- function(codebook, csv.extract,
na.strings = c("-1", "-2", "-3", "-4", "-5")) {
# Converts raw NLS Investigator output to an NLSdata object
#
# Args:
# codebook: the .cdb file from the NLS Investigator export
# csv.extract: the .csv file from the NLS Investigator export
# na.strings: which of the missing value situations to represent as NA
# Returns:
# an NLSdata object containing a data frame and metadata dictionary
write("\n", file = codebook, append = TRUE)
contents <- readLines(codebook)
chunks <- GetChunkListFromContents(contents)
my.df <- read.csv(csv.extract, na.strings = na.strings)
metadata <- list()
i = 0
for (chunk in chunks) {
i = i + 1
cat(" Working on Chunk ", i, "\n")
r.id <- sub("([A-Z][0-9\\.]+).*", "\\1", chunk[1])
r.id <- sub("\\.", "", r.id)
name <- sub(".*\\[(.*)\\].*", "\\1", chunk[1])
year <- sub(".*Survey Year: (\\d{4}|XRND)", "\\1", chunk[1])
summary.index <- 2 + min(grep("[A-Z]", chunk[3:length(chunk)]))
summary <- trim(chunk[summary.index])
# Figuring out values from summary histograms
number.lines <- grep("^\\s*\\d", chunk)
dash.index <- grep("^\\s+-", chunk)
distr.lines <- ""
if ((length(number.lines) > 0) & (length(dash.index) > 0)) {
distr.lines <- chunk[min(number.lines):(max(dash.index) - 1)]
}
distr.lines <- gsub(":", "", distr.lines)
char.map <- sub("^\\s+\\d+\\s+(\\d+)\\s(.*)$", "\\1\t\\2", distr.lines)
char.map <- char.map[grepl("\t", char.map)]
cont.evidence <- (length(grep("\tTO\\s", char.map)) > 0) |
(length(char.map) == 0)
new.name <- gsub("-", "_", paste(name, year, sep = "."))
if (cont.evidence | length(distr.lines) == 0) {
my.df <- rename(my.df, r.id, new.name)
} else {
mapping <- data.frame()
label.map.list <- strsplit(char.map, "\t")
for (k in 1:length(label.map.list)) {
mapping <- rbind(mapping,
data.frame(var1 = as.numeric(label.map.list[[k]][1]),
var2 = label.map.list[[k]][2]))
}
names(mapping) <- c(r.id, new.name)
print(names(mapping))
my.df <- merge(my.df, mapping, all.x = TRUE)
my.df[, r.id] <- NULL
if (all.logical(my.df[, new.name])) {
my.df[, new.name] <- as.logical(my.df[, new.name])
}
}
metadata[[new.name]] <- list(name = new.name, summary = summary,
year = year, r.id = r.id, chunk = chunk)
cat("After chunk ", i, ",", nrow(my.df), "rows remain, ",
ncol(my.df), "cols remain, is numeric:", cont.evidence, "\n")
}
obj <- list(metadata = metadata, data = my.df)
class(obj) <- "NLSdata"
return(obj)
}
summary.NLSdata <- function(object, ...) {
# Prints summary information for NLSdata object
#
# Args:
# object: An NLSdata object
# ...: Not used currently
vars <- names(object$data)
for (var in vars) {
cat("------------------------------------------------------\n")
cat(" ", var, ", measured in", object$metadata[[var]]$year, "\n")
cat(" ", object$metadata[[var]]$summary, "\n")
cat("------------------------------------------------------\n")
print(summary(object$data[, var]))
}
}
KeywordSearch <- function(term, nlsdata) {
#TODO: documentation
summary.vec <- c()
n <- length(nlsdata$metadata)
for (i in 1:n) {
#summary.vec <- c(summary.vec, nlsdata$metadata[[i]]$summary)
text.to.search <- paste(nlsdata$metadata[[i]]$summary,
paste(nlsdata$metadata[[i]]$chunk, collapse = " "))
summary.vec <- c(summary.vec, text.to.search)
}
found <- grep(term, summary.vec, ignore.case = TRUE)
for (i in found) {
cat("*FOUND* : ", nlsdata$metadata[[i]]$name, ", index:", i, "\n")
cat(" SUMMARY: ", nlsdata$metadata[[i]]$summary, "\n\n")
}
}
CreateTimeSeriesDf <- function(obj, variable.base) {
# Converts a repeated element into a long format data frame
#
# Args:
# obj: an NLSdata object
# variable.base: character string common prefix for the repeated element
#
# Returns:
# A data frame in long format
var.vec <- sort(grep(variable.base, names(obj$data), value = TRUE))
year.vec <- as.numeric(sub(".*(\\d{4}.*)", "\\1", var.vec))
var.type <- class(obj$data[, var.vec[1]])
melted.df <- melt(obj$data[, c("PUBID.1997", var.vec)], id = "PUBID.1997")
if (var.type == "logical") {
melted.df[, "value"] <- as.logical(melted.df[, "value"])
}
melted.df$year <- as.numeric(sub(".*(\\d{4}.*)", "\\1", melted.df$variable))
melted.df$variable <- NULL
names(melted.df) <- c("PUBID.1997", variable.base, "year")
return(melted.df)
}
RosterToLongDf <- function(data, roster.base, id = "PUBID.1997") {
# Creates a long data frame from a wide roster variable set
#
# Args:
# data: a data frame
# roster.base: the stable character prefix of the roster variable
# id: character name representing the unit identifier
#
# Returns:
# A data frame in long format with the roster base as the outcome name
roster.vars <- grep(paste0(roster.base, "\\.\\d{2}\\.\\d{4}"),
names(data), value = TRUE)
if (length(roster.vars) == 0){
stop("No rosters. Ensure that roster variables in form [base].dd.dddd")
}
roster.subset <- data[, c(id, roster.vars)]
roster.long <- melt(roster.subset, id.var = id)
roster.long <- roster.long[!is.na(roster.long$value), ]
roster.long$year <- sub(paste0(roster.base, "\\.\\d{2}\\.(\\d{4})"), "\\1",
roster.long$variable)
roster.long[, roster.base] <- roster.long$value
roster.long$variable <- NULL
roster.long$value <- NULL
roster.long <- roster.long[order(roster.long[, id], roster.long$year),
c(id, "year", roster.base)]
return(roster.long)
}
ThrowAwayDataForBalance <- function(data, var.name, id = "PUBID.1997") {
# Achieve Balance over Time Period and Respondent by Crude Subsetting
#
# Args:
# data: a data frame
# var.name: variable to achieve balance with respect to
# id: id variable, also to achieve balance with respect to
#
# Returns: a balanced data data frame
n.df <- aggregate(data[, var.name], by = list(data[[id]]),
FUN = function(x) sum(!is.na(x)))
new.df <- data[data$PUBID.1997 %in% n.df[n.df$x == max(n.df$x), "Group.1"], ]
return(new.df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.