#' Set up iFixit Answers data for model fitting or predictions
#'
#' Used in the fit_model function and to set up the data to be predicted on. This function subsets the
#' data to all questions in English and creates all necessary variables.
#'
#' @param data Answers data frame.
#' @param forpredicting Set to true if this function will be used to set up the variables in the prediction
#' data set, and will not try to set up the time_until_answer variable.
#' Default is set to false. Default of this function is used in the fit_model function for setting up the
#' data set to build the model on. The time_until_answer variable will be set up in this case.
#'
#' @importFrom magrittr "%>%"
#' @importFrom stringr str_detect str_to_lower str_length str_count str_replace_all str_locate
#' @importFrom rebus "%R%" START SPC QUESTION END or1 or
#' @importFrom dplyr filter select
#'
#' @return Returns a data frame to be used in model fitting or predicting.
#'
#' @details Variables created:
#'
#' \itemize{
#' \item new_category: reorganizes category variable (e.g. pulled out Apple products)
#' \item weekday: if the question was posted over the weekend or weekday
#' \item text_length, device_length
#' \item title_questionmark: whether or not the title ends with a "?"
#' \item title_beginwh: whether or not the title begins with "Wh"
#' \item text_all_lower: whether or not the text is in all lower case
#' \item text_contain_punct: whether or not the text contains any end punctuation marks
#' \item update: whether or not the asker updated their question
#' \item prior_effort: whether or not the asker included words in the text that indicated that
#' they made prior effort/did research before asking the question
#' \item newline_ratio: ratio of newlines to the length of the question's text
#' \item avg_tag_length: the average length of all of a question's tags
#' \item avg_tag_score: the score or frequency of a tag is defined as the proportion of times that tag appears
#' in all of the data. avg_tag_score is defined as the average score/frequency of all of a question's tags
#' \item contain_answered: whether or not the question's title contains words considered to be frequent answered terms
#' \item contain_unanswered: whether or not the question's title contains words considered to be frequent unanswered terms
#' }
#'
#' @note If warnings about empty documents are output, they're from the function get_au_terms. This function
#' uses the function get_freq_terms, which turns the input into a document term matrix with weighting = weightTfIdf
#'
#' @examples
#' # setting up the data to build the model on
#' dir <- file.path(getwd(),"data")
#' out <- read.csv(file.path(dir, "answers_data.csv")) # data set without any variables set up
#'
#' model <- fit_model(out) # fit_model calls variable_setup() within
#'
#' # setting up variables in the prediction data
#' newdata <- oshitar::variable_setup(newdata, forpredicting = TRUE)
#'
#' @export
variable_setup <- function(data, forpredicting = FALSE){
x <- data %>%
tibble::as.tibble() %>%
filter(langid == "en")
#----Create time_until_answer---------------------------------
if (forpredicting == FALSE) { # only used on the data to build the model on
x$time_until_answer <- (x$first_answer_date - x$post_date) / 3600
empty <- is.na(x$time_until_answer)
x$time_until_answer[empty] <- (x$download_date[empty] - x$post_date[empty]) / 3600
}
#----Convert factors to characters----------------------------
x <- dplyr::mutate_if(x, is.factor, as.character)
x$new_user <- as.character(x$new_user)
#----Recoding NAs---------------------------------------------
# coding NAs as "Other"
x$category[is.na(x$category)] <- "Other"
x$subcategory[is.na(x$subcategory)] <- "Other"
#----Creates new_category-------------------------------------
x$new_category <- x$category
#----Apple Products---------------------
apple <- str_detect(str_to_lower(x$device),
pattern = or("apple","ipod","ipad", "imac", "mac" %R% SPC,
"macbook", "iphone", "safari", "apple", "ihome"))
x$new_category[apple == TRUE | x$subcategory == "iPhone" | x$category == "Mac"] <- "Apple Product"
#----Phones-----------------------------
phonetf <- str_detect(str_to_lower(x$device), pattern = or("blu", "blackberry", "android",
"alcatel", "htc", "huawei", "nokia",
"micromax", "moto", "kyocera", "zte",
"samsung" %R% SPC %R% "galaxy", "xiaomi"))
x$new_category[x$new_category == "Phone" | phonetf == TRUE] <- "Android/Other Phone"
#----PC---------------------------------
pc <- str_detect(str_to_lower(x$device),
pattern = or("acer", "lenovo", "inspiron", "asus", "laptop", "chromebook", "pavilion",
"satellite", "nextbook"))
x$new_category[pc == TRUE] <- "PC"
#----Home-------------------------------
kenmore <- str_detect(str_to_lower(x$device),
pattern = or("whirlpool", "kenmore", "dyson", "frigidaire", "ge" %R% SPC))
x$new_category[x$new_category == "Appliance" | x$new_category == "Household" | kenmore == TRUE] <- "Home"
#----Vehicle----------------------------
cars <- str_detect(str_to_lower(x$device), pattern = or("chevrolet", "toyota", "jeep", "camry", "ford", "honda", "golfcart", "chevy",
"pontiac", "mercury", "audi", "dodge", "nissan", "acura", "cadillac", "silverado",
"mazda", "gas scooter"))
x$new_category[x$new_category == "Car and Truck" | x$new_category == "Vehicle" | cars == TRUE] <- "Vehicle"
#----Tablet-----------------------------
tablet <- str_detect(str_to_lower(x$device), pattern = "tablet")
x$new_category[tablet == TRUE] <- "Tablet"
#----Camera-----------------------------
camera <- str_detect(str_to_lower(x$device), pattern = or("canon", "nikon", "coolpix", "polaroid", "fujifilm", "camera"))
x$new_category[camera == TRUE] <- "Camera"
#---------------------------------------
# grouping smaller categories with other
x$new_category <- forcats::fct_lump(as.factor(x$new_category), prop = 0.02)
x$new_category <- as.character(x$new_category)
#----weekday/weekend------------------------------------------
datetime <- as.POSIXct(x$post_date,origin="1970-01-01")
weekday <- factor(weekdays(datetime), levels = c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"))
x$day <- NA
x$day[stringr::str_detect(weekday,
pattern = rebus::or("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday")) == TRUE] <- "Weekday"
x$day[is.na(x$day)] <- "Weekend"
#----text length----------------------------------------------
x$text_length <- str_length(x$text)
#----device_length--------------------------------------------
x$device_length <- str_length(x$device)
#----if the title ends with a "?"-----------------------------
x$title_questionmark <- str_detect(x$title, pattern = QUESTION %R% END)
#----if text is in all lower case-----------------------------
cleaned <- str_replace_all(x$text, " ", "")
cleaned <- str_replace_all(cleaned, "[[:punct:]]|[[:digit:]]", "")
x$text_all_lower <- str_detect(cleaned, pattern = "^[[:lower:]]+$")
#----if text contains any end punct---------------------------
x$text_contain_punct <- str_detect(x$text, pattern = "[.|?|!]")
#----if user updated question---------------------------------
x$update <- str_detect(x$text, pattern = "===")
#----if user showed any prior effort--------------------------
x$prior_effort <- str_detect(str_to_lower(x$text),
pattern = or("tried", "searched", "researched", "tested",
"replaced", "used", "checked", "investigated",
"considered", "measured", "attempted", "inspected", "fitted"))
#----newline ratio to length of text--------------------------
x$newline_ratio <- str_count(x$text, pattern = "\n")/str_length(x$text)
#----average tag length---------------------------------------
taglist <- oshitar::tag_frequency(x$tags)
split_tags <- taglist[[1]]
tag_freq <- taglist[[2]]
x$avg_tag_length <- 0
for (j in which(x$n_tags != 0)) {
x$avg_tag_length[j] <- sum(str_length(as.vector(split_tags[j,]))) / sum(as.vector(split_tags[j,]) != "")
}
#----freqency of tags-----------------------------------------
score1 <- oshitar::assign_tag_freq(split_tags[,1], tag_freq)
score2 <- oshitar::assign_tag_freq(split_tags[,2], tag_freq)
score3 <- oshitar::assign_tag_freq(split_tags[,3], tag_freq)
score4 <- oshitar::assign_tag_freq(split_tags[,4], tag_freq)
x$avg_tag_score <- 0
hastag <- which(x$n_tags != 0)
x$avg_tag_score[hastag] <- (score1[hastag] + score2[hastag] + score3[hastag] + score4[hastag]) / as.numeric(x$n_tags[hastag])
#----frequent terms in answered/unanswered questions' titles--
remove <- c(unique(as.character(x$category)), unique(as.character(x$subcategory)),
unique(as.character(x$new_category)), "macbook", "imac", "ipod")
au_terms <- oshitar::get_au_terms(x, "title", stopwords = c("can", "will", "cant", "wont",
"works", "get", "help", "need",
"fix", "doesnt", "dont"), remove = remove)
p <- 0.01
r <- 1
answeredTerms <- au_terms %>%
filter(prop_in_answered > p) %>% filter(ratio > r)
unansweredTerms <- au_terms %>%
filter(prop_in_unanswered > p) %>% filter(ratio < r)
x$contain_answered <- str_detect(str_to_lower(as.character(x$title)), pattern = or1(answeredTerms$word))
x$contain_unanswered <- str_detect(str_to_lower(as.character(x$title)), pattern = or1(unansweredTerms$word))
return(x)
}
#-------------------------------------------------------------------------------
#' Fit cox regression model
#'
#' This function sets up variables in the input data using the variable_setup function,
#' and then fits and returns the model.
#'
#' @param data The data to use in fitting the model.
#' @param summary If true, this function will print a summary of the model (e.g. statistics, coefficients).
#' Default is set to false.
#'
#' @importFrom rms cph strat pol rcs
#'
#' @return Returns the cox regression model with 19 independent predictors. See the documentation on the
#' variable_setup function for details on what each predictor represents.
#'
#' @details
#' \itemize{
#' \item this model is stratified on ampm
#' \item contains a quadratic term on text_length
#' \item restricted cubic splines on device_length (5 knots), avg_tag_length (4 knots), and newline_ratio (4 knots)
#' }
#'
#' @examples
#' dir <- file.path(getwd(),"data")
#' out <- read.csv(file.path(dir, "answers_data.csv")) # data set without any variables set up
#' model <- fit_model(out)
#' model # calling model by itself will also print out a summary
#'
#' @export
fit_model <- function(data, summary = FALSE) {
data <- oshitar::variable_setup(data, forpredicting = FALSE)
fit <- rms::cph(Surv(time_until_answer, answered) ~ new_category + new_user +
contain_unanswered + contain_answered + title_questionmark +
text_contain_punct + text_all_lower + update + prior_effort +
day + sqrt(avg_tag_score) + rcs(log10(text_length), 5) +
rcs(log10(avg_tag_length + 1), 4) + rcs(log10(device_length + 1), 5) +
rcs(sqrt(newline_ratio), 3),
data = data, x = TRUE, y = TRUE, surv = TRUE)
if (summary == TRUE) {
print(fit)
}
return(fit)
}
#-------------------------------------------------------------------------------
#' Predict failure probabilities for questions on Answers
#'
#' This function uses a fitted cox regression model to predict failure probabilities with the
#' survest function from the rms package. Failure is defined as 1 - survival probability, and indicates the
#' probability that an event does happen before a certain time. In this case, the failure probability at time t
#' for a question is the probability that a question receives an answer before time t.
#'
#' @param model The cox regression model to use for predictions (output from the fit_model function). This function
#' only works with cph fits, not coxph fits.
#' @param newdata Optional, new data from which to get predictions for. If this is omitted, this function
#' will output predictions for all subjects linear predictor and strata values at the default/user-specified times.
#' @param times Vector of times at which to predict on. If omitted, this function will return predictions at
#' 0.5, 3, 10, 24, 100, 1000 hours.
#'
#' @return Returns a data frame of predicted failure probabilities. The columns are the times predicted on, the
#' rows correspond to each question in the data.
#'
#' @examples
#' # importing data
#' dir <- file.path(getwd(),"data")
#' out <- read.csv(file.path(dir, "answers_data.csv")) # data set without any variables set up
#'
#' # fitting model
#' model <- fit_model(out)
#'
#' # setting up variables in the prediction data
#' data_for_predicting <- variable_setup(newdata, forpredicting = TRUE)
#'
#' predictions <- predict_failure(model, newdata = data_for_predicting)
#'
#'
#' @export
predict_failure <- function(model, newdata = NULL, times = c(0.5, 3, 10, 24, 100, 1000)) {
if (!is.null(newdata)) {
pr <- data.frame(1 - rms::survest(model, as.data.frame(newdata), times = times, conf.int = FALSE)$surv)
} else {
pr <- data.frame(1 - rms::survest(model, times = times, conf.int = FALSE)$surv)
}
colnames(pr) <- times
return(pr)
}
#-------------------------------------------------------------------------------
#' Plot failure probability predictions for a question
#'
#' This function uses a fitted cox regression model to predict and plot failure probabilities
#' for a single question. Predictions are made at each minute, starting at 0 hours,
#' and ending at the maximum time of the data the cox model was fit on. This function then
#' plots those predictions against the corresponding time with ggplot.
#'
#' @param model The cox regression model to use in predicting failure (output from the fit_model function).
#' This function only works with cph fits.
#' @param question The question to predict failure probabilities on (one row of Answers data set).
#'
#' @importFrom dplyr "%>%" filter
#' @importFrom ggplot2 aes geom_point scale_x_continuous scale_y_continuous margin
#' expand_limits theme element_text geom_segment labs
#'
#' @return Returns a ggplot of the failure experience for the input question.
#' The plot indicates when the question is predicted to reach it's maximum
#' failure probabiity with red reference lines.
#'
#' @examples
#' # importing data
#' dir <- file.path(getwd(),"data")
#' out <- read.csv(file.path(dir, "answers_data.csv")) # data set without any variables set up
#'
#' # fitting model
#' model <- fit_model(out)
#'
#' # setting up data to predict on
#' newdata <- variable_setup(newdata)
#' question <- newdata[1,]
#'
#' plot_failure(model, question)
#'
#' @export
plot_failure <- function(model, question) {
predict <- rms::survest(model, newdata = as.data.frame(question),
times = seq(0, model$maxtime, by = 1), conf.int = FALSE)
df <- data.frame(time = predict$time, pr = 1 - predict$surv) # creating a data frame of times & predictions
df <- df %>% filter(!is.na(pr)) # removing any missing predictions
max <- max(df$pr) # maximum probability and time at which it occurs
hr <- which(df$pr == max(df$pr))[1]
ggplot(df, aes(x = time, y = pr)) +
geom_point(size = 0.2) +
scale_x_continuous("Hours since the question was posted", expand = c(0.01, 0),
breaks = seq(0, nrow(df), by = 150)) +
scale_y_continuous("Predicted Failure Probability", expand = c(0.01, 0),
breaks = seq(0, 1, by = 0.1)) +
expand_limits(y = 1, x = 0) +
theme(axis.title.x = element_text(margin = margin(t = 10))) +
geom_segment(aes(x = 0, xend = hr, y = max, yend = max),
col = "red", size = 0.25) +
geom_segment(aes(x = hr, xend = hr, y = 0, yend = max),
col = "red", size = 0.25) +
labs(title = paste("Predicted failure probabilities for question", question$id),
subtitle = paste("The red lines indicate the highest failure probability the question reaches.",
"\n", "Maximum failure probability: ", round(max,2), " at ", hr, " hours", sep = ""))
}
#-------------------------------------------------------------------------------
#' Tag Frequency
#'
#' Used in exploratory_setup() and variable_setup() to create the tag-based variables.
#' This function takes in vector of tags, and separates that vector into a matrix where each column
#' represents an individual tag. Unique tags are pulled from the matrix, and the number of times
#' each unique tags occur in the entire matrix is summed to get the frequency.
#'
#' @param tags vector of tags
#'
#' @importFrom dplyr "%>%" arrange desc
#'
#' @return Returns a list. In the first position is a matrix, where each row represents
#' a question and each column represents the question's tags. A question that has no tags has empty strings in each
#' column. In the second position is a data frame containing each unique tag found within
#' the data, along with the percent/proportion of times it occurs in the data.
#'
#' @examples
#' taglist <- tag_frequency(data$tags)
#' matrix_of_tags <- taglist[[1]]
#' tag_frequency_df <- taglist[[2]]
#'
#' @export
tag_frequency <- function(tags) {
split_tags <- stringr::str_split(tags, ", ", simplify = TRUE)
split_tags[is.na(split_tags)] <- ""
unique_tags <- unique(as.vector(split_tags[split_tags != ""]))
tag_freq <- data.frame(tag = unique_tags,
percent = purrr::map_dbl(unique_tags, ~mean(rowSums(split_tags == .) > 0))) %>%
arrange(desc(percent))
return(list(split_tags, tag_freq))
}
#-------------------------------------------------------------------------------
#' Assign tag frequency
#'
#' Used in exploratory_setup() and variable_setup() for creating the tag-based variables.
#' This function takes each tag in the input vector, and matches it to the corresponding frequency
#' in the input data frame. Arguments are from the output of the tag_frequency function.
#'
#' @param tag vector of tags
#' @param tagdf data frame of tags and tag frequencies (output from tag_frequency function)
#'
#' @return Returns a numeric vector corresponding to the frequency of each tag in the input vector. Questions with
#' no tags are assigned a frequency of 0.
#'
#' @export
assign_tag_freq <- function(tag, tagdf) {
score <- rep(0, length(tag))
for (j in which(tag!= "")) {
score[j] <- tagdf$percent[which(tagdf$tag == tag[j])]
}
return(score)
}
#-------------------------------------------------------------------------------
#' Get most frequent terms in the titles of answered/unanswered questions
#'
#' This function is used in variable setup functions for the cox regression model. It separates the full data
#' into data frames of answered and unanswered questions. It then uses the get_freq_terms function from this
#' package to get data frames of the most commonly used words in the user-specified text variable of
#' answered and unanswered questions (the fitted model used question titles). The resulting data frames are then
#' joined by word.
#'
#' @param data The full data set
#' @param variable The variable to get the frequent terms from. In the model, question titles were used.
#' Argument should be input as a string.
#' @param stopwords Optional, add stopwords to remove. Argument should be input in the form of a string or character vector.
#' For the model, "can", "will", "cant", "wont", "works", "get", "help", "need", "fix", "doesnt", "dont" were removed.
#' @param remove Optional, add words to remove from the resulting data frame. Argument should be input in the form of
#' a string or character vector. For the model, words that matched with any of the category, subcategory, or new_category
#' levels were removed.
#'
#' @importFrom dplyr "%>%" filter
#'
#' @return Returns a data frame of words from the input text variable, along with the frequency each word
#' occurs in all of the data, as well as in answered and unanswered questions, and a ratio calculated as: frequency
#' in answered divided by frequency in unanswered. The resulting data frame is used in exploratory_setup and
#' variable_setup functions for the contain_answered and contain_unanswered variables.
#'
#' @examples
#' words <- c("can", "will", "cant", "wont", "works", "get", "help", "need", "fix", "doesnt", "dont")
#' devices <- c("iphone", "macbook", "imac", "ipad")
#' get_au_terms(data = x, variable = "title", stopwords = words, remove = devices)
#'
#' @export
get_au_terms <- function(data, variable, stopwords = NULL, remove = NULL) {
answered <- data %>% dplyr::filter(answered == 1) %>% select(variable) # subsets data into answered/unanswered questions
unanswered <- data %>% dplyr::filter(answered == 0) %>% select(variable)
if (is.null(stopwords)) { # gets data frame of frequent terms for answered and unanswered Qs
a_terms <- oshitar::get_freq_terms(as.character(answered[[variable]]))
u_terms <- oshitar::get_freq_terms(as.character(unanswered[[variable]]))
} else {
a_terms <- oshitar::get_freq_terms(as.character(answered[[variable]]), stopwords)
u_terms <- oshitar::get_freq_terms(as.character(unanswered[[variable]]), stopwords)
}
colnames(a_terms)[2:3] <- c("frequency_a", "prop_in_answered")
colnames(u_terms)[2:3] <- c("frequency_u", "prop_in_unanswered")
a_terms <- dplyr::mutate_if(a_terms, is.factor, as.character)
u_terms <- dplyr::mutate_if(u_terms, is.factor, as.character)
combined <- dplyr::full_join(a_terms, u_terms, by = "word")
combined$ratio <- combined$prop_in_answered / combined$prop_in_unanswered
if (!is.null(remove)) {
terms <- glue::collapse(stringr::str_to_lower(remove), sep = " ")
combined$delete <- purrr::map_dbl(combined$word, ~stringr::str_detect(terms, pattern = SPC %R% . %R% SPC))
combined <- combined %>% filter(delete == 0)
}
return(combined)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.