# program elements --------------------------------------------------------
.decode_program_element <-
function(x = "0603502N") {
dod_program <-
x %>% substr(1,2)
rd_category <-
x %>% substr(3,4)
if (nchar(x) >= 5) {
equip_activity <-
x %>% substr(5,5)
} else {
equip_activity <- NA
}
if (nchar(x) >= 7) {
serial_no <-
x %>% substr(6,7)
} else {
serial_no <- NA
}
if (nchar(x) >= 8) {
service <- x %>% substr(8, nchar(x))
} else {
service <- NA
}
tibble(
codeProgramElement = x,
idDODProgram = dod_program,
idRDCategory = rd_category,
idEquipmentCategory = equip_activity,
idSerial = serial_no,
codeService = service
)
}
### http://acqnotes.com/acqnote/acquisitions/program-element-pe
# utils -------------------------------------------------------------------
.fix_dtic_dates <-
function(data) {
date_names <- data %>% select(matches("date")) %>% names()
if (length(date_names) == 0) {
return(data)
}
data %>%
mutate_at(date_names,
list(function(x){
glue("{x}-01") %>% as.character() %>% ymd()
}))
}
.fix_dtic_amount <-
function(data) {
amount_names <- data %>% select(matches("amount")) %>% names()
if (length(date_names) == 0) {
return(data)
}
data %>%
mutate_at(amount_names,
list(function(x){
parse_number(x) * 1000000
}))
}
.munge_dtic_names <-
function(data) {
dict_names <- .dictionary_dtic_names()
grant_names <-
names(data)
actual_names <-
grant_names %>%
map_chr(function(name) {
df_row <-
dict_names %>% filter(nameDTIC == name)
if (nrow(df_row) == 0) {
glue::glue("Missing {name}") %>% message()
return(name)
}
df_row$nameActual
})
data %>%
set_names(actual_names)
}
.dictionary_dtic_names <-
function() {
tibble(
nameDTIC = c(
"recordNumber",
"title",
"pdfUrl",
"xmlUrl",
"snippet",
"orgName",
"budgetActivity",
"programElementNum",
"programElementTitle",
"appropriationNum",
"cache",
"fiscalYear",
"orgCode",
"fileName",
"cacheUrl",
"cyear",
"pyear",
"docTypeIndexValue",
"orgUrlPart",
"AppropriationNumber",
"AppropriationTitle",
"BudgetActivityNumber",
"BudgetActivityTitle",
"BudgetCycle",
"BudgetSubActivityNumber",
"BudgetSubActivityTitle",
"BudgetYear",
"Code",
"Description",
"Justification",
"LineItemNumber",
"LineItemTitle",
"P1LineNumber",
"ServiceAgencyName",
"SubmissionDate",
"urlXML",
"p40AItemScheduleSetting",
"ProgramElementNumber",
"AppropriationCode",
"AppropriationName",
"ProgramElementMissionDescription",
"ProgramElementNote",
"ProgramElementTitle",
"R1LineNumber",
"codeblistProgramElementNumber",
"documentassemblyoptionsp40AItemScheduleSetting",
"changesummarySummaryExplanation",
"amountCost",
"amountYearCurrentTotal",
"amountYearFiveTotal",
"amountYearFourTotal",
"amountYearOneBase",
"amountYearOneTotal",
"amountYearPriorTotal",
"amountYearsPriorAllTotal",
"amountYearThreeTotal",
"amountYearTwoTotal",
"otherrelatedlistProgramElementNumber",
"R1LineItemNumber",
"CostType",
"documentassemblyoptionssuppressP40As",
"amountYearOneOCO"
),
nameActual = c(
"numberRecord",
"nameBudgetLineItem",
"urlPDF",
"urlXML",
"slugSnippet",
"nameOrganization",
"nameBudgetActivity",
"codeProgramElement",
"nameProgramElement",
"slugAppropriation",
"slugCache",
"yearBudget",
"slugOrganization",
"nameFile",
"urlCache",
"amountItem",
"amountItemPriorYear",
"nameDODBudgetGroup",
"slugOrganizationFile",
"codeAppropriation",
"nameAppropriation",
"slugBudgetActivity",
"nameBudgetActivity",
"slugBudgetCycle",
"slugBudgetSubActivity",
"nameBudgetSubActivity",
"yearBudget",
"codeBudget",
"descriptionAppropriation",
"descriptionJustification",
"codeBudgetLineItem",
"nameBudgetLineItem",
"slugP1LineNumber",
"nameAgency",
"dateSubmission",
"urlXML",
"slugp40AItemScheduleSetting",
"codeProgramElement",
"codeAppropriation",
"nameAppropriation",
"descriptionProgramElement",
"noteProgramElement",
"titleProgramElement",
"numberLineR1",
"codeProgramElement",
"typeP40AItemScheduleSetting",
"descriptionChanges",
"amountCost",
"amountYearCurrentTotal",
"amountYearFiveTotal",
"amountYearFourTotal",
"amountYearOneBase",
"amountYearOneTotal",
"amountYearPriorTotal",
"amountYearsPriorAllTotal",
"amountYearThreeTotal",
"amountYearTwoTotal",
"codeProgramElementRelated",
"numberLineR1",
"typeCost",
"removeSuppressP40",
"amountYearOneOCO"
)
)
}
# communities_of_interest -------------------------------------------------
### https://defenseinnovationmarketplace.dtic.mil/communities-of-interest/
.dtic_coi <-
function() {
page <- read_html("https://defenseinnovationmarketplace.dtic.mil/communities-of-interest/")
communities <- page %>% html_nodes("h5") %>% html_text() %>% str_squish()
descriptions <-
page %>% html_nodes(".textwidget") %>% html_text() %>% str_squish()
descriptions <- descriptions[2:length(descriptions)] %>%
discard(function(x) {
x %in% c(
"",
"Enter Search Term(s): //< ![CDATA[ var usasearch_config = { siteHandle:\"chieftechnologist\" }; var script = document.createElement(\"script\"); script.type = \"text/javascript\"; script.src = \"//search.usa.gov/javascripts/remote.loader.js\"; document.getElementsByTagName(\"head\")[0].appendChild(script); //]]>"
)
})
descriptions <-
descriptions %>% str_remove_all(glue("^{communities}") %>% str_c(collapse = "|")) %>%
str_remove_all("^COUNTER-IED|^COUNTER-WMD|^ENGINEERED RESILIENT SYSTEMS") %>%
str_remove_all("ALUMNI COI|S&T|EMS") %>%
str_replace_all("\\)|\\(", "") %>%
str_squish()
descriptions <-
case_when(descriptions == "Webpage is under construction." ~ NA_character_,
TRUE ~ descriptions)
data <-
tibble(nameCommunityOfInterest = communities,
descriptionCommunityOfInterest = descriptions)
links <- page %>%
html_nodes("a") %>%
html_attr("href")
links <-
links[links %>% str_detect("https://defenseinnovationmarketplace.dtic.mil/communities-of-interest/")] %>%
unique() %>%
discard(list(function(x){
is.na(x)
}))
data <-
data %>%
mutate(isAlumni = nameCommunityOfInterest %>% str_detect("ALUMNI")) %>%
separate(
nameCommunityOfInterest,
into = c("nameCommunityOfInterest",
"acronymCommunityOfInterest"),
sep = "\\(",
extra = "merge",
fill = "right"
) %>%
select(-acronymCommunityOfInterest) %>%
select(isAlumni, everything()) %>%
mutate(hasDescription = !is.na(descriptionCommunityOfInterest))
df_links <- tibble(urlCOI = links) %>%
mutate(
slug = urlCOI %>% str_remove_all(
"https://defenseinnovationmarketplace.dtic.mil/communities-of-interest/"
)
) %>%
filter(slug != "") %>%
mutate(slug = slug %>% str_remove_all("/$"))
data <-
data %>%
mutate_if(is.character, str_squish)
data <- data %>%
mutate(
slug = case_when(
nameCommunityOfInterest == "ADVANCED ELECTRONICS" ~ "advanced-electronics",
nameCommunityOfInterest == "AIR PLATFORMS" ~ "air-platforms",
nameCommunityOfInterest == "AUTONOMY" ~ "autonomy",
nameCommunityOfInterest == "BIOMEDICAL ASBREM" ~ "biomedical-asbrem",
nameCommunityOfInterest == "C4I" ~ "c4i_coi",
nameCommunityOfInterest == "COUNTER-IED" ~ "counter-improvised-explosive-devices-c-ied",
nameCommunityOfInterest == "COUNTER-WMD" ~ "counter-weapons-of-mass-destruction-c-wmd",
nameCommunityOfInterest == "CYBER" ~ "cyber",
nameCommunityOfInterest == "ELECTRONIC WARFARE" ~ "electronic-warfare",
nameCommunityOfInterest == "ENERGY AND POWER TECHNOLOGIES" ~ "energy-and-power-ep-technologies",
nameCommunityOfInterest == "ENGINEERED RESILIENT SYSTEMS" ~ "engineered-resilient-systems",
nameCommunityOfInterest == "GROUND AND SEA PLATFORMS" ~ "ground-and-sea-platforms-gsp",
nameCommunityOfInterest == "HUMAN SYSTEMS" ~ "human-systems",
nameCommunityOfInterest == "MATERIALS AND MANUFACTURING PROCESSES" ~ "materials-manufacturing-processes-mmp",
nameCommunityOfInterest == "SENSORS" ~ "sensors",
nameCommunityOfInterest == "SPACE" ~ "space"
)
)
data <- data %>%
left_join(df_links, by = "slug") %>%
select(-slug)
data
}
.parse_coi_page <-
function(url = "https://defenseinnovationmarketplace.dtic.mil/communities-of-interest/air-platforms/") {
page <- read_html(url)
text <- page %>% html_nodes('.sow-accordion-title, p, #content li') %>% html_text() %>% str_squish()
text <- text[!text %>% str_detect("CDATA")]
data <- tibble(urlCOI = url, textCOI = text)
has_links <- page %>% html_nodes("#content a") %>% length() > 0
data <-
data %>%
filter(textCOI != "")
if (has_links) {
nameFile <-
page %>% html_nodes("#content a") %>% html_text()
urlFile <-
page %>% html_nodes("#content a") %>% html_attr("href") %>%
str_c("https://defenseinnovationmarketplace.dtic.mil",.)
data <-
data %>%
filter(!textCOI %in% c(nameFile, "Communities of Interest")) %>%
filter(!textCOI %>% str_detect("PDF"))
data <- data %>%
filter(!textCOI %in% c(nameFile, "Communities of Interest")) %>%
filter(!textCOI %>% str_detect("PDF")) %>%
group_by(urlCOI) %>%
summarise(textCOI = textCOI %>% str_c(collapse = "\n")) %>%
ungroup()
df_urls <- tibble(nameFile, urlFile)
data <- data %>%
mutate(dataResearch = list(df_urls))
} else {
data <-
data %>%
filter(!textCOI %in% c(nameFile, "Communities of Interest")) %>%
filter(!textCOI %>% str_detect("PDF")) %>%
group_by(urlCOI) %>%
summarise(textCOI = textCOI %>% str_c(collapse = "\n")) %>%
ungroup()
}
data
}
#' Department of Defense Communities of Interest
#'
#' Returns DTIC hosted information about the Defense Innovation Marketplace
#'
#' @param return_message if \code{TRUE} returns information about link parsing
#'
#' @return
#' @export
#'
#' @examples
dtic_communities_of_interest <-
function(snake_names = T,
return_message = T) {
options(warn = - 1)
data <- .dtic_coi()
urls <-
data %>% filter(!is.na(urlCOI)) %>%
pull(urlCOI)
df_text <- urls %>%
map_dfr(function(x) {
if (return_message) {
glue("Parsing {x}") %>% message()
}
.parse_coi_page(url = x)
})
df_text <-
df_text %>%
mutate(hasResearchLinks = dataResearch %>% map_dbl(length) > 0)
data <-
data %>%
left_join(df_text, by = "urlCOI") %>%
mutate(hasResearchLinks = hasResearchLinks %>% coalesce(F))
data <-
data %>%
munge_data(snake_names = snake_names)
data
}
#' DTIC Community of Interest Taxonomy
#'
#' Acquires and OCRs information about
#' the DTIC Communities of interest
#'
#' @return tibble
#' \url{https://defenseinnovationmarketplace.dtic.mil/communities-of-interest/}
#' @export
#' @family dtic
#'
#' @examples
#' dtic_communities_of_interest_taxonomy()
dtic_communities_of_interest_taxonomy <-
function(snake_names = T) {
urlDTIC <- "https://defenseinnovationmarketplace.dtic.mil/wp-content/uploads/2018/02/COI_Tier1_Taxonomy_7March2016.pdf"
df_metadata <- pdftools::pdf_info(pdf = urlDTIC) %>% flatten_df()
df_metadata <- df_metadata %>%
mutate_if(is.character,
list(function(x) {
case_when(x == "" ~ NA_character_,
TRUE ~ x)
})) %>%
.remove_na()
text <-
pdftools::pdf_text(pdf = urlDTIC) %>%
str_split("\n") %>%
flatten_chr() %>%
str_squish() %>%
discard(function(x) {
x == ""
}) %>%
discard(function(x) {
x %>% str_detect("Distribution Statement")
})
data <-
tibble(text) %>%
mutate(row = 1:n())
df_parents <- data %>%
filter(text %>% str_detect("^[0-9][.]|^[0-9][0-9][.]")) %>%
rename(coi = text) %>%
separate(
coi,
into = c("numberCommunityOfInterest", "nameCommunityOfInterest"),
sep = "\\. "
) %>%
mutate(
numberCommunityOfInterest = as.numeric(numberCommunityOfInterest),
nameCommunityOfInterest =
case_when(
nameCommunityOfInterest == "Energy and Power (E&P) Technologies" ~ "Energy and Power Technologies (E&P)",
TRUE ~ nameCommunityOfInterest
)
)
data <-
data %>%
left_join(df_parents, by = "row") %>%
fill(numberCommunityOfInterest,
nameCommunityOfInterest) %>%
filter(!text %>% str_detect("^[0-9][.]|^[0-9][0-9][.]")) %>%
filter(!is.na(numberCommunityOfInterest)) %>%
separate(
nameCommunityOfInterest,
into = c("nameCommunityOfInterest", "acronymnCommunityOfInterest"),
sep = "\\(",
extra = 'merge',
fill = 'right'
) %>%
mutate_if(is.character, list(function(x) {
x %>% str_squish() %>% str_remove_all("\\)") %>% str_to_upper()
}))
data <- data %>%
group_by(numberCommunityOfInterest,
nameCommunityOfInterest,
acronymnCommunityOfInterest) %>%
summarise(textCOI = text %>% str_c(collapse = "\n")) %>%
ungroup()
data <-
data %>%
select(one_of(names(df_parents %>% select(-row))), everything()) %>%
mutate(urlDTIC)
df_metadata <-
df_metadata %>%
mutate(urlDTIC)
data <-
data %>%
left_join(df_metadata, by = "urlDTIC") %>%
select(one_of(names(df_metadata)), everything()) %>%
munge_data(snake_names = snake_names)
data
}
# entities ----------------------------------------------------------------
.parse_name_entity <-
function(text) {
parts <-
text %>% str_split("\\(|\\)|\\*|\\ ") %>% flatten_chr() %>% discard(function(x) {
x == ""
})
data <-
tibble(word = parts) %>%
mutate(idWord = 1:n())
entity <-
data %>%
dplyr::slice(1:(nrow(data) - 2)) %>%
pull(word) %>%
str_c(collapse = " ")
loc <- (nrow(data) - 1):nrow(data)
location <- data %>%
dplyr::slice(loc) %>%
pull(word) %>%
str_c(collapse = ", ")
tibble(nameEntity = entity, locationEntity = location)
}
#' DTIC Entities
#'
#' List of DTIC contributing
#' entities and their organizational
#' structure.
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' dtic_entites()
dtic_entites <-
function() {
data <-
read_delim(
"https://discover.dtic.mil/wp-content/uploads/2019/09/CS/corporate.txt",
delim = "\\: ",
col_names = F
)
data <-
data %>% dplyr::slice(5:nrow(data)) %>%
filter(X1 != "------ ") %>%
separate(
X1,
"\\: ",
into = c("X1", "X2", "X3", "X4", "X5"),
fill = "right",
extra = "merge"
) %>%
mutate_all(str_squish) %>%
setNames(
c(
"idSource",
"nameEntity",
"nameParentMaster",
"nameParents",
"nameSuborganizations"
)
) %>%
mutate_all(list(function(x) {
ifelse(x == "", NA_character_, x)
})) %>%
mutate(idSource = as.numeric(idSource))
df_suborginzations <-
data %>%
select(idSource, nameSuborganizations) %>%
separate_rows(nameSuborganizations, sep = "\\|") %>%
mutate(suborganizationsEntity = nameSuborganizations %>% str_squish()) %>%
mutate_all(list(function(x) {
ifelse(x == "", NA_character_, x)
})) %>%
filter(!is.na(nameSuborganizations)) %>%
group_by(idSource) %>%
summarise(
countSubsidiaries = n(),
nameSuborganizations = nameSuborganizations %>% str_c(collapse = "|")
) %>%
ungroup()
data <-
data %>%
select(-nameSuborganizations) %>%
left_join(df_suborginzations, by = "idSource") %>%
mutate(countSubsidiaries = ifelse(is.na(countSubsidiaries), 0 , countSubsidiaries))
rm(df_suborginzations)
df_parents <-
data %>%
filter(!is.na(nameParents)) %>%
select(idSource, nameParents) %>%
separate_rows(nameParents, sep = "\\|") %>%
mutate_all(list(function(x) {
ifelse(x == "", NA_character_, x)
})) %>%
filter(!is.na(nameParents)) %>%
group_by(idSource) %>%
summarise(countParents = n(),
nameParents = nameParents %>% str_c(collapse = "|")) %>%
ungroup() %>%
mutate_if(is.numeric, list(function(x) {
ifelse(is.na(x), 0, x)
}))
data <- data %>%
select(-nameParents) %>%
left_join(df_parents, by = "idSource") %>%
select(one_of(names(df_parents)), everything()) %>%
select(idSource, nameEntity, everything())
data
}
#' DTIC Keyword Thesaurus
#'
#' Returns all the terms
#' in the DTIC thesaurus and their
#' related parents, military uses cases
#' and closely related words
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' dtic_thesaurus()
dtic_thesaurus <-
memoise::memoise(function() {
values <-
"https://discover.dtic.mil/wp-content/uploads/thesaurus/thesaurus.txt" %>% read_lines()
values <-
values %>% discard(function(x) {
x == ""
})
data <-
tibble(values) %>%
mutate(idRow = 1:n()) %>%
mutate(isSubField = values %>% str_detect(" ")) %>%
select(idRow, everything())
df_parents <-
data %>% filter(!isSubField) %>%
rename(parent = idRow) %>%
select(-isSubField) %>%
rename(termParent = values)
data <-
data %>%
filter(isSubField) %>%
mutate(parent = idRow - 1) %>%
rename(typeTerm = values) %>%
mutate(typeTerm = str_trim(typeTerm)) %>%
separate(
typeTerm,
into = c("slugDescriptor", "nameTerm"),
fill = "right",
extra = "merge",
sep = "\\ "
) %>%
left_join(df_parents, by = "parent") %>%
fill(termParent) %>%
select(termParent, slugDescriptor, nameTerm)
data <- data %>%
mutate(
nameDescriptor = case_when(
slugDescriptor == "UF" ~ "relatedUsedFor",
slugDescriptor == "NT" ~ "relatedTermNear",
slugDescriptor == "BT" ~ "relatedBroadTerm",
slugDescriptor == "RT" ~ "relatedTerm",
)
) %>%
select(termParent, nameDescriptor, slugDescriptor, everything())
data
})
# search_engine -----------------------------------------------------------
.generate_dtic_search <- function(term = 'China', quote = T) {
t <- term
if (quote) {
term <- glue('"{term}"')
}
slug_term <- term %>% URLencode()
url <-
glue("https://discover.dtic.mil/results/?q={slug_term}") %>% as.character()
tibble(termSearch = t, urlDTICSearch = url)
}
#' DTIC Library search tibble
#'
#' Creates URLs for items
#' in DTIC's library
#'
#' @param terms vector of terms
#' @param quote if \code{TRUE} quotes the terms
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' dtic_search_tbl(terms = c("FAIL SAFE", "CATHETERIZATION", "RETINOIC ACIDS"), quote = T)
dtic_search_tbl <-
function(terms = NULL, quote = T) {
if (length(terms) == 0) {
stop("Enter terms to search DTIC")
}
terms %>%
map_dfr(function(term) {
.generate_dtic_search(term = term, quote = quote)
})
}
# r2_pe -------------------------------------------------------------------
# https://apps.dtic.mil/dodinvestment/#/
#' DTIC Program Elements
#'
#' Returns information about DTIC programe elements
#'
#' @return
#' @export
#'
#' @examples
#' dictionary_dtic_program_elements()
dictionary_dtic_program_elements <-
function() {
data <-
"https://apps.dtic.mil/dodinvestment/data/penandlin.json" %>%
fromJSON()
data %>%
as_tibble() %>%
setNames(c("codeProgramElement", "isCustom"))
}
# rdte --------------------------------------------------------------------
.parse_dtic_rdte_dict <-
memoise::memoise(function(url = "https://apps.dtic.mil/dodinvestment/api/service/search/advancedSearch?budgetApn=&budgetFYear=2020&budgetNum=&budgetType=All&numberOfResults=10000&searchMethod=3&searchText=&sortBy=fy&sortOrder=desc&start=0") {
text <- read_lines(url)
text <- text[[2]]
data <- text %>% fromJSON(simplifyDataFrame = T)
rm(text)
gc()
data <-
data$documents %>%
as_tibble()
data <-
data %>% .munge_dtic_names()
df_urls <-
data %>%
select(numberRecord, matches("url")) %>%
mutate(numberRecord = as.integer(numberRecord))
data <-
data %>%
select(-matches("url")) %>%
.munge_data() %>%
.remove_na() %>%
mutate(numberRecord = as.integer(numberRecord)) %>%
left_join(df_urls, by = "numberRecord")
amount_cols <- data %>% select(matches("amount")) %>% names()
if (length(amount_cols) > 0) {
data <- data %>%
mutate_at(amount_cols,
list(function(x) {
x %>% as.integer() * 1000000
}))
}
data <- data %>%
mutate(urlDTICBudgetAPI = url)
data
})
.generate_dtic_budget_urls <-
function(years = 2000:2020,
results = 10000) {
urls <-
glue(
"https://apps.dtic.mil/dodinvestment/api/service/search/advancedSearch?budgetApn=&budgetFYear={years}&budgetNum=&budgetType=All&numberOfResults={results}&searchMethod=3&searchText=&sortBy=fy&sortOrder=desc&start=0"
) %>%
as.character()
data <-
tibble(yearBudget = years, urlDTICBudgetAPI = urls)
data
}
#' RDTE Budgets
#'
#' Returns data about DTIC warehoused
#' RDTE budgets
#'
#'
#'
#' @param years vector of years - default 2000:2020
#' @param results number of results - default 10,000
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' dtic_rdte_budgets()
#' }
dtic_rdte_budgets <-
function(years = 2000:2020,
results = 10000) {
df_urls <- .generate_dtic_budget_urls()
urls <- df_urls$urlDTICBudgetAPI
.parse_dtic_rdte_dict_safe <-
possibly(.parse_dtic_rdte_dict, tibble())
data <-
urls %>%
future_map_dfr(function(url) {
url %>% message()
.parse_dtic_rdte_dict_safe(url = url)
})
data <- data %>%
mutate(numberRecord = as.integer(numberRecord)) %>%
mutate(idRow = 1:n())
df_urls <-
data %>%
select(idRow, matches("url"))
data <-
data %>%
select(-matches("url")) %>%
.munge_data() %>%
mutate(numberRecord = as.integer(numberRecord)) %>%
left_join(df_urls, by = "idRow") %>%
select(-idRow)
data <- data %>%
mutate(urlXML = case_when(is.na(urlXML) ~ urlCache,
TRUE ~ urlXML))
data
}
.prune_dtic_xml <-
function(data, threshold = 2,
character_fields = c(
"name",
"title",
"budgetcycle",
"location",
"ContractMethod",
"ContractType",
"SpecsAvailableNow",
"FundingVehicle",
"costtocomplete",
"summaryexplanation",
"appropriationcode",
"description",
"projectnumber",
"idcode",
"modelaffected",
"modificationtype",
"remarks",
"modificationnumber",
"budgetcycle",
"programelementnote",
"programelementnumber",
".text",
"ProjectNumber",
"performancemetrics",
"articles",
"ProjectNote",
".note"
)) {
if (!data %>% hasName("parent")) {
data <-
data %>%
separate(
item,
into = c("parent", "item"),
sep = "\\.",
extra = "merge",
fill = "right"
) %>%
mutate(idRow = 1:n())
}
data_item_count <-
data %>% count(item, sort = T)
data_item_count <-
data %>% count(item, sort = T) %>%
mutate(countLevels = item %>% str_count("\\."))
max_levels <- data_item_count$countLevels %>% max() + 1
new_cols <- glue("X{1:max_levels}") %>% as.character()
data_items_long <-
data_item_count %>%
separate(
col = item,
into = new_cols,
sep = "\\.",
fill = "right",
extra = "merge",
remove = F
) %>%
mutate(idItem = 1:n()) %>%
select(-countLevels) %>%
gather(table, field, -c(item, idItem, n), na.rm = T) %>%
arrange(idItem)
data_names <-
data_items_long %>%
group_by(idItem) %>%
mutate(numberItem = 1:n()) %>%
do(tail(., threshold)) %>%
mutate(field = case_when(numberItem == min(numberItem) ~ str_to_lower(field),
TRUE ~ field)) %>%
ungroup() %>%
group_by(item) %>%
summarise(nameItem = str_c(field, collapse = "")) %>%
mutate(slugItem = str_to_lower(item))
char_slugs <-
character_fields %>% str_to_lower() %>% str_c(collapse = "|")
data_names <-
data_names %>%
mutate(
typeItem = case_when(
slugItem %>% str_detect("date") ~ "date",
slugItem %>% str_detect(char_slugs) ~ "character",
TRUE ~ "numeric"
)
)
data_names <- data_names %>%
mutate(
nameItem = nameItem %>% str_replace_all("five", "Five") %>%
str_replace_all("four", "Four") %>%
str_replace_all("three", "Three") %>%
str_replace_all("two", "Two") %>%
str_replace_all("one", "One") %>%
str_replace_all("^budgetyear", "countBudgetYear") %>%
str_replace_all("^in", "countIn") %>%
str_replace_all("^out", "countOut"),
nameItem = case_when(
nameItem == "totalcostTotal" ~ "amountCostTotal",
nameItem %>% str_detect("totalcost") ~ nameItem %>% str_replace_all("totalcost", "amountCost"),
nameItem %>% str_detect("deliverydate") ~ nameItem %>% str_replace_all("deliverydate", "dateDelivery"),
nameItem %>% str_detect("contractdate") ~ nameItem %>% str_replace_all("contractdate", "dateContract"),
nameItem == "categoryName" ~ "nameCategory",
nameItem == "compOnentName" ~ "nameComponent",
nameItem == "itemName" ~ "nameCategoryItem",
nameItem == "supportName" ~ "nameSupport",
nameItem == "manufacturerName" ~ "nameManufacturer",
nameItem == "costelementName" ~ "nameCostElement",
nameItem == "logisticsName" ~ "nameLogistics",
nameItem == "nonorganicinstallationImplementationMethodName" ~ "nameInstallationMethod",
nameItem == "kitName" ~ "nameKit",
nameItem == "hardwareName" ~ "nameHardware",
nameItem == "softwareName" ~ "nameSoftware",
nameItem == "name" ~ "nameItem",
nameItem == "itemIdCode" ~ "codeItem",
nameItem %in% c("modificationitemTitle", "modificationtitle") ~ "titleModification",
nameItem %in% c("modelsaffectedlistModelAffected") ~ "nameModelsAffected",
nameItem %in% c("modificationnumber") ~ "codeModification",
nameItem %in% c("modificationtype") ~ "typeModification",
nameItem == "idcode" ~ "codeItem",
nameItem == "description" ~ "descriptionItem",
nameItem == "remarks" ~ "remarksItem",
nameItem == "totalobligationauthorityTotal" ~ "amountObligationAuthorityTotal",
nameItem %>% str_detect("totalobligationauthority") ~ nameItem %>% str_replace_all("totalobligationauthority", "amountObligationAuthority"),
nameItem %>% str_detect("unitcost") ~ nameItem %>% str_replace_all("unitcost", "amountCostPerUnit"),
nameItem %>% str_detect("netprocurementp1") ~ nameItem %>% str_replace_all("netprocurementp1", "amountProcurementNetP1"),
nameItem %>% str_detect("^quantity") ~ nameItem %>% str_replace_all("^quantity", "count"),
nameItem == "manufacturerLocation" ~ "locationManufacturer",
nameItem == "manufacturerAdminLeadTimeAfterOct1InMonths" ~ "countAdminLeadTimePostOct1Months",
nameItem == "manufacturerProductionLeadTimeAfterOct1InMonths" ~ "countProductionLeadTimePostOct1Months",
TRUE ~ nameItem
)
) %>%
mutate(
periodBudget =
case_when(
typeItem != "character" &
nameItem %>% str_detect("CurrentYear") ~ "YearCurrent",
typeItem != "character" &
nameItem %>% str_detect("AllPriorYears") ~ "YearsPriorAll",
typeItem != "character" &
nameItem %>% str_detect("BudgetYearOne") ~ "YearOne",
typeItem != "character" &
nameItem %>% str_detect("BudgetYearTwo") ~ "YearTwo",
typeItem != "character" &
nameItem %>% str_detect("BudgetYearThree") ~ "YearThree",
typeItem != "character" &
nameItem %>% str_detect("BudgetYearFour") ~ "YearFour",
typeItem != "character" &
nameItem %>% str_detect("BudgetYearFive") ~ "YearFive",
typeItem != "character" &
nameItem %>% str_detect("PriorYear") ~ "YearPrior",
typeItem != "character" &
nameItem %>% str_detect("ToComplete") ~ "ToComplete",
typeItem != "character" &
nameItem %>% str_detect("Total$") ~ "TotalBudget",
)
)
period_slugs <-
c(
"CurrentYear",
"AllPriorYears",
"BudgetYearOne",
"PriorYear",
"BudgetYearFive",
"BudgetYearFour",
"BudgetYearThree",
"BudgetYearTwo",
"ToComplete",
"Total"
) %>% str_c(collapse = "|")
data_names <-
data_names %>%
mutate(nameItem = nameItem %>% str_remove_all(period_slugs)) %>%
mutate(
typeBudget = case_when(
!is.na(periodBudget) &
nameItem %>% str_detect("OCO") ~ "OCO",
!is.na(periodBudget) &
nameItem %>% str_detect("Base") ~ "Base",
!is.na(periodBudget) &
!nameItem %>% str_detect("OCO|Base") ~ "Total"
),
nameItem = nameItem %>% str_remove_all("OCO|Base")
)
data <- data %>%
left_join(data_names, by = "item") %>%
select(parent, item, nameItem, everything())
data
}
.parse_dtic_rdte_xml <-
function(url = "https://apps.dtic.mil/procurement/Y2020/AirForce/U_P40_000071_BSA-1_BA-7_APP-3010F_PB_2020.xml",
table_names = c("programelement", "lineitem", "codeblist"),
return_message = T
) {
doc <- read_xml(url)
if (return_message) {
cat(url, fill = T)
}
data <-
doc %>%
as_list() %>%
unlist() %>%
enframe(name = "item")
data <-
data %>%
mutate(levelField = item %>% str_count("\\.")) %>%
mutate(isNested = levelField > 3)
table_name_slugs <-
str_to_lower(table_names) %>% str_c(collapse = "|")
df_base <-
data %>%
filter(!isNested) %>%
.prune_dtic_xml(threshold = 2)
df_base <-
df_base %>%
mutate(nameItem = nameItem %>% str_remove_all(table_name_slugs)) %>%
mutate(nameItem = case_when(
!is.na(periodBudget) &
!is.na(typeBudget) ~ str_c(nameItem, periodBudget, typeBudget, sep = ""),
TRUE ~ nameItem
)) %>%
mutate(nameItem = nameItem %>% str_replace_all("^funding", "amount")) %>%
select(nameItem, value) %>%
distinct() %>%
group_by(nameItem) %>%
summarise(value = value %>% str_c(collapse = " | ")) %>%
ungroup() %>%
spread(nameItem, value) %>%
.munge_dtic_names() %>%
.fix_dtic_dates() %>%
.fix_dtic_amount() %>%
.remove_na() %>%
select(matches("name"), everything())
df_base <-
df_base %>%
.munge_data() %>%
mutate(urlXML = url)
df_nested <-
data %>% filter(isNested)
df_nested <-
df_nested %>%
separate(
item,
into = c("nameTable", "item"),
sep = "\\.",
extra = "merge",
fill = "right"
) %>%
select(-nameTable)
df_nested <-
df_nested %>%
separate(
item,
into = c("nameTable", "item"),
sep = "\\.",
extra = "merge",
fill = "right"
) %>%
select(-nameTable) %>%
separate(
item,
into = c("nameTable", "item"),
sep = "\\.",
extra = "merge",
fill = "right"
)
tables <-
df_nested %>%
distinct(nameTable) %>%
pull()
data_nested <-
tables %>%
map(function(table){
table %>% message()
df <-
df_nested %>% filter(nameTable == table)
if (table == "ItemExhibitList") {
df <-
df %>%
separate(
item,
into = c("parent", "item"),
sep = "\\.",
extra = "merge",
fill = "right"
) %>%
select(parent, item, value) %>%
mutate(idRow = 1:n())
df <- df %>%
.prune_dtic_xml(threshold = 2)
df_items <-
df %>%
filter(nameItem %in% c("nameItem", "titleModification")) %>%
select(idRow) %>%
mutate(numberItem = 1:n())
df <- df %>%
left_join(df_items, by = "idRow") %>%
select(numberItem, everything()) %>%
fill(numberItem) %>%
mutate(urlXML = url)
item_tables <-
df$parent %>% unique()
df <-
item_tables %>%
map(function(item_table){
data_name <- glue("data{item_table}") %>% as.character()
df_table <-
df %>%
filter(parent == item_table) %>%
select(numberItem, nameItem, typeItem, periodBudget, typeBudget, value)
df_base_char <-
df_table %>% filter(typeItem == "character") %>%
select(numberItem, nameItem, value) %>%
group_by(numberItem, nameItem) %>%
summarise(value = unique(value) %>% str_c(collapse = " | ")) %>%
ungroup() %>%
spread(nameItem, value)
if (df_table %>% filter(typeItem == "date") %>% nrow() > 0) {
df_dates <-
df_table %>% filter(typeItem == "date") %>%
mutate(value = value %>% str_c("-01")) %>%
select(-typeItem) %>%
mutate_if(is.character,
list(function(x) {
ifelse(is.na(x), "", x)
})) %>%
unite(item, nameItem, periodBudget, typeBudget, sep = "") %>%
mutate(value = value %>% ymd()) %>%
group_by(numberItem, item) %>%
summarise(value = value %>% str_c(collapse = " | ")) %>%
spread(item,value) %>%
ungroup()
df_base_char <-
df_base_char %>%
left_join(df_dates, by = "numberItem")
}
df_numeric <-
df_table %>% filter(typeItem == "numeric") %>%
select(-typeItem) %>%
mutate(value = parse_number(value)) %>%
distinct() %>%
mutate(value = case_when(
!nameItem %in% c("amountCostPerUnit", "count") ~ value * 1000000,
TRUE ~ value
)) %>%
mutate_if(is.character,
list(function(x) {
ifelse(is.na(x), "", x)
})) %>%
unite(item,
nameItem,
periodBudget,
typeBudget,
sep = "",
remove = F) %>%
group_by(numberItem, item) %>%
mutate(numberSubGroup = 1:n()) %>%
select(numberItem, numberSubGroup, everything()) %>%
ungroup() %>%
group_by(numberItem) %>%
nest() %>%
rename(dataValues = data) %>%
ungroup()
df_row <-
df_base_char %>%
left_join(df_numeric, by = "numberItem") %>%
mutate(hasData = dataValues %>% map_dbl(length) > 0) %>%
.munge_data() %>%
mutate(urlXML = url)
df_row %>%
group_by(urlXML) %>%
nest() %>%
ungroup() %>%
rename(!!sym(data_name) := data)
}) %>%
reduce(left_join)
return(df)
}
if (table == "SecondaryDistribution") {
df <-
df %>%
mutate(item = item %>% str_remove_all("ComponentList.Component.|Cost."))
df <- df %>%
separate(item,
into = c("parent", "item"),
sep = "\\.",
fill = "left") %>%
select(parent, item, value) %>%
mutate(idRow = 1:n())
df_items <-
df %>% filter(item == "Name") %>%
select(idRow) %>%
mutate(numberItem = 1:n())
df <- df %>%
left_join(df_items, by = "idRow") %>%
select(numberItem, everything()) %>%
fill(numberItem)
df <- df %>%
mutate(
item = case_when(
item == "Name" ~ "nameItem",
parent == "Quantity" ~ glue("count{item}") %>% as.character(),
parent == "TotalQuantity" ~ glue("countTotal{item}") %>% as.character(),
TRUE ~ glue("amount{item}") %>% as.character()
)
) %>%
select(numberItem, item, value) %>%
spread(item, value) %>%
select(numberItem, nameItem, everything()) %>%
.munge_data()
amt_cols <- df %>% select(matches("amount")) %>% names()
if (length(amt_cols) > 0) {
df <- df %>%
mutate_at(amt_cols,
list(function(x){
x %>% as.numeric() * 1000000 %>% formattable::currency(digits = 0)
}))
}
df <-
df %>%
mutate(urlXML = url) %>%
group_by(urlXML) %>%
nest() %>%
ungroup() %>%
rename(dataSecondaryDistribution = data)
return(df)
}
if (table == "ModsOutYearDelta") {
df <-
df %>%
separate(
item,
into = c("parent", "item"),
sep = "\\.",
extra = "merge",
fill = "right"
)
df <-
df %>%
mutate(parent = parent %>% str_c("amountTotal", .),
value = as.numeric(value) * 1000000) %>%
select(parent, item, amount = value) %>%
mutate(isTotal = item == "Total") %>%
.munge_data() %>%
mutate(numberYearBudget = case_when(
item %>% str_detect("One") ~ 1L,
item %>% str_detect("Two") ~ 2L,
item %>% str_detect("Three") ~ 3L,
item %>% str_detect("Four") ~ 4L,
item %>% str_detect("Five") ~ 5L,
TRUE ~ NA_integer_
)) %>%
mutate(urlXML = url) %>%
group_by(urlXML) %>%
nest() %>%
ungroup() %>%
rename(dataModsOutDelta = data)
return(df)
}
if (table == "ChangeSummary") {
df <-
df %>%
separate(
item,
into = c("parent", "item"),
sep = "\\.",
extra = "merge",
fill = "right"
) %>%
select(parent, item, value) %>%
mutate(idRow = 1:n())
df <- .prune_dtic_xml(data = df)
df <-
df %>%
filter(!parent %>% str_detect("AdjustmentDetails")) %>%
mutate(parent = str_c("amount", parent)) %>%
unite(item, parent, item, sep = "") %>%
select(item, value) %>%
mutate(value = parse_number(value)) %>%
filter(!is.na(value)) %>%
spread(item, value) %>%
mutate(urlXML = url) %>%
group_by(urlXML) %>%
nest() %>%
ungroup() %>%
rename(dataChangeSummary = data)
return(df)
}
if (table == "ProjectList") {
df <-
df %>%
separate(
item,
into = c("parent", "item"),
sep = "\\.",
extra = "merge",
fill = "right"
) %>%
select(parent, item, value) %>%
mutate(idRow = 1:n())
df <- .prune_dtic_xml(data = df)
df <- df %>%
mutate(nameItem = case_when(
!is.na(periodBudget) &
!is.na(typeBudget) ~ str_c(nameItem, periodBudget, typeBudget, sep = ""),
TRUE ~ nameItem
))
df_items <-
df %>%
filter(nameItem %in% c("projecttitle")) %>%
select(idRow) %>%
mutate(numberItem = 1:n())
df <- df %>%
left_join(df_items, by = "idRow") %>%
select(numberItem, everything()) %>%
fill(numberItem) %>%
mutate(urlXML = url)
item_tables <-
df$parent %>% unique()
df <-
item_tables %>%
map(function(item_table){
data_name <- glue("data{item_table}") %>% as.character()
df_table <-
df %>%
filter(parent == item_table) %>%
select(numberItem, nameItem, typeItem, periodBudget, typeBudget, value)
df_base_char <- df_table %>% filter(typeItem == "character") %>%
select(numberItem, nameItem, value) %>%
group_by(numberItem, nameItem) %>%
summarise(value = unique(value) %>% str_c(collapse = " | ")) %>%
ungroup() %>%
spread(nameItem, value)
if (df_table %>% filter(typeItem == "date") %>% nrow() > 0) {
df_dates <-
df_table %>% filter(typeItem == "date") %>%
distinct() %>%
mutate(value = value %>% str_c("-01")) %>%
select(-typeItem) %>%
mutate_if(is.character,
list(function(x) {
ifelse(is.na(x), "", x)
})) %>%
unite(item, nameItem, periodBudget, typeBudget, sep = "") %>%
mutate(value = value %>% ymd()) %>%
group_by(numberItem, item) %>%
dplyr::slice(1) %>%
ungroup() %>%
spread(item,value)
df_base_char <-
df_base_char %>%
left_join(df_dates, by = "numberItem")
}
df_numeric <-
df_table %>% filter(typeItem == "numeric") %>%
select(-typeItem) %>%
mutate(value = parse_number(value)) %>%
distinct() %>%
mutate(value = case_when(
!nameItem %in% c("amountCostPerUnit", "count") ~ value * 1000000,
TRUE ~ value
)) %>%
mutate_if(is.character,
list(function(x) {
ifelse(is.na(x), "", x)
})) %>%
unite(item,
nameItem,
periodBudget,
typeBudget,
sep = "",
remove = F) %>%
group_by(numberItem, item) %>%
mutate(numberSubGroup = 1:n()) %>%
select(numberItem, numberSubGroup, everything()) %>%
ungroup() %>%
group_by(numberItem) %>%
nest() %>%
rename(dataValues = data) %>%
ungroup()
df_row <-
df_base_char %>%
left_join(df_numeric, by = "numberItem") %>%
mutate(hasData = dataValues %>% map_dbl(length) > 0) %>%
.munge_data() %>%
mutate(urlXML = url)
df_row %>%
filter(!is.na(numberItem)) %>%
group_by(urlXML) %>%
nest() %>%
ungroup() %>%
rename(!!sym(data_name) := data)
}) %>%
reduce(left_join)
return(df)
}
if (table == "ResourceSummary") {
df <-
df %>%
separate(item, into = c("parent", "item"), sep = "\\.",
fill = "right")
df <-
df %>%
mutate(parent = parent %>% str_c("amount", .),
value = as.numeric(value) * 1000000) %>%
select(parent, item, amount = value) %>%
.munge_data() %>%
mutate(
numberYearBudget = case_when(
item %>% str_detect("One") ~ 1L,
item %>% str_detect("Two") ~ 2L,
item %>% str_detect("Three") ~ 3L,
item %>% str_detect("Four") ~ 4L,
item %>% str_detect("Five") ~ 5L,
TRUE ~ NA_integer_
),
typeBudget = case_when(
item %>% str_detect("OCO") ~ "OCO",
item %>% str_detect("Base") ~ "Base",
TRUE ~ "Total"
)
) %>%
mutate(urlXML = url) %>%
group_by(urlXML) %>%
nest() %>%
ungroup() %>%
rename(dataResourceSummary = data)
return(df)
}
data_name <- glue("data{table}") %>% as.character()
df <-
df %>%
mutate(urlXML = url) %>%
group_by(urlXML) %>%
nest() %>%
ungroup() %>%
rename(!!sym(data_name) := data)
df
}) %>%
reduce(left_join)
data <-
df_base %>%
left_join(data_nested, by = "urlXML")
data
}
#' Parse DTIC DOD Investment URLs
#'
#' Parses vector of XML URLs from DTIC RDTE
#' budget justicifcations from \url{https://apps.dtic.mil/dodinvestment/#/}
#'
#' @param urls vector of urls
#' @param table_names list of table names to clean out defaults to \itemize{
#' \item programelement
#' \item codeblist
#' \item lineitem
#' }
#' @param return_message if \code{TRUE} prints am essage
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' parse_dtic_rdte_xml_urls(urls = "https://apps.dtic.mil/procurement/Y2020/AirForce/U_P40_000071_BSA-1_BA-7_APP-3010F_PB_2020.xml")
parse_dtic_investment_xml_urls <-
function(urls = "https://apps.dtic.mil/procurement/Y2020/AirForce/U_P40_000071_BSA-1_BA-7_APP-3010F_PB_2020.xml",
table_names = c("programelement", "lineitem", "codeblist"),
return_message = T) {
.parse_dtic_rdte_xml_safe <- possibly(.parse_dtic_rdte_xml, tibble())
all_data <-
urls %>%
future_map_dfr(function(url){
.parse_dtic_rdte_xml_safe(url = url, table_names = table_names,
return_message = return_message)
})
all_data <-
all_data %>%
select(-matches("remove"))
data_cols <- all_data %>%
select(matches("data")) %>%
names()
df_data <-
all_data %>%
transmute_at(data_cols,
list(function(x) {
x %>% map_dbl(length) > 0
}))
names(df_data) <- names(df_data) %>% str_replace_all("data", "has")
all_data <- all_data %>%
bind_cols(df_data)
all_data <-
all_data %>%
select(dateSubmission, nameAgency, everything())
all_data
}
# research ----------------------------------------------------------------
.parse_dod_data <-
function(url = "https://dodgrantawards.dtic.mil/grants/api/service/search/advancedSearchExport?searchMethod=EITHER&searchDateMethod=STARTING&ascDescOrder=ASC") {
data <-
url %>%
fread(showProgress = FALSE) %>%
as_tibble()
}
.munge_dod_data <-
function(data) {
data <-
data %>%
.munge_fpds_names()
data <-
data %>%
mutate_at(c("datetimeCreated",
"datetimeModified"),
list(mdy_hms)) %>%
mutate_at(c("dateAwardStart", "dateAwardEnd"),
list(mdy))
data <-
data %>%
mutate(hasFNComma = nameFirstPrincipalInvestigator %>% str_detect("\\,")) %>%
mutate(
nameFirstClean = case_when(
hasFNComma ~ nameLastPrincipalInvestigator,
TRUE ~ nameFirstPrincipalInvestigator
),
nameLastClean = case_when(
hasFNComma ~ nameFirstPrincipalInvestigator %>% str_remove_all("\\,"),
TRUE ~ nameLastPrincipalInvestigator
)
) %>%
unite(
namePrincipalInvestigator,
nameFirstClean,
nameLastClean,
sep = " ",
remove = F
) %>%
select(-c(
nameFirstPrincipalInvestigator,
nameLastPrincipalInvestigator,
hasFNComma
)) %>%
rename(
nameFirstPrincipalInvestigator = nameFirstClean,
nameLastPrincipalInvestigator = nameLastClean
) %>%
select(one_of(names(data)), everything()) %>%
unite(
namePrincipalInvestigator,
nameFirstPrincipalInvestigator,
nameLastPrincipalInvestigator,
sep = " ",
remove = F
)
data <-
data %>%
select(one_of(
c(
"idAward",
"nameAward",
"descriptionAward",
"nameAgency",
"amountAward",
"namePrincipalInvestigator",
"nameOffice",
"yearFiscal",
"nameOrganization",
"namePrincipalInvestigator",
"nameFirstPrincipalInvestigator",
"nameLastPrincipalInvestigator",
"typePrincipalInvestigator",
"dateAwardStart",
"dateAwardEnd",
"datetimeCreated",
"datetimeModified"
)
), everything())
data <- data %>%
mutate(idOffice = idAward %>% substr(1, 6)) %>%
select(idAward:nameAgency, idOffice, nameOffice, everything())
data <- data %>%
mutate_if(is.character,
list(function(x) {
x %>% stringi::stri_trans_general("Latin-ASCII") %>% str_trim()
}))
ids <-
data %>% distinct(idOffice) %>% pull() %>% str_c(collapse = "|")
data <-
data %>%
mutate(nameOffice = nameOffice %>% str_remove_all(ids) %>% str_trim()) %>%
.munge_data(clean_address = F) %>%
mutate(
nameOffice = case_when(
nameOffice == "AFRL/RQK" ~ "AFRL RQK",
nameOffice == "- ACC-APG-RTP DIVISION" ~ "Army Contracting Command – Aberdeen Proving Ground" %>% str_to_upper(),
TRUE ~ nameOffice
)
)
data <-
data %>%
mutate(
namePrincipalInvestigator =
namePrincipalInvestigator %>% str_replace_all("\\^DR. |PROF.|PROFESSOR|ASST.", "") %>%
str_replace_all(", JR.", " JR") %>%
str_replace_all(", III", " III") %>%
str_replace_all(", PH.D|, PH. D", " PHD")
)
has_more_commas <- data %>%
filter(namePrincipalInvestigator %>% str_detect("\\,")) %>% nrow() > 0
if (has_more_commas) {
data <- data %>%
filter(namePrincipalInvestigator %>% str_detect("\\,")) %>%
separate(
namePrincipalInvestigator,
into = c(
"nameLastPrincipalInvestigator",
"nameFirstPrincipalInvestigator"
),
fill = "right",
extra = "merge",
sep = "\\,"
) %>%
mutate_if(is.character, str_trim) %>%
unite(
namePrincipalInvestigator,
nameFirstPrincipalInvestigator,
nameLastPrincipalInvestigator,
sep = " ",
remove = F
) %>%
bind_rows(data %>% filter(!namePrincipalInvestigator %>% str_detect("\\,"))) %>%
arrange(dateAwardStart) %>%
select(one_of(names(data)), everything()) %>%
.munge_data(clean_address = F)
}
data <-
data %>%
mutate(yearFiscal = year(dateAwardStart)) %>%
.munge_organizations()
data <-
data %>%
mutate(
namePrincipalInvestigator = case_when(
namePrincipalInvestigator %in% c("PROF. RALPH", "RALPH ETIENNE- CUMMINS") ~ "RALPH ETIENNE-CUMMINGS",
namePrincipalInvestigator %in% c("ABDALLA DARWISH", "ABDALLA DAWISH") ~ "ABDALLA DARWISH",
namePrincipalInvestigator %in% c("ALEX K Y JEN") ~ "ALEX JEN",
namePrincipalInvestigator %>% str_detect("ALI JADBABAIE") ~ "ALI JADBABAIE",
namePrincipalInvestigator %>% str_detect("BENITO GONZLEZ") ~ "BENITO GONZALEZ",
namePrincipalInvestigator %>% str_detect("CLAY GLOSTER") ~ "CLAY GLOSTER",
namePrincipalInvestigator %>% str_detect("DAVID BOURELL") ~ "DAVID BOURELL",
namePrincipalInvestigator %>% str_detect("DR AVESTA SADAN|DR AVESTA SASAN") ~ "AVESTA SASAN",
TRUE ~ namePrincipalInvestigator
)
)
data <- data %>%
refine_columns(entity_columns = "namePrincipalInvestigator") %>%
select(-namePrincipalInvestigator) %>%
rename(namePrincipalInvestigator = namePrincipalInvestigatorClean) %>%
select(one_of(names(data))) %>%
mutate(namePrincipalInvestigator = namePrincipalInvestigator %>% str_replace_all("^DR |^SSOR ", ""))
df_last_names <-
data %>%
distinct(namePrincipalInvestigator) %>%
tbl_last_name(name_column = "namePrincipalInvestigator")
df_last_classified_names <-
df_last_names %>%
filter(!is.na(nameLast)) %>%
pull(nameLast) %>%
unique() %>%
entities::classify_last_names()
df_last_names <-
df_last_names %>%
left_join(
df_last_classified_names %>%
rename(typeWRUPrincipalInvestigatorPrediction = typeWRUPrediction),
by = "nameLast"
)
data <- data %>%
left_join(
df_last_names %>%
distinct(
namePrincipalInvestigator,
typeWRUPrincipalInvestigatorPrediction
),
by = "namePrincipalInvestigator"
)
data <- data %>%
mutate(
typeWRUPrincipalInvestigatorPrediction = ifelse(
is.na(typeWRUPrincipalInvestigatorPrediction),
"Other",
typeWRUPrincipalInvestigatorPrediction
)
)
data <- data %>%
mutate(
nameOffice = case_when(
nameOffice %>% str_detect("10TH CONTRACTING SQUADRON, USAF ACADEMY, CO") ~ "10TH CONTRACTING SQUADRON - USAF ACADEMY CO",
nameOffice %>% str_detect("ARMY CONTRACTING COMMANDABERDEEN PROVING GROUND") ~ "ARMY CONTRACTING COMMAND - ABERDEEN PROVING GROUND",
nameOffice %>% str_detect(
"MEDICAL RESEARCH & MATERIEL COMMAND|MEDICAL RESEARCH & MATERIAL COMMAND"
) ~ "MEDICAL RESEARCH & MATERIEL COMMAND",
nameOffice %>% str_detect("-W6QK ACC-APG NATICK CONTRACTING DIVISION") ~ "ARMY CONTRACTING COMMAND - NATICK CONTRACTING DIVISION",
nameOffice %>% str_detect("ACC - WARREN") ~ "ARMY CONTRACTING COMMAND - WARREN",
nameOffice %>% str_detect(
"AFOSR ASIAN OFFICE OF AEROSPACE RESEARCH AND DEVELOPMENT TOKYO JPN|AFOSR ASIAN OFFICE OF AEROSPACE RESEARCH AND DEVELOPMENT, TOKYO, JPN"
) ~ "Air Force Office of Scientific Research - ASIAN OFFICE OF AEROSPACE RESEARCH IN TOKYO",
nameOffice %>% str_detect(
"AFRL MUNITIONS DIRECTORATE EGLIN AFB FL|AFRL MUNITIONS DIRECTORATE, EGLIN AFB FL"
) ~ "AIR FORCE RESEARCH LABS - EGLIN Air Force Base",
nameOffice %>% str_detect("AFRL ROME RESEARCH SITE ROME NY|AFRL ROME RESEARCH SITE, ROME NY") ~ "AIR FORCE RESEARCH LABS - Rome",
nameOffice %>% str_detect("AFRL RQK|AFRL/PZL|AFRL/PZLDB|AFRL/RQKMC") ~ "AIR FORCE RESEARCH LABS - Patterson Air Force Base",
nameOffice %>% str_detect("AFRL RVK|DET 8 AFRL PKD") ~ "AIR FORCE RESEARCH LABS - Kirtland Air Force Base",
nameOffice %>% str_detect(
"AIR FORCE OFFICE OF SCIENTIFIC RESEARCH ARLINGTON VA|AIR FORCE OFFICE OF SCIENTIFIC RESEARCH, ARLINGTON, VA"
) ~ "AIR FORCE OFFICE OF SCIENTIFIC RESEARCH - ARLINGTON VA",
nameOffice %>% str_detect("ARMY CONTRACTING COMMAND - ACC-NJ") ~ "ARMY CONTRACTING COMMAND - Picatinny NJ",
nameOffice %>% str_detect("ARMY CONTRACTING COMMAND -RTP DIVISION") ~ "ARMY CONTRACTING COMMAND - RESEARCH TRIANGLE PARK",
nameOffice %>% str_detect("ARMY CONTRACTING COMMAND-REDSTONE") ~ "ARMY CONTRACTING COMMAND - REDSTONE",
nameOffice %>% str_detect("DARPA CONTRACTS MANAGEMENT OFFICE") ~ "DARPA - CONTRACTS MANAGEMENT OFFICE",
nameOffice %>% str_detect("DEPARTMENT OF DEFENSE EDUCATION ACTIVITY") ~ "DEPARTMENT OF DEFENSE - EDUCATION ACTIVITY",
nameOffice %>% str_detect("NAVAL INFORMATION WARFARE CENTER ATLANTIC") ~ "NAVAL INFORMATION WARFARE CENTER - ATLANTIC",
nameOffice %>% str_detect("NAVAL INFORMATION WARFARE CENTER PACIFIC|SSC PACIFIC") ~ "NAVAL INFORMATION WARFARE CENTER - PACIFIC",
nameOffice %>% str_detect("NAVAL SURFACE WARFARE CENTER CRANE") ~ "NAVAL SURFACE WARFARE CENTER - CRANE INDIANA",
nameOffice %>% str_detect("NSWC IHEODTD INDIAN HEAD") ~ "NAVAL SURFACE WARFARE CENTER - INDIAN HEAD",
nameOffice %>% str_detect("NAVAL UNDERSEA WARFARE CENTER DIVISION KEYPORT") ~ "NAVAL UNDERSEA WARFARE CENTER DIVISION - KEYPORT WASHINGTON",
nameOffice %>% str_detect("NAVSUP FLC SAN DIEGO") ~ "NAVSUP Fleet Logistics Center - San Diego",
nameOffice %>% str_detect("NAVSUP FLC SIGONELLA NAPLES|NAVSUP FLC SIGONELLA, NAPLES") ~ "NAVSUP Fleet Logistics Center - Naples",
nameOffice %>% str_detect(
"OFFICE OF NAVAL RESEARCH LONDON ENGLAND|OFFICE OF NAVAL RESEARCH, LONDON, ENGLAND"
) ~ "OFFICE OF NAVAL RESEARCH - LONDON ENGLAND",
nameOffice %>% str_detect("U.S. ARMY CORPS OF ENGINEERS - ALBUQUERQUE DISTRICT") ~ "US ARMY CORPS OF ENGINEERS - ALBUQUERQUE",
nameOffice %>% str_detect("US ARMY CORPS OF ENGINEERS ALASKA DISTRICT") ~ "US ARMY CORPS OF ENGINEERS - ALASKA",
nameOffice %>% str_detect("US ARMY CORPS OF ENGINEERS OMAHA") ~ "US ARMY CORPS OF ENGINEERS - OMAHA",
nameOffice %>% str_detect("USACE- ERDC") ~ "US ARMY CORPS OF ENGINEERS - VICKSBURG",
TRUE ~ nameOffice
),
nameOffice = nameOffice %>% str_to_upper()
) %>%
rename(nameOfficeFull = nameOffice)
data <- data %>% mutate(
nameOfficeFull = case_when(
idOffice == "FA7000" ~ "10TH CONTRACTING SQUADRON - USAF ACADEMY CO",
idOffice == "N00421" ~ "NAVAL AIR WARFARE CENTER",
idOffice %in% c("N00014", "N62909") ~ "OFFICE OF NAVAL RESEARCH",
idOffice == "N00014" ~ "OFFICE OF NAVAL RESEARCH - LONDON ENGLAND",
TRUE ~ nameOfficeFull
)
) %>%
separate(
nameOfficeFull,
into = c("nameOffice", "nameOfficeDetail"),
remove = F,
fill = "right",
extra = "merge",
sep = "\\ - "
)
data <- data %>%
mutate(typeWRUPrincipalInvestigatorPrediction = typeWRUPrincipalInvestigatorPrediction %>% str_to_upper())
data
}
.dod_grants <-
memoise::memoise(function() {
data <-
fread(
"https://asbcllc.com/r_packages/govtrackR/data/dod_grants.tsv.gz",
showProgress = FALSE
) %>%
as_tibble() %>%
.munge_data(clean_address = F)
data
})
#' Department of Defense Grants
#'
#' Returns historic Department
#' of Defense scientific grant data.
#'
#' @return
#' @export
#'
#' @examples
dod_grants <-
function(snake_names = F) {
data <- .dod_grants()
data <-
data %>%
mutate(nameAgencyParent = "DEPARTMENT OF DEFENSE")
data <- data %>% munge_data(snake_names = snake_names)
data
}
# pubdefense --------------------------------------------------------------
.parse_pubdefense_url <-
function(url = "https://publicaccess.dtic.mil/padf_public/api/service/search/simpleSearch?ascDescOrder=ASC&newSearch=0&numberOfResults=1000000&orderProp=&performSimpleSearch=&searchText=%22china%22") {
data <- fromJSON(url)
data
}
# dtic_investment ---------------------------------------------------------
# https://apps.dtic.mil/dodinvestment/#/browse
# categories --------------------------------------------------------------
.dtic_categories <- function() {
page <-
read_html("https://discover.dtic.mil/thesaurus/subject-categories/")
nodes <- page %>% html_nodes(".fl-col-group-nested a")
items <- nodes %>% html_text() %>% str_squish() %>% str_to_upper()
urls <-
nodes %>% html_attr("href")
data <-
tibble(nameCategory = items, urlCategoryDTIC = urls) %>%
mutate(
idCategory = 1:n(),
idCategory = case_when(
nchar(idCategory) == 1 ~ glue("0{idCategory}") %>% as.character(),
TRUE ~ as.character(idCategory)
)
) %>%
select(idCategory, everything())
data
}
.parse_dtic_category_url <-
memoise::memoise(function(url = "https://discover.dtic.mil/thesaurus/subject-categories/12mathematical/") {
page <- read_html(url)
data <-
page %>% html_table(fill = F) %>%
.[[1]] %>%
as_tibble() %>%
setNames(c("idSubject", "nameSubject", "descriptionSubject")) %>%
mutate(urlCategoryDTIC = url) %>%
.munge_data()
data <- data %>%
mutate(idSubject = case_when(
nchar(idSubject) == 1 ~ glue("0{idSubject}") %>% as.character(),
TRUE ~ as.character(idSubject)
))
df_subjects <- data %>%
select(idSubject, descriptionSubject) %>%
separate_rows(descriptionSubject, sep = "\\;") %>%
mutate_all(list(function(x){
x %>% str_remove_all("\\.") %>% str_squish()
})) %>%
nest(dataSubjects = c(descriptionSubject)) %>%
mutate(countSubjects = dataSubjects %>% map_dbl(nrow)) %>%
select(idSubject, countSubjects, dataSubjects)
data <-
data %>%
left_join(df_subjects, by = "idSubject")
data
})
#' DTIC Thesaurus Subject Categories
#'
#' DTIC has identified 25 broad subject fields and 251 groups to categorize t
#' areas of scientific and technical interest.
#'
#' These fields and groups provide the structure for the subject grouping of technical reports in DTIC's collection and are used to define the areas of need-to-know in distributing these reports.
#'
#' Through this site, you will find the subject coverage for each subject category as well as cross-references to related fields and groups.
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' \dontrun{
#' dtic_subject_categories()
#' }
dtic_subject_categories <-
function() {
data <-
.dtic_categories()
.parse_dtic_category_url_safe <-
possibly(.parse_dtic_category_url, tibble())
df_subjects <-
data$urlCategoryDTIC %>%
future_map_dfr(function(url){
.parse_dtic_category_url_safe(url = url)
})
data <- data %>%
left_join(df_subjects, by = "urlCategoryDTIC") %>%
select(idCategory, nameCategory, idSubject, nameSubject, everything())
data <-
data %>%
mutate(codeSubject = glue("{idCategory}/{idSubject}") %>% as.character()) %>%
select(codeSubject, everything(),
descriptionSubject)
data
}
# dtic_budgets ------------------------------------------------------------
# https://budget.dtic.mil/
.dtic_active_budget <-
function(url = "https://budget.dtic.mil/") {
page <- read_html(url)
1:9 %>%
map_dfr(function(x) {
css <-
glue("table:nth-child(4) tr+ tr td:nth-child({x})") %>% as.character()
page %>% html_nodes(css)
})
}
.dictionary_dtic_budget_names <-
function() {
tibble(
idColumn = 1:9,
nameFile = c(
"Summary Request",
"Research Development, Test & Evaluation",
"Research Development, Test & Evaluation",
"Procurement",
"Procurement",
"Operations & Maintence",
"Operations & Maintence",
"Personnel",
"Personnel"
),
nameColumn = c(
"urlSummaryPDF",
"urlRDTEPDF",
"urlRDTEXLS",
"urlProcurementPDF",
"urlProcurementXLS",
"urlOMPDF",
"urlOMXLS",
"urlPersonnelPDF",
"urlPersonnelXLS"
),
typeFile = c("PDF",
"PDF",
"XLS",
"PDF",
"XLS",
"PDF",
"XLS",
"PDF",
"XLS"),
)
}
.parse_dtic_active_budget_page <-
function(file_loc = "Downloads/index.html") {
page <-
read_html(file_loc)
df_names <- .dictionary_dtic_budget_names()
data <-
1:9 %>%
map_dfr(function(x) {
css <-
glue("table+ table tr+ tr td:nth-child({x}) a") %>% as.character()
nodes <-
page %>% html_nodes(css)
text <- nodes %>% html_text()
d <- tibble(
idColumn = x,
text = nodes %>% html_text(),
url = nodes %>% html_attr("href")
) %>%
filter(!text %>% str_detect("Access")) %>%
mutate(idRowColumn = 1:n())
d %>%
mutate(
slug = url %>% str_remove_all("https://budget.dtic.mil/|") %>%
str_remove_all("pdfs|spreadsheets")
) %>%
select(idColumn, idRowColumn, text, slug, url)
}) %>%
filter(text != "") %>%
left_join(df_names, by = "idColumn") %>%
select(idColumn, names(df_names), everything()) %>%
arrange(idRowColumn) %>%
mutate(idRow = 1:n()) %>%
select(idRow, idRowColumn, everything())
df_base <-
data %>%
filter(nameFile == "Summary Request") %>%
select(idRow, nameColumn, text, slug, url) %>%
separate(
slug,
into = c("base", "yearBudget"),
fill = "right",
extra = "merge",
sep = "/"
) %>%
mutate(yearBudget = yearBudget %>% substr(1, 6) %>% parse_number(),
idRow = 1 + idRow) %>%
select(idRow,
yearBudget,
nameBudget = text,
urlSummaryPDF = url) %>%
separate(
nameBudget,
into = c("nameBudget", "detailsBudget"),
fill = "right",
extra = "merge",
sep = "\\("
) %>%
mutate(detailsBudget = detailsBudget %>% str_remove_all("\\)")) %>%
mutate(nameBudget = nameBudget %>% str_remove_all("FY[0-9][0-9][0-9][0-9]|[0-9][0-9]MB")) %>%
mutate_if(is.character,
str_squish) %>%
.munge_data()
df_other <-
data %>%
filter(nameFile != "Summary Request")
data <-
df_other %>%
select(idRow, nameFile, nameColumn, typeFile, text, url) %>%
mutate(sizeFileKB = case_when(
text %>% str_detect("KB") ~ parse_number(text),
TRUE ~ parse_number(text) * 1000
)) %>%
select(-text) %>%
left_join(df_base, by = "idRow") %>%
select(idRow,
yearBudget,
nameBudget,
detailsBudget,
urlSummaryPDF,
everything()) %>%
fill(yearBudget) %>%
fill(detailsBudget) %>%
fill(nameBudget) %>%
fill(urlSummaryPDF) %>%
select(-idRow)
data
}
.dtic_historic_budgets <-
function(url = "https://budget.dtic.mil/previous_reports.html") {
page <- read_html(url)
df_names <- .dictionary_dtic_budget_names()
data <-
1:9 %>%
map_dfr(function(x) {
css <-
glue("tr+ tr td:nth-child({x}) a") %>% as.character()
nodes <-
page %>% html_nodes(css)
text <- nodes %>% html_text()
tibble(
idColumn = x,
text = nodes %>% html_text(),
slug = nodes %>% html_attr("href"),
url = str_c("https://budget.dtic.mil/", slug, sep = "")
) %>%
filter(text != "") %>%
mutate(idRowColumn = 1:n())
}) %>%
left_join(df_names, by = "idColumn") %>%
select(idColumn, names(df_names), everything()) %>%
arrange(idRowColumn) %>%
mutate(idRow = 1:n()) %>%
select(idRow, idRowColumn, everything())
df_base <-
data %>%
filter(nameFile == "Summary Request") %>%
select(idRow, nameColumn, text, slug, url) %>%
separate(
slug,
into = c("base", "yearBudget"),
fill = "right",
sep = "/"
) %>%
mutate(yearBudget = yearBudget %>% substr(1, 6) %>% parse_number(),
idRow = 1 + idRow) %>%
select(idRow,
yearBudget,
nameBudget = text,
urlSummaryPDF = url) %>%
separate(
nameBudget,
into = c("nameBudget", "detailsBudget"),
fill = "right",
extra = "merge",
sep = "\\("
) %>%
mutate(detailsBudget = detailsBudget %>% str_remove_all("\\)")) %>%
mutate(nameBudget = nameBudget %>% str_remove_all("FY[0-9][0-9][0-9][0-9]|[0-9][0-9]MB")) %>%
mutate_if(is.character,
str_squish) %>%
.munge_data()
df_other <-
data %>%
filter(nameFile != "Summary Request")
data <-
df_other %>%
select(idRow, nameFile, nameColumn, typeFile, text, url) %>%
mutate(sizeFileKB = case_when(
text %>% str_detect("KB") ~ parse_number(text),
TRUE ~ parse_number(text) * 1000
)) %>%
select(-text) %>%
left_join(df_base, by = "idRow") %>%
select(idRow,
yearBudget,
nameBudget,
detailsBudget,
urlSummaryPDF,
everything()) %>%
fill(yearBudget) %>%
fill(detailsBudget) %>%
fill(nameBudget) %>%
fill(urlSummaryPDF) %>%
select(-idRow) %>%
mutate(urlHistoricBudgets = url)
data
}
.dictionary_dtic_budget_urls <-
function() {
fread("https://asbcllc.com/r_packages/govtrackR/data/dtic_budget_urls.csv.tz",showProgress = FALSE) %>%
as_tibble()
}
#' DTIC Budget URL Dictionary
#'
#' This returns URL data for DTIC
#' budgets from 2007 to 2020
#'
#' @return
#' @export
#'
#' @examples
#' dictionary_dtic_budget_urls()
dictionary_dtic_budget_urls <-
function() {
.tt <- memoise::memoise(.dictionary_dtic_budget_urls)
data <- .tt()
data <-
data %>%
left_join(
data %>% distinct(yearBudget) %>% filter(yearBudget == max(yearBudget)) %>%
mutate(isBudgetCurrentYear = T),
by = "yearBudget"
) %>%
select(yearBudget, isBudgetCurrentYear, everything()) %>%
mutate(isBudgetCurrentYear = case_when(isBudgetCurrentYear %>% is.na() ~ F,
T ~ isBudgetCurrentYear))
data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.