.fix_contract_name <-
function(data) {
if (data %>% hasName("nameContract") && data %>% hasName("idContract")) {
contracts <-
data %>% select(matches("idContract")) %>% gather(column, contract) %>% distinct(contract) %>%
filter(!is.na(contract)) %>%
filter(contract != "0") %>%
pull(contract) %>% unique() %>% str_c(collapse = "|")
data <-
data %>%
mutate(
nameContract = nameContract %>% str_remove_all(contracts) %>% str_squish() %>% str_remove_all("\\ ,")
)
}
data
}
.fix_contract_description <-
function(data) {
if (data %>% hasName("idContract") && data %>% hasName("descriptionObligation")) {
contracts <-
data %>% select(matches("idContract")) %>% gather(column, contract) %>% distinct(contract) %>%
filter(!is.na(contract)) %>%
filter(contract != "0") %>%
pull(contract) %>% unique() %>% str_c(collapse = "|")
data <-
data %>%
mutate(
descriptionObligation = descriptionObligation %>% str_remove_all(contracts) %>% str_squish() %>% str_remove_all("\\ ,")
)
}
data
}
#' Select Key Columns
#'
#' @param data
#' @param key_character_cols
#' @param remove_columns
#'
#' @return
#' @export
#'
#' @examples
select_key_columns <-
function(data,
key_character_cols = c(
"^is[A-Z]",
"^has[A-Z]",
"^amount[A-Z]",
"^count[A-Z]",
"^pct[A-Z]",
"percent[A-Z]",
"^id[A-Z]",
"^code[A-Z]",
"typeOrganization",
"cage",
"city",
"Vendor",
"ProductService",
"NAICS",
"description",
"nameContract",
"Department",
"Agency",
"Office",
"date[A-Z]",
"datetime[A-Z]",
"email",
"transaction"
),
remove_columns = c(
"slugOffice",
"slugDepartment",
"slugAgency",
"namesVendor",
"^namesVendor",
"^fax[A-Z]"
)) {
all_names <- names(data)
num_log <- c(
data %>% select_if(is.logical) %>% names(),
data %>% select_if(is.numeric) %>% names()
)
if (length(num_log) == 0) {
num_log <- c()
}
char_cols <-
data %>% select(matches(key_character_cols %>% str_c(collapse = "|"))) %>% names()
new_names <-
data %>% select(one_of(num_log, char_cols)) %>% names()
data <-
data %>% select(one_of(all_names[all_names %in% new_names]))
if (length(remove_columns) > 0) {
data <- data %>%
select(-matches(remove_columns %>% str_c(collapse = "|")))
}
data
}
#' Title
#'
#' @param data
#' @param parse_dates
#' @param clean_address
#' @param unformat
#' @param no_extra
#' @param exclude_bloat
#' @param snake_names
#'
#' @return
#' @export
#'
#' @examples
munge_lite <-
function(data,
parse_dates = T,
clean_address = F,
unformat = T,
no_extra = F,
exclude_bloat = T,
snake_names = F) {
col_classes <-
data %>% map_chr(function(x) {
x %>% class() %>% str_c(collapse = "|")
})
tbl_class <-
tibble(column = names(col_classes), class = col_classes)
tbl_class_char <- tbl_class %>% filter(class == "character")
slug_duns_all <-
.pull_columns(data = tbl_class, params = c("^slugDUNS", "idDUNS"))
has_both_duns <- slug_duns_all %>% str_detect("id|slug") %>% sum() >= 3
if (has_both_duns) {
data <- data %>%
select(-matches("slugDUNS"))
}
datetime_cols <-
.pull_columns(data = tbl_class_char, params = "^datetime")
date_cols <-
.pull_columns(data = tbl_class_char, params = "^date[A-Z]|^date_")
desc_cols <-
.pull_columns(data = tbl_class_char, params = "^description")
if (length(desc_cols) > 0) {
data <- data %>%
mutate_at(desc_cols,
list(function(x) {
x %>% str_remove_all("\\(|\\)")
}))
}
id_cols <-
.pull_columns(data = tbl_class_char, params = "^idDUNS|idNAICS|idPRNumber|idAccountingInstallation|idScienceOrEngineering|idCCR")
logical_cols <-
.pull_columns(data = tbl_class_char, params = "^is[A-Z]|^has[A-Z]|^is[0-9]|^has[0-9]")
numeric_cols <-
.pull_columns(data = tbl_class_char, params = "^ratio|^area|^size|^count[A-Z]|numberTransaction|Interval|numberRecord")
amount_cols <-
.pull_columns(data = tbl_class_char, params = "^amount")
if (length(amount_cols) > 0) {
data <-
data %>%
mutate_at(amount_cols,
list(function(x) {
x %>% parse_number()
}))
}
amt_num <-
data %>%
select_if(is.numeric) %>%
select(matches("^amount")) %>%
names()
comp_cols <-
.pull_columns(data = tbl_class_char, params = "^amountOfficerHighCompensat"
)
if (length(comp_cols) > 0) {
data <- data %>%
mutate_at(comp_cols, as.numeric) %>%
mutate_at(comp_cols, list(function(x) {
case_when(x > 100000000 ~ NA_real_,
TRUE ~ x)
}))
}
if (length(amt_num) > 0 & !unformat) {
data <-
data %>%
mutate_at(amt_num,
list(function(x) {
x %>% formattable::currency(digits = 0)
}))
}
pct_cols <-
.pull_columns(data = tbl_class_char, params = c("percent", "pct"))
if (length(pct_cols) > 0) {
data <-
data %>%
mutate_at(pct_cols, list(function(x) {
x %>% as.numeric()
}))
}
pct_cols <-
data %>%
select_if(is.numeric) %>%
select(matches("^percent|^pct")) %>%
names()
if ((length(pct_cols) > 0) && !unformat) {
data <-
data %>%
mutate_at(pct_cols, list(function(x) {
x %>% percent(digits = 2)
}))
}
if (length(datetime_cols) > 0 & parse_dates) {
data <-
data %>%
mutate_at(datetime_cols, list(function(x) {
x %>% lubridate::ymd_hms()
}))
}
if (length(date_cols) > 0 & parse_dates) {
data_rows <- nrow(data)
na_ymd <-
data %>% pull(date_cols[1]) %>% ymd() %>% is.na() %>% sum()
if (na_ymd / data_rows == 1) {
data <- data %>%
mutate_at(date_cols, list(function(x) {
x %>% lubridate::mdy()
}))
} else {
if (parse_dates) {
data <-
data %>%
mutate_at(date_cols, list(function(x) {
x %>% lubridate::ymd()
}))
}
}
}
if (length(logical_cols) > 0) {
data <-
data %>% mutate_at(logical_cols,
list(function(x) {
case_when(
str_to_upper(x) %in% c("Y", "YES") ~ TRUE,
str_to_upper(x) %in% c("N", "NO", "F") ~ F,
str_to_upper(x) == "TRUE" ~ TRUE,
TRUE ~ as.logical(x)
)
}))
}
if (length(numeric_cols) > 0) {
data <-
data %>%
mutate_at(numeric_cols, list(function(x) {
x %>% as.numeric()
}))
}
num_cols <-
data %>%
select(matches("^ratio|^area|^size|^count[A-Z]|numberTransaction|Interval|numberRecord")) %>%
select(-matches("country")) %>%
names()
if (length(num_cols) > 0 & !unformat) {
data <- data %>% mutate_at(num_cols, list(function(x) {
x %>% comma(digits = 0)
}))
}
to_lower_names <-
.pull_columns(data = tbl_class_char, params = c("^url|email"))
to_upper_names <-
data %>%
select_if(is.character) %>%
select(
-matches(
"^url|email|transactionLastModifiedBy|transactionClosedBy|^json|slugKey|^item|^parent"
)
) %>% names()
to_upper_names <- to_upper_names %>% discard(function(x) {
x %in% to_lower_names
})
if (length(to_lower_names) > 0) {
data <- data %>% mutate_at(to_lower_names, str_to_lower)
}
if (length(to_upper_names) > 0) {
data <- data %>% mutate_at(to_upper_names, str_to_upper)
}
data <-
.add_name(data = data)
if (clean_address) {
data <-
data %>%
build_address(return_message = F)
}
data <-
data %>%
mutate_if(is.character,
list(function(x) {
case_when(x == "" ~ NA_character_,
TRUE ~ x)
}))
to_date_names <-
c(
"datetimeContractSigned",
"datetimeContractEffective",
"datetimeContractCompletionCurrent",
"datetimeContractCompletionUltimate",
"datetimeRegistration",
"datetimeRenewal"
)
to_date <- names(data)[names(data) %in% to_date_names]
if (length(to_date) > 0) {
data <- data %>% mutate_at(to_date, as.Date)
names(data)[names(data) %in% to_date] <-
names(data)[names(data) %in% to_date] %>% str_replace_all("datetime", "date")
}
has_contract_ob <-
names(data) %in% c("amountContract", "amountObligation") %>% sum(na.rm = T) == 2
if (has_contract_ob) {
data <- data %>%
mutate(isContractObligationChanged = amountContract != amountObligation)
}
if (data %>% hasName("dateContractSigned")) {
data <- data %>%
mutate(yearObligation = dateContractSigned %>% year())
}
if (data %>% hasName("nameAwardeeAddress")) {
data <- data %>%
mutate(nameAwardeeAddress = str_to_upper(nameAwardeeAddress))
df_awardees <-
data$nameAwardeeAddress %>% unique() %>%
.munge_awardees()
data <-
data %>%
left_join(df_awardees, by = "nameAwardeeAddress")
}
data <-
data %>%
mutate_if(is.character,
list(function(x) {
gsub("\\s+", " ", x) %>% stri_enc_toascii() %>% str_remove_all(" \032 ")
}))
if (data %>% hasName("nameAgencyAward")) {
data <-
data %>%
separate(
nameAgencyAward,
sep = "\\(",
fill = "right",
into = c("nameAgencyAward", "slugAgencyAward"),
extra = "merge"
) %>%
mutate(slugAgencyAward = slugAgencyAward %>% gsub("\\)|\\(", "" , .) %>% str_squish())
}
data <-
data %>%
mutate_if(is.character, list(function(x) {
x %>% str_remove_all('"')
}))
data <-
.fix_contract_id(data = data)
if (data %>% hasName("idAgencyAward")) {
data <- data %>% filter(idAgencyAward != "CONTRACTING AGENCY ID")
}
data <-
distinct(data)
if (no_extra) {
return(data)
}
duns_cols <-
data %>% select(matches("idDUNS")) %>% names()
if (length(duns_cols) > 0) {
data <- data %>%
mutate_at(duns_cols,
list(function(x){
if_else(x == 0, NA_real_, x)
}))
}
v_c <-
data %>% select(matches("nameVendor|nameContractor")) %>% names()
if (length(v_c) > 0) {
data <- data %>%
mutate_at(v_c, list(function(x) {
if_else(x == "NO DATA FROM D AND B", NA_character_, x)
}))
}
data <-
.fix_duns(data)
data <-
.add_budget_year(data)
data <-
add_department_codes(data = data)
data <-
.add_dod_type(data = data)
data <-
.fix_foreign_reference(data = data)
data <-
.fix_sam_exceptions(data = data)
data <-
.add_analysis_contract(data = data)
data <-
.add_agency_cgacs(data = data)
data <-
resolve_listed_duns(data = data, exclude_bloat = exclude_bloat)
data <-
.add_original_dates(data = data)
data <-
.allocate_federal_accounts(data = data)
data <-
.guess_duns_type(data = data)
if (unformat) {
data <-
data %>%
mutate_if(is.numeric, as.numeric)
}
if (snake_names) {
data <-
data %>%
janitor::clean_names(case = "snake")
}
data
}
.fix_contract_id <-
function(data) {
has_contract <-
data %>% hasName("idContract")
has_idv <-
data %>% hasName("idContractIDV")
if (!has_idv) {
return(data)
}
if (!has_contract) {
return(data)
}
data <- .remove_na(data = data)
if (data %>% hasName("idContractIDV")) {
data <-
data %>%
mutate(idContract = case_when(
!is.na(idContractIDV) & nchar(idContract) <= 4 ~ idContractIDV,
TRUE ~ idContract
))
}
data
}
.pull_columns <-
function(data, params) {
slug_params <- str_c(params,collapse = "|")
data %>%
filter(column %>% str_detect(slug_params)) %>%
pull(column)
}
.munge_data <-
function(data,
parse_dates = T,
clean_address = F,
unformat = T,
exclude_bloat = T,
snake_names = F) {
col_classes <-
data %>% map_chr(function(x) {
x %>% class() %>% str_c(collapse = "|")
})
tbl_class <-
tibble(column = names(col_classes), class = col_classes)
tbl_class_char <- tbl_class %>% filter(class == "character")
slug_duns_all <-
.pull_columns(data = tbl_class, params = c("^slugDUNS", "idDUNS"))
has_both_duns <- slug_duns_all %>% str_detect("id|slug") %>% sum() >= 3
if (has_both_duns) {
data <- data %>%
select(-matches("slugDUNS"))
}
datetime_cols <-
.pull_columns(data = tbl_class_char, params = "^datetime")
date_cols <-
.pull_columns(data = tbl_class_char, params = "^date[A-Z]|^date_")
desc_cols <-
.pull_columns(data = tbl_class_char, params = "^description")
if (length(desc_cols) > 0) {
data <- data %>%
mutate_at(desc_cols,
list(function(x) {
x %>% str_remove_all("\\(|\\)")
}))
}
if (data %>% hasName("idFARA")) {
data <- data %>%
mutate(idFARA = as.numeric(idFARA))
}
if (data %>% hasName("codeEIN")) {
data <-
data %>%
mutate(idEIN = as.numeric(codeEIN)) %>%
select(idEIN, everything())
}
if (data %>% hasName("nameInCareOf")) {
data <- data %>%
mutate(nameInCareOf = nameInCareOf %>% str_remove_all("^% ") %>%
str_remove_all("C/O ") %>% str_squish())
}
id_cols <-
.pull_columns(data = tbl_class_char, params = "^idDUNS|idNAICS|idPRNumber|idAccountingInstallation|idScienceOrEngineering|idCCR")
logical_cols <-
.pull_columns(data = tbl_class_char, params = "^is[A-Z]|^has[A-Z]|^is[0-9]|^has[0-9]")
numeric_cols <-
.pull_columns(data = tbl_class_char, params = "^ratio|^area|^size|^count[A-Z]|numberTransaction|Interval|numberRecord")
if (data %>% hasName("zipcodePerformance")) {
data <-
data %>%
mutate(zipcodePerformance = zipcodePerformance %>% substr(1, 5))
}
if (data %>% hasName("zipcodeVendor")) {
data <- data %>%
mutate(zipcodeVendor = zipcodeVendor %>% substr(1, 5))
}
if (data %>% hasName("slugDUNS")) {
data <- data %>%
mutate(idDUNS = as.numeric(slugDUNS)) %>%
select(-slugDUNS)
}
if (data %>% hasName("slugDUNSParent")) {
data <- data %>%
mutate(idDUNSParent = as.numeric(slugDUNSParent)) %>%
select(-slugDUNSParent)
}
amount_cols <-
.pull_columns(data = tbl_class_char, params = "^amount")
if (length(amount_cols) > 0) {
data <-
data %>%
mutate_at(amount_cols,
list(function(x) {
x %>% parse_number()
}))
}
amt_num <-
data %>%
select_if(is.numeric) %>%
select(matches("^amount")) %>%
names()
comp_cols <-
.pull_columns(data = tbl_class_char, params = "^amountOfficerHighCompensat")
if (length(comp_cols) > 0) {
data <- data %>%
mutate_at(comp_cols, as.numeric) %>%
mutate_at(comp_cols, list(function(x) {
case_when(x > 100000000 ~ NA_real_,
TRUE ~ x)
}))
}
if (length(amt_num) > 0 & !unformat) {
data <-
data %>%
mutate_at(amt_num,
list(function(x) {
x %>% formattable::currency(digits = 0)
}))
}
pct_cols <-
.pull_columns(data = tbl_class_char, params = c("^percent", "^pct"))
if (length(pct_cols) > 0) {
data <-
data %>%
mutate_at(pct_cols, list(function(x) {
x %>% as.numeric()
}))
}
pct_cols <-
data %>%
select_if(is.numeric) %>%
select(matches("percent|pct")) %>%
names()
if ((length(pct_cols) > 0) && !unformat) {
data <-
data %>%
mutate_at(pct_cols, list(function(x) {
x %>% percent(digits = 2)
}))
}
if (length(datetime_cols) > 0 & parse_dates) {
data <-
data %>%
mutate_at(datetime_cols, list(function(x) {
x %>% lubridate::ymd_hms()
}))
}
if (length(date_cols) > 0 & parse_dates) {
data_rows <- nrow(data)
na_ymd <-
data %>% pull(date_cols[1]) %>% ymd() %>% is.na() %>% sum()
if (na_ymd / data_rows == 1) {
data <- data %>%
mutate_at(date_cols, list(function(x) {
x %>% lubridate::mdy()
}))
} else {
if (parse_dates) {
data <-
data %>%
mutate_at(date_cols, list(function(x) {
x %>% lubridate::ymd()
}))
}
}
}
if (length(logical_cols) > 0) {
data <-
data %>% mutate_at(logical_cols,
list(function(x) {
case_when(
str_to_upper(x) %in% c("Y", "YES") ~ TRUE,
str_to_upper(x) %in% c("N", "NO", "F") ~ F,
str_to_upper(x) == "TRUE" ~ TRUE,
TRUE ~ as.logical(x)
)
}))
}
if (length(numeric_cols) > 0) {
data <-
data %>%
mutate_at(numeric_cols, list(function(x) {
x %>% as.numeric()
}))
}
num_cols <- data %>%
select(matches("^ratio|^area|^size|^count[A-Z]|numberTransaction|Interval|numberRecord")) %>%
select(-matches("country")) %>%
names()
if (length(num_cols) > 0 & !unformat) {
data <- data %>% mutate_at(num_cols, list(function(x) {
x %>% comma(digits = 0)
}))
}
if (names(data) %in% c("nameFirst", "nameLast", "initialMiddle") %>% sum(na.rm = T) == 3) {
data <- data %>%
unite(namePerson,
nameFirst,
initialMiddle,
nameLast,
sep = " ",
remove = F)
}
if (names(data) %in% c("nameFirst", "nameLast") %>% sum(na.rm = T) == 2) {
data <- data %>%
unite(namePerson,
nameFirst,
nameLast,
sep = " ",
remove = F)
}
if (length(id_cols) > 0) {
data <-
data %>%
mutate_at(id_cols, list(function(x) {
x %>% as.numeric()
}))
}
if (data %>% hasName("typeCorporateStructure")) {
data <- data %>%
separate(
typeCorporateStructure,
into = c("typeCorporateStructure", "isTaxExemptEntity"),
sep = "\\(",
fill = "right",
extra = "merge"
) %>%
mutate(isTaxExemptEntity = !isTaxExemptEntity %>% str_detect("Not"))
}
to_lower_names <-
.pull_columns(data = tbl_class_char, params = c("^url|email"))
to_upper_names <-
data %>%
select_if(is.character) %>%
select(
-matches(
"^url|email|transactionLastModifiedBy|transactionClosedBy|^json|slugKey|^item|^parent"
)
) %>% names()
to_upper_names <- to_upper_names %>% discard(function(x) {
x %in% to_lower_names
})
if (length(to_lower_names) > 0) {
data <- data %>% mutate_at(to_lower_names, str_to_lower)
}
if (length(to_upper_names) > 0) {
data <- data %>% mutate_at(to_upper_names, str_to_upper)
}
if (data %>% hasName("dateBusinessStart")) {
data <-
data %>%
mutate(ageBusinessDays = (Sys.Date() - dateBusinessStart) %>% as.numeric())
}
if (data %>% hasName("datetimeRegistration")) {
data <-
data %>%
mutate(lengthGovernmentRelationshipDays = (Sys.Date() - as.Date(datetimeRegistration)) %>% as.numeric())
}
if (data %>% hasName("datetimeRegistration") &
data %>% hasName("dateBusinessStart")) {
data <- data %>%
mutate(
daysPostFoundingPriorToGovernmentWork = (as.Date(datetimeRegistration) - dateBusinessStart) %>% as.integer()
)
}
if (data %>% hasName("nameRegistrant")) {
change <-
c("D/B/A", " / ", " - ", ", A DIVISION OF ", "\\--", " DBA ") %>% str_c(collapse = "|")
data <- data %>%
mutate(
nameRegistrant =
nameRegistrant %>%
str_replace_all("U.S. - EMIRATES", "US-EMIRATES") %>% str_replace_all("VAN FLEET - MEREDITH GROUP",
"VAN FLEET-MEREDITH GROUP") %>%
str_replace_all(" U.S.-CHINA - SHANTOU", " U.S.-CHINA SHANTOU") %>%
str_replace_all(
"NICHOLS - DEZENHALL COMMUNICATIONS MANAGEMENT GROUP, LTD.",
"NICHOLS-DEZENHALL COMMUNICATIONS MANAGEMENT GROUP, LTD."
) %>%
str_replace_all(
"U.S. - SOUTHERN SUDAN DEVELOPMENT COMPANY",
"US SOUTHERN SUDAN DEVELOPMENT COMPANY"
) %>%
str_replace_all(
"PROTECT (PRUDENT RESIDENTS OPPOSED ELEC. CABLE TRANSMISSION)",
"The Prudent Residents Opposed to Electric Cable Transmission"
) %>% str_to_upper() %>%
gsub(change, "^", .),
lastChar = nameRegistrant %>% substr(nchar(nameRegistrant),
nchar(nameRegistrant)),
nameRegistrant = case_when(
lastChar == ")" ~ nameRegistrant %>% gsub("\\(", "^", .),
TRUE ~ nameRegistrant
)
) %>%
mutate(nameRegistrant = nameRegistrant %>% gsub("\\^^", "#", .)) %>%
separate(
nameRegistrant,
into = c("nameRegistrant", "nameRegistrantDetail"),
sep = "\\^|#",
fill = "right",
extra = "merge"
) %>%
mutate_at(c("nameRegistrant", "nameRegistrantDetail"),
function(x) {
x %>% str_remove_all("\\)|\\'|F/K/A") %>% str_squish()
}) %>%
select(-lastChar)
data <- data %>%
separate(
nameRegistrant,
into = c("nameRegistrant", "nr2"),
extra = "merge",
fill = "right",
sep = "\\("
) %>%
mutate(nameRegistrantDetail = case_when(
is.na(nameRegistrantDetail) ~ nr2,
TRUE ~ nameRegistrantDetail
)) %>%
separate(
nameRegistrantDetail,
into = c("nameRegistrantDetail", "nr2"),
extra = "merge",
fill = "right",
sep = "\\("
) %>%
select(-nr2) %>%
mutate_if(is.character,
str_squish)
df_reg_entities <-
data %>%
distinct(nameRegistrant) %>%
mutate(
nameRegistrantClean = nameRegistrant %>% str_remove_all("\\.|\\-"),
countCommas = nameRegistrantClean %>% str_count("\\,"),
typeRegistrant = case_when(
countCommas != 1 ~ "ENTITY",
nameRegistrant %>% str_detect(
"JETRO|REFORMISTA|SIDERURGICA|\\, S.A.|&|\\, P.A.|REFORMISTA\\ LLC| INC| CORP|LTD|LLP|NEW YORK|LOS ANGELES|PC|CHICAGO|LP|PLLC|PLC|COMPANY| OFFICE | COMMUNICATION | COUNCIL | NATIONAL | GOVERNMENT| FOUNDATION|ORGANIZATION|OFFICE|COMMUNICATIONS|COMMITTEE|PARTNERSHIP|RELATIONS|INFORMATION|AGENCY|BUREAU|COMMISSION|TOURIST|TRADE|REVOLUTIONARY|REVOLUCIONARIO|LIBERACION|MINISTRY|SOCIETIES|DEFENSE|GOVERNMENT|LAW FIRM|GROUP| USA|DELGATION|DINAMICA"
) ~ "ENTITY",
nameRegistrantClean %>%
str_detect("PARTNERS|\\, LLC|\\, PC|STRATEGIES|\\, LP|\\, LLP") ~ "ENTITY",
TRUE ~ "PERSON"
)
) %>%
select(-countCommas)
df_reg_entities <-
df_reg_entities %>%
filter(typeRegistrant != "ENTITY") %>%
separate(
nameRegistrantClean,
into = c("nameLast", "nameFirst"),
extra = "merge",
fill = "right",
sep = "\\,"
) %>%
mutate_all(str_squish) %>%
unite(nameRegistrantClean, nameFirst, nameLast, sep = " ") %>%
bind_rows(df_reg_entities %>% filter(typeRegistrant == "ENTITY"))
data <- data %>%
left_join(df_reg_entities, by = "nameRegistrant") %>%
select(-nameRegistrant, nameRegistrant = nameRegistrantClean) %>%
select(one_of(names(data)), everything())
data <-
data %>%
mutate(nameRegistrantDetail = nameRegistrantDetail %>% str_remove_all('\\"|\\.|\\,'))
}
if (data %>% hasName("nameRegistrantEntity")) {
data <- data %>%
mutate(
nameRegistrantEntity =
nameRegistrantEntity %>%
gsub("\\)|\\(", "", .) %>%
str_remove_all('\\"|\\.|\\,|\\- ')
)
}
if (data %>% hasName("nameForeignPrincipal")) {
change <-
c(
"D/B/A",
"\\ ON BEHALF OF",
" VIA ",
"THROUGH ",
" / ",
" - ",
", A DIVISION OF ",
"\\--",
" DBA ",
" ON BEHALF ",
" THROUGH ",
" AS AN AFFILIATE OF "
) %>% str_c(collapse = "|")
data <-
data %>%
mutate(
nameForeignPrincipal =
nameForeignPrincipal %>%
gsub(change, "^", .),
lastChar = nameForeignPrincipal %>% substr(nchar(nameForeignPrincipal),
nchar(nameForeignPrincipal)),
nameForeignPrincipal = case_when(
lastChar == ")" ~ nameForeignPrincipal %>% gsub("\\(", "^", .),
TRUE ~ nameForeignPrincipal
)
) %>%
mutate(nameForeignPrincipal = nameForeignPrincipal %>% gsub("\\^^", "#", .)) %>%
separate(
nameForeignPrincipal,
into = c("nameForeignPrincipal", "nameForeignPrincipalDetail"),
sep = "\\^|\\#",
fill = "right",
extra = "merge"
) %>%
mutate_at(c("nameForeignPrincipal", "nameForeignPrincipalDetail"),
function(x) {
x %>% str_remove_all("\\)|\\'|F/K/A|THROUGH|THROUGH THE|VIA") %>%
str_squish()
}) %>%
mutate(
lastChar = nameForeignPrincipal %>% substr(nchar(nameForeignPrincipal),
nchar(nameForeignPrincipal)),
nameForeignPrincipal = case_when(
lastChar == "," ~ nameForeignPrincipal %>% substr(1, nchar(nameForeignPrincipal) - 1),
TRUE ~ nameForeignPrincipal
),
nameForeignPrincipalDetail = nameForeignPrincipalDetail %>% str_remove_all(change)
) %>%
select(-lastChar)
data <-
data %>%
separate(
nameForeignPrincipal,
into = c("nameForeignPrincipal", "nr2"),
sep = "\\(",
fill = "right",
extra = "merge"
) %>%
mutate(nameForeignPrincipalDetail = case_when(
is.na(nameForeignPrincipalDetail) ~ nr2,
TRUE ~ nameForeignPrincipalDetail
)) %>%
mutate_if(is.character,
str_squish) %>%
select(-one_of("nr2"))
df_reg_entities <-
data %>%
distinct(nameForeignPrincipal) %>%
mutate(
nameForeignPrincipalClean = nameForeignPrincipal %>% str_remove_all("\\.|\\ - ") %>% gsub("\\s+", " ", .),
countCommas = nameForeignPrincipal %>% str_count("\\,"),
typeForeignPrincipal = case_when(
countCommas != 1 ~ "ENTITY",
nameForeignPrincipal %>%
str_detect(
"SOCIETE|SOJAT|BIDZINA IVANISHVILI|PHILLIMORE|INFOPLAN|SWISSAIR|CORPORACION|CHEMISCHEN|LOTERIA|AMERICANA|LITHUANIAN|PORTUGUESA|DEUTSCHER|ASEA|SELENIA|CARLOFORTE|CLEMENTE|DISTRIBUIDORA|CONGRESS|CORDELEROS|COMINCO|AIRBUS|MANAGING|REFUGEES|ATLANTIC|REFUGEES|ESQUIRE|TEA |NUKLEAR|EDITORIAL|, C. POR A.|AGRUPACION|FEDERACION|COOPERATIVE|HOTEL|UNDERWRITER|ORDNANCE|AIRLINE|HOTEL|, S.C.|DIVISION|STREET|DIVISION|PROMOCION|ORBIS|DIVISION| A.C.|MBH|PROMOTION|OPTRONICS|B.V.|C.A.|COMERCIO|TEKNOLOJILERI|ECONOMIC|IN EXILE|NEWSPAPER|INDUSTRIAL|PHOTO|ITALIANE| AIR |A.O.|DEMOCRAT|HONORABLE|HIS EXCELLENCY|, A. C.|, S.P.A.|BANCO|BANK |GMBH|TRUST|, A.G.|, N.V.|, A.B.|, E.N.|ASOCIACION|\\, S.A.|, A.S.|&|\\, P.A.|REFORMISTA\\ LLC| INC| CORP|LTD|LLP|NEW YORK|LOS ANGELES|PC|CHICAGO|LP|PLLC|PLC|COMPANY| OFFICE | COMMUNICATION | COUNCIL | NATIONAL | GOVERNMENT| FOUNDATION|ORGANIZATION|OFFICE|COMMUNICATIONS|COMMITTEE|PARTNERSHIP|RELATIONS|INFORMATION|AGENCY|BUREAU|COMMISSION|TOURIST|TRADE|REVOLUTIONARY|REVOLUCIONARIO|LIBERACION|MINISTRY|SOCIETIES|DEFENSE|GOVERNMENT|LAW FIRM|GROUP| USA|DELGATION|DINAMICA"
) ~ "ENTITY",
nameForeignPrincipalClean %>%
str_detect(
"MONDO|JETRO|CEMEX|DEMOCRATS| AND |PARLIMENT|SECRETARIA|INTERNATIONAL|AMBASSADOR|AUTHORITY|POLITICAL| OF |PREMIER|MINISTER|CONSUL|LIMITED|PRESIDENT|REPUBLIC|PROVINCE|CANDIDATE|EMBASSY|DEVELOPMENT|PARTNERS|\\, LLC|\\, PC|STRATEGIES|\\, LP|\\, LLP"
) ~ "ENTITY",
TRUE ~ "PERSON"
)
) %>%
select(-countCommas) %>%
unique()
df_p <- df_reg_entities %>%
filter(typeForeignPrincipal != "ENTITY") %>%
filter(!is.na(nameForeignPrincipalClean)) %>%
mutate(
nameForeignPrincipalClean = nameForeignPrincipalClean %>% str_remove_all(", RT|\\, DIRECTOR") %>%
str_replace_all("\\, JR", "\\ JR") %>%
str_replace_all("\\, SR", "\\ SR") %>%
str_replace_all(", III", "\\ III")
)
df_p <- df_p %>%
filter(nameForeignPrincipalClean %>% str_detect("\\,")) %>%
separate(
nameForeignPrincipalClean,
into = c("nameLast", "nameFirst"),
sep = "\\,",
fill = "right",
extra = "merge"
) %>%
unite(nameForeignPrincipalClean, nameFirst, nameLast, sep = " ") %>%
bind_rows(df_p %>%
filter(!nameForeignPrincipalClean %>% str_detect("\\,"))) %>%
mutate_all(str_squish)
df_reg_entities <-
df_reg_entities %>%
filter(typeForeignPrincipal == "ENTITY") %>%
bind_rows(df_p)
data <-
data %>%
left_join(df_reg_entities, by = "nameForeignPrincipal") %>%
select(-nameForeignPrincipal, nameForeignPrincipal = nameForeignPrincipalClean) %>%
select(one_of(names(data)), everything())
data <-
data %>%
mutate(nameForeignPrincipalDetail = nameForeignPrincipalDetail %>% str_remove_all('\\"|\\.|\\,'))
}
if (data %>% hasName("countryForeignPrincipal")) {
data <-
data %>%
mutate(
countryForeignPrincipal = case_when(
countryForeignPrincipal == "CONGO (KINSHASA) (ZAIRE)" ~ "ZAIRE",
countryForeignPrincipal == "CONGO (BRAZZAVILLE)" ~ "REPUBLIC OF THE CONGO",
countryForeignPrincipal %>% str_detect("COTE D'IVOIRE") ~ "IVORY COAST" ,
countryForeignPrincipal == "KOREA, DEMOCRATIC PEOPLES REPUBLIC OF" ~ "NORTH KOREA",
countryForeignPrincipal == "KOREA, REPUBLIC OF" ~ "SOUTH KOREA",
countryForeignPrincipal %>% str_detect("MYANMAR (BURMA)") ~ "MYANMAR",
countryForeignPrincipal %>% str_detect("ST. CHRISTOPHER (ST. KITTS) & NEVIS") ~ "ST. KITTS & NEVIS",
countryForeignPrincipal %>% str_detect("TIMOR-LESTE (EAST TIMOR)") ~ "EAST TIMOR",
countryForeignPrincipal %>% str_detect("SAMOA") ~ "SAMOA",
TRUE ~ countryForeignPrincipal
)
) %>%
mutate(
countryForeignPrincipal = case_when(
countryForeignPrincipal == "CONGO (KINSHASA) (ZAIRE)" ~ "ZAIRE",
countryForeignPrincipal == "CONGO (BRAZZAVILLE)" ~ "REPUBLIC OF THE CONGO",
countryForeignPrincipal %>% str_detect("COTE D'IVOIRE") ~ "IVORY COAST" ,
countryForeignPrincipal %>% str_detect("KOREA DEMOCRATIC PEOPLES REPUBLIC OF") ~ "NORTH KOREA",
countryForeignPrincipal == "KOREA REPUBLIC OF" ~ "SOUTH KOREA",
countryForeignPrincipal %>% str_detect("MYANMAR (BURMA)") ~ "MYANMAR",
countryForeignPrincipal %>% str_detect("ST. CHRISTOPHER (ST. KITTS) & NEVIS") ~ "ST. KITTS & NEVIS",
countryForeignPrincipal %>% str_detect("TIMOR-LESTE (EAST TIMOR)") ~ "EAST TIMOR",
countryForeignPrincipal == "TIMOR-LESTE (EAST TIMOR)" ~ "EAST TIMOR",
countryForeignPrincipal %>% str_detect("SAMOA") ~ "SAMOA",
countryForeignPrincipal %>% str_detect("BURMA") ~ "MYANMAR",
countryForeignPrincipal %>% str_detect("GERMAN DEMOCRATIC REPUBLIC") ~ "EAST GERMANY",
countryForeignPrincipal %>% str_detect("GERMANY FEDERAL REPUBLIC OF") ~ "WEST GERMANY",
countryForeignPrincipal == "YEMEN PEOPLES DEMOCRATIC REPUBLIC OF YEMEN" ~ "YEMEN",
countryForeignPrincipal == "ST. CHRISTOPHER (ST. KITTS) & NEVIS" ~ "ST. CHRISTOPHER & NEVIS",
TRUE ~ countryForeignPrincipal
)
)
}
data <-
.add_name(data = data)
if (clean_address) {
data <-
data %>%
build_address(return_message = F)
}
data <-
data %>%
mutate_if(is.character,
list(function(x) {
case_when(x == "" ~ NA_character_,
TRUE ~ x)
}))
to_date_names <-
c(
"datetimeContractSigned",
"datetimeContractEffective",
"datetimeContractCompletionCurrent",
"datetimeContractCompletionUltimate",
"datetimeRegistration",
"datetimeRenewal"
)
to_date <- names(data)[names(data) %in% to_date_names]
if (length(to_date) > 0) {
data <- data %>% mutate_at(to_date, as.Date)
names(data)[names(data) %in% to_date] <-
names(data)[names(data) %in% to_date] %>% str_replace_all("datetime", "date")
}
has_contract_ob <-
names(data) %in% c("amountContract", "amountObligation") %>% sum(na.rm = T) == 2
if (has_contract_ob) {
data <- data %>%
mutate(isContractObligationChanged = amountContract != amountObligation)
}
if (data %>% hasName("dateContractSigned")) {
data <- data %>%
mutate(yearObligation = dateContractSigned %>% year())
}
if (data %>% hasName("nameAwardeeAddress")) {
data <- data %>%
mutate(nameAwardeeAddress = str_to_upper(nameAwardeeAddress))
df_awardees <-
data$nameAwardeeAddress %>% unique() %>%
.munge_awardees()
data <-
data %>%
left_join(df_awardees, by = "nameAwardeeAddress")
}
data <-
data %>%
mutate_if(is.character,
list(function(x) {
gsub("\\s+", " ", x) %>% stri_enc_toascii() %>% str_remove_all(" \032 ")
}))
if (data %>% hasName("nameAgencyAward")) {
data <-
data %>%
separate(
nameAgencyAward,
sep = "\\(",
fill = "right",
into = c("nameAgencyAward", "slugAgencyAward"),
extra = "merge"
) %>%
mutate(slugAgencyAward = slugAgencyAward %>% gsub("\\)|\\(", "" , .) %>% str_squish())
}
bracket_cols <-
data %>% select_if(is.character) %>% select(-matches("^url|^json")) %>% names()
if (length(bracket_cols) > 0) {
data <-
data %>%
mutate_at(bracket_cols,
rm_bracket)
}
data <-
data %>%
mutate_if(is.character, list(function(x) {
x %>% str_remove_all('"')
}))
org_names <-
data %>% select(matches("idOrganization")) %>% names()
if (length(org_names) > 0) {
data <-
data %>%
mutate_at(org_names, as.integer)
}
if (data %>% hasName("idOrganizationLocation")) {
data <-
data %>%
mutate(idOrganizationLocation = as.integer(idOrganizationLocation))
}
if (data %>% hasName("numberVersion")) {
data <-
data %>%
mutate(numberVersion = as.integer(numberVersion))
}
if (data %>% hasName("idNotice")) {
data <-
data %>%
mutate(idNotice = str_to_lower(idNotice))
}
if (data %>% hasName("idDescription")) {
data <-
data %>%
mutate(idDescription = str_to_lower(idDescription))
}
if (data %>% hasName("nameCommand")) {
data <-
data %>%
mutate(nameCommand = case_when(nameCommand == "N/A" ~ NA_character_,
TRUE ~ nameCommand))
}
data <-
.fix_contract_id(data = data)
if (data %>% hasName("idAgencyAward")) {
data <- data %>% filter(idAgencyAward != "CONTRACTING AGENCY ID")
}
data <-
unique(data)
duns_cols <-
data %>% select(matches("idDUNS")) %>% names()
if (length(duns_cols) > 0) {
data <- data %>%
mutate_at(duns_cols,
list(function(x){
if_else(x == 0, NA_real_, x)
}))
}
v_c <-
data %>% select(matches("nameVendor|nameContractor")) %>% names()
if (length(v_c) > 0) {
data <- data %>%
mutate_at(v_c, list(function(x) {
if_else(x == "NO DATA FROM D AND B", NA_character_, x)
}))
}
data <-
.fix_duns(data)
data <-
.add_budget_year(data)
data <-
add_department_codes(data = data)
data <-
.add_dod_type(data = data)
data <-
.fix_foreign_reference(data = data)
data <-
.fix_sam_exceptions(data = data)
data <-
.add_analysis_contract(data = data)
data <-
.add_agency_cgacs(data = data)
data <-
resolve_listed_duns(data = data, exclude_bloat = exclude_bloat)
data <-
.add_original_dates(data = data)
data <-
.allocate_federal_accounts(data = data)
data <-
.guess_duns_type(data = data)
if (unformat) {
data <-
data %>%
mutate_if(is.numeric, as.numeric)
}
if (snake_names) {
data <-
data %>%
janitor::clean_names(case = "snake")
}
data
}
.munge_organizations <-
function(data) {
if (!data %>% hasName("nameOrganization")) {
return(data)
}
data <-
data %>%
mutate(
nameOrganization = stri_enc_toutf8(nameOrganization),
nameOrganization = case_when(
nameOrganization %>% str_detect("YALE UNIV NEW HAVEN CT") ~ "YALE UNIVERSITY",
nameOrganization %>% str_detect("AUSTRALIAN NATIONAL UNIVERSITY") ~ "AUSTRALIAN NATIONAL UNIVERSITY",
nameOrganization %>% str_detect("ALBERT EINSTEIN COLLEGE OF MEDICINE") ~ "ALBERT EINSTEIN COLLEGE OF MEDICINE",
nameOrganization %>% str_detect(
"ARIZONA UNIVERSITY OF TUCSON|ARIZONA UNIV BOARD OF REGENTS TUCSON|ARIZONA UNIVERSITY OF TUCSON|ARIZONA, UNIVERSITY OF, TUCSON"
) ~ "UNIVERSITY OF ARIZONA",
nameOrganization %>% str_detect("ASSOCIATION FOR UNMANNED VEHIC") ~ "ASSOCIATION FOR UNMANNED VEHICLE SYSTEMS INTERNATIONAL FOUNDATION INC",
nameOrganization %>% str_detect("BOSTON UNIVERSITY") ~ "BOSTON UNIVERSITY",
nameOrganization %>% str_detect("BAHAMAS MARINE MAMMAL") ~ "BAHAMAS MARINE MAMMAL RESEARCH ORGANIZATION",
nameOrganization %in% c(
"BOARD OF REGENTS OF THE UNIVE",
"BOARD OF REGENTS OF THE UNIVERSITY|BOARD OF REGENTS OF THE UNIVERSITY OF NEBRASKA|UNIVERSITY OF NEBRASKA|BOARD OF REGENTS OF THE UNIVERSITY|BOARD OF REGENTS OF THE UNIVERSITY OF NEBRASKA|UNIVERSITY OF NEBRSKA"
) ~ "UNIVERSITY OF NEBRASKA",
nameOrganization == "CABI" ~ "CAB INTERNATIONAL",
nameOrganization %>% str_detect("COLLEGE OF WILLIAM") ~ "COLLEGE OF WILLIAM AND MARY",
nameOrganization %>% str_detect("CALIFORNIA STATE UNIVERSITY AUXILIA") ~ "CALIFORNIA STATE UNIVERSITY AUXILIARY SERVICES INC",
nameOrganization %>% str_detect(
"CALIFORNIA POLYTECHNIC|CALIFORNIA STATE POLYTECHNIC UNIVERSITY POMONA"
) ~ "CALIFORNIA POLYTECHNIC STATE UNIVERSITY",
nameOrganization %>% str_detect("CTRE NAT DE LA RECHERCHE") ~ "CTRE NAT DE LA RECHERCHE SCIENTIFIQUE",
nameOrganization %>% str_detect("CLEAR PATH FOR VETERANS") ~ "CLEAR PATH FOR VETERANS NEW ENGLAND INC",
nameOrganization %>% str_detect("CENTER FOR STRATEGIC AND INTERNATIONAL STUDIES INC.") ~ "CENTER FOR STRATEGIC AND INTERNATIONAL STUDIES",
nameOrganization %>% str_detect("CENTRE FOR MARITIME RESEARCH") ~ "CENTRE FOR MARITIME RESEARCH & EXPERIMENTATION",
nameOrganization %>% str_detect("CENTRO DE INVESTIGACION EN MATE") ~ "CENTRO DE INVESTIGACION EN MATEMATICAS AC",
nameOrganization %>% str_detect("COMMONWEALTH SCIENTIFIC AND IN") ~ "COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARCH ORGANISATION",
nameOrganization %>% str_detect("UNIVERSITY OF PITTSBURGH THE|PITTSBURGH, UNIVERSITY OF") ~ "UNIVERSITY OF PITTSBURGH",
nameOrganization %>% str_detect("CARNEGIE MELLON UNIVERSITY|CARNEIE MELLON UNIVERSITY") ~ "CARNEGIE MELLON UNIVERSITY",
nameOrganization %>% str_detect(
"UNIVERSITY OF CALIFORNIA - DAVIS|UNIVERSITY OF CALIFORNIA, DAVIS|CALIFORNIA, UNIVERSITY OF, DAVIS|UNIVERSITY OF CALIFORNIA DAVIS|UNIVERSITY OF CALIFORNIA, DAVIS-DAVIS CAMPUS/NEWS SERVICE"
) ~ "UNIVERSITY OF CALIFORNIA - DAVIS",
nameOrganization %>% str_detect(
"UNIVERSITY OF CALIFORNIA - IRVINE|UNIVERSITY OF CALIFORNIA IRVINE|UNIVERSITY OF CALIFORNIA, IRVINE|UNIVERSITY OF CALIFORNIA AT IRVINE"
) ~ "UNIVERSITY OF CALIFORNIA - IRVINE",
nameOrganization %>% str_detect(
"UNIVERSITY OF CALIFORNIA SANTA CRUZ|UNIVERSITY OF CALIFORNIA - SANTA CRUZ|UNIVERSITY OF CALIFORNIA, SANT|UNIVERSITY OF CALIFORNIA, SANTA CRU|UNIVERSITY OF CALIFORNIA, SANTA CRUZ"
) ~ "UNIVERSITY OF CALIFORNIA - SANTA CRUZ",
nameOrganization %>% str_detect("CHARLES DREW") ~ "CHARLES DREW UNIVERSITY OF MEDICINE AND SCIENCE",
nameOrganization %in% c(
"DEPARTMENT OF ELECTRICAL ENGI NEERI",
"DEPARTMENT OF ELECTRICAL ENGINEERING, UNIVERSIDAD DE CHILE"
) ~ "DEPARTMENT OF ELECTRICAL ENGINEERING, UNIVERSIDAD DE CHILE",
nameOrganization %>% str_detect("DUKE UNIVERSITY") ~ "DUKE UNIVERSITY",
nameOrganization %>% str_detect(
"EASTERN VIRGINIA MEDICAL COLLEGE|EASTERN VIRGINIA MEDICAL SCHOOL"
) ~ "EASTERN VIRGINIA MEDICAL COLLEGE",
nameOrganization %>% str_detect("ECOLE POLYTECHNIQUE") ~ "ECOLE POLYTECHNIQUE FÉDÉRALE DE LAUSANNE",
nameOrganization %>% str_detect("ECOLE NORMALE SUPERIEURE") ~ "ECOLE NORMALE SUPERIERE OF LYON",
nameOrganization %in% c(
"FEDERATION OF AMERICA SCIENTISTS",
"FEDERATION OF AMERICAN SCIENTISTS"
) ~ "FEDERATION OF AMERICAN SCIENTISTS",
nameOrganization %>% str_detect("FLORIDA STATE UNIV") ~ "FLORIDA STATE UNIVERSITY",
nameOrganization %>% str_detect("FLORIDA INSTITUTE OF TECHNOLOGY") ~ "FLORIDA INSTITUTE OF TECHNOLOGY",
nameOrganization %>% str_detect("FLORIDA INSTITUTE FOR HUMAN AND MAC") ~ "FLORIDA INSTITUTE FOR HUMAN AND MACHINE COGNITION INC",
nameOrganization %>% str_detect("FUNDACAO DE APOIO DA UNIVERSIDADE F") ~ "FUNDACAO DE APOIO DA UNIVERSIDADE FEDERAL DO RGS",
nameOrganization %>% str_detect("FUNDACAO DE APOIO A UNIVERSIDADE DE ") ~ "FUNDACAO DE APOIO A UNIVERSIDADE DE SAO PAULO",
nameOrganization %>% str_detect("FUNDACAO DE CIENCIA, APLICACOES E T") ~ "FUNDACAO DE CIENCIA APLICACOES E TECNOLOGIA ESPACIAIS",
nameOrganization %in% c(
"FOUNDATION FOR RESEARCH AND TE",
"FOUNDATION FOR RESEARCH AND TECHNOL"
) ~ "FOUNDATION FOR RESEARCH AND TECHNOLOGY HELLAS",
nameOrganization %>% str_detect("FOUNDATION FOR MARINE ECOLOGY AND T") ~ "FOUNDATION FOR MARINE ECOLOGY AND TELEMETRY RESEARCH",
nameOrganization %>% str_detect("FAIRBANKS NORTH STAR BOROUGH") ~ "FAIRBANKS NORTH STAR BOROUGH SCHOOL DISTRICT",
nameOrganization %>% str_detect("FREEDOM DOGS ") ~ "FREEDOM SERVICE DOGS OF AMERICA",
nameOrganization %>% str_detect("FRAUNHOFER-GESELLSCHAFT ZUR FO") ~ "FRAUNHOFER-GESELLSCHAFT ZUR FORDERUNG DER ANGEWANDTEN FORSCHUNG EV",
nameOrganization %>% str_detect("GEORGIA STATE UNIVERSITY") ~ "GEORGIA STATE UNIVERSITY",
nameOrganization %>% str_detect("GEORGIA TECH") ~ "GEORGIA TECH",
nameOrganization %>% str_detect("GEORGIA REGENTS UNIVERSITY") ~ "UNIVERSITY OF GEORGIA",
nameOrganization %>% str_detect("HEBREW UNIVERSITY OF JERUSALEM") ~ "HEBREW UNIVERSITY OF JERUSALEM",
nameOrganization %>% str_detect("HAWAII DEPARTMENT OF EDUCATION") ~ "HAWAII DEPARTMENT OF EDUCATION",
nameOrganization %>% str_detect("HANOI UNIVERSITY OF SCIENCE AND") ~ "HANOI UNIVERSITY OF SCIENCE AND TECHNOLOGY",
nameOrganization %>% str_detect("HANNAM UNIVERSITY") ~ "HANNAM UNIVERSITY",
nameOrganization %>% str_detect("HENRY M. JACKSON FOUNDATION") ~ "HENRY M. JACKSON FOUNDATION",
nameOrganization %>% str_detect("HARVARD COLLEGE|HARVARD MEDICAL") ~ "HARVARD UNIVERSITY",
nameOrganization == "HARFORD COUNTY" ~ "HARFORD COUNTY GOVERNMENT",
nameOrganization %>% str_detect("HORSHAM LRA") ~ "HORSHAM LAND REDEVELOPMENT AUTHORITY",
nameOrganization %>% str_detect("HOWARD UNIV") ~ "HOWARD UNIVERSITY",
nameOrganization %>% str_detect("ICFO-THE INSTITUTE OF PHOTONIC") ~ "ICFO-THE INSTITUTE OF PHOTONIC SCIENCES",
nameOrganization %>% str_detect("IOWA STATE UNIVERSITY") ~ "IOWA STATE UNIVERSITY",
nameOrganization %>% str_detect("IMPERIAL COLLEGE") ~ "IMPERIAL COLLEGE LONDON",
nameOrganization %>% str_detect("INSTITUTE OF MATERIALS SCIENCE") ~ "INSTITUTE OF MATERIALS SCIENCE VIETNAM ACADEMY OF SCIENCE AND TECHNOLOGYA",
nameOrganization %>% str_detect(
"INDIANA UNIVERSITY BLOOMINGTON|INDIANA UNIVERSITY AT BLOOMINGTON|TRUSTEES OF INDIANA UNIVERSITY"
) ~ "INDIANA UNIVERSITY",
nameOrganization %>% str_detect("INDIANA UNIVERSITY, INDIANAPOLIS") ~ "INDIANA UNIVERSITY AT INDIANAPOLIS",
nameOrganization %>% str_detect("INDIAN INSTITUTE OF SCIENCE") ~ "INDIAN INSTITUTE OF SCIENCE EDUCATION AND RESEARCH",
nameOrganization %>% str_detect("JOHNS HOPKINS UNIV|JOHN HOPKINS UNIVERSITY, THE") ~ "JOHNS HOPKINS UNIVERSITY",
nameOrganization %>% str_detect("KING ABDULLAH UNIVERSITY OF SCIENCE") ~ "KING ABDULLAH UNIVERSITY OF SCIENCE AND TECHNOLOGY",
nameOrganization %>% str_detect("KHON KAEN UNIVERSITY") ~ "KHON KAEN UNIVERSITY",
nameOrganization %>% str_detect("KAROLINSKA INST") ~ "KAROLINSKA INSTITUTE",
nameOrganization %>% str_detect("KOREA UNIV. RESEARCH AND BUSIN") ~ "KOREA UNIV. RESEARCH AND BUSINESS FOUNDATION",
nameOrganization %>% str_detect("KOREA INSTITUTE OF INFORMATION SECU") ~ "KOREA INSTITUTE OF INFORMATION SECURITY",
nameOrganization %>% str_detect("KOREA RESEARCH INSTITUTE OF SHIPS &") ~ "KOREA RESEARCH INSTITUTE OF SHIPS & ENGINEERING",
nameOrganization %>% str_detect("LOUISIANA STATE UNIVERSITY") ~ "LOUISIANA STATE UNIVERSITY",
nameOrganization %>% str_detect(
"LUDWIG INSTITUTE FOR CANCER RESEARCH|LATVIJAS UNIVERSITATES CIETVIELU FI"
) ~ "LUDWIG INSTITUTE FOR CANCER RESEARCH",
nameOrganization %>% str_detect("MASSACHUSETTS GENERAL HOSPITAL") ~ "MASSACHUSETTS GENERAL HOSPITAL",
nameOrganization %>% str_detect("MICHIGAN STATE UNIV") ~ "MICHIGAN STATE UNIVERSITY",
nameOrganization %>% str_detect("MCGILL UNIVERSITY") ~ "MCGILL UNIVERSITY",
nameOrganization %>% str_detect("MONTCLAIR STATE UNIVERSITY") ~ "MONTCLAIR STATE UNIVERSITY",
nameOrganization %>% str_detect("MONTANA STATE") ~ "MONTANA STATE UNIVERSITY",
nameOrganization %>% str_detect("MONTEREY BAY AQUARIUM RESEARCH") ~ "MONTEREY BAY AQUARIUM RESEARCH INSTITUTE",
nameOrganization == "MIAMI UNIVERSITY" ~ "MIAMI UNIVERSITY, OHIO",
nameOrganization %>% str_detect("MINERALS, METALS ") ~ "MINERALS, METALS & MATERIALS SOCIETY",
nameOrganization %>% str_detect("MARINE BIOLOGICAL LABORATORY") ~ "MARINE BIOLOGICAL LABORATORY",
nameOrganization %>% str_detect("NEW JERSEY INST OF TECH NEWARK") ~ "NEW JERSEY INSTITUTE OF TECHNOLOGY",
nameOrganization %>% str_detect("NATIONAL INSTITUTE FOR MATERIA") ~ "NATIONAL INSTITUTE FOR MATERIALS SCIENCE IAI",
nameOrganization %>% str_detect("NANYANG TECHNOLOGICAL UNIVERSI") ~ "NANYANG TECHNOLOGICAL UNIVERSITY",
nameOrganization %>% str_detect("NORTHWESTERN UNIVERSITY") ~ "NORTHWESTERN UNIVERSITY",
nameOrganization %>% str_detect("NORTH CAROLINA STATE") ~ "NORTH CAROLINA STATE",
nameOrganization %>% str_detect("NORTH CAROLINA AGRICULTURAL AND TEC") ~ "NORTH CAROLINA A&T STATE UNIVERSITY",
nameOrganization %>% str_detect("OFFICE OF THE GOVERNOR") ~ "OFFICE OF THE GOVERNOR GUAM",
nameOrganization %>% str_detect("OPTICAL SOCIETY OF AMERICA") ~ "OPTICAL SOCIETY OF AMERICA INCORPORATED",
nameOrganization %>% str_detect("OSPEDALE SAN RAFFAELE") ~ "OSPEDALE SAN RAFFAELE",
nameOrganization %>% str_detect("OHIO STATE UNIVERSITY") ~ "OHIO STATE UNIVERSITY",
nameOrganization %>% str_detect("OKLAHOMA STATE UNIVERSITY|OSU - INSTITUTE OF TECHNOLOGY") ~ "OKLAHOMA STATE UNIVERSITY",
nameOrganization %>% str_detect("OLD DOMINION|OLD DIMINION") ~ "OLD DOMINION UNIVERSITY",
nameOrganization %>% str_detect("OREGON HEALTH") ~ "OREGON HEALTH AND SCIENCE UNIVERSITY",
nameOrganization %>% str_detect("PENN STATE UNIVERSITY|PENNSYLVANIA STATE UNIVERSITY") ~ "PENN STATE UNIVERSITY",
nameOrganization %>% str_detect("PRESIDENT AND BOARD OF TRUSTEES OF") ~ "SANTA CLARA UNIVERSITY",
nameOrganization %>% str_detect("PRAIRIE VIEW") ~ "PRAIRIE VIEW A&M",
nameOrganization %>% str_detect("QUEEN MARY") ~ "QUEEN MARY UNIVERSITY OF LONDON",
nameOrganization %>% str_detect("ROCHESTER INSTITUTE OF TECHNOL") ~ "ROCHESTER INSTITUTE OF TECHNOLOGY",
nameOrganization %>% str_detect("RECTOR & VISITORS|VIRGINIA, UNIVERSITY OF") ~ "UNIVERSITY OF VIRGINIA",
nameOrganization %in% c("REGENTS OF THE UNIVERSITY OF") ~ "UNIVERSITY OF MICHIGAN",
nameOrganization == "REGENTS OF THE UNIVERSITY OF C" ~ "UNIVERSITY OF CALIFORNIA AT BERKELEY",
nameOrganization == "REGENTS OF THE UNIVERSITY OF M" ~ "UNIVERSITY OF MICHIGAN",
nameOrganization %>% str_detect(
"REGENTS OF THE UNIVERSITY OF CAL|THE REGENTS OF THE UNIVERSITY OF CALIFORNIA"
) ~ "UNIVERSITY OF CALIFORNIA AT BERKELEY",
nameOrganization %>% str_detect("REGENTS OF THE UNIVERSITY OF COL") ~ "UNIVERSITY OF COLORADO",
nameOrganization %>% str_detect("REGENTS OF THE UNIVERSITY OF MICHIG") ~ "UNIVERSITY OF MICHIGAN",
nameOrganization %>% str_detect(
"REGENTS OF THE UNIVERSITY OF MINNES|UNIVERSITY OF MINNESOTA|UNIVERSITY OF MINNESOTA TWIN CITIES|MINNESOTA, UNIVERSITY OF, TWIN CITIES"
) ~ "UNIVERSITY OF MINNESOTA",
nameOrganization == "RESEARCH FOUNDATION OF THE CITY UNI" ~ "RESEARCH FOUNDATION OF THE CITY UNIVERSITY OF NEW YORK",
nameOrganization %in% c(
"RESEARCH FOUNDATION FOR THE ST",
"RESEARCH FOUNDATION FOR THE STATE U"
) ~ "RESEARCH FOUNDATION FOR THE STATE UNIVERSITY OF NEW YORK THE",
nameOrganization %>% str_detect("RESEARCH INSTITUTE AT NATIONWIDE CH") ~ "RESEARCH INSTITUTE AT NATIONWIDE CHILDREN'S HOSPITAL",
nameOrganization %>% str_detect("RUTGERS") ~ "RUTGERS UNIVERSITY",
nameOrganization %>% str_detect(
"ROYAL INSTITUTION FOR THE ADVA|ROYAL INSTITUTION FOR THE ADVANCEMENT OF LEARNING"
) ~ "MCGILL UNIVERSITY",
nameOrganization %>% str_detect("RHEINISCH-WEEFAELISCH") ~ "RHEINISCH-WEEFAELISCH TECHNISCH HOCHSCHULE AACHN",
nameOrganization %>% str_detect("RENSSELAER POLYTECHNIC") ~ "RENSSELAER POLYTECHNIC INSTITUTE",
nameOrganization %>% str_detect("SOCIEDADE BRASILEIRA DE COMPU") ~ "SOCIEDADE BRASILEIRA DE COMPUTACAO",
nameOrganization %>% str_detect("SOCIEDADE PORTUGUESA DE IN") ~ "SOCIEDADE PORTUGUESA DE INOVACAO",
nameOrganization %>% str_detect("STEVENS INSTITUTE") ~ "STEVENS INSTITUTE OF TECHNOLOGY",
nameOrganization %>% str_detect("SOUTH DAKOTA SCHOOL OF MINES") ~ "SOUTH DAKOTA SCHOOL OF MINES AND TECHNOLOGY",
nameOrganization %>% str_detect("SALK INSTITUTE") ~ "SALK INSTITUTE",
nameOrganization %>% str_detect("SINGAPORE SPACE AND TECHNOLOGY") ~ "SINGAPORE SPACE AND TECHNOLOGY ASSOCIATION",
nameOrganization %>% str_detect("SINGAPORE UNIVERSITY OF TECHNOLOGY") ~ "SINGAPORE UNIVERSITY OF TECHNOLOGY",
nameOrganization %>% str_detect("SAN JOSE STATE UNIVERSITY") ~ "SAN JOSE STATE UNIVERSITY",
nameOrganization %>% str_detect("SAN DIEGO STATE UNIVERSITY") ~ "SAN DIEGO STATE UNIVERSITY",
nameOrganization %>% str_detect("SIEMENS CORPORATION") ~ "SIEMENS CORPORATION",
nameOrganization %>% str_detect("SRI INTERNATIONAL") ~ "SRI INTERNATIONAL",
nameOrganization %>% str_detect("WRIGHT STATE") ~ "WRIGHT STATE",
nameOrganization %>% str_detect("WORCESTER POLYTECHNIC") ~ "WORCESTER POLYTECHNIC INSTITUTE",
nameOrganization %>% str_detect("WALTER REED") ~ "WALTER REED NATIONAL MILITARY MEDICAL CENTER",
nameOrganization %>% str_detect("RICE UNIV") ~ "RICE UNIVERSITY",
nameOrganization %>% str_detect("WOODS HOLE") ~ "WOODS HOLE OCEANOGRAPHIC INSTITUTION",
nameOrganization %>% str_detect("WEIZMANN INSTITUTE") ~ "WEIZMANN INSTITUTE OF SCIENCE",
nameOrganization %>% str_detect("WASHINGTON UNIVERSITY") ~ "WASHINGTON UNIVERSITY ST LOUIS",
nameOrganization %>% str_detect("WASHINGTON STATE") ~ "WASHINGTON STATE",
nameOrganization %>% str_detect("WEST VIRGINIA UNIVERSITY") ~ "WEST VIRGINIA UNIVERSITY",
nameOrganization %>% str_detect("WAKE FOREST|WAKE FORREST") ~ "WAKE FOREST UNIVERSITY",
nameOrganization %>% str_detect("VANDERBILT") ~ "VANDERBILT UNIVERSITY",
nameOrganization %>% str_detect("VILLANOVA") ~ "VILLANOVA UNIVERSITY",
nameOrganization %>% str_detect("UNIAO BRASILEIRA DE EDUCACAO E ASSI") ~ "UNIAO BRASILEIRA DE EDUCACAO E ASSISTENCIA",
nameOrganization %>% str_detect("UNIVERSITI TUNKU ABDUL RAHMAN") ~ "UNIVERSITI TUNKU ABDUL RAHMAN",
nameOrganization %>% str_detect("UNIVERSITA DEGLI STUDI DI PERUGIA") ~ "UNIVERSITA DEGLI STUDI DI PERUGIA",
nameOrganization %>% str_detect("UNIVERSITY SYSTEM OF NEW HAMPS") ~ "UNIVERSITY SYSTEM OF NEW HAMPSHIRE",
nameOrganization %>% str_detect("UNIVERSITY AUXILIARY AND RESEARCH S") ~ "UNIVERSITY AUXILIARY AND RESEARCH SERVICES CORPORATION",
nameOrganization %>% str_detect("UNIVERSITY OF VERMONT") ~ "UNIVERSITY OF VERMONT",
nameOrganization %>% str_detect("UNIVERSITY OF PUERTO RICO AT MAYAGUEZ") ~ "UNIVERSITY OF PUERTO RICO - MAYAGUEZ",
nameOrganization %>% str_detect("UNIVERSITY OF BRITISH COLUMBIA") ~ "UNIVERSITY OF BRITISH COLUMBIA",
nameOrganization %>% str_detect("UNIVERSITY OF ARKANSAS AT LITTLE ROCK") ~ "UNVIERSITY OF ARKANSAS",
nameOrganization %>% str_detect("UNIVERSITY OF OREGON EUGENE OR|UNIVERSITY OF OREGON") ~ "UNIVERSITY OF OREGON",
nameOrganization %>% str_detect("UNIVERSITY OF MARYLAND BALTIMORE CO") ~ "UNIVERSITY OF MARYLAND BALTIMORE COUNTY",
nameOrganization %>% str_detect("NORGES TEKNISK-NATURVITENSKAPELIGE") ~ "NORGES TEKNISK-NATURVITENSKAPELIGE",
nameOrganization %>% str_detect("TUFTS") ~ "TUFTS UNIVERSITY",
nameOrganization %>% str_detect("TEXAS TECH") ~ "TEXAS TECH UNIVERSITY",
nameOrganization %in% c(
"TEXAS A&M UNIVERSITY - KINGSVILLE",
"TEXAS A & M UNIVERSITY KINGSVILLE",
"TEXAS A&M UNIVERSITY-KINGSVILLE"
) ~ "TEXAS A&M UNIVERSITY KINGSVILLE",
nameOrganization %in% c(
"TEXAS A & M UNIVERSITY",
"TEXAS A&M UNIVERSITY SYSTEM,THE",
"TEXAS A&M UNIVERSITY"
) ~ "TEXAS A&M UNIVERSITY",
nameOrganization %>% str_detect(
"TEXAS ENGINEERING EXPERIMENT STATION|TEXAS A&M UNIVERSITY-CORPUS CHRISTI|TEXAS A&M ENGINEERING EXPERIMENT ST|TEXAS A&M ENGINEERING EXPERIMENT STATION"
) ~ "TEXAS A&M UNIVERSITY",
nameOrganization %>% str_detect(
"THE CHANCELLOR, MASTERS AND SCHOLAR|THE CHANCELLOR, MASTER AND SCHOLARS OF THE UNIVERISTY OF CAMBRIDGE"
) ~ "UNIVERSITY OF CAMBRIDGE",
nameOrganization %>% str_detect(
"TECHNISCHE UNIVERSITAT BERLIN|TECHNISCHE UNIVERSITAET BERLIN"
) ~ "Berlin Institute of Technology" %>% str_to_upper(),
nameOrganization %>% str_detect("TECHNOLOGY MANAGEMENT TRAINING GROU") ~ "TECHNOLOGY MANAGEMENT TRAINING GROUP",
nameOrganization %>% str_detect("TECHNION ") ~ "TECHNION ISRAEL INSTITUTE OF TECHNOLOGY",
nameOrganization %>% str_detect("GEORGE WASHINGTON") ~ "GEORGE WASHINGTON UNIVERSITY",
nameOrganization %>% str_detect("SCRIPPS RESEARCH") ~ "THE SCRIPPS RESEARCH INSTITUTE",
nameOrganization %>% str_detect("COLUMBIA UNIVERSITY") ~ "COLUMBIA UNIVERSITY",
nameOrganization %>% str_detect("TATA INSTITUTE") ~ "TATA INSTITUTE OF FUNDAMENTAL RESEARCH",
nameOrganization %>% str_detect("TEMPLE UNIVERSITY") ~ "TEMPLE UNIVERSITY",
nameOrganization %>% str_detect("THE KOREAN INSTITUTE OF ELECTRICAL") ~ "THE KOREAN INSTITUTE OF ELECTRICAL AND ELECTRONIC MATERIAL ENGINEERS",
nameOrganization %>% str_detect("ELECTROCHEMICAL SOCIETY") ~ "THE ELECTROCHEMICAL SOCIETY",
nameOrganization %>% str_detect("PRINCETON") ~ "PRINCETON UNIVERSITY",
nameOrganization %>% str_detect("BOSTON COLLEGE CHESTNUT HILL MA|BOSTON COLLEGE") ~ "BOSTON COLLEGE",
nameOrganization %>% str_detect("BMS (BIOLOGIE MEDECINE SERVICES)") ~ "BIOLOGIE MEDECINE SERVICES",
nameOrganization %>% str_detect("BROWN UNIVERSITY IN PROVIDENCE") ~ "BROWN UNIVERSITY",
nameOrganization %>% str_detect("CASE WESTERN RESERVE") ~ "CASE WESTERN RESERVE UNIVERSITY",
nameOrganization %>% str_detect("CALIFORNIA STATE UNIVERSITY LONG B") ~ "CALIFORNIA STATE UNIVERSITY LONG BEACH",
nameOrganization %>% str_detect("COLLEGE OF WILLIAM AND MARY|COLLEGE OF WILLIAM & MARY THE") ~ "COLLEGE OF WILLIAM AND MARY",
nameOrganization %>% str_detect("COLORADO STATE UNIVERSITY") ~ "COLORADO STATE UNIVERSITY",
nameOrganization %>% str_detect("CONSIGLIO NAZIONALE DELLE RICERCHE") ~ "CONSIGLIO NAZIONALE DELLE RICERCHE",
nameOrganization %>% str_detect("COUNCIL FOR SCIENTIFIC & INDUSTRIAL") ~ "COUNCIL FOR SCIENTIFIC & INDUSTRIAL RESEARCH",
nameOrganization %>% str_detect("CENTER FOR COLLABORATIVE INTERDISCI") ~ "CENTER FOR COLLABORATIVE INTERDISCIPLINARY SCIENCES",
nameOrganization %>% str_detect("CURTIN UNIVERSITY") ~ "CURTIN UNIVERSITY",
nameOrganization %>% str_detect("CARLTON U") ~ "CARLETON UNIVERSITY",
nameOrganization %>% str_detect(
"VIRGINIA POLYTECHNIC|VIRGINIA TECH APPLIED RESEARCH CORP|VIRGINIA TECH APPLIED RESEARCH CORP"
) ~ "VIRGINIA POLYTECHNIC INSTITUTE",
nameOrganization %>% str_detect("CORNELL UNIVERSITY") ~ "CORNELL UNIVERSITY",
nameOrganization %>% str_detect("DEUTSCHE GESELLSCHAFT FUR ORTUNG UND NAVIGATION") ~ "DEUTSCHE GESELLSCHAFT FUR ORTUNG UND NAVIGATION",
nameOrganization %>% str_detect("UNIVERSITY OF OXFORD") ~ "UNIVERSITY OF OXFORD",
nameOrganization %>% str_detect("UNIVERSITY OF HAWAII SYSTEMS") ~ "UNIVERSITY OF HAWAII SYSTEMS HONOLULU",
nameOrganization %>% str_detect("UNIVERSITY OF OXFORD") ~ "UNIVERSITY OF OXFORD",
nameOrganization %>% str_detect("THE UNIVERSITY OF THE DISTRICT OF COLUMBIA") ~ "THE UNIVERSITY OF THE DISTRICT OF COLUMBIA",
nameOrganization %>% str_detect("THE INSTITUTE OF MARINE ENGINEERING") ~ "THE INSTITUTE OF MARINE ENGINEERING SCIENCE & TECHNOLOGY",
nameOrganization %>% str_detect("TRUSTEES OF THE COLORADO SCHOOL OF|COLORADO SCHOOL OF MINES") ~ "COLORADO SCHOOL OF MINES",
nameOrganization %>% str_detect(
"TRUSTEES OF THE UNIVERSITY OF PENNS|PENNSYLVANIA, UNIVERSITY OF"
) ~ "UNIVERSITY OF PENNSYLVANIA",
nameOrganization %>% str_detect("THE RESEARCH FOUNDATION OF STATE UN") ~ "THE RESEARCH FOUNDATION OF SUNY",
nameOrganization %>% str_detect("UW APPLIED PHYSICS LAB") ~ "UNIVERSITY OF WASHINGTON",
nameOrganization %>% str_detect("U OF ALBERTA") ~ "UNIVERSITY OF ALBERTA",
nameOrganization %>% str_detect("U OF WATERLOO|UNIVERSITY OF WATERLOO") ~ "UNIVERSITY OF WATERLOO",
nameOrganization %>% str_detect("U OF OTTAWA|UNIVERSITY OF OTTAWA") ~ "UNIVERSITY OF OTTAWA",
nameOrganization %>% str_detect("YONSEI UNIVERSITY UNIVERSITY") ~ "YONSEI UNIVERSITY UNIVERSITY",
nameOrganization %>% str_detect(
"UNDERSEA & HYPERBARIC MEDICAL SOCIE|UNDERSEA & HYPERBARIC MEDICAL SOCIETY"
) ~ "UNDERSEA & HYPERBARIC MEDICAL SOCIETY",
nameOrganization %>% str_detect("UNIVERSITY RESEARCH CO., LLC") ~ "UNIVERSITY RESEARCH CO LLC",
nameOrganization %>% str_detect("UNIVERSIT LIBRE DE BRUXELLES|UNIVERSIT?? LIBRE DE BRUXELLES") ~ "UNIVERSIT LIBRE DE BRUXELLES",
nameOrganization %>% str_detect(
"UNIVERSIDAD NACIONAL ANDRES B ELLO|UNIVERSIDAD NACIONAL ANDRES BELLO"
) ~ "UNIVERSIDAD NACIONAL ANDRES BELLO",
nameOrganization %>% str_detect(
"UNIVERSIDAD NACIONAL AUT_NOMA DE MXICO|UNIVERSIDAD NACIONAL AUTONOMA DE ME"
) ~ "UNIVERSIDAD NACIONAL AUTONOMA DE ME",
nameOrganization %>% str_detect("UNIVERSIDADE DE SAO PAULO") ~ "UNIVERSIDADE DE SAO PAULO",
nameOrganization %>% str_detect(
"UNIVERSITA DEGLI STUDI DI CAMERINO|UNIVERSITA' DEGLI STUDI DI CAMERINO"
) ~ "UNIVERSITA' DEGLI STUDI DI CAMERINO",
nameOrganization %>% str_detect(
"UNIVERSITA DEGLI STUDI DI FIRENZE|UNIVERSITA' DEGLI STUDI DI FIRENZE"
) ~ "UNIVERSITA DEGLI STUDI DI FIRENZE",
nameOrganization %>% str_detect("UNIVERSITY OF HAWAII") ~ "UNIVERSITY OF HAWAII ",
nameOrganization %>% str_detect(
"UNIVERSITY AT BUFFALO SUNY|UNIVERSITY AT BUFFALO, SUNY|STATE UNIVERSITY OF NEW YORK AT BUFFALO|UNIVERSITY AT BUFFALO, STATE UNIVERSITY OF NEW YORK|SUNY, UNIVERSITY AT BUFFALO|UNIVERSITY AT BUFFALO, SUNY"
) ~ "STATE UNIVERSITY OF NEW YORK AT BUFFALO",
nameOrganization %>% str_detect("EMBRY-RIDDLE AERONAUTICAL") ~ "EMBRY-RIDDLE AERONAUTICAL UNIVERSITY",
nameOrganization %>% str_detect("ELIZABETH CITY STATE UNIVERSIT") ~ "ELIZABETH CITY STATE UNIVERSITY",
nameOrganization %>% str_detect(
"UNIVERSITY OF TEXAS, AUSTIN|UNIVERSITY OF TEXAS SYSTEM, THE|THE UNIVERSITY OF TEXAS AT AUSTIN"
) ~ "UNIVERSITY OF TEXAS AT AUSTIN",
nameOrganization %>% str_detect(
"UNIVERSITY OF TEXAS AT SAN ANTONIO, THE|UNIVERSITY OF TEXAS AT SAN ANTONIO|UNIVERSITY OF TEXAS - SAN ANTONIO"
) ~ "UNIVERSITY OF TEXAS AT SAN ANTONIO",
nameOrganization %>% str_detect("UNIVERSITY OF TEXAS AT ARLINGT") ~ "UNIVERSITY OF TEXAS AT ARLINGTON",
nameOrganization %>% str_detect("UNIVERSITY OF TEXAS MEDICAL BRANCH") ~ "UNIVERSITY OF TEXAS MEDICAL BRANCH AT GALVESTON",
nameOrganization %>% str_detect("UNIVERSITY OF TEXAS - EL PASO") ~ "UNIVERSITY OF TEXAS AT EL PASO",
nameOrganization %>% str_detect(
"UNIVERSITY OF NORTH CAROLINA - CHAPEL HILL|UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL"
) ~ "UNIVERSITY OF NORTH CAROLINA",
nameOrganization %>% str_detect(
"UNIVERSITY OF NORTH CAROLINA AT CHARLOTTE|UNIVERSITY OF NORTH CAROLINA - CHARLOTTE|NORTH CAROLINA AT CHAPEL HILL, UNIVERSITY OF"
) ~ "UNIVERSITY OF NORTH CAROLINA AT CHARLOTTE",
nameOrganization %in% c(
"UNIVERSITY OF NORTH CAROLINA AT CHA",
"UNIVERSITY OF NORTH CAROLINA A"
) ~ "UNIVERSITY OF NORTH CAROLINA",
nameOrganization %in% c("AMERICAN UNIVERSITY, THE",
"AMERICAN UNIVERSITY") ~ "AMERICAN UNIVERSITY",
nameOrganization %>% str_detect("THE UNIVERSITY OF GEORGIA|UNIVERSITY OF GEORGIA") ~ "UNIVERSITY OF GEORGIA",
nameOrganization %>% str_detect(
"UNIVERSITY OF FLORIDA GAINESVILLE|UNIVERSITY OF FLORIDA|UNIVERSITY OF FLOIDA|FLORIDA, UNIVERSITY OF|UNIVERSITY OF FLORIDA|UNIVERSITY OF FLORIDA|UNIVERISITY OF FLORIDA"
) ~ "UNIVERSITY OF FLORIDA",
nameOrganization %>% str_detect("FLORIDA A & M UNIVERSITY") ~ "FLORIDA A&M UNIVERSITY",
nameOrganization %>% str_detect(
"MICHIGAN, UNIVERSITY OF|UNIVERSITY OF MICHIGAN - ANN ARBOR|UNIVERSITY OF MICHIGAN-ANN ARBOR"
) ~ "UNIVERSITY OF MICHIGAN",
nameOrganization %>% str_detect(
"UNIVERSITY OF OKLAHOMA|OKLAHOMA, UNIVERSITY OF, HEALTH SCIENCES CENTER"
) ~ "UNIVERSITY OF OKLAHOMA",
nameOrganization %>% str_detect("CHICAGO, UNIVERSITY OF") ~ "UNIVERSITY OF CHICAGO",
nameOrganization %>% str_detect(
"UNIVERSITY OF ILLINOIS - CHICAGO|UNIVERSITY OF ILLINOIS AT CHICAGO|ILLINOIS, UNIVERSITY OF, AT CHICAGO"
) ~ "UNIVERSITY OF ILLINOIS AT CHICAGO",
nameOrganization %>% str_detect("SOUTHERN ILLINOIS UNIVERSITY") ~ "SOUTHERN ILLINOIS UNIVERSITY",
nameOrganization %>% str_detect(
"UNIVERSITY OF ILLINOIS|BOARD OF TRUSTEES OF THE UNIVERSITY OF ILLINOIS|THE BOARD OF TRUSTEES OF THE UNIVERSITY OF ILLINOIS|UNIVERSITY OF ILLINOIS OFFICE OF BUSINESS AND FINANCIAL SERVICES|UNIVERSITY OF ILLINOIS AT URBANA-CHAMPAIGN|UNIVERSITY OF ILLINOIS CHAMPAIGN|UNIVERSITY OF ILLINOIS - URBANA|UNIVERSITY OF ILLINOIS - URBANA - CHAMPAIGN"
) ~ "UNIVERSITY OF ILLINOIS",
nameOrganization %>% str_detect("MADISON CITY BOARD OF EDUCATION") ~ "MADISON CITY SCHOOLS",
nameOrganization %>% str_detect("UNIVERSITY OF WOLLONGONG") ~ "UNIVERSITY OF WOLLONGONG",
nameOrganization %>% str_detect("CATHOLIC UNIVERSITY OF AMERICA") ~ "CATHOLIC UNIVERSITY OF AMERICA",
nameOrganization %>% str_detect("ALLEN COUNTY") ~ "ALLEN COUNTY BOARD OF COMMISSIONERS",
nameOrganization %>% str_detect(
"MIAMI UNIVERSITY OF CORAL GABLES|UNIVERSITY OF MIAMI - CORAL GABLES|MIAMI, UNIVERSITY OF, CORAL GABLES|UNIVERSITY OF MIAMI"
) ~ "UNIVERSITY OF MIAMI",
nameOrganization %>% str_detect("UNIVERSITY OF ALABAMA - TUSCALOOSA") ~ "UNIVERSITY OF ALABAMA",
nameOrganization %>% str_detect("UNIVERSITY OF UTAH, THE|UTAH, UNIVERSITY OF") ~ "UNIVERSITY OF UTAH",
nameOrganization %>% str_detect(
"UNIVERSITY OF COLORADO, BOULDER|UNIVERSITY OF COLORADO - BOULDER|UNIVERSITY OF COLORADO AT BOULDER"
) ~ "UNIVERSITY OF COLORADO",
nameOrganization %>% str_detect("UNIVERSITY OF NEW SOUTH WALES") ~ "UNIVERSITY OF NEW SOUTH WALES ",
nameOrganization %>% str_detect("UNIVERSITY OF SOUTHERN CALIFOR") ~ "UNIVERSITY OF SOUTHERN CALIFORNIA",
nameOrganization %>% str_detect(
"UNIVERSITY OF NEVADA, RENO|UNIVERSITY OF NEVADA - RENO|BOARD OF REGENTS NEVADA SYSTEM OF H|UNIVERSITY OF NEVADA RENO"
) ~ "UNIVERSITY OF NEVADA",
nameOrganization %>% str_detect("GENEVA FOUNDATION") ~ "GENEVA FOUNDATION",
nameOrganization %>% str_detect("GEORGE MASON") ~ "GEORGE MASON UNIVERSITY",
nameOrganization %>% str_detect(
"UNIVERSITY OF TENNESSEE AT KNOXVILLE|UNIVERSITY OF TENNESSEE HEALTH SCIENCE CENTER|UNIVERSITY OF TENNESSEE KNOXVILLE|UNIVERSITY OF TENNESSEE, KNOXVILLE"
) ~ "UNIVERSITY OF TENNESSEE",
nameOrganization %>% str_detect(
"THE UNIVERSITY OF MISSISSIPPI|UNIVERSITY OF MISSISSIPPI MEDICAL CENTER|MISSISSIPPI, UNIVERSITY OF, MEDICAL CENTER"
) ~ "UNIVERSITY OF MISSISSIPPI",
nameOrganization %>% str_detect(
"UNIVERSITY OF MISSOURI, COLUMBIA|UNIVERSITY OF MISSOURI - COLUMBIA|UNIVERSITY OF MISSOURI SYSTEM|MISSOURI SYSTEM UNIVERSITY OF|CURATORS OF THE UNIVERSITY OF MISSOURI|MISSOURI SYSTEM UNIVERSITY OF"
) ~ "UNIVERSITY OF MISSOURI",
nameOrganization %>% str_detect("COMMISSARIAT A ENERGIE ATOMIQUE ET") ~ "COMMISSARIAT L'ENERGIE ATOMIQUUSE",
nameOrganization %>% str_detect("UNIVERSITY OF MAINE") ~ "UNIVERSITY OF MAINE",
nameOrganization %>% str_detect("UNIVERSITY SYSTEM OF NEW HAMPSHIRE") ~ "UNIVERSITY OF NEW HAMPSHIRE",
nameOrganization %>% str_detect(
"MASSACHUSETTS UNIV AMHERST MA|UNIVERSITY OF MASSACHUSETTS AMHERST|UNIVERSITY OF MASSACHUSETTS - AMHERST|MASSACHUSETTS, UNIVERSITY OF, AMHERST"
) ~ "UNIVERSITY OF MASSACHUSETTS",
nameOrganization %>% str_detect(
"MASSACHUSETTS UNIVERSITY OF LOWELL MA|UNIVERSITY OF MASSACHUSETTS, LOWELL|UNIVERSITY OF MASSACHUSETTS - LOWELL"
) ~ "UNIVERSITY OF MASSACHUSETTS - LOWELL",
nameOrganization %>% str_detect("UNIVERSITY OF CONNECTICUT") ~ "UNIVERSITY OF CONNECTICUT",
nameOrganization %>% str_detect(
"UNIVERSITY NEW MEXICO|UNIVERSITY OF NEW MEXICO|NEW MEXICO, UNIVERSITY OF|UNIVERSITY OF NEW MEXICO ALBUQUERQUE|UNIVERSITY OF NEW MEXICO - ALBUQUERQUE"
) ~ "UNIVERSITY OF NEW MEXICO",
nameOrganization %>% str_detect("NEW MEXICO INSTITUTE OF MINING AND") ~ "NEW MEXICO INSTITUTE OF MINING AND TECHNOLOGY",
nameOrganization %>% str_detect(
"UNIVERSITY OF CALIFORNIA, SANTA BAR|UNIVERSITY OF CALIFORNIA SANTA BARBARA|UNIVERSITY OF CALIFORNIA - SANTA BARBARA|UNIVERSITY OF CALIFORNIA, SANTA BARBARA|UNIVERSITY OF CALIFORNIA, SANTA BAR|UNIVERSITY OF CALIFORNIA, SANTA BARBARA"
) ~ "UNIVERSITY OF CALIFORNIA SANTA BARBARA",
nameOrganization %>% str_detect("DIRECTED ENERGY PROFESSIONAL SOCIET") ~ "DIRECTED ENERGY PROFESSIONAL SOCIETY",
nameOrganization %>% str_detect("ECOLE POLYTECHNIQUE FEDERALE DE") ~ "ECOLE POLYTECHNIQUE FÉDÉRALE DE LAUSANNE",
nameOrganization %>% str_detect("NORTH CAROLINA STATE UNIVERSITY") ~ "NORTH CAROLINA STATE UNIVERSITY",
nameOrganization %>% str_detect("NEW JERSEY INSTITUTE OF TECH") ~ "NEW JERSEY INSTITUTE OF TECHNOLOGY",
nameOrganization %>% str_detect("NAVY LEAGUE OF THE UNITED STATES") ~ "NAVY LEAGUE OF THE UNITED STATES",
nameOrganization %>% str_detect("NATIONAL CENTRAL UNIVERSITY") ~ "NATIONAL CENTRAL UNIVERSITY",
nameOrganization %>% str_detect("MAYO CLINIC") ~ "MAYO CLINIC",
nameOrganization %>% str_detect("MASSACHUSETTS INSTITUTE OF TEC") ~ "MASSACHUSETTS INSTITUTE OF TECHNOLOGY",
nameOrganization %>% str_detect("POHANG UNIVERSITY OF SCIENCE") ~ "POHANG UNIVERSITY OF SCIENCE",
nameOrganization %>% str_detect("LELAND STANFORD JUNIOR|STANFORD UNIVERSITY") ~ "STANFORD UNIVERSITY",
nameOrganization %>% str_detect("UNIVERSITY OF CHICAGO") ~ "UNIVERSITY OF CHICAGO",
nameOrganization %>% str_detect("UNIVERSITY OF CONNECTICUT - STORRS") ~ "UNIVERSITY OF CONNECTICUT",
nameOrganization %>% str_detect("UNIVERSITY OF FLORIDA - GAINESVILLE") ~ "UNIVERSITY OF FLORIDA",
nameOrganization %>% str_detect("UNIVERSITY OF CENTRAL FLORIDA") ~ "UNIVERSITY OF CENTRAL FLORIDA",
nameOrganization %>% str_detect(
"UNIVERSITY OF CALIFORNIA LOS |UNIV OF CALIFORNIA LOS ANGELES|UNIVERSITY OF CALIFORNIA, LOS|UNIVERSITY OF CALIFORNIA - LOS ANGELES|UNIVERSITY OF CALIFORNIA AT LOS ANGELES"
) ~ "UNIVERSITY OF CALIFORNIA LOS ANGELES",
nameOrganization %>% str_detect("TULANE") ~ "TULANE UNIVERSITY",
nameOrganization %>% str_detect("PRESIDENT AND FELLOWS OF HARVA|HARVARD COLLEGE PRESIDENT") ~ "HARVARD UNIVERSITY",
nameOrganization %>% str_detect("TRUSTEES OF BOSTON UNIVERSITY") ~ "BOSTON UNIVERSITY",
nameOrganization %>% str_detect("AMERICAN INSTITUTE OF CHEMICAL ENGI") ~ "AMERICAN INSTITUTE OF CHEMICAL ENGINEERS",
nameOrganization %>% str_detect(
"UNIVERSITY OF CALIFORNIA - BERKELEY|REGENTS OF THE UNIVERSITY OF CALIFORNIA BERKELEY"
) ~ "UNIVERSITY OF CALIFORNIA AT BERKELEY",
nameOrganization %>% str_detect(
"UNIVERSITY OF ARKANSAS @ LITTLE ROCK|UNIVERSITY OF ARKANSAS SYSTEM|UNIVERSITY OF ARKANSAS"
) ~ "UNIVERSITY OF ARKANSAS AT LITTLE ROCK",
nameOrganization %>% str_detect("AMERICAN SOCIETY OF NAVAL ENGI") ~ "AMERICAN SOCIETY OF NAVAL ENGINEERS",
nameOrganization %>% str_detect("KOREA ADVANCED INSTITUTE OF SCIENCE") ~ "KOREA ADVANCED INSTITUTE OF SCIENCE AND TECHNOLOGY",
nameOrganization %>% str_detect("MARINE ENVIRONMENT AND RESOURC") ~ "MARINE ENVIRONMENT AND RESOURCES FOUNDATION",
nameOrganization %>% str_detect(
"MARYLAND UNIV COLLEGE PARK|UNIVERSITY OF MARYLAND - COLLEGE PARK"
) ~ "UNIVERSITY OF MARYLAND",
nameOrganization %>% str_detect(
"UNIVERSITY OF MARYLAND, BALTIMORE|MARYLAND, UNIVERSITY OF, BALTIMORE|UNIVERSITY OF MARYLAND - BALTIMORE|UNIVERSITY OF MARYLAND, BALTIMORE COUNTY"
) ~ "UNIVERSITY OF MARYLAND BALTIMORE COUNTY",
nameOrganization %>% str_detect("UNIVERSITY OF MARYLAND CENTER FOR E") ~ "UNIVERSITY OF MARYLAND CENTER FOR ENVIRONMENTAL SCIENCE OFFICE OF RESEARCH ADMINSTRATION AND ADVANCEMENT",
nameOrganization %>% str_detect(
"LOUISIANA STATE UNIVERSITY AND A&M COLLEGE|LOUISIANA STATE UNIVERSITY SYSTEM"
) ~ "LOUISIANA STATE UNIVERSITY",
nameOrganization %>% str_detect(
"UNIVERSITY OF WISCONSIN SYSTEM|UNIVERSITY OF WISCONSIN - MADISON|UNIVERSITY OF WISCONSIN|WISCONSIN, UNIVERSITY OF, MADISON"
) ~ "UNIVERSITY OF WISCONSIN AT MADISON",
nameOrganization %>% str_detect("UNIVERSITY OF VIRGINIA") ~ "UNIVERSITY OF VIRGINIA",
nameOrganization %>% str_detect("RESEARCH FOUNDATION OF SUNY AT STONY BROOK UNIVERSITY") ~ "STONY BROOK UNIVERSITY",
nameOrganization %>% str_detect("BROAD INSTITUTE") ~ "BROAD INSTITUTE",
nameOrganization %>% str_detect("FORT MONMOUTH ECONOMIC REVITALIZATION") ~ "FORT MONMOUTH ECONOMIC REVITALIZATION AUTHORITY",
nameOrganization %>% str_detect(
"UNIVERSITY OF CALIFORNIA, RIVERSIDE|UNIVERSITY OF CALIFORNIA - RIVERSIDE"
) ~ "UNIVERSITY OF CALIFORNIA - RIVERSIDE",
nameOrganization %>% str_detect("CITY OF RIVERBANK") ~ "CITY OF RIVERBANK",
nameOrganization %>% str_detect(
"CALIFORNIA STATE UNIVERSITY - LOS ANGELES|CALIFORNIA STATE UNIVERSITY-LOS ANGELES"
) ~ "CALIFORNIA STATE UNIVERSITY - LOS ANGELES",
nameOrganization %>% str_detect(
"UNIVERSITY OF CALIFORNIA - SAN DIEGO|UNIVERSITY OF CALIFORNIA, SAN DIEGO|UNIVERSITY OF CALIFORNIA SAN DIEGO|UNIVERSITY OF CALIFORNIA AT SAN DIEGO|CALIFORNIA, UNIVERSITY OF, SAN DIEGO|UNIVERSITY OF CALIFORNIA SAN D"
) ~ "UNIVERSITY OF CALIFORNIA - SAN DIEGO",
nameOrganization %>% str_detect(
"UNIVERSITY OF CALIFORNIA AT SAN FRANCISCO|CALIFORNIA, UNIVERSITY OF, SAN FRANCISCO|UNIVERSITY OF CALIFORNIA, SAN FRANCISCO|UNIVERSITY OF CALIFORNIA - SAN FRANCISCO"
) ~ "UNIVERSITY OF CALIFORNIA AT SAN FRANCISCO",
nameOrganization %>% str_detect(
"CUNY - CITY COLLEGE OF NEW YORK|CUNY -THE CITY COLLEGE|RFCUNY - CITY COLLEGE|CUNY -THE CITY COLLEGE"
) ~ "CUNY - CITY COLLEGE OF NEW YORK",
nameOrganization %>% str_detect("UNIVERSITY OF QUEENSLAND") ~ "UNIVERSITY OF QUEENSLAND",
nameOrganization %>% str_detect("UNIVERSITY OF IOWA|IOWA, UNIVERSITY OF|IOWA UNIV IOWA CITY") ~ "UNIVERSITY OF IOWA",
nameOrganization %>% str_detect("PONTIFICIA UNIVERSIDAD CATOL") ~ "PONTIFICIA UNIVERSIDAD CATOLICA DE CHILE",
nameOrganization %>% str_detect(
"UNIVERSITY OF KANSAS CENTER FOR RES|KANSAS, UNIVERSITY OF, CENTER FOR RESEARCH INC.|UNIVERSITY OF KANSAS|KANSAS, UNIVERSITY OF"
) ~ "UNIVERSITY OF KANSAS",
nameOrganization %>% str_detect("UNIVERSITY OF TEXAS HEALTH|TEXAS, UNIVERSITY OF") ~ "UNIVERSITY OF TEXAS AT AUSTIN",
nameOrganization %>% str_detect(
"M.D. ANDERSON CANCER CENTER, UNIVERSITY OF TEXAS|THE UNIVERSITY OF TEXAS MD ANDERSON CANCER CENTER"
) ~ "THE UNIVERSITY OF TEXAS MD ANDERSON CANCER CENTER",
nameOrganization %>% str_detect("AMERICAN VACUUM SOCIETY") ~ "AMERICAN VACUUM SOCIETY",
nameOrganization %>% str_detect("BRIGHAM AND WOMEN'S HOSPITAL") ~ "BRIGHAM AND WOMEN'S HOSPITAL",
nameOrganization %>% str_detect(
"CALIFORNIA STATE UNIVERSITY - LONG BEACH|CALIFORNIA STATE UNIVERSITY, LONG B"
) ~ "CALIFORNIA STATE UNIVERSITY - LONG BEACH",
nameOrganization %>% str_detect("CAPE COD INSTITUTE FOR SCIENCE") ~ "CAPE COD INSTITUTE FOR SCIENCE AND ENGINEERING",
nameOrganization %>% str_detect("CENTRAL KITSAP SCHOOL DISTRICT") ~ "CENTRAL KITSAP SCHOOL DISTRICT",
nameOrganization %>% str_detect(
"CHILDREN'S HOSPITAL OF PHILADELPHIA|CHILDREN'S HOSPITAL, PHILADELPHIA"
) ~ "CHILDREN'S HOSPITAL OF PHILADELPHIA",
nameOrganization %>% str_detect("CZECH TECHNICAL UNIVERSITY IN PRAGU") ~ "CZECH TECHNICAL UNIVERSITY IN PRAGUE",
nameOrganization %>% str_detect("ECOLE NATIONALE DES PONTS ET CHAUSS") ~ "ECOLE NATIONALE DES PONTS ET CHAUSSEES",
nameOrganization %>% str_detect("ENGINEERING CONFERENCES INTERNATION") ~ "ENGINEERING CONFERENCES INTERNATIONAL",
nameOrganization %>% str_detect("EUROPEAN UNDERWATER BAROMEDICAL SOC") ~ "EUROPEAN UNDERWATER BAROMEDICAL SOCIETY",
nameOrganization %>% str_detect("FAKULTET ELEKTROTEHNIKE I RACUNARST") ~ "FAKULTET ELEKTROTEHNIKE I RACUNARSTVA",
nameOrganization %>% str_detect("FORSCHUNGSZENTRUM JLICH GMBH") ~ "FORSCHUNGSZENTRUM JULICH GMBH",
nameOrganization %>% str_detect("FUNDA?AO DE DESENVOLVIMENTO DA UNICAMP FUNCAMP") ~ "FUNDAAO DE DESENVOLVIMENTO DA UNICAMP FUNCAMP",
nameOrganization %>% str_detect("FUNDACAO COORDENACAO DE PROJETO") ~ "FUNDACAO COORDENACAO DE PROJETOS PESQUISAS E EEUDOS TECNOL",
nameOrganization %>% str_detect("FUNDACION CIENCIAS EXACTAS Y NATURA") ~ "FUNDACION CIENCIAS EXACTAS Y NATURALES",
nameOrganization %>% str_detect("GEARY COUNTY UNIFIED SCHOOL DISTRICT") ~ "GEARY COUNTY UNIFIED SCHOOL DISTRICT",
nameOrganization %>% str_detect("INDIAN INSTITUTE OF TECHNOLOGY BHU") ~ "INDIAN INSTITUTE OF TECHNOLOGY BHUBANESWAR",
nameOrganization %>% str_detect("INDIAN INSTITUTE OF TECHNOLOGY KANP") ~ "INDIAN INSTITUTE OF TECHNOLOGY KANPUR",
nameOrganization %>% str_detect("INDIAN INSTITUTE OF TECHNOLOGY MADR") ~ "INDIAN INSTITUTE OF TECHNOLOGY MADRAS",
nameOrganization %>% str_detect("INDUSTRY UNIVERSITY COOPERATION FOU") ~ "INDUSTRY UNIVERSITY COOPERATION FOUNDATION HANYANG UNIVERSITY",
nameOrganization %>% str_detect("INSTYTUT FIZYKI POLSKIEJ AKADEMII N") ~ "INSTYTUT FIZYKI POLSKIEJ AKADEMII NAUK",
nameOrganization %>% str_detect("INTERNATIONAL BIODETERIORATION SOCI") ~ "INTERNATIONAL BIODETERIORATION SOCIETY",
nameOrganization %>% str_detect(
"INTERNATIONAL UNIVERSITY OF VIETNAM|INTERNATIONAL UNIVERSITY VIETNAM"
) ~ "INTERNATIONAL UNIVERSITY OF VIETNAM",
nameOrganization %>% str_detect("ISTITUTO NAZIONALE PER STUDI ED ESP") ~ "ISTITUTO NAZIONALE PER STUDI ED ESPERIENZE DI ARCHITETTURA NAVALE",
nameOrganization %>% str_detect("JAPAN ADVANCED INSTITUTE OF SCIENCE") ~ "JAPAN ADVANCED INSTITUTE OF SCIENCE AND TECHNOLOGY",
nameOrganization %>% str_detect(
"KOREA UNIVERSITY RESEARCH AND BUSIN|KOREA UNIV RESEARCH AND BUSINESS FOUNDATION"
) ~ "KOREA UNIVERSITY RESEARCH AND BUSINESS FOUNDATION",
nameOrganization %>% str_detect("MAX-PLANCK-GESELLSCHAFT ZUR FORDERUNG") ~ "MAX-PLANCK-GESELLSCHAFT ZUR FRDERUNG DER WISSENSCHAFTEN EV",
nameOrganization %>% str_detect("UNIVERSITY OF SOUTH CAROLINA|UNIVERSITY OF SOUTH CAROLIN") ~ "UNIVERSITY OF SOUTH CAROLINA",
nameOrganization %>% str_detect("MODELLING AND SIMULATION CENTRE OF") ~ "MODELLING AND SIMULATION CENTRE OF EXCELLENCE",
nameOrganization %>% str_detect("NEDERLANDSE ORGANISATIE VOOR TOEGEP") ~ "NEDERLANDSE ORGANISATIE VOOR TOEGEPAST NATUURWETENSCHAPPELIJK ONDERZOEK",
nameOrganization %>% str_detect("PONTIFICIA UNIVERSIDAD CATOLIC") ~ "PONTIFICIA UNIVERSIDAD CATOLICA DE CHILE",
nameOrganization == "RESEARCH FOUNDATION OF STATE UNIVER" ~ "STATE UNIVERSITY OF NEW YORK AT BUFFALO",
nameOrganization %>% str_detect("UNIVERSIT LIBRE DE BRUXELLES|UNIVERSIT?? LIBRE DE BRUXELLES") ~ "UNIVERSITE LIBRE DE BRUXELLES",
nameOrganization %>% str_detect("UNIVERSITA' DEGLI STUDI DI NAPOLI") ~ "UNIVERSITY OF NAPLES FEDERICO II",
nameOrganization %>% str_detect(
"UNIVERSITAT POLITECNICA DE VALENCIA|UNIVERSIDAD POLITECNICA DE VALENCIA"
) ~ "POLYTECHNIC UNIVERSITY OF VALENCIA",
nameOrganization %>% str_detect("UNIVERSIDADE FEDERAL DO RIO GRANDE") ~ "UNIVERSIDADE FEDERAL DO RIO GRANDE DO SUL",
nameOrganization %>% str_detect("UNIVERZITET U BEOGRADU-FAKULTET ORG") ~ "UNIVERZITET U BEOGRADU-FAKULTET ORGANIZACIONIH NAUKA",
nameOrganization %>% str_detect("UNIVERSITA' DI PISA|UNIVERSITY DI PISA") ~ "UNIVERSITY OF PISA",
nameOrganization %>% str_detect("UNIVERSITY OF STRATHCLYDE VIZ ROYAL") ~ "UNIVERSITY OF STRATHCLYDE VIZ ROYAL",
nameOrganization %>% str_detect("STIFTELSEN NANSEN SENTER") ~ "NANSEN CENTER",
nameOrganization %>% str_detect("ROYAL MELBOURNE INSTITUTE OF TECHNO") ~ "ROYAL MELBOURNE INSTITUTE OF TECHNOLOGY",
nameOrganization %>% str_detect(
"AGENCIA ESTATAL CONSEJO SUPERIOR DE|AGENCIA ESTATAL CONSEJO SUPERIOR DE INVESTIGACIONES CIENTIFICAS"
) ~ "AGENCIA ESTATAL CONSEJO SUPERIOR DE INVESTIGACIONES CIENTIFICAS",
nameOrganization %in% c(
"BOARD OF REGENTS OF THE UNIVERSITY",
"BOARD OF REGENTS OF THE UNIVERSITY OF NEBRASKA",
"BOARD OF REGENTS OF THE UNIVER"
) ~ "UNIVERSITY OF NEBRASKA",
nameOrganization %>% str_detect("CENTRO INTERDISCIPLINARIO DE NEUROC") ~ "CENTRO INTERDISCIPLINARIO DE NEUROCIENCIA DE VALPARAISO",
nameOrganization %>% str_detect("CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE") ~ "CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE",
nameOrganization %>% str_detect("GORDON RESEARCH CONFERENCES") ~ "GORDON RESEARCH CONFERENCES",
nameOrganization %>% str_detect("INST FR RECHERCHE POUR L EXPLOIT") ~ "INST FR RECHERCHE POUR L EXPLOIT MER",
nameOrganization %>% str_detect("MOUNTAIN HOME SCHOOL DISTRICT") ~ "MOUNTAIN HOME SCHOOL DISTRICT",
nameOrganization %>% str_detect("UNIVERSIT LIBRE DE BRUXELLES|UNIVERSIT?? LIBRE DE BRUXELLES") ~ "UNIVERSIT LIBRE DE BRUXELLES",
nameOrganization %>% str_detect("UNIVERSITA' DEGLI STUDI DI PERUGIA") ~ "UNIVERSITA' DEGLI STUDI DI PERUGIA",
nameOrganization == "UNIVERSITY OF CALIFORNIA," ~ "UNIVERSITY OF CALIFORNIA LOS ANGELES",
nameOrganization %>% str_detect("DA MARIS") ~ "ASSOCIACAO DA MARIS",
nameOrganization %>% str_detect("BAYLOR COLLEGE OF MEDICINE") ~ "BAYLOR UNIVERSITY",
nameOrganization %>% str_detect("BOSSIER PARISH SCHOOL") ~ "BOSSIER PARISH SCHOOL DISTRICT",
nameOrganization %in% c(
"CALIFORNIA POLYTECHNIC STATE UNIVERSITY",
"CAL POLY CORPORATION"
) ~ "CALIFORNIA POLYTECHNIC STATE UNIVERSITY",
nameOrganization %>% str_detect("APPLIED RESEARCH ASSOCIATES") ~ "APPLIED RESEARCH ASSOCIATES",
nameOrganization %>% str_detect(
"ARISTOTLE UNIVERSITY OF THESSALONIKI|ARISTOTELIO PANEPISTIMIO THESSALONI"
) ~ "ARISTOTLE UNIVERSITY OF THESSALONIKI",
nameOrganization %>% str_detect("CITY OF LONG BEACH") ~ "CITY OF LONG BEACH",
nameOrganization %>% str_detect("^EIDGEN") ~ "EIDGENOSSISCHE TECHNISCHE HOCHSCHULEETH",
nameOrganization %>% str_detect("FISH AND GAME, ALASKA DEPARTMENT OF") ~ "ALASKA DEPARTMENT OF FISH AND GAME",
nameOrganization %>% str_detect(
"LUDWIG MAXIMILIANS UNIVERSITAET|LUDWIG-MAXIMILIANS-UNIVERSITAT MUNC"
) ~ "LUDWIG MAXIMILIAN UNIVERSITY OF MUNICH",
nameOrganization %>% str_detect("MADISON COUNTY") ~ "MADISON COUNTY SCHOOLS",
nameOrganization %>% str_detect("MARQUETTE UNIV") ~ "MARQUETTE UNIVERSITY",
nameOrganization %>% str_detect(
"MONTGOMERY COUNTY OFFICES OF THE COUNTY EXECUTIVE|MONTGOMERY, COUNTY OF"
) ~ "MONTGOMERY COUNTY",
nameOrganization %>% str_detect("NATIONAL ACADEMIES OF SCIENCES|NATIONAL ACADEMY OF SCIENCES") ~ "NATIONAL ACADEMY OF SCIENCES",
nameOrganization %>% str_detect("NATIONAL BUREAU OF ECONOMIC RESEARC") ~ "NATIONAL BUREAU OF ECONOMIC RESEARCH",
nameOrganization %>% str_detect("REGENTS OF THE UNIVERSITY OF IDAHO|UNIVERSITY OF IDAHO") ~ "UNIVERSITY OF IDAHO",
nameOrganization %>% str_detect("RHEINISCH-WEEFAELISCH TECHNISCH") ~ "RHEINISCH-WEEFAELISCH TECHNISCH",
nameOrganization %>% str_detect("UNIVERSITA' DEGLI STUDI DI UDI") ~ "UNIVERSITA' DEGLI STUDI DI UDINE",
nameOrganization %>% str_detect("TOKYO UNIVERSITY OF AGRICULTURE AND") ~ "TOKYO UNIVERSITY OF AGRICULTURE",
nameOrganization %>% str_detect("UNIVERSITY OF WESTERN ONTARIO") ~ "UNIVERSITY OF WESTERN ONTARIO",
nameOrganization %>% str_detect("UNIVERSITY OF NOTRE DAME") ~ "UNIVERSITY OF NOTRE DAME",
nameOrganization %>% str_detect("UNIVERSITY OF UTAH") ~ "UNIVERSITY OF UTAH",
nameOrganization %>% str_detect("UNIVERSITY OF TULSA") ~ "UNIVERSITY OF TULSA",
nameOrganization %>% str_detect("UNIVERSITY OF TOLEDO") ~ "UNIVERSITY OF TOLEDO",
nameOrganization %>% str_detect("UNIVERSITY OF NEWCASTLE") ~ "UNIVERSITY OF NEWCASTLE",
nameOrganization %>% str_detect("UNIVERSITY OF NORTH CAROLINA AT GREENSBORO") ~ "UNIVERSITY OF NORTH CAROLINA - GREENSBORO",
nameOrganization %in% c("UNIVERSITY OF NOTH CAROLINA AT CHAPEL HILL") ~ "UNIVERSITY OF NORTH CAROLINA",
nameOrganization %>% str_detect("UNIVERSITY OF NORTH TEXAS") ~ "UNIVERSITY OF NORTH TEXAS",
nameOrganization %>% str_detect("UNIVERSITY OF HOUSTON") ~ "UNIVERSITY OF HOUSTON",
nameOrganization %>% str_detect("UNIVERSITY OF DAYTON") ~ "UNIVERSITY OF DAYTON",
nameOrganization == "UNIVERSITY OF CALIFORNIA, SAN" ~ "UNIVERSITY OF CALIFORNIA - SAN DIEGO",
nameOrganization %>% str_detect("ONSLOW COUNTY") ~ "ONSLOW COUNTY",
nameOrganization %>% str_detect("RIKEN") ~ "RIKEN BRAIN SCIENCE INSTITUTE",
nameOrganization %>% str_detect("SLOAN KETTERING") ~ "MEMORIAL SLOAN KETTERING CANCER CENTER",
nameOrganization %>% str_detect("AMERICAN CHEMICAL SOCIETY") ~ "AMERICAN CHEMICAL SOCIETY",
nameOrganization %>% str_detect("FUNCAMP") ~ "Unicamp Development Foundation" %>% str_to_upper(),
nameOrganization %>% str_detect("ADAI -") ~ "Association for the Development of Industrial Aerodynamics" %>% str_to_upper(),
TRUE ~ nameOrganization
)
)
data <- data %>%
refine_columns(entity_columns = "nameOrganization") %>%
select(-matches("slugSoundex")) %>%
select(-nameOrganization) %>%
rename(nameOrganization = nameOrganizationClean) %>%
select(one_of(names(data)))
data
}
#' Fix Malformed US Organization Columns
#'
#' @param data a \code{tibble}
#' @param org_col organization column name
#'
#' @return
#' @export
#'
#' @examples
fix_usg_organization_col <-
function(data, org_col = "nameDepartment") {
if (!data %>% hasName(org_col)) {
return(data)
}
new_col <- glue("{org_col}Actual") %>% as.character()
df_depts <-
data %>%
distinct(!!sym(org_col)) %>%
mutate(UQ(new_col) := !!sym(org_col))
df_depts <-
df_depts %>%
mutate(
!!sym(new_col) := !!sym(new_col) %>% str_replace_all("\\DEPT OF ", "\\DEPARTMENT OF ") %>% str_remove_all("DOD/")
)
df_depts <- df_depts %>%
filter(!(!!sym(new_col) %>% str_detect("\\, DEPARTMENT"))) %>%
bind_rows(
df_depts %>%
filter((
!!sym(new_col) %>% str_detect("\\, DEPARTMENT")
)) %>%
separate(
new_col,
sep = "\\, ",
into = c("part2", "part1"),
extra = "merge",
fil = "right"
) %>%
unite(!!sym(new_col), part1, part2, sep = " ")
)
data <- data %>%
left_join(df_depts, by = org_col) %>%
select(-org_col) %>%
rename(!!sym(org_col) := !!sym(new_col)) %>%
select(names(data), everything())
data
}
.download_excel_file <-
memoise::memoise(function(url = "https://www.sbir.gov/awards/annual-reports/xls?xls_table=SBIR_count&dataid=SbirAnnualReportsSummarySqlYearSbir",
has_col_names = F,
...) {
tmp <-
tempfile()
curl::curl_download(url, tmp)
data <-
tmp %>%
read_excel(col_names = has_col_names, ...)
tmp %>%
unlink()
data
})
#' Download excel file
#'
#' @param url url of excel file
#' @param has_col_names if \code{TRUE} include column name as first column
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
#'
download_excel_file <-
function(url = "https://www.sbir.gov/awards/annual-reports/xls?xls_table=SBIR_count&dataid=SbirAnnualReportsSummarySqlYearSbir",
has_col_names = F,
...) {
.download_excel_file(url = url, has_col_names = has_col_names, ...)
}
.munge_awardee <-
function(address_award = "RJS CONSTRUCTION, INC., 974 EAST STREET, WASHOUGAL, WA 98671 US") {
if (is.na(address_award)) {
return(tibble(nameAwardeeAddress = address_award))
}
parts <-
address_award %>%
str_to_title() %>%
str_remove_all("\\.") %>%
gsub(".,", "", .) %>%
str_replace_all(", INC|, INC,", " INC") %>%
str_replace_all(", LLC|L.L.C", " LLC") %>%
str_split("C/O|\\ |\\,|DBA|;|<BR>") %>% flatten_chr() %>% str_squish() %>%
discard(function(x) {
x == ""
})
if (length(parts) == 1) {
return(tibble(nameAwardeeAddress = address_award, nameAwardee = parts))
}
nameAwardee <- parts[1]
addressAwardee <-
parts[2:length(parts)] %>% str_c(collapse = " ")
tibble(nameAwardeeAddress = address_award, nameAwardee, addressAwardee)
}
.munge_awardees <-
function(address_awards = "RJS CONSTRUCTION, INC., 974 EAST STREET, WASHOUGAL, WA 98671 US") {
address_awards %>%
map_dfr(function(address_award) {
.munge_awardee(address_award = address_award)
})
}
.generate_url_reference <-
function() {
user_agents <-
c(
"Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/60.0.3112.107 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 7.0; SM-G930VC Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/58.0.3029.83 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; SM-G935S Build/MMB29K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; SM-G920V Build/MMB29K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 6P Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 7.1.1; G8231 Build/41.2.A.0.219; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/59.0.3071.125 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; E6653 Build/32.2.A.0.253) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0; HTC One X10 Build/MRA58K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/61.0.3163.98 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0; HTC One M9 Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Mobile Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A5370a Safari/604.1",
"Mozilla/5.0 (iPhone9,3; U; CPU iPhone OS 10_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1",
"Mozilla/5.0 (iPhone9,4; U; CPU iPhone OS 10_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1",
"Mozilla/5.0 (Apple-iPhone7C2/1202.466; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543 Safari/419.3",
"Mozilla/5.0 (Windows Phone 10.0; Android 6.0.1; Microsoft; RM-1152) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Mobile Safari/537.36 Edge/15.15254",
"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; RM-1127_16056) AppleWebKit/537.36(KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10536",
"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/13.10586",
"Mozilla/5.0 (Linux; Android 7.0; Pixel C Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.98 Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; SGP771 Build/32.2.A.0.253; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.98 Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; SHIELD Tablet K1 Build/MRA58K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Safari/537.36",
"Mozilla/5.0 (Linux; Android 7.0; SM-T827R4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.116 Safari/537.36",
"Mozilla/5.0 (Linux; Android 5.0.2; SAMSUNG SM-T550 Build/LRX22G) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/3.3 Chrome/38.0.2125.102 Safari/537.36",
"Mozilla/5.0 (Linux; Android 4.4.3; KFTHWI Build/KTU84M) AppleWebKit/537.36 (KHTML, like Gecko) Silk/47.1.79 like Chrome/47.0.2526.80 Safari/537.36",
"Mozilla/5.0 (Linux; Android 5.0.2; LG-V410/V41020c Build/LRX22G) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/34.0.1847.118 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
"Mozilla/5.0 (CrKey armv7l 1.5.16041) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.0 Safari/537.36",
"Roku4640X/DVP-7.70 (297.70E04154A)",
"Mozilla/5.0 (Linux; U; Android 4.2.2; he-il; NEO-X5-116A Build/JDQ39) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Safari/534.30",
"Mozilla/5.0 (Linux; Android 5.1; AFTS Build/LMY47O) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/41.99900.2250.0242 Safari/537.36",
"Dalvik/2.1.0 (Linux; U; Android 6.0.1; Nexus Player Build/MMB29T)",
"AppleTV6,2/11.1",
"AppleTV5,3/9.1.1",
"Mozilla/5.0 (Nintendo WiiU) AppleWebKit/536.30 (KHTML, like Gecko) NX/3.0.4.2.12 NintendoBrowser/4.3.1.11264.US",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; XBOX_ONE_ED) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Xbox; Xbox One) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/13.10586",
"Mozilla/5.0 (PlayStation 4 3.11) AppleWebKit/537.73 (KHTML, like Gecko)",
"Mozilla/5.0 (PlayStation Vita 3.61) AppleWebKit/537.73 (KHTML, like Gecko) Silk/3.2",
"Mozilla/5.0 (Nintendo 3DS; U; ; en) Version/1.7412.EU",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
"Mozilla/5.0 (X11; U; Linux armv7l like Android; en-us) AppleWebKit/531.2+ (KHTML, like Gecko) Version/5.0 Safari/533.2+ Kindle/3.0+",
"Mozilla/5.0 (Linux; U; en-US) AppleWebKit/528.5+ (KHTML, like Gecko, Safari/528.5+) Version/4.0 Kindle/3.0 (screen 600x800; rotate)"
)
user_agent <-
user_agents[!user_agents %>% str_detect("bot|slurp")] %>%
sample(1)
tl_domain <-
c('.com', '.gov', '.org') %>%
sample(1)
word_length <-
8:15
words <-
word_length %>% sample(1)
domain_slug <-
1:words %>%
map_chr(function(x) {
sample(letters, 1)
}) %>%
paste0(collapse = '')
url <-
list('http://', domain_slug, tl_domain) %>%
purrr::reduce(paste0)
df <-
tibble(urlReferer = url,
userAgent = user_agent)
df
}
.format_data <-
function(data) {
amount_cols <-
data %>%
select_if(is.numeric) %>%
select(matches("amount[A-Z]")) %>%
names()
pct_cols <-
data %>%
select_if(is.numeric) %>%
select(matches("pct|percent")) %>%
names()
if (length(amount_cols) > 0) {
data <-
data %>%
mutate_at(amount_cols, list(function(x) {
x %>% formattable::currency(digits = 0)
}))
}
if (length(pct_cols) > 0) {
data <-
data %>%
mutate_at(pct_cols, list(function(x) {
x %>% percent(digits = 2)
}))
}
data
}
#' Format data
#'
#' @param data a \code{tibble}
#'
#' @return
#' @export
#'
#' @examples
format_data <-
function(data) {
amount_cols <-
data %>% select(matches("^amount")) %>%
select_if(is.numeric) %>%
names()
pct_cols <-
data %>% select(matches("^percent|^pct|percent")) %>%
select_if(is.numeric) %>%
names()
if (length(pct_cols) > 0) {
data <-
data %>%
mutate_at(pct_cols, list(function(x) {
x %>% percent(digits = 2)
}))
}
numeric_cols <-
data %>% select(matches("^ratio|^area|^size|^count[A-Z]|numberTransaction")) %>%
select(-matches("county|country|countries")) %>%
select_if(is.character) %>%
names()
if (length(numeric_cols) > 0) {
data <-
data %>%
mutate_at(numeric_cols, list(function(x) {
x %>% as.character() %>% readr::parse_number() %>% formattable::comma(digits = 0)
}))
}
if (length(amount_cols) > 0) {
data <-
data %>%
mutate_at(amount_cols, list(function(x) {
formattable::currency(digits = 0)
}))
}
data
}
.discard_text <- function(x) {
x %>% str_split("\t|\n") %>%
flatten_chr() %>%
discard( ~ .x == "") %>%
str_to_upper() %>% str_c(collapse = " ")
}
.parse_for_text <- function(page, css = ".soln") {
page %>%
html_nodes(css = css) %>%
html_text() %>%
str_squish()
}
.get_page_attribute_data <-
function(page) {
page_attributes <-
page %>% html_nodes("div") %>%
html_attrs()
df_attrs <-
seq_along(page_attributes) %>%
map_dfr(function(x) {
page_attributes[x] %>% flatten_df()
})
df_attrs
}
.parse_css_name <-
function(page,
css = "#dnf_class_values_procurement_notice__solicitation_number__widget",
actual_name = "idSolicitation") {
if (page %>% html_nodes(css) %>% length() == 0) {
return(invisible())
}
value <-
page %>%
parse_for_text(css = css)
tibble(nameActual = actual_name, value)
}
#' FBO Bizopps IDs
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
dictionary_bizopps_ids <-
function() {
tibble(
idGovernment = c(
"solicitation_number",
"procurement_type",
"description",
"office_address",
"place_of_performance",
"primary_poc",
"secondary_poc",
"original_posted_date",
"posted_date",
"response_deadline",
"original_response_deadline",
"archive_type",
"original_archive_date",
"archive_date",
"original_set_aside",
"set_aside",
"classification_code",
"naics_code",
"contract_award_date",
"contract_award_number",
"contractor_awarded_name",
"contractor_awarded_duns",
"contractor_awarded_address",
"contract_award_amount",
"contractor_awardee_text",
"office_address_text",
"poc_text",
"contract_line_item_number",
"ja_statutory",
"additional_info_link",
"place_of_performance_text",
"delivery_order_number",
"fair_opp_ja",
"dnf_class_values_procurement_notice_archive_procurement_type",
"dnf_class_values_procurement_notice_archive_posted_date",
"dnf_class_values_procurement_notice_archive_archive_type",
"dnf_class_values_procurement_notice_archive_archive_date",
"dnf_class_values_procurement_notice_archive_classification_code",
"dnf_class_values_procurement_notice_archive_naics_code",
"dnf_class_values_procurement_notice_archive_original_posted_date",
"dnf_class_values_procurement_notice_archive_original_response_deadline",
"dnf_class_values_procurement_notice_archive_original_set_aside",
"dnf_class_values_procurement_notice_archive_set_aside",
"dnf_class_values_procurement_notice_archive_response_deadline"
),
nameActual = c(
"idSolicitation",
"typeProcurement",
"descriptionProcurement",
"addressContractingOffice",
"addressPlaceOfPerformance",
"contactPersonPrimary",
"contactPersonSecondary",
"datePostedOriginal",
"datePosted",
"datetimeDeadline",
"datetimeDeadlineOriginal",
"typeArchive",
"dateArchiveOriginal",
"dateArchive",
"typeSetAsideOriginal",
"typeSetAside",
"codeClassification",
"codeNAICS",
"dateContractAwarded",
"idContract",
"nameContractWinner",
"idDUNSContractWinner",
"addressContractWinner",
"amountContract",
"nameContractWinner",
"addressContractWinner",
"contactPersonPrimary",
"idContractLine",
"descriptionJA",
"urlLinkInfo",
"addressPlaceOfPerformance",
"idDeliveryInfo",
"descriptionFairOpportunity",
"typeProcurement",
"datePosted",
"typeArchive",
"dateArchive",
"codeClassification",
"codeNAICS",
"datePostedOriginal",
"datetimeDeadline",
"typeSetAsideOriginal",
"typeSetAside",
"timezoneResponseDeadline",
"orgKey",
"categoryDesc",
"categoryId",
"cfdaCode",
"endDate",
"fpdsCode",
"fpdsOrgId",
"cgac",
"fullParentPath",
"fullParentPathName",
"isSourceFpds",
"lastModifiedBy",
"lastModifiedDate",
"modStatus",
"orgCode",
"shortName",
"l1ShortName",
"summary",
"level",
"code",
"sendEmail",
"l1Name",
"l1OrgKey",
"agencyName",
"cfdaBur",
"cfdaOmb",
"isSourceCfda",
"isSourceCwCfda",
"sourceCfdaPk",
"logoUrl",
"a11TacCode",
"ombAgencyCode",
"tas2Code",
"tas3Code",
"startDate",
"ingestedOn",
"sourceParentCfdaPk"
)
)
}
#' FBO feature name dictionary
#'
#' @return
#' @export
#'
#' @examples
dictionary_bizopps_names <-
function() {
tibble(
nameGovernment = c(
"Solicitation Number",
"Notice Type",
"Contract Award Date",
"Contract Award Number",
"Contract Award Dollar Amount",
"Contractor Awarded Name",
"Contractor Awarded DUNS",
"Contractor Awarded Address",
"Contracting Office Address",
"Original Posted Date",
"Posted Date",
"Response Date",
"Original Response Date",
"Archiving Policy",
"Original Archive Date",
"Archive Date",
"Original Set Aside",
"Set Aside",
"Classification Code",
"NAICS Code",
"Primary Point of Contact.",
"idRow",
"isCanceled",
"publishDate",
"title",
"isActive",
"responseDate",
"cleanSolicitationNumber",
"archiveDate",
"_rScore",
"_type",
"indexedDate",
"solicitationNumber",
"modifiedDate",
"_id",
"parentNoticeId",
"originalPublishDate",
"originalResponseDate",
"rScore",
"typeMatch",
"id",
"opportunityId",
"attachmentId",
"resourceId",
"attachmentOrder",
"fileExists",
"name",
"type",
"postedDate",
"accessLevel",
"exportControlled",
"explicitAccess",
"uri",
"description",
"mimeType",
"size",
"deletedDate",
"deletedFlag",
"accessStatus",
"archived",
"cancelled",
"latest",
"deleted",
"createdDate",
"modifiedBy",
"createdBy",
"totalCount",
"version",
"organizationId",
"classificationCode",
"response",
"responseTz",
"descriptionId",
"modifiedOn",
"body",
"organizationLocationId",
"date",
"amount",
"number",
"duns",
"city.code",
"city.name",
"country.code",
"country.name",
"state.code",
"state.name",
"lineItemNumber",
"city",
"zip",
"streetAddress",
"setAsideId",
"setAsideCode",
"setAsideName",
"active",
"organizationIds",
"sortIndex",
"legacyFBOCode",
"activeStartDate",
"orgKey",
"categoryDesc",
"categoryId",
"cfdaCode",
"endDate",
"fpdsCode",
"fpdsOrgId",
"cgac",
"fullParentPath",
"fullParentPathName",
"isSourceFpds",
"lastModifiedBy",
"lastModifiedDate",
"modStatus",
"orgCode",
"shortName",
"l1ShortName",
"summary",
"level",
"code",
"sendEmail",
"l1Name",
"l1OrgKey",
"agencyName",
"cfdaBur",
"cfdaOmb",
"isSourceCfda",
"isSourceCwCfda",
"sourceCfdaPk",
"logoUrl",
"a11TacCode",
"ombAgencyCode",
"tas2Code",
"tas3Code",
"startDate",
"ingestedOn",
"sourceParentCfdaPk",
"parentOrgKey",
"l2Name",
"parentOrg",
"l2OrgKey",
"ombBureauCode",
"l5Name",
"l3Name",
"l4Name",
"aacCode",
"regionCode",
"NoticeId",
"Title",
"Sol#",
"Department/Ind.Agency",
"Sub-Tier",
"Office",
"PostedDate",
"Type",
"BaseType",
"ArchiveType",
"ArchiveDate",
"SetASideCode",
"SetASide",
"ResponseDeadLine",
"NaicsCode",
"ClassificationCode",
"PopStreetAddress",
"PopCity",
"PopState",
"PopZip",
"PopCountry",
"Active",
"AwardNumber",
"AwardDate",
"Award$",
"Awardee",
"PrimaryContactTitle",
"PrimaryContactFullname",
"PrimaryContactEmail",
"PrimaryContactPhone",
"PrimaryContactFax",
"SecondaryContactTitle",
"SecondaryContactFullname",
"SecondaryContactEmail",
"SecondaryContactPhone",
"SecondaryContactFax",
"OrganizationType",
"State",
"City",
"ZipCode",
"CountryCode",
"AdditionalInfoLink",
"Link",
"Description"
),
nameActual =
c(
"idSolicitation",
"typeNotice",
"dateContractAward",
"idContractAward",
"amountContract",
"nameVendor",
"idDUNSVendor",
"addressContractor",
"addressContractingOffice",
"datePostingOriginal",
"datePosted",
"dateResponse",
"dateResponseOriginal",
"policyArchiving",
"dateArchiveOriginal",
"dateArchive",
"typeSetAsideOriginal",
"typeSetAside",
"codeClassification",
"idNAICS",
"descriptionContractingPointContact",
"idRow",
"isCanceled",
"datetimePublished",
"nameSolicitation",
"isActive",
"datetimeResponse",
"idSolicitationClean",
"datetimeArchive",
"removeR",
"typeAPI",
"datetimeIndex",
"idSolicitation",
"datetimeModified",
"idAPI",
"idNoticeParent",
"datetimePublishedOriginal",
"datetimeResponseOriginal",
"scoreMatch",
"slugMatch",
"idNotice",
"idNotice",
"idAttachment",
"idResource",
"numberAttachment",
"hasFile",
"nameFile",
"typeAttachment",
"datePosted",
"levelAccess",
"isExportControlled",
"hasExplicitAccess",
"uriAttachment",
"descriptionAttachment",
"typeFile",
"sizeFile",
"dateDeleted",
"deletedFlag",
"statusAccess",
"isArchived",
"isCancelled",
"isLatest",
"isDeleted",
"datetimeCreated",
"modifiedBy",
"createdBy",
"countTotal",
"numberVersion",
"idOrganizationSAM",
"codeProductService",
"datetimeResponse",
"timezoneResponse",
"idDescription",
"datetimeModified",
"body",
"idOrganizationLocation",
"dateAward",
"amountAward",
"idAward",
"idDUNSAwardee",
"idCitySAM",
"city",
"countryPerformance",
"removeCountry",
"state",
"removeSatate",
"codeLineItem",
"city",
"zipcode",
"addressStreet",
"idSetAside",
"codeSetAside",
"nameSetAside",
"isActive",
"idOrganizationSAM",
"idSortIndex",
"codeLegacyFBO",
"dateActiveStart",
"idOrganizationSAM",
"typeOrganization",
"codeCategory",
"codeCFDA",
"datetimeEnd",
"codeFPDS",
"idFPDS",
"idAgency",
"idOrganizationSAMParent",
"nameDepartmentParent",
"isSourceFpds",
"lastModifiedBy",
"datetimeLastModified",
"statusModification",
"codeAgency",
"slugAgency",
"slugDepartment",
"summaryOffice",
"idLevel",
"idAgencyFREC",
"hasEmail",
"nameDepartment",
"idOrganizationSAMLDepartment",
"nameAgency",
"idCFDABureau",
"idOMB",
"isSourceCFDA",
"isSourceCWCFDA",
"sourceCFDAPK",
"urlLogo",
"codeTACA11",
"codeOMBAgency",
"codeTAS2",
"codeTAS3",
"datetimeStart",
"datetimeIngested",
"uriCFDAPK",
"idOrganizationParentSAMKey",
"nameOfficeAgency",
"nameAgencyParentMaster",
"idOrganizationSAMAgency",
"codeOMBBureau",
"nameOffice",
"nameCommandMajor",
"nameCommandSub",
"codeAAC",
"idRegion",
"idNotice",
"nameSolicitation",
"idSolicitation",
"nameDepartment",
"nameCommandSub",
"nameOffice",
"datetimePublished",
"typeNotice",
"typeNoticeBase",
"typeArchive",
"dateArchive",
"slugSetAside",
"typeSetAside",
"datetimeResponse",
"idNAICS",
"idSolicitationGroup",
"addressStreetPerformance",
"cityPerformance",
"codeStatePerformance",
"zipcodePerformance",
"countryPerformance",
"isActive",
"idAward",
"dateAward",
"amountAward",
"nameAwardee",
"titlePrimaryContact",
"namePrimaryContact",
"emailPrimaryContact",
"phonePrimaryContact",
"faxPrimaryContact",
"titleSecondaryContact",
"nameSecondaryContact",
"emailSecondaryContact",
"phoneSecondaryContact",
"faxSecondaryContact",
"typeOrganization",
"stateOrganization",
"cityOrganization",
"zipcodeOrganization",
"countryOrganization",
"hasAdditionalInfoLink",
"urlOpportunity",
"descriptionSolicitation"
)
)
}
.resolve_bizopp_ids <- function(gov_ids) {
df_gov_names <-
dictionary_bizopps_ids()
gov_ids %>%
map_chr(function(id) {
no_name <-
df_gov_names %>%
filter(idGovernment == id) %>%
nrow() == 0
if (no_name) {
glue::glue("Missing {id} in dictionary") %>% message()
return(id)
}
df_gov_names %>%
filter(idGovernment == id) %>%
pull(nameActual) %>%
unique() %>%
.[[1]]
})
}
.munge_agency <-
function(agency) {
split_regex <- "[a-z][A-Z]|\\)[A-Z]"
if (!agency %>% str_detect(split_regex)) {
data <-
tibble(nameAgency = agency, detailsAgency = agency)
return(data)
}
split_1 <-
agency %>% str_locate(split_regex) %>% min()
split_2 <-
agency %>% str_locate(split_regex) %>% max()
word_end_1 <-
agency %>% substr(split_1, split_1)
word_start_2 <-
agency %>% substr(split_2, split_2)
words <-
agency %>%
str_split(split_regex) %>%
flatten_chr()
name_agency <-
str_c(words[1], word_end_1, sep = "")
division <-
str_c(word_start_2, words[2], sep = "")
has_slug <-
agency %>% str_detect("\\)")
data <-
tibble(
nameAgency = name_agency,
divisionAgency = division,
detailsAgency = agency
) %>%
mutate_all(str_squish)
if (has_slug) {
data <-
data %>%
separate(
nameAgency,
into = c("nameAgency", "slugAgency"),
extra = "merge",
fill = "right",
sep = "\\("
) %>%
mutate(slugAgency = slugAgency %>% str_remove_all("\\)")) %>%
mutate_all(str_squish)
}
data
}
.munge_agencies <-
function(agencies) {
all_agencies <-
agencies %>%
map_dfr(function(agency) {
.munge_agency(agency = agency)
})
all_agencies
}
#' Munge Data
#'
#' Cleans a tibble
#'
#' @param data a \code{tibble}
#' @param parse_dates if \code{TRUE} parses dates
#' @param clean_address if \code{TRUE} parses addresses
#' @param unformat if \code{TRUE} removes formattabble format
#' @param snake_names if \code{TRUE} returns names in snake case
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
munge_data <-
function(data,
parse_dates = T,
clean_address = F,
unformat = F,
exclude_bloat = F,
snake_names = F) {
data <-
.munge_data(
data = data,
parse_dates = parse_dates,
clean_address = clean_address,
unformat = unformat,
snake_names = snake_names,
exclude_bloat = exclude_bloat
)
data
}
#' Clean entity data
#'
#' @param data \code{tibble}
#' @param entity_column name of the coumn to clean
#' @param use_business_suffix if \code{TRUE} use business suffix
#' @param use_n_gram_merge if \code{TRUE} uses n_gram_merge
#' @param edit_threshold edit threshold
#' @param ignore_words vector of words to ignore
#'
#' @return
#' @export
#'
#' @examples
clean_entity_data <-
function(data,
entity_column = "nameOwnerPrimary",
use_business_suffix = T,
use_n_gram_merge = T,
edit_threshold = 1,
ignore_words = c(
"ASSOCIATIONS",
"ASSOCIATES",
"ASSOCIATIONS",
"LLC",
"LL",
"ACQUISITION",
"ACQUISITIONS"
)) {
if (!data %>% tibble::has_name(entity_column)) {
return(data)
}
co_col <- entity_column %>% str_c("CareOf")
data <-
data %>%
mutate(UQ(entity_column) := !!sym(entity_column) %>% str_replace_all("CARE OF", "C/O")) %>%
tidyr::separate(
!!sym(entity_column),
into = c(entity_column, co_col),
extra = "merge",
fill = "right",
sep = "\\C/O"
) %>%
mutate_if(is.character, str_squish) %>%
dplyr::select(which(colMeans(is.na(.)) < 1))
new_col <-
glue::glue("{entity_column}Clean") %>% as.character()
x <-
(data %>% pull(entity_column)) %>% str_to_upper() %>% str_squish() %>%
str_remove_all("\\,|\\.") %>%
gsub("\\s+", " ", .)
ignores <-
ignore_words %>% str_to_upper()
new_clean <-
x %>%
refinr::key_collision_merge(ignore_strings = ignores, bus_suffix = use_business_suffix)
if (use_n_gram_merge) {
new_clean <-
new_clean %>%
refinr::n_gram_merge(ignore_strings = ignores,
edit_threshold = edit_threshold)
}
data <-
data %>%
mutate(UQ(new_col) := new_clean)
data
}
.import_rda_file <-
function(file = NULL,
return_tibble = TRUE) {
if (length(file) == 0) {
stop("Please enter a file path")
}
env <- new.env()
nm <- load(file, env)[1]
if (return_tibble) {
data <-
env[[nm]] %>%
dplyr::as_tibble()
} else {
data <-
env[[nm]]
}
data
}
.curl_url <-
function(url = "https://github.com/abresler/FRED_Dictionaries/blob/master/data/fred_series_data.rda?raw=true",
return_tibble = TRUE) {
con <-
url %>%
curl::curl()
data <-
con %>%
.import_rda_file(return_tibble = return_tibble)
close(con)
return(data)
}
#' Read RDA File
#'
#' Reads and RDA file
#'
#' @param file location of file
#' @param return_tibble if \code{TRUE} returns tibble
#'
#' @return
#' @export
#'
#' @examples
read_rda <-
function(file = NULL,
return_tibble = TRUE) {
if (length(file) == 0) {
stop("Please enter a file")
}
is_html <-
file %>% stringr::str_detect("http")
if (is_html) {
data <- .curl_url(url = file, return_tibble = return_tibble)
} else {
data <-
.import_rda_file(file = file, return_tibble = return_tibble)
}
data
}
#' Create a block of text
#'
#' @param data a \code{tibble}
#' @param id_column id column
#' @param text_columns vector of text columns
#'
#' @return
#' @export
#'
#' @examples
create_text_block <-
function(data, id_column, text_columns) {
text_col <-
id_column %>% str_replace_all("^id", "text")
df_text <-
data %>%
select(one_of(id_column, text_columns)) %>%
gather(item, value, -id_column) %>%
select(one_of(id_column), value) %>%
arrange(!!sym(id_column)) %>%
filter(!is.na(value)) %>%
mutate(
value = value %>% stringi::stri_enc_toascii() %>%
str_remove_all("\032") %>%
str_replace_all("\\|", " ") %>%
str_squish() %>%
str_to_upper() %>%
str_remove_all("\\_|\\~|--") %>%
stri_unescape_unicode() %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ")
) %>%
group_by(!!sym(id_column)) %>%
distinct() %>%
summarise(UQ(text_col) := str_c(value, collapse = ". ")) %>%
ungroup()
data %>%
left_join(df_text, by = id_column)
}
#' Munge a tibble
#'
#' @param data a \code{tibble()}
#' @param parse_dates if \code{TRUE} parse dates
#' @param unformat if \code{TRUE} removes formattable formats
#'
#' @return
#' @export
#'
#' @examples
tbl_munge <- function(data,
munge_entities = F,
parse_dates = T,
unformat = F,
entity_columns = NULL,
use_business_suffix = T,
phonics_methods = "soundex",
phonics_length = 16L,
use_n_gram_merge = T,
edit_threshold = 1,
ignore_words = NULL,
numgram = 2,
weight = c(d = 0.33,
i = 0.33,
s = 1,
t = 0.5)) {
data <- data %>%
.munge_data(
unformat = unformat,
parse_dates = parse_dates,
clean_address = F
)
if (data %>% hasName("idAssistance")) {
data <- data %>%
mutate(
codeAssistance = case_when(
idAssistance == 2 ~ "A",
idAssistance == 3 ~ "A",
idAssistance == 4 ~ "B",
idAssistance == 5 ~ "B",
idAssistance == 6 ~ "C",
idAssistance == 7 ~ "E",
idAssistance == 8 ~ "F",
idAssistance == 9 ~ "G",
idAssistance == 10 ~ "D",
idAssistance == 11 ~ NA_character_
),
typeAssistance = case_when(
idAssistance == 2 ~ "BLOCK GRANT",
idAssistance == 3 ~ "FORMULA GRANT",
idAssistance == 4 ~ "PROJECT GRANT",
idAssistance == 5 ~ "COOPERATIVE AGREEMENT",
idAssistance == 6 ~ "DIRECT PAYMENT FOR SPECIFIED USE, AS A SUBSIDY OR OTHER NON-REIMBURSABLE DIRECT FINANCIAL AID",
idAssistance == 7 ~ "DIRECT LOAN",
idAssistance == 8 ~ "GUARANTEED/INSURED LOAN",
idAssistance == 9 ~ "INSURANCE",
idAssistance == 10 ~ "DIRECT PAYMENT WITH UNRESTRICTED USE (RETIREMENT, PENSION, VETERANS BENEFITS, ETC.)",
idAssistance == 11 ~ "OTHER REIMBURSABLE, CONTINGENT, INTANGIBLE, OR INDIRECT FINANCIAL ASSISTANCE"
)
)
}
if (munge_entities & data %>% hasName("nameOrganization")) {
data <-
data %>%
.munge_organizations()
}
if (length(entity_columns) > 0 & munge_entities) {
data <-
refine_columns(
data = data,
entity_columns = entity_columns,
use_business_suffix = use_business_suffix,
phonics_methods = phonics_methods,
phonics_length = phonics_length,
use_n_gram_merge = ,
edit_threshold = edit_threshold,
ignore_words = ignore_words,
numgram = numgram,
weight = weight,
...
)
}
data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.