Nothing
# This script contains files for importing common bibliographic formats in to R
# user-accessible function
read_bibliography <- function(
filename,
return_df = TRUE
){
invisible(Sys.setlocale("LC_ALL", "C"))
on.exit(invisible(Sys.setlocale("LC_ALL", "")))
if(missing(filename)){
stop("filename is missing with no default")
}
file_check <- unlist(lapply(filename, file.exists))
if(any(!file_check)){
stop("file not found")
}
if(length(filename) > 1){
result_list <- lapply(filename, function(a, df){
read_bibliography_internal(a, df)
},
df = return_df
)
names(result_list) <- filename
if(return_df){
result <- merge_columns(result_list)
result$filename <- unlist(
lapply(seq_len(length(result_list)),
function(a, data){
rep(names(data)[a], nrow(data[[a]]))
},
data = result_list
))
if(any(colnames(result) == "label")){
result$label <- make.unique(result$label)
}
return(result)
}else{
result <- do.call(c, result_list)
return(result)
}
}else{
return(
read_bibliography_internal(filename, return_df)
)
}
}
# underlying workhorse function
read_bibliography_internal <- function(
filename,
return_df = TRUE
){
if(grepl(".csv$", filename)){
result <- revtools_csv(filename)
if(!return_df){
result <- as.bibliography(result)
}
}else{
# import x
z <- tryCatch(
{
scan(filename,
sep = "\t",
what = "character",
quote = "",
quiet = TRUE,
blank.lines.skip = FALSE
)
},
warning = function(w){
stop(
"file import failed: data type not recognized by read_bibliography",
call. = FALSE
)
},
error = function(e){
stop(
"file import failed: data type not recognized by read_bibliography",
call. = FALSE
)
}
)
Encoding(z) <- "latin1"
z <- gsub("<[[:alnum:]]{2}>", "", z) # remove errors from above process
# detect whether file is bib-like or ris-like via the most common single characters
nrows <- min(c(200, length(z)))
zsub <- z[seq_len(nrows)]
n_brackets <- length(grep("\\{", zsub))
n_dashes <- length(grep(" - ", zsub))
if(n_brackets > n_dashes){
result <- read_bib(z) # simple case - no further work needed
}else{ # ris format can be inconsistent; custom code needed
if(grepl(".ciw$", filename)){
tag_type <- "wos"
}else{
tag_type <- "ris"
}
z_dframe <- prep_ris(z, detect_delimiter(zsub))
# import appropriate format
if(any(z_dframe$ris == "PMID")){
result <- read_medline(z_dframe)
}else{
result <- read_ris(z_dframe, tag_type)
}
}
if(return_df){
result <- as.data.frame(result)
}
}
return(result)
}
rollingsum <- function(a, n = 2L){
tail(cumsum(a) - cumsum(c(rep(0, n), head(a, -n))), -n + 1)
}
# detect delimiters between references, starting with strings that start with "ER"
detect_delimiter <- function(x){
if(any(grepl("^ER", x))){
delimiter <- "endrow"
}else{
# special break: same character repeated >6 times, no other characters
char_list <- strsplit(x, "")
char_break_test <- unlist(
lapply(char_list,
function(a){length(unique(a)) == 1 & length(a > 6)}
)
)
if(any(char_break_test)){
delimiter <- "character"
}else{
# use space as a ref break (last choice)
space_break_check <- unlist(lapply(
char_list,
function(a){all(a == "" | a == " ")}
))
if(any(space_break_check)){
delimiter <- "space"
}else{
stop("import failed: unknown reference delimiter")
}
}
}
return(delimiter)
}
prep_ris <- function(
z,
delimiter
){
# detect tags
tags <- regexpr(
"^([[:upper:]]{2,4}|[[:upper:]]{1}[[:digit:]]{1})\\s{0,}-{0,2}\\s{0,}",
perl = TRUE,
z
)
z_dframe <- data.frame(
text = z,
row = seq_along(z),
match_length = attr(tags, "match.length"),
stringsAsFactors = FALSE
)
z_list <- split(z_dframe, z_dframe$match_length)
z_list <- lapply(z_list, function(a){
n <- a$match_length[1]
if(n < 0){
result <- data.frame(
ris = "",
text = a$text,
row_order = a$row,
stringsAsFactors = FALSE
)
}else{
result <- data.frame(
ris = sub("\\s{0,}-\\s{0,}|^\\s+|\\s+$", "", substr(a$text, 1, n)),
text = gsub("^\\s+|\\s+$", "", substr(a$text, n+1, nchar(a$text))),
row_order = a$row,
stringsAsFactors = FALSE
)
}
return(result)
})
z_dframe <- do.call(rbind, z_list)
z_dframe <- z_dframe[order(z_dframe$row), ]
# replace tag information for delimiter == character | space
if(delimiter == "character"){ # i.e. a single character repeated many times
z_dframe$ris[which(
unlist(lapply(
strsplit(z, ""),
function(a){
length(unique(a)) == 1 & length(a > 6)
}
))
)] <- "ER"
}
if(delimiter == "space"){
z_dframe$ris[which(z_dframe$ris == "" & z_dframe$text == "")] <- "ER"
# ensure multiple consecutive empty rows are removed
z_rollsum <- rollingsum(z_dframe$ris == "ER")
if(any(z_rollsum > 1)){
z_dframe <- z_dframe[which(z_rollsum <= 1), ]
}
}
if(delimiter == "endrow"){
# work out what most common starting tag is
z_dframe$ref <- c(0, cumsum(z_dframe$ris == "ER")[
seq_len(nrow(z_dframe)-1)]
) # split by reference
start_tags <- unlist(lapply(
split(z_dframe$ris, z_dframe$ref),
function(a){a[which(a != "")[1]]}
))
start_tag <- names(which.max(xtabs(~ start_tags )))
# continue old code
row_df <- data.frame(
start = which(z_dframe$ris == start_tag),
end = which(z_dframe$ris == "ER")
)
z_list <- apply(
row_df,
1,
function(a){c(a[1]:a[2])}
)
z_list <- lapply(
z_list,
function(a, lookup){lookup[a, ]},
lookup = z_dframe
)
z_dframe <- as.data.frame(
do.call(rbind, z_list)
)
}
# cleaning
z_dframe$ref <- c(0, cumsum(z_dframe$ris == "ER")[
seq_len(nrow(z_dframe)-1)]
) # split by reference
z_dframe <- z_dframe[which(z_dframe$text != ""), ] # remove empty rows
z_dframe <- z_dframe[which(z_dframe$ris != "ER"), ] # remove end rows
z_dframe$text <- trimws(z_dframe$text)
# fill missing tags
z_split <- split(z_dframe, z_dframe$ref)
z_split <- lapply(z_split, function(a){
if(a$ris[1] == ""){
a$ris[1] <- "ZZ"
}
accum_ris <- Reduce(c, a$ris, accumulate = TRUE)
a$ris <- unlist(lapply(
accum_ris,
function(b){
good_vals <- which(b != "")
b[good_vals[length(good_vals)]]
}))
return(a)
})
z_dframe <- as.data.frame(
do.call(rbind, z_split)
)
return(z_dframe)
}
read_medline <- function(x){
x_merge <- merge(x,
tag_lookup(type = "medline"),
by = "ris",
all.x = TRUE,
all.y = FALSE
)
x_merge <- x_merge[order(x_merge$row_order), ]
# convert into a list, where each reference is a separate entry
x_split <- split(x_merge[c("bib", "text")], x_merge$ref)
x_final <- lapply(x_split, function(a){
result <- split(a$text, a$bib)
if(any(names(result) == "abstract")){
result$abstract <- paste(result$abstract, collapse = " ")
}
if(any(names(result) == "title")){
if(length(result$title) > 1){
result$title <- paste(result$title, collapse = " ")
}
}
if(any(names(result) == "term_other")){
names(result)[which(names(result) == "term_other")] <- "keywords"
}
if(any(names(result) == "date_published")){
result$year <- substr(result$date_published, start = 1, stop = 4)
}
if(any(names(result) == "article_id")){
doi_check <- grepl("doi", result$article_id)
if(any(doi_check)){
result$doi <- strsplit(result$article_id[which(doi_check)], " ")[[1]][1]
}
}
return(result)
})
names(x_final) <- unlist(lapply(x_final, function(a){a$pubmed_id}))
class(x_final) <- "bibliography"
return(x_final)
}
# generate unique label for entries, using as much author & year data as possible
generate_bibliographic_names <- function(x){
nonunique_names <- unlist(lapply(x, function(a){
name_vector <- rep("", 3)
if(any(names(a) == "author")){
name_vector[1] <- strsplit(a$author[1], ",")[[1]][1]
}
if(any(names(a) == "year")){
name_vector[2] <- a$year[1]
}
if(any(names(a) == "journal")){
journal_info <- strsplit(a$journal, " ")[[1]]
name_vector[3] <- paste(
substr(journal_info, 1, min(nchar(journal_info), 4)),
collapse = "")
}
name_vector <- name_vector[which(name_vector != "")]
if(length(name_vector) == 0){
return("ref")
}else{
return(paste(name_vector, collapse = "_"))
}
}))
# where this is not possible, give a 'ref1' style result
if(any(nonunique_names == "ref")){
rows_tr <- which(nonunique_names == "ref")
nonunique_names[rows_tr] <- create_index("ref", length(rows_tr))
}
# ensure names are unique
if(length(unique(nonunique_names)) < length(nonunique_names)){
nonunique_names <- make.unique(nonunique_names, sep = "_")
}
return(nonunique_names)
}
# RIS
read_ris <- function(x, tag_type = "ris"){
# merge data with lookup info, to provide bib-style tags
x_merge <- merge(x, tag_lookup(type = tag_type),
by = "ris",
all.x = TRUE,
all.y = FALSE)
x_merge <- x_merge[order(x_merge$row_order), ]
# find a way to store missing .bib data rather than discard
if(any(is.na(x_merge$bib))){
rows_tr <- which(is.na(x_merge$bib))
x_merge$bib[rows_tr] <- x_merge$ris[rows_tr]
if(all(is.na(x_merge$row_order))){
start_val <- 0
}else{
start_val <- max(x_merge$row_order, na.rm = TRUE)
}
x_merge$row_order[rows_tr] <- as.numeric(
as.factor(x_merge$ris[rows_tr])
) + start_val
}
# method to systematically search for year data
year_check <- regexpr("^\\d{4}$", x_merge$text)
if(any(year_check > 0)){
check_rows <- which(year_check > 0)
year_strings <- as.numeric(x_merge$text[check_rows])
# for entries with a bib entry labelled year, check that there arent multiple years
if(any(x_merge$bib[check_rows] == "year", na.rm = TRUE)){
# check for repeated year information
year_freq <- xtabs(~ ref, data = x_merge[which(x_merge$bib == "year"), ])
if(any(year_freq > 1)){
year_df <- x_merge[which(x_merge$bib == "year"), ]
year_list <- split(nchar(year_df$text), year_df$ris)
year_4 <- sqrt((4 - unlist(lapply(year_list, mean))) ^ 2)
# rename bib entries that have >4 characters to 'year_additional'
incorrect_rows <- which(
x_merge$ris != names(which.min(year_4)[1]) &
x_merge$bib == "year"
)
x_merge$bib[incorrect_rows] <- "year_additional"
}
}else{
possible_rows <- which(
year_strings > 0 &
year_strings <= as.numeric(format(Sys.Date(), "%Y")) + 1
)
tag_frequencies <- as.data.frame(
xtabs(~ x_merge$ris[check_rows[possible_rows]]),
stringsAsFactors = FALSE
)
colnames(tag_frequencies) <- c("tag", "n")
# now work out what proportion of each tag contain year data
# compare against number of references to determine likelihood of being 'the' year tag
tag_frequencies$prop <- tag_frequencies$n/(max(x_merge$ref)+1) # number of references
if(any(tag_frequencies$prop > 0.9)){
year_tag <- tag_frequencies$tag[which.max(tag_frequencies$prop)]
rows.tr <- which(x_merge$ris == year_tag)
x_merge$bib[rows.tr] <- "year"
x_merge$row_order[rows.tr] <- 3
# the following code was necessary when string >4 characters long were detected
# x_merge$text[rows.tr] <- substr(
# x = x_merge$text[rows.tr],
# start = year_check[rows.tr],
# stop = year_check[rows.tr]+3
# )
}
}
}
# use code from blog.datacite.org for doi detection
# then return a consistent format - i.e. no www.dx.doi.org/ etc.
# regexpr("/^10.d{4,9}/[-._;()/:A-Z0-9]+$/i", test) # original code
# doi_check <- regexpr("/10.\\d{4,9}/", x_merge$text) # my version
# if(any(doi_check > 0)){
# check_rows <- which(doi_check > 0)
# x_merge$bib[check_rows] <- "doi"
# x_merge$row_order[check_rows] <- 11
# x_merge$text[check_rows] <- substr(
# x = x_merge$text[check_rows],
# start = doi_check[check_rows]+1,
# stop = nchar(x_merge$text[check_rows])
# )
# }
# ensure author data from a single ris tag
if(any(x_merge$bib == "author")){
lookup.tags <- xtabs( ~ x_merge$ris[which(x_merge$bib == "author")])
if(length(lookup.tags) > 1){
replace_tags <- names(which(lookup.tags < max(lookup.tags)))
replace_rows <- which(x_merge$ris %in% replace_tags)
x_merge$bib[replace_rows] <- x_merge$ris[replace_rows]
if(all(is.na(x_merge$row_order))){
start_val <- 0
}else{
start_val <- max(x_merge$row_order, na.rm = TRUE)
}
x_merge$row_order[replace_rows] <- start_val + as.numeric(
as.factor(x_merge$ris[replace_rows])
)
}
}
# convert into a list, where each reference is a separate entry
x_split <- split(x_merge[c("bib", "ris", "text", "row_order")], x_merge$ref)
# convert to list format
x_final <- lapply(x_split, function(a){
result <- split(a$text, a$bib)
# YEAR
if(any(names(result) == "year")){
if(any(nchar(result$year) >= 4)){
year_check <- regexpr("\\d{4}", result$year)
if(any(year_check > 0)){
result$year <- substr(
x = result$year[which(year_check>0)],
start = year_check[1],
stop = year_check[1]+3
)
}else{
result$year <- ""
}
}else{
result$year <- ""
}
}
# TITLE
if(any(names(result) == "title")){
if(length(result$title) > 1){
if(result$title[1] == result$title[2]){
result$title <- result$title[1]
}else{
result$title <- paste(result$title, collapse = " ")
}
}
result$title <- gsub("\\s+", " ", result$title) # remove multiple spaces
result$title <- sub("\\.$", "", result$title) # remove final full stops
}
# JOURNAL
if(any(names(result) == "journal")){
unique_journals <- unique(result$journal)
if(length(unique_journals)>1){
unique_journals <- unique_journals[order(
nchar(unique_journals),
decreasing = FALSE
)]
result$journal <- unique_journals[1]
result$journal_secondary <- paste(
unique_journals[c(2:length(unique_journals))],
collapse = "; "
)
}else{
result$journal <- unique_journals
}
result$journal <-gsub(" ", " ", result$journal)
result$journal <-sub("\\.$", "", result$journal)
}
# ABSTRACT
if(length(result$abstract > 1)){
result$abstract <- paste(result$abstract, collapse = " ")
result$abstract <- gsub("\\s+", " ", result$abstract) # remove multiple spaces
}
# PAGE NUMBER
if(any(names(result) == "pages")){
if(length(result$pages) > 1){
result$pages <- paste(sort(result$pages), collapse = "-")
}
}
entry_order <- unlist(lapply(
names(result),
function(b, initial){
initial$row_order[which(a$bib == b)[1]]
},
initial = a
))
final_result <- result[order(entry_order)]
return(final_result)
})
names(x_final) <- generate_bibliographic_names(x_final)
class(x_final) <- "bibliography"
return(x_final)
}
# BIB
read_bib <- function(x){
# which lines start with @article?
group_vec <- rep(0, length(x))
row_id <- which(regexpr("^@", x) == 1)
group_vec[row_id] <- 1
group_vec <- cumsum(group_vec)
# work out row names
ref_names <- gsub(".*\\{|,$", "", x[row_id])
ref_type <- gsub(".*@|\\{.*", "", x[row_id])
# split by reference
x_split <- split(x[-row_id], group_vec[-row_id])
length_vals <- unlist(lapply(x_split, length))
x_split <- x_split[which(length_vals > 3)]
x_final <- lapply(x_split, function(z){
# first use a stringent lookup term to locate only tagged rows
delimiter_lookup <- regexpr(
"^[[:blank:]]*([[:alnum:]]|[[:punct:]])+[[:blank:]]*=[[:blank:]]*\\{+",
z
)
delimiter_rows <- which(delimiter_lookup != -1)
other_rows <- which(delimiter_lookup == -1)
delimiters <- data.frame(
row = delimiter_rows,
location = regexpr("=", z[delimiter_rows])
)
split_tags <- apply(delimiters, 1, function(a, lookup){
c(
row = as.numeric(a[1]),
tag = substr(
x = lookup[a[1]],
start = 1,
stop = a[2] - 1
),
value = substr(
x = lookup[a[1]],
start = a[2] + 1,
stop = nchar(lookup[a[1]])
)
)
},
lookup = z
)
entry_dframe <- rbind(
as.data.frame(
t(split_tags),
stringsAsFactors = FALSE
),
data.frame(
row = other_rows,
tag = NA,
value = z[other_rows],
stringsAsFactors = FALSE
)
)
entry_dframe$row <- as.numeric(entry_dframe$row)
entry_dframe <- entry_dframe[order(entry_dframe$row), c("tag", "value")]
if(any(entry_dframe$value == "}")){
entry_dframe <- entry_dframe[seq_len(which(entry_dframe$value == "}")[1]-1), ]
}
if(any(entry_dframe$value == "")){
entry_dframe <- entry_dframe[-which(entry_dframe$value == ""), ]
}
# remove whitespace
entry_dframe <- as.data.frame(
lapply(entry_dframe, trimws),
stringsAsFactors = FALSE
)
# remove 1 or more opening brackets
entry_dframe$value <- gsub("^\\{+", "", entry_dframe$value)
# remove 1 or more closing brackets followed by zero or more punctuation marks
entry_dframe$value <- gsub("\\}+[[:punct:]]*$", "", entry_dframe$value)
# convert each entry to a list
label_group <- rep(0, nrow(entry_dframe))
tag_rows <- which(entry_dframe$tag != "")
label_group[tag_rows] <- 1
tag_names <- entry_dframe$tag[tag_rows]
entry_list <- split(
entry_dframe$value,
cumsum(label_group)+1
)
names(entry_list) <- tolower(
gsub("^\\s+|\\s+$", "", tag_names)
)
entry_list <- lapply(entry_list,
function(a){paste(a, collapse = " ")}
)
if(any(names(entry_list) == "author")){
if(length(entry_list$author) == 1){
entry_list$author <- strsplit(entry_list$author, " and ")[[1]]
}
}
return(entry_list)
})
# add type
x_final <- lapply(
seq_len(length(x_final)),
function(a, type, data){
c(type = type[a], data[[a]])
},
type = ref_type,
data = x_final
)
names(x_final) <- ref_names
class(x_final) <- "bibliography"
return(x_final)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.