#' @title fromFILEStoSERIES function
#' @description This function helps to organize the Belgium data by adding extra "Series" row and reformatting data in a proper format.
#' @param dat data frame
#' @param series_title variable name "Series title"
#' @param files variable name for "Files"
#' @param series_scope_note variable name for "Series scope note"
#' @param series_date_range variable name for "Hoover date range"
#' @param scope_and_content variable name for "Scope and content"
#' @param box_barcode variable name for "Box Barcode"
#' @param ckey variable name for "Ckey"
#' @param top_container variable name for "Top container"
#' @param processing_information variable name for "Processing information"
#' @param lang_encoding sets system's locale to specific language (by default "English")
#' @param add_articles vector of extra articles to be excluded from alphabetical ordering
#' @param remove_special_characters remove special characters, i.e. horizontal brackets (TRUE by default)
#' @param alphabetizewithinbrackets alphabetize by a string within square brackets (FALSE by default)
#' @param diacriticslatinization transform characters with diacritics to Latin for alphabetization (TRUE by default)
#' @param USextension file name change to reflect specifics of US data
#' @param ... other parameters
#' @export
#' @importFrom stringdist stringdist
#' @import dplyr
#' @import readxl
#' @import xlsx
#' @import stringi
#' @return Returns altered dataframe.
#' @examples
#'
#' library(HooverArchives)
#' library(readxl)
#' library(xlsx)
#'
#' #Load data and create indices
#'
#' #Open Sheet 1
#' dat2.1<-read.xlsx(system.file("BelgiumData.xlsx", package="HooverArchives"), sheetIndex=1, header=FALSE, encoding = "utf-8")
#' dat2.1[]<-lapply(dat2.1, as.character)
#' colnames(dat2.1)<-as.character(dat2.1[3,])
#' dat2.1<-dat2.1[-(1:3),-c(1,14)];
#' dat2.1$indexW<-dat2.1$`Item title`
#'
#' #Open Sheet 2
#' dat2.2<-read.xlsx(system.file("BelgiumData.xlsx", package="HooverArchives"), sheetIndex=2, header=TRUE, encoding = "utf-8")
#' dat2.2$indexW<- dat2.2$`Packet.Catalog.Title`
#'
#' #Merge two dataframe using BuildIndex and Merge_data functions
#' index_matches<-buildIndex(dat2.1$indexW,dat2.2$indexW,
#' index_simplify=TRUE,
#' fuzzy_matching=TRUE,
#' index_hashing=FALSE)
#' mdat<-mergeData(dat2.1,dat2.2, index_matches)
#'
#' #Use fromFILEStoSERIES() to add the Series row
#' coverted.dat<-fromFILEStoSERIES(dat=mdat,
#' series_title="Series title",
#' files="index",
#' series_scope_note="Series scope note",
#' series_date_range="Hoover date range",
#' scope_and_content="Scope.and.content",
#' problems_notes="Series scope note",
#' box_barcode="Box_Barcode",
#' ckey="Ckey.x",
#' top_container="Final.Box..")
#' coverted.dat$Date<-dateReformatter(coverted.dat$Date)
#' convertedtoArchivesSpace<-subset(coverted.dat, select=c("Title", "Hierarchical_Relationship", "Processing_Information",
#' "CkeyV", "Description_Level", "Date", "Top_Container_[indicator]",
#' "Box_Barcode", "Scope_and_content"), value=TRUE)
#' #Save file in xlsx to preserve diacritic characters
#' #write.xlsx(convertedtoArchivesSpace, "convertedtoArchivesSpace.xlsx", sheetName = "ArchivesSpace", col.names = TRUE)
fromFILEStoSERIES<-function(dat=NULL,
series_title=NULL,
files=NULL,
series_scope_note=NULL,
series_date_range=NULL,
scope_and_content=NULL,
box_barcode=NULL,
top_container=NULL,
processing_information=NULL,
ckey=NULL,
lang_encoding="English",
add_articles=NULL,
remove_special_characters=TRUE,
alphabetizewithinbrackets=FALSE,
diacriticslatinization=TRUE,
USextension=FALSE,
...){
dat[] <- lapply(dat, as.character)
Sys.setlocale('LC_ALL', lang_encoding)
if(isTRUE(remove_special_characters)){
remove_sc = "\U00fe20|\U00fe21|\U00361"
remove_sc<-paste(remove_special_characters, remove_sc, collapse="|")
remove_sc<-gsub("^\\s+","", remove_sc)
dat <- as.data.frame(apply(dat, 2, function(x) gsub(remove_sc, "", x, perl=TRUE)), stringsAsFactors = FALSE)
}
dat[] <- lapply(dat, as.character)
cNames <- colnames(dat)
series_titleV <- dat[,cNames%in%series_title]
series_scope_noteV <- dat[,cNames%in%series_scope_note]
series_date_rangeV <- dat[,cNames%in%series_date_range]
processing_informationV <- dat[,cNames%in%processing_information]
CkeyV <- dat[,cNames%in%ckey]
filesV <- dat[,cNames%in%files]
k=1
series_titleV<-gsub("^\\s+|\\s+$", "", series_titleV)
series_scope_noteV<-gsub("^\\s+|\\s+$", "", series_scope_noteV)
series_titleV<-gsub("^\\s+|\\s+$", "", series_titleV)
TitleS <- paste(series_titleV, '. (', series_scope_noteV, ", ", series_date_rangeV, ")", sep="")
TitleS <- gsub("\\(+|\\(+\\s*\\(+", "(", TitleS); TitleS <- gsub("\\)+|\\)+\\s*\\)+", ")", TitleS)
dat$TitleF <- filesV
dat$Group <- NA
dat$Title <- NA
dat$'Hierarchical Relationship' <- NA
dat$'Description Level' <- NA
dat$CkeyV<-NA
superdat <- data.frame(matrix(NA, dim(dat)[1]*2, dim(dat)[2]))
colnames(superdat)<-colnames(dat)
gr=1
for (i in 1:length(unique(dat$TitleF))){
repObs<-sum(dat$TitleF%in%unique(dat$TitleF)[i])
if(repObs>1){
superdat[k,] <- NA
superdat$Title[k]<-TitleS[which(dat$TitleF%in%unique(dat$TitleF)[i])[1]];
superdat$CkeyV[k]<-CkeyV[which(dat$TitleF%in%unique(dat$TitleF)[i])[1]];
superdat$'Hierarchical Relationship'[k]<-1
superdat$'Description Level'[k]<-"Series"
superdat[(k+1):(k+repObs),] <- dat[which(dat$TitleF%in%unique(dat$TitleF)[i]),];
superdat[(k+1):(k+repObs),'Title'] <- dat[dat$TitleF%in%unique(dat$TitleF)[i],'TitleF'];
superdat[(k+1):(k+repObs),'Hierarchical Relationship'] <- 2
superdat[(k+1):(k+repObs),'Description Level'] <- "File"
superdat[k:(k+repObs), 'Group']<-gr
k=k+repObs+1;
}else{
superdat[k,] <- NA
superdat$Title[k]<-TitleS[which(dat$TitleF%in%unique(dat$TitleF)[i])[1]];
superdat$CkeyV[k]<-CkeyV[which(dat$TitleF%in%unique(dat$TitleF)[i])[1]];
superdat$'Hierarchical Relationship'[k]<-1
superdat$'Description Level'[k]<-"Series"
superdat[(k+1):(k+2),] <- dat[which(dat$TitleF%in%unique(dat$TitleF)[i]),];
superdat[(k+1):(k+2),'Title'] <- dat[dat$TitleF%in%unique(dat$TitleF)[i],'TitleF'];
superdat[(k+1):(k+2),'Hierarchical Relationship'] <- 2
superdat[(k+1):(k+2),'Description Level'] <- "File"
superdat[k:(k+2), 'Group']<-gr
k=k+2
}
gr=gr+1
}
colnames(superdat)<-gsub(" |\\.", "_", colnames(dat))
if(!is.null(top_container)){
superdat$"Top_Container_[indicator]"<- superdat[,names(superdat)%in%gsub("\\.", "_", top_container)]
}
superdat <- superdat[!is.na(superdat$Group),]
if(USextension){
superdat$Title[superdat$Description_Level!="Series"]<-paste(gsub(":", "",
gsub("\\s+(?=:)", "", gsub("(?<=:)\\d+", "", superdat$Title[superdat$Description_Level!="Series"], perl=TRUE), perl=TRUE)), ")", sep="")
}
#reordering
remove_articles<-function(x){
Caps <- function(s) {
unique(c(s,
paste(toupper(substring(s, 1,1)), substring(s, 2), sep=""),
paste(toupper(s), sep="")))
}
articles<-Caps(c("l'", "l", "le", "la", "les", "un", "une", "des", "du", "de", "la", "der", "die", "das", "ein", "eine", "het", add_articles))
x[is.na(x)] <- "NA"
first <- regmatches(x, regexpr("(\\w+)", x))
x[first %in% articles]<-gsub("(^\\w+\\s+)|([[:alpha:]])'", "", x[first%in%articles])
#x <- gsub("\\s*\\([^\\)]+\\)","", x)
x <- tolower(x)
x <- gsub("\\s+", " ", x)
x <- gsub("^\\s+", "", x)
return(x)}
if(alphabetizewithinbrackets){
##rem_nat_char_Title <- gsub("[[:punct:]]| ", "", gsub("^.+(?=[\\[\\{])", "", remove_articles(superdat$Title), perl=TRUE))
#rem_nat_char_Title <- gsub("([\\s])|[[:punct:]]", "\\1", gsub("^.+(?=[\\[\\{])", "", remove_articles(superdat$Title), perl=TRUE))
#rem_nat_char_Title <- gsub('[0-9]+', '', rem_nat_char_Title)
#rem_nat_char_Title <- gsub("\\s+", " ", rem_nat_char_Title)
#rem_nat_char_Title <- gsub("^\\s+|\\s+$", "", rem_nat_char_Title)
#superdat$indexN <- remove_articles(rem_nat_char_Title)
#subset for data with brackets
dwithbrackets<-grepl("\\[|\\]", remove_articles(superdat$Title))
rem_nat_char_Title<-rep(NA, nrow(superdat))
rem_nat_char_Title[dwithbrackets] <- gsub("([\\s])|[[:punct:]]", "\\1", gsub("^.+(?=[\\[\\{])", "", remove_articles(superdat$Title[dwithbrackets]), perl=TRUE))
rem_nat_char_Title[dwithbrackets] <- gsub('[0-9]+', '', rem_nat_char_Title[dwithbrackets])
rem_nat_char_Title[dwithbrackets] <- gsub("\\s+", " ", rem_nat_char_Title[dwithbrackets])
rem_nat_char_Title[dwithbrackets] <- gsub("^\\s+|\\s+$", "", rem_nat_char_Title[dwithbrackets])
superdat$indexN[dwithbrackets] <- remove_articles(rem_nat_char_Title[dwithbrackets])
#subset for data without brackets
dwithoutbrackets<-!grepl("\\[|\\]", remove_articles(superdat$Title))
superdat$indexN[dwithoutbrackets] <- gsub("([\\s])|[[:punct:]]", "\\1", remove_articles(superdat$Title[dwithoutbrackets]), perl=TRUE)
superdat$indexN[dwithoutbrackets] <- gsub('[0-9]+', '', superdat$indexN[dwithoutbrackets])
superdat$indexN[dwithoutbrackets] <- gsub("\\s+", " ", superdat$indexN[dwithoutbrackets])
superdat$indexN[dwithoutbrackets] <- gsub("^\\s+|\\s+$", "", superdat$indexN[dwithoutbrackets])
}else{
superdat$indexN <- gsub("([\\s])|[[:punct:]]", "\\1", remove_articles(superdat$Title), perl=TRUE)
#superdat$indexN <- gsub("\u00B4", "", superdat$indexN)
superdat$indexN <- gsub('[0-9]+', '', superdat$indexN)
superdat$indexN <- gsub("\\s+", " ", superdat$indexN)
superdat$indexN <- gsub("^\\s+|\\s+$", "", superdat$indexN)
if(diacriticslatinization){
Encoding(superdat[["indexN"]]) <- "UTF-8"
superdat$indexN <- stri_trans_general(superdat$indexN,"Latin-ASCII")
}
}
index_frame<-superdat%>%select(Hierarchical_Relationship, Group, indexN)%>%
filter(Hierarchical_Relationship==1)%>%
arrange(indexN)%>%
mutate(GroupN = 1:n())
superdat$GroupN<-sapply(superdat$Group, function(x) index_frame$GroupN[index_frame$Group%in%x])
superdat <- arrange(superdat, GroupN, Hierarchical_Relationship, indexN, Date)
#g The scope and contents notes CHANGES:
if(!is.null(scope_and_content)){
superdat[,gsub("\\.|\\s+", "_", scope_and_content)]<-
gsub("many numbers missing", "many issues missing", superdat[,gsub("\\.", "_", scope_and_content)])
}
if(!is.null(scope_and_content)){
superdat[,gsub("\\.|\\s+", "_", scope_and_content)]<-
gsub("\\s+[Xx]\\s+", "", superdat[,gsub("\\.", "_", scope_and_content)])
}
superdat[is.na(superdat)] <- ""
for (name in colnames(superdat[,sapply(superdat, is.character)])){
Encoding(superdat[[name]]) <- "UTF-8"}
#remove rows with NAs in Title
superdat <- superdat[!grepl("(^NA\\.*$)|(^NA\\.)", superdat$Title),]
superdat$Title <- gsub("\\(+|\\(+\\s*\\(+", "(", superdat$Title)
superdat$Title <- gsub("\\)+|\\)+\\s*\\)+", ")", superdat$Title)
return(superdat)}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.