# scripts/clean.scottish.capital.R In corybrunson/scottish.capital: Data from Scott and Hughes' *The Anatomy of Scottish Capital*

```library(igraph)

# COMPANY NAMES

# Vector of company names
companies <- unlist(sapply(scottish.capital,
function(g) V(g)\$name[V(g)\$type]))
length(unique(companies))

# Convert " and " to " & "
companies <- gsub(' and ', ' & ', companies)
length(unique(companies))

# Convert all "William" company names to "Wm"
companies <- gsub('William', 'Wm', companies)
length(unique(companies))

# Changed names
wh.rep <- grep('\\(formerly', companies)
name.dat <- unique(data.frame(
old = I(gsub('^.* \\(formerly (.*)\\)\$', '\\1', companies[wh.rep])),
new = I(gsub('^(.*) \\(formerly .*\\)\$', '\\1', companies[wh.rep]))
))
# Where else do the original names appear?
wh.old <- lapply(1:nrow(name.dat), function(i) {
setdiff(grep(name.dat\$old[i], companies), wh.rep)
})
# Where else do the changed names appear?
wh.new <- lapply(1:nrow(name.dat), function(i) {
grep(name.dat\$new[i], companies)
})
# Replace all instances with changed names
for(i in 1:nrow(name.dat)) {
companies[union(wh.old[[i]], wh.new[[i]])] <- name.dat\$new[i]
}
length(unique(companies))

# Combine companies that begin the same way
uniq.companies <- unique(companies)
wh.prefix <- lapply(1:length(uniq.companies), function(i) {
grep(paste0('^', uniq.companies[i]), uniq.companies)
})
wh.prefix <- wh.prefix[which(sapply(wh.prefix, length) > 1)]
prefix.companies <- lapply(wh.prefix, function(vec) uniq.companies[vec])
prefix.tab <- lapply(prefix.companies, function(vec) {
table(companies[which(companies %in% vec)])
})
for(i in 1:length(prefix.tab)) {
name <- names(prefix.tab[[i]])[which(prefix.tab[[i]] ==
max(prefix.tab[[i]]))[1]]
companies[which(companies %in% names(prefix.tab[[i]]))] <- name
}
length(unique(companies))

# Rename event nodes
i <- 1
for(j in 1:length(scottish.capital)) {
wh.event <- which(V(scottish.capital[[j]])\$type)
V(scottish.capital[[j]])\$name[wh.event] <-
companies[i:(i + length(wh.event) - 1)]
i <- i + length(wh.event)
}

# DIRECTOR NAMES

# Vector of director names
directors <- unlist(sapply(scottish.capital,
function(g) V(g)\$name[!V(g)\$type]))
length(unique(directors))

# Convert " & " to " and "
directors <- gsub(' & ', ' and ', directors)
length(unique(directors))

# Combine directors that begin the same way (BEWARE OF REG EXPR CHARACTERS)
uniq.directors <- unique(directors)
wh.prefix <- lapply(1:length(uniq.directors), function(i) {
grep(paste0('^', uniq.directors[i]), uniq.directors)
})
wh.prefix <- wh.prefix[which(sapply(wh.prefix, length) > 1)]
prefix.directors <- lapply(wh.prefix, function(vec) uniq.directors[vec])
prefix.tab <- lapply(prefix.directors, function(vec) {
table(directors[which(directors %in% vec)])
})
for(i in 1:length(prefix.tab)) {
name <- names(prefix.tab[[i]])[which(prefix.tab[[i]] ==
max(prefix.tab[[i]]))[1]]
directors[which(directors %in% names(prefix.tab[[i]]))] <- name
}
length(unique(directors))

# Presence of an honorific
hons <- c('Sir', 'Maj\\. Gen\\.', 'Col\\.', 'Hon\\.')
for(hon in hons) {
# Determine which names come both with and without the honorific
w.hon <- grep(paste0('^', hon, ' '), directors)
wo.hon <- sub(paste0('^', hon, ' (.*)\$'), '\\1', directors[w.hon])
wwo.hon <- wo.hon[which(wo.hon %in% directors)]
# Provided at most one interval is skipped over between appearances,
# replace all instances with the one with the honorific
for(wwo in wwo.hon) {
wh.graph <- which(sapply(scottish.capital, function(g) {
any(grepl(paste0('^(', hon, ' ){0,1}', wwo, '\$'),
V(g)\$name))
}))
if(length(wh.graph) > 1) {
if(max(diff(wh.graph)) > 2 | max(wh.graph) - min(wh.graph) > 3) next
}
wh.dirs <- which(grepl(paste0('^(', hon, ' ){0,1}', wwo, '\$'),
directors))
directors[wh.dirs] <- paste0(hon, ' ', wwo)
}
}

# Manual name equivalences across intervals
equivs <- c(
'^Sir R\\.(W\\.){0,1} Anstruther\$',
'^Sir A\\. {0,1}S(\\.|teven) Bilsland\$',
'^Sir A\\.(C\\.){0,1} Blair\$',
'^(Sir ){0,1}J(as|ames){0,1}. {0,1}(C|I). Campbell\$',
'^Sir J(\\.|ohn )T. Cargill\$',
'^J(\\.|ohn) Cowan\$',
'^H\\.(U\\.){0,1} Cunningham\$',
'^Sir Maurice (E\\. ){0,1}Denny\$',
'^J\\.(A\\.){0,1} Dewar\$',
'^Lord (George|Nigel) Douglas-Hamilton\$',
'^R(\\.|alph )(W\\. ){0,1}Dundas\$',
'^Sir R(\\.|obert) Erskine-Hill\$',
'^R\\.(E\\.){0,1} Findlay\$',
'^(R|T)\\.D\\. Findlay\$',
'^(Sir ){0,1}H(\\.|ugh) Fraser\$',
'^A\\.(B\\.){0,1} Gilroy\$',
'^Sir L\\.(G|J)\\. Grant\$',
'^Sir G\\.(C\\.){0,1} Harvie-Watt\$',
'^(Capt\\. ){0,1}J\\.(F\\.){0,1}H\\. Houldsworth\$',
'^B\\.(G|J)\\. Ivory\$',
'^H(\\.|enry) Lithgow\$',
'^M\\. Ma{0,1}cDougall\$',
'^Sir A\\.(F\\.){0,1} McDonald\$',
'^Sir J(\\.|ohn) Muir\$',
'^Sir H(\\.|ugh) Rose\$',
'^R\\.(H|M)\\. Sinclair\$',
'^D\\.(J\\.){0,1} Smith\$',
'^Sir D(\\.|ouglas) Thomson\$',
'^(J|T)\\.W\\. Tod\$',
'^Sir Ernest (M\\. ){0,1}Wedderburn\$',
'^Sir G(\\.|eorge) Williamson\$',
'(B|H)\\.C\\. Wilson\$',
'^C\\.F\\.(J\\.){0,1} Younger\$',
'^(Sir ){0,1}W(\\.|m )McE\\. Younger\$'
)
for(equiv in equivs) {
# Provided no intervals are skipped over between appearances...
wh.graphs <- which(sapply(scottish.capital, function(g) {
any(grep(equiv, V(g)\$name))
}))
if(length(wh.graphs) > 1)
if(any(diff(wh.graphs) > 1)) {
print(equiv)
next
}
# ...replace each instance with the most recent version to appear twice,
# or else the most recent
wh <- grep(equiv, directors)
tab <- table(directors[wh])
tw <- names(tab)[which(tab > 1)]
ok <- directors[wh[which(directors[wh] %in% tw)]]
use <- if(length(ok) > 0) ok[length(ok)] else directors[wh[length(wh)]]
directors[wh] <- use
}
length(unique(directors))

# Rename nodes according to renamed directors
i <- 0
sc <- 1
while(i < length(directors)) {
wh <- which(!V(scottish.capital[[sc]])\$type)
j <- length(wh)
V(scottish.capital[[sc]])\$name[wh] <- directors[(i + 1):(i + j)]
i <- i + j
sc <- sc + 1
}

# Combine nodes with same name
for(i in 1:length(scottish.capital)) {
scottish.capital[[i]] <- contract.vertices(
scottish.capital[[i]],
factor(V(scottish.capital[[i]])\$name),
'first'
)
}

# Remove directors that only appear once
for(i in 1:length(scottish.capital)) {
wh <- which(
(!V(scottish.capital[[i]])\$type) & (degree(scottish.capital[[i]]) == 1)
)
if(length(wh) == 0) next
print('Deleted directors (appear only once)')
print(c(i, V(scottish.capital[[i]])\$name[wh]))
scottish.capital[[i]] <- delete.vertices(scottish.capital[[i]], wh)
}

save(list = 'scottish.capital', file = 'data/scottish.capital.rda')

rm(list = ls())
```
corybrunson/scottish.capital documentation built on May 13, 2019, 10:52 p.m.