#' @export
CommonPattern <-
function(strings.vec, low = 5, high = 25, interval = 5, eveChar.df){
##### 1. Prepare file names
# 1.1 get the name of the string vector
strings.vec_name <- deparse(substitute(strings.vec))
##### 2. Store string information
# column1: strings; column2:lengths of strings; column3:number of all substrings
# 2.0 number of strings in the original vector
numStrings <- length(strings.vec)
# 2.1 Initialize data frame
string_info.m <- matrix(ncol = 3, nrow = numStrings)
string_info.df <- data.frame(string_info.m, stringsAsFactors = FALSE)
# 2.2 Get and store string information
# 2.2.1 stings and string lengths
string_info.df[,1] <- strings.vec
string_info.df[,2] <- nchar(string_info.df[,1])
maxNchar = max(string_info.df[,2]) # maximum length
# 2.2.2 numbers of all substrings (length >= 3)
string_info.df[,3] <- 0
string_info.df[,3] <- (1 + string_info.df[,2] -2) * (string_info.df[,2] -2) /2
# 2.1.3 maxium number of substrings or rows to store substrings
numrow_substr <- max(string_info.df[,3])
##### 3. Get and store substrings
# 3.1 initialize data frame to store all substrings
# To store all substrings. Each column is for a set of substrings of a string.
# note numrow_substr is the maxium number of substrings or rows, i.e.,
# only one or a few columns will use up numrow; most will be fewer
subStr_all.m <- matrix(ncol = numStrings, nrow = numrow_substr)
subStr_all.df <- data.frame(subStr_all.m, stringsAsFactors = FALSE)
# 3.2 Get all substrings (length >= 3) for each string
for (m in 1:numStrings){
k <- 1
for (i in 1:(string_info.df[m,2]-2)){ # start char position of a string
for (j in (i+2):string_info.df[m,2]){ # end char position of a string
subStr_all.df[k, m] <- substring(string_info.df[m,1], i, j)
k <- k+1
}
}
}
##### 4. Statistics and sort
# 4.1 Find pattern (substring) frequencies
# organize all substrings in col1 and the corresponding Freq in col2 using 'table'
subStr_all_result.df <- as.data.frame(table(unlist(subStr_all.df)))
names(subStr_all_result.df) <- c("Pattern", "Freq_grp")
# in case substring starting w/ '0'.
subStr_all_result.df$Pattern <- as.character(subStr_all_result.df$Pattern)
# 4.2 Ratio of number of each substring to number of original full strings
pattRatio <- subStr_all_result.df$Freq_grp/numStrings
# 4.3 function to write as percent format
percent <- function(x, digits = 2, format = "f", ...){
paste0(formatC(100 * x, format = format, digits = digits, ...), "%")
}
# 4.4 more columns
# Percentage in % and digit formats, respectively
subStr_all_result.df$Percent_grp <- percent(pattRatio) # column of ratio in percent
subStr_all_result.df$pattRatio <- pattRatio # column of ratio in digits
subStr_all_result.df$pattRatio <- format(subStr_all_result.df$pattRatio, digits = 2, nsmall = 2)
# column of substring length
subStr_all_result.df$Length <- nchar(subStr_all_result.df$Pattern)
# 4.5 Sort substring length - from longest to shortest
subStr_all_result.df <- subStr_all_result.df[with(subStr_all_result.df, order(-subStr_all_result.df$Length)),]
# 4.6 total number of all substrings
numPatternTotal <- length(subStr_all_result.df$Length)
##### 5. Another consideration: same substrings is only counted once within one string ###
# That is, each string can have a certain substring ONLY ONCE.
# e.g., in string 'abcdabc', 'abc' is only counted once.
# Thus, it has the same substrings as in the 'all' file, with possibly different frequencies
# 5.1 Get and store substrings
# 5.1.1 initialize data frame to store all substrings
subStr_once.m <- matrix(ncol = numStrings, nrow = numrow_substr)
subStr_once.df <- data.frame(subStr_once.m, stringsAsFactors = FALSE)
# 5.1.2 Get all substrings (length >= 3) for each string
subStr_once.list <- apply(subStr_all.df, 2, function(x) unique(x))
# number of unique substrings in each string and the max number
n_subStr_once <- sapply(subStr_once.list, length)
max.n_subStr_once <- max(n_subStr_once)
subStr_once.m <- matrix(ncol = numStrings, nrow = max.n_subStr_once)
subStr_once.df <- data.frame(subStr_once.m, stringsAsFactors = FALSE)
subStr_once.df <- sapply(subStr_once.list, "[", i = 1:max.n_subStr_once)
# 5.2 Find frequencies for 'once' substrings
subStr_once_result.df <- as.data.frame(table(unlist(subStr_once.df)))
# 5.3 First two columns of the 'once' data frame
names(subStr_once_result.df) <- c("Patterns1", "Freq_str")
subStr_once_result.df$Patterns1 <- as.character(subStr_once_result.df$Patterns1)
# 5.4 Ratio of number of each substring to number of original full strings
pattRatio1= subStr_once_result.df$Freq_str/numStrings
# 5.5 more columns
subStr_once_result.df$Percent_str <- percent(pattRatio1) # column of ratio in percent
subStr_once_result.df$pattRatio1 <- pattRatio1 # column of ratio in digits
subStr_once_result.df$pattRatio1 <- format(subStr_once_result.df$pattRatio1, digits = 2, nsmall = 2)
# column of substring length # same as all substrings
subStr_once_result.df$Length1 <- nchar(subStr_once_result.df$Patterns1)
# 5.6 sort substring length - from longest to shortest
subStr_once_result.df <- subStr_once_result.df[with(subStr_once_result.df,
order(-subStr_once_result.df$Length1)),]
##### 6. Combine all and once to all.
# 6.1 Duplicate the 'all' df
subStr_result_out.df <- subStr_all_result.df
# 6.2 Get freq and percent info from the 'once' df
subStr_result_out.df$Freq_str <- subStr_once_result.df$Freq_str
subStr_result_out.df$Percent_str <- subStr_once_result.df$Percent_str
# 6.2.1 Sort substring length (long to short), then frequency (high to low), then remove the ratio column
subStr_result_out.df <- subStr_result_out.df[with(subStr_result_out.df,
order(-subStr_result_out.df$Length, -subStr_result_out.df$Freq_grp)),]
# 6.3 Case 1: use only frequencies >= 2, then remove the ratio column (no conversion to token)
subStr_result_out_f2.df <- subset(subStr_result_out.df, subStr_result_out.df$Freq_grp > 1)[, -4]
# 6.3.1 name of output file
out.f2.name <- paste0(strings.vec_name, "_f2up", ".txt")
# 6.3.2 Write .txt as output file. Note no conversion happens here because patterns with freq>=2 can be a lot.
utils::write.table(subStr_result_out_f2.df, sep = "\t",
row.names=F, col.names = TRUE, file = out.f2.name)
# 6.4 Case 2+: use pattern Ratio >= low_cutoff, ... 15%, 20%, 25%,...high_cutoff..., respectively
# default percentages: low_cutoff = 5, high_cutoff = 25, inter_cutoff = 5
# 6.4.1 cutoff sequence: from ow_cutoff to high_cutoff
cutoff.seq <- seq(low, high, by = interval)
# count numbers in the range
cutoff.seq_num <- length(cutoff.seq)
# percentage to percent in digits
cutoff.seq.n <- 0.01 * cutoff.seq
# form of 2 digits for every number
cutoff.seq.c <- sprintf("%02d", cutoff.seq)
# 6.4.2 names of output files
out.file.names <- paste0(strings.vec_name, "_", cutoff.seq.c, "up", ".txt")
# 6.4.3 Get list containing dfs with different %, then remove the ratio column
subStr_result_cutoff.list <- lapply(cutoff.seq.n, function(x){
subset(subStr_result_out.df, subStr_result_out.df$pattRatio >= x)[, -4]
})
##### 7. Optional: convert characters back to event names
# Test whether conversion is needed or not
if(missing(eveChar.df)){
# 7.1 Directly write output files with different percentages
lapply(1:cutoff.seq_num, function(i){
utils::write.table(subStr_result_cutoff.list[[i]], sep = "\t", row.names = FALSE, col.names = TRUE,
file = out.file.names[i])
})
} else{
# 7.2 convert character back to event name
# 7.2.1 Split all patterns (substrings)
subStr_result_cutoff.sp.list <- lapply(1:cutoff.seq_num, function(i){
strsplit(subStr_result_cutoff.list[[i]]$Pattern, split = "")
})
# 7.2.2 replace characters with event names, then combine each set of event names to a string
subStr_result_cutoff.t.list <- lapply(subStr_result_cutoff.sp.list, function(x){
lapply(1:length(x), function(i){
paste(plyr::mapvalues(x[[i]], eveChar.df[,2], eveChar.df[,1],
warn_missing = FALSE), collapse = ",")
})
})
# 7.2.3 Add the converted strings as the last column of the output df
subStr_result_cutoff.tr.list <- lapply(1:cutoff.seq_num, function(i){
cbind(subStr_result_cutoff.list[[i]],
unlist(subStr_result_cutoff.t.list[[i]]))
})
# 7.2.4 Change the name of the newly added column
for (i in 1:cutoff.seq_num){
names(subStr_result_cutoff.tr.list[[i]])[7] = "Event_name"
}
# 7.3.5 Write output files with different percentages, with an additional patterns (in token) column
lapply(1:cutoff.seq_num, function(i){
utils::write.table(subStr_result_cutoff.tr.list[[i]], sep = "\t", row.names = FALSE, col.names = TRUE,
file = out.file.names[i])
})
} # end of else
##### 8. End of function reminder
cat(' Files with different percentages of common patterns are exported.
Patterns occur at least twice are exported in a separate file.\n')
sprintf("Number of original strings: %d; Total number of substrings (patterns): %d.", numStrings, numPatternTotal)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.