#' @title Ranks the similarities between a given query bed file with multiple bed files (database files) based on the nucleotides length
#' @description The function compares a given query bed file with multiple bed files (database files), and ranks the relative similarity between each file pairing, computed using jaccard indexes, where 0 has no similarities and 1 has an identical file. The function can also provide a significance value (p-value) of the similarity index based on user selection.
#' @param queryBed The file path of a query bed file to be compared to the database files.
#' @param database_dir The directory of a folder containing database files to be used for comparison with the query file.
#' @param output_path The file path where the output .csv file will be generated.
#' @param output_frm_DatabaseFileSize The file was generated from "DatabaseFileSize" function. Please also include the directory path of this file. The file contains the two columns: 1) BED file names in the database, and 2) pre-calculated length of binding sizes for each BED file
#' @export
#' @return A dataframe that shows the similarities of the query file to the database files ranked from greatest to least.
#' @examples
#' rankSimilarity(queryBed ="/dir/querybed.txt",database_dir="/dir/database_dir",output_path="/dir/output_path", output_frm_DatabaseFileSize ="/dir/Database_BindingSize.csv)
rankSimilarity = function(queryBed = queryBed,database_dir=database_dir,output_path=output_path, output_frm_DatabaseFileSize=output_frm_DatabaseFileSize){
if(is_missing(queryBed)) {
stop("The query BED file is missing. Please use the following format rankSimilarity('/dir/querybed.txt','/dir/database_dir','/dir/output_path', '/dir/Database_BindingSize.csv')")
}
if(is_missing(database_dir)) {
stop("The database director is missing . Please use the following format rankSimilarity('/dir/querybed.txt','/dir/database_dir','/dir/output_path', '/dir/Database_BindingSize.csv')")
}
if(is_missing(output_path)) {
stop("The output_path is missing . Please use the following format rankSimilarity('/dir/querybed.txt','/dir/database_dir','/dir/output_path', '/dir/Database_BindingSize.csv')")
}
if(is_missing(output_frm_DatabaseFileSize)) {
stop("The output_frm_DatabaseFileSize is missing. Please use the following format rankSimilarity('/dir/querybed.txt','/dir/database_dir','/dir/output_path', '/dir/Database_BindingSize.csv')")
}
print("rankSimilarity start")
#levels = list.files(database_dir, recursive = TRUE, pattern=".csv")
levels = list.files(database_dir, recursive = TRUE)
indexes = list()
#calculate size of the total binding region in query bed file
query_length = overlapCalc(queryBed,queryBed)
#read the database_size file generated by the preprocess step: DatabaseFileSize(database_dir, output_path) function
database_size = read.csv(output_frm_DatabaseFileSize, header = TRUE)
#Use hash to keeo tract the length of binding sites of each BED file in the database
hashvalue_size = database_size$size
hashkey_dbfilename = as.character(database_size$filename)
# vectorize assign, get and exists for convenience
assign_hash <- Vectorize(assign, vectorize.args = c("x", "value"))
get_hash <- Vectorize(get, vectorize.args = "x")
exists_hash <- Vectorize(exists, vectorize.args = "x")
# initialize hash
hash <- new.env(hash = TRUE, parent = emptyenv(), size = 100L)
# assign values to keys
databasehash = assign_hash(hashkey_dbfilename, hashvalue_size, hash)
#calculate jaccard index between the query BED file (queryBed) and each BED file in the database (database_dir).
#query_length is the size of the total binding region in query bed file
#hash contains the name of database BED file as key and its corresponding size of binding region as value
indexes = lapply(levels, FUN = jaccardfun, queryBed = queryBed, database_dir = database_dir, query_length = query_length, hash = hash)
jaccard_df = as.data.frame(do.call(rbind,indexes))
jaccard_df = jaccard_df[order(as.numeric(as.character(jaccard_df$jaccard_index)), decreasing = T),]
dir.create(file.path(output_path), showWarnings = FALSE)
write.csv(jaccard_df,paste(output_path,"/",gsub("^.*/", "", queryBed),"_jaccard_nt.csv",sep=""))
print(paste(output_path,"/",gsub("^.*/", "", queryBed),"_jaccard_nt.csv",sep=""))
print("file created!")
}
#calculate jaccard index between query queryBed file and single database file from database_dir
jaccardfun = function(databasefile = databasefile, queryBed = queryBed, database_dir= database_dir, query_length=query_length, hash = hash){
pwd = paste(database_dir,databasefile,sep="/")
#calculate length of intersection between queryBed and database file
num_overlap = overlapCalc(queryBed, pwd)
#length of query queryBed is one of the input parameter to this function
A_bindinglength = query_length
#length of database file is acquired from hash table using the database file name
B_bindinglength = hash[[databasefile]]
#jaccard index formula
num_union = A_bindinglength + B_bindinglength - num_overlap
jaccard_id = num_overlap/num_union
#jaccard_index
#percentage --- ANB/A and ANB/B
#ANB/A
A_similarity = (num_overlap)/A_bindinglength
B_similarity = (num_overlap)/B_bindinglength
#each row in output has four columns: 1) Hit - database file name; 2) Jaccard Index; 3) overlap percent in A; 4) overlap percent in B
line = c(bedfile = databasefile,jaccard_index = jaccard_id,percentage_A = A_similarity,percentage_B=B_similarity)
return (line)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.