GPSmatch: Compares and Ranks the Similarities of Bed Files

Documented in rankSimilarity

#' @title Ranks the similarities between a given query bed file with multiple bed files (database files) based on the nucleotides length
#' @description The function compares a given query bed file with multiple bed files (database files), and ranks the relative similarity between each file pairing, computed using jaccard indexes, where 0 has no similarities and 1 has an identical file. The function can also provide a significance value (p-value) of the similarity index based on user selection.
#' @param queryBed The file path of a query bed file to be compared to the database files.
#' @param database_dir The directory of a folder containing database files to be used for comparison with the query file.
#' @param output_path The file path where the output .csv file will be generated.
#' @param output_frm_DatabaseFileSize The file was generated from "DatabaseFileSize" function. Please also include the directory path of this file. The file contains the two columns: 1) BED file names in the database, and 2) pre-calculated length of binding sizes for each BED file
#' @export
#' @return A dataframe that shows the similarities of the query file to the database files ranked from greatest to least.
#' @examples
#' rankSimilarity(queryBed ="/dir/querybed.txt",database_dir="/dir/database_dir",output_path="/dir/output_path", output_frm_DatabaseFileSize ="/dir/Database_BindingSize.csv)

rankSimilarity = function(queryBed = queryBed,database_dir=database_dir,output_path=output_path, output_frm_DatabaseFileSize=output_frm_DatabaseFileSize){
  if(is_missing(queryBed)) {
    stop("The query BED file is missing. Please use the following format rankSimilarity('/dir/querybed.txt','/dir/database_dir','/dir/output_path', '/dir/Database_BindingSize.csv')")
  }
  if(is_missing(database_dir)) {
    stop("The database director is missing . Please use the following format rankSimilarity('/dir/querybed.txt','/dir/database_dir','/dir/output_path', '/dir/Database_BindingSize.csv')")
  }
  if(is_missing(output_path)) {
    stop("The output_path is missing . Please use the following format rankSimilarity('/dir/querybed.txt','/dir/database_dir','/dir/output_path', '/dir/Database_BindingSize.csv')")

  }
  if(is_missing(output_frm_DatabaseFileSize)) {
    stop("The output_frm_DatabaseFileSize is missing. Please use the following format rankSimilarity('/dir/querybed.txt','/dir/database_dir','/dir/output_path', '/dir/Database_BindingSize.csv')")
  }

  print("rankSimilarity start")

  #levels = list.files(database_dir, recursive = TRUE, pattern=".csv")
  levels = list.files(database_dir, recursive = TRUE)
  indexes = list()

  #calculate size of the total binding region in query bed file
  query_length = overlapCalc(queryBed,queryBed)

  #read the database_size file generated by the preprocess step: DatabaseFileSize(database_dir, output_path) function
  database_size = read.csv(output_frm_DatabaseFileSize, header = TRUE)

  #Use hash to keeo tract the length of binding sites of each BED file in the database
  hashvalue_size = database_size$size
  hashkey_dbfilename = as.character(database_size$filename)


  # vectorize assign, get and exists for convenience
  assign_hash <- Vectorize(assign, vectorize.args = c("x", "value"))
  get_hash <- Vectorize(get, vectorize.args = "x")
  exists_hash <- Vectorize(exists, vectorize.args = "x")

  # initialize hash
  hash <- new.env(hash = TRUE, parent = emptyenv(), size = 100L)
  # assign values to keys
  databasehash = assign_hash(hashkey_dbfilename, hashvalue_size, hash)


  #calculate jaccard index between the query BED file (queryBed) and each BED file in the database (database_dir).
  #query_length is the size of the total binding region in query bed file
  #hash contains the name of database BED file as key and its corresponding size of binding region as value
  indexes = lapply(levels, FUN = jaccardfun, queryBed = queryBed, database_dir = database_dir, query_length = query_length, hash = hash)

  jaccard_df = as.data.frame(do.call(rbind,indexes))
  jaccard_df = jaccard_df[order(as.numeric(as.character(jaccard_df$jaccard_index)), decreasing = T),]

  dir.create(file.path(output_path), showWarnings = FALSE)
  write.csv(jaccard_df,paste(output_path,"/",gsub("^.*/", "", queryBed),"_jaccard_nt.csv",sep=""))
  print(paste(output_path,"/",gsub("^.*/", "", queryBed),"_jaccard_nt.csv",sep=""))
  print("file created!")
}


#calculate jaccard index between query queryBed file and single database file from database_dir
jaccardfun = function(databasefile = databasefile, queryBed = queryBed, database_dir= database_dir, query_length=query_length, hash = hash){
  pwd = paste(database_dir,databasefile,sep="/")

  #calculate length of intersection between queryBed and database file
  num_overlap = overlapCalc(queryBed, pwd)
  #length of query queryBed is one of the input parameter to this function
  A_bindinglength = query_length
  #length of database file is acquired from hash table using the database file name
  B_bindinglength = hash[[databasefile]]

  #jaccard index formula
  num_union = A_bindinglength + B_bindinglength - num_overlap
  jaccard_id = num_overlap/num_union
  #jaccard_index

  #percentage --- ANB/A and ANB/B
  #ANB/A
  A_similarity = (num_overlap)/A_bindinglength
  B_similarity = (num_overlap)/B_bindinglength

  #each row in output has four columns: 1) Hit - database file name; 2) Jaccard Index; 3) overlap percent in A; 4) overlap percent in B
  line = c(bedfile = databasefile,jaccard_index = jaccard_id,percentage_A = A_similarity,percentage_B=B_similarity)
  return (line)
}