library(GPSmatch)
require(readxl)
require(tidyverse)
require(data.table)
require(rlang)
###The simulated files were generated by shell script "ShuffleFiles.sh" with certain percentage of binding coordinates replaced by randomly selected coordinates from any random files in the database (e.g. 90% in this example)
###This R script was used to run GPSmatch rankSimilarity() function on every simulated files (100 files) in a folder, (e.g Shuffle.0.9.posDIR in this example).
###If the top hit matches the original file where the shuffled files derived from, it shows that GPSmatch can still identify the correct BED file in the database, even the query files were replaced by random binding sites from other BED files in the database.
###The accuracy and the average/sd jaccard indexes of the top hit from 100 shuffled files were summarized (with 90% replacement in this example)
#Step 1: This only needs to be run once for the same database
databaseFileSize(database_dir= "filedir/Col3DIR_SplitDataBase_mergeregions", "filedir/test")
file_list <- list.files(path="filedir/Shuffle.0.9.posDIR", pattern=".csv")
# Run rankSimilarity() to get the jaccard index between simulated file and each of the file in the database. For each simulated BED file, individual output file was created to save the sorted jaccard index of this query BED file against each of the files in the database. All output files will be saved in the output directory "Shuffle.0.9.posDIR_Result".
for (i in 1:length(file_list)){
filename = paste("filedir/Shuffle.0.9.posDIR/",file_list[i], sep = "")
print (filename)
rankSimilarity(filename, "filedir/Col3DIR_SplitDataBase_mergeregions","filedir/Shuffle.0.9.posDIR_Result", "filedir/Col3DIR_SplitDataBase_mergeregions_BindingSize.csv")
}
### Summary the top hit accuracy and their jaccard index value for all the simulation files
dffinal = c(rep(1:6))
file_list2 <- list.files(path="filedir/Shuffle.0.9.posDIR_Result")
for (i in 1:length(file_list2)){
filename = paste("filedir/Shuffle.0.9.posDIR_Result/",file_list2[i], sep = "")
df <- read.csv(file=filename,nrows=1) #get the top hits for each query file
df$OrginalFile = file_list2[i]
dffinal= rbind(dffinal, df)
}
dffinal = dffinal[-1,]
#check the accuracy of the top hits
Sum = 0
for (row in 1:length(file_list)) {
if(isTRUE(grepl(dffinal$bedfile[row],dffinal$OrginalFile[row]))){ #check if the top hit matches the source file of the shuffled file
dffinal$correctness[row] = 1
}else{
dffinal$correctness[row] = 0
}
}
write.table(dffinal, file = "filedir/Summary_Shuffle.0.9.posDIR_Result_082621.csv")
###### negative 105 files
file_list <- list.files(path="filedir/Shuffle105.0.90.negDIR", pattern=".csv")
# Run rankSimilarity() to get the jaccard index between netative control query file and each of the file in the database. For each query BED file, individual output file was created to save the sorted jaccard index of this query BED file against each of the files in the database. All output files will be saved in the output directory "Shuffle.0.9.posDIR_Result".
for (i in 1:length(file_list)){
filename = paste("filedir/Shuffle105.0.90.negDIR/",file_list[i], sep = "")
print (filename)
rankSimilarity(filename, "filedir/Col3DIR_SplitDataBase_mergeregions","filedir/Shuffle105.0.90.negDIR_Result", "filedir/Col3DIR_SplitDataBase_mergeregions_BindingSize.csv")
}
### Summary the top hit accuracy and their jaccard index value for all the simulation files
dffinal = c(rep(1:6))
file_list2 <- list.files(path="filedir/Shuffle105.0.90.negDIR_Result")
for (i in 1:length(file_list2)){
filename = paste("filedir/Shuffle105.0.90.negDIR_Result/",file_list2[i], sep = "")
df <- read.csv(file=filename,nrows=1)
df$OrginalFile = file_list2[i]
dffinal= rbind(dffinal, df)
}
dffinal = dffinal[-1,]
#check the accuracy of the top hits and create a new column "correctness" to record the result.
Sum = 0
for (row in 1:length(file_list2)) {
if(isTRUE(grepl(dffinal$bedfile[row],dffinal$OrginalFile[row]))){
dffinal$correctness[row] = 1
}else{
dffinal$correctness[row] = 0
}
}
write.table(dffinal, file = "filedir/Summary_Shuffle105.0.90.negDIR_Result_082621.csv")
sum(dffinal$correctness)
mean(dffinal$jaccard_index)
sd(dffinal$jaccard_index)
std = function(x) sd(x)/sqrt(length(x))
std(dffinal$jaccard_index)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.