#' mergeSRAFast Function
#'
#' @param ref The file reference positons
#' @param gff1 The gff file which is the reference gff
#' @param gff2 the gff file that needs rearranging
#' @param filenum1 number or name for the first gff file
#' @param filenum2 number or name for the second gff file
#' @keywords combines multiple gff files from different genomes
#' @export
#' @examples
#' mergeSRAFast()
mergeSRAFast <- function(ncRNAgff, time.it = T, quiet = F, filenum1 = "1", filenum2 = "2", initial_data = T, align = T, no_merge =F){
# Setup and tests ---------------------------------------------------------
test_setup <- F
if(test_setup == T){
load(file = "~/bin/r_git/R/mergeSRAData.Rda")
ncRNAgff <- mergeSRAData[["ncRNAgff"]]
filenum1 <- mergeSRAData[["filenum1"]]
filenum2 <- mergeSRAData[["filenum2"]]
initial_data <- mergeSRAData[["initial_data"]]
align <- mergeSRAData[["align"]]
quiet <- F
time.it <- T
i <- 0
test_setup <- T
no_merge <- F
}
ptm <- proc.time()
# Data frame setup ----------------
ncRNAgff <- ncRNAgff%>%mutate(change = ifelse(start < end, F, T))%>%
mutate(start.tmp = end)%>%
mutate(end.tmp = start)%>%
mutate(start = ifelse(change == T, start.tmp, start))%>%
mutate(end = ifelse(change == T, end.tmp, end))%>%
select(-start.tmp, -end.tmp, -change)
ncRNAgff <- ncRNAgff%>%arrange(start)
current_feature <- F #is there a current feature being written?
new_feature <- F
counter <- 0
for(i in 1:(nrow(ncRNAgff))){
if(ncRNAgff$start[i] < 0 && align == T){
start_val <- ncRNAgff$start[i]
start_i <- i
end_val <- ncRNAgff$end[i]
counter <- counter + 1
next
}
if(current_feature == F){
start_val <- ncRNAgff$start[i]
start_i <- i
end_val <- ncRNAgff$end[i]
}else{
if(ncRNAgff$end[i] > end_val){
end_val <- ncRNAgff$end[i]
}
}
if(i == nrow(ncRNAgff)){
counter <- counter + 1
current_feature <- F
}else{
aa <- min(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
bb <- max(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
cc <- min(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
dd <- max(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
prop_val <- (cc - bb)/(dd - aa)
if(end_val > ncRNAgff$start[i + 1] && prop_val > 0.25){
if(ncRNAgff$end[i + 1] > end_val){
end_val <- ncRNAgff$end[i + 1]
}
current_feature <- T
}else{
counter <- counter + 1
current_feature <- F
}
}
}
##Data will be written into this format
mergedDat <- data.frame(sequence = rep(as.character("0"), counter), feature = rep(as.character("0"), counter),
start = rep(as.integer("0"), counter), end = rep(as.integer("0"), counter),
strand = rep(as.character("0"), counter), file_names = rep(as.character("start_row"), counter),
row_numbers = rep(as.character("0"), counter), prop_overlap = rep(as.numeric(0), counter), feature_match = rep(F, counter),
number_of_features = rep(as.integer("0"), counter),
score = rep(as.character("0"), counter),
new_feature = rep(F, counter),
number_of_rnaseq_files = rep(as.integer("0"), counter),
id1 = rep(as.character("0"), counter),
id2 = rep(as.character("0"), counter),
set_val_1 = rep(as.character("0"), counter),
set_val_2 = rep(as.character("0"), counter),
file_id = rep(as.character("0"), counter),
stringsAsFactors = F)
##loop through the combined gff files and combine features that overlap
current_feature <- F #is there a current feature being written?
new_feature <- F
counter <- 0
#i <-175
# Main loop ---------------------------------------------------------------
i <- 1
for(i in 1:(nrow(ncRNAgff))){
if(counter + 1 > nrow(mergedDat)){
print(counter)
print(i)
print(nrow(ncRNAgff))
print(nrow(mergedDat))
}
if(quiet ==F){
printRemaining(i <- i, length = nrow(ncRNAgff) - 1, increment = 5)
}
##write the feature without checks if it could not be remapped
##This only applies if theree is a need to align
if(ncRNAgff$start[i] < 0 && align == T){
##define the start and end of the feature
start_val <- ncRNAgff$start[i]
start_i <- i
end_val <- ncRNAgff$end[i]
feature_matched <- F ##this is always false here as no other features have been compared
prop_val <- NA ##prop val is always 1 if there is no feature matched
#select the current feature from the dataframe
idRows <- ncRNAgff[i,]
##keep the ids from rows of the dataframe that match the filenum (file name)
##The purpose of this is to keep track of each original file and row number for each feature as continued merging is done
id1_val <- paste(idRows$id[idRows$file_id == filenum1], collapse = "-")
id2_val <- paste(idRows$id[idRows$file_id == filenum2], collapse = "-")
##assign each missing file as an id of filenum_0
##this needs to be done for each individual file so that it appears as filenum1_0-filenum2_0 not filenum1-filenum2_0
##as this will make tracking which files contributed easier
if(id1_val == ""){
id_vals <- unlist(strsplit(filenum1, "-"))
id_vals <- paste(id_vals, "_0", sep = "")
id1_val <- paste(id_vals, collapse = "-")
}
if(id2_val == ""){
id_vals <- unlist(strsplit(filenum2, "-"))
id_vals <- paste(id_vals, "_0", sep = "")
id2_val <- paste(id_vals, collapse = "-")
}
##take the current set value and if one does not exist then set to 1
set_val_1 <- ifelse(filenum1 == ncRNAgff$file_id[i], ncRNAgff$set_val[i], 0)
set_val_2 <- ifelse(filenum2 == ncRNAgff$file_id[i], ncRNAgff$set_val[i], 0)
##Number of features
##Combine each set of ids and get a unique list of each id available
id_val <- paste(id1_val, id2_val, sep = "-")
idList <- unique(unlist(strsplit(id_val, "-")))
##loop through each ID and check if it is real or a place holder. All real IDs are 1 and placeholders are 0
for(j in 1:length(idList)){
number <- unlist(strsplit(as.character(idList[j]), "_"))
if(length(number) < 3){
idList[j] <- as.numeric(0)
}else{
number <- number[3]
if(number == "0"){
idList[j] <- as.numeric(0)
}else{
idList[j] <- as.numeric(1)
}
}
}
number_of_features <- sum(as.numeric(idList))
aa <- min(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
bb <- max(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
cc <- min(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
dd <- max(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
prop_val <- (cc - bb)/(dd - aa)
#print(aa)
#print(bb)
#print(cc)
#print(dd)
##get the set values from old features
counter <- counter + 1
mergedDat$sequence[counter] <- ncRNAgff$sequence[i]
mergedDat$feature[counter] <- ncRNAgff$feature[i]
mergedDat$start[counter] <- start_val
mergedDat$end[counter] <- end_val
mergedDat$strand[counter] <- ncRNAgff$strand[i]
mergedDat$file_names[counter] <- paste(unique(ncRNAgff$file_id[start_i:i]), collapse = ",")
mergedDat$row_numbers[counter] <- as.character(paste(c(start_i:i), collapse = ","))
mergedDat$prop_overlap[counter] <- prop_val
mergedDat$feature_match[counter] <- feature_matched
mergedDat$number_of_features[counter] <- number_of_features
mergedDat$score[counter] <- as.character(ncRNAgff$score[i])
mergedDat$new_feature[counter] <- !(F %in% ncRNAgff$new_feature[start_i:i])
mergedDat$number_of_rnaseq_files[counter] <- sum(as.integer(ncRNAgff$number_of_rnaseq_files[start_i:i]))
mergedDat$id1[counter] <- id1_val
mergedDat$id2[counter] <- id2_val
mergedDat$set_val_1[counter] <- as.character(set_val_1)
mergedDat$set_val_2[counter] <- as.character(set_val_2)
mergedDat$file_id[counter] <- paste(filenum1, "-", filenum2, sep = "")
##this is the end of the loop writing a feature that cannot be mapped
next
}
##if there is no current feature then set a new start value
if(current_feature == F){
start_val <- ncRNAgff$start[i]
start_i <- i
end_val <- ncRNAgff$end[i]
}else{
##check if the existing values are outside the range of the initial values
if(ncRNAgff$end[i] > end_val){
end_val <- ncRNAgff$end[i]
}
}
##check if the current end value overlaps with the next starting value and update the end value if it does
if(i == nrow(ncRNAgff)){
##check if the subsequent feature was contained within the first feature
aa <- min(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
bb <- max(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
cc <- min(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
dd <- max(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
prop_val <- (cc - bb)/(dd - aa)
##check if there was more than one feature overlapping
feature_matched <- ifelse(length(unique(ncRNAgff$file_id[start_i:i])) > 1, T, F)
#select the current feature from the dataframe
idRows <- ncRNAgff[start_i:i,]
##keep the ids from rows of the dataframe that match the filenum (file name)
##The purpose of this is to keep track of each original file and row number for each feature as continued merging is done
id1_val <- paste(idRows$id[idRows$file_id == filenum1], collapse = "-")
id2_val <- paste(idRows$id[idRows$file_id == filenum2], collapse = "-")
##assign each missing file as an id of filenum_0
##this needs to be done for each individual file so that it appears as filenum1_0-filenum2_0 not filenum1-filenum2_0
##as this will make tracking which files contributed easier
if(id1_val == ""){
id_vals <- unlist(strsplit(filenum1, "-"))
id_vals <- paste(id_vals, "_0", sep = "")
id1_val <- paste(id_vals, collapse = "-")
}
if(id2_val == ""){
id_vals <- unlist(strsplit(filenum2, "-"))
id_vals <- paste(id_vals, "_0", sep = "")
id2_val <- paste(id_vals, collapse = "-")
}
##take the current set value and if one does not exist then set to 1
set_val_1 <- idRows$set_val[idRows$file_id == filenum1]
set_val_2 <- idRows$set_val[idRows$file_id == filenum2]
set_val_1 <- interset_all(set_val_1)
set_val_2 <- interset_all(set_val_2)
if(set_val_1 == ""){
set_val_1 <- "0"
}
if(set_val_2 == ""){
set_val_2 <- "0"
}
##Number of features
##Combine each set of ids and get a unique list of each id available
id_val <- paste(id1_val, id2_val, sep = "-")
idList <- unique(unlist(strsplit(id_val, "-")))
##loop through each ID and check if it is real or a place holder. All real IDs are 1 and placeholders are 0
for(j in 1:length(idList)){
number <- unlist(strsplit(as.character(idList[j]), "_"))
if(length(number) < 3){
idList[j] <- as.numeric(0)
}else{
number <- number[3]
if(number == "0"){
idList[j] <- as.numeric(0)
}else{
idList[j] <- as.numeric(1)
}
}
}
number_of_features <- sum(as.numeric(idList))
aa <- min(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
bb <- max(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
cc <- min(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
dd <- max(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
prop_val <- (cc - bb)/(dd - aa)
#print(aa)
#print(bb)
#print(cc)
#print(dd)
# if(prop_val != 1){
# print(prop_val)
# }
counter <- counter + 1
mergedDat$sequence[counter] <- ncRNAgff$sequence[i]
mergedDat$feature[counter] <- ncRNAgff$feature[i]
mergedDat$start[counter] <- start_val
mergedDat$end[counter] <- end_val
mergedDat$strand[counter] <- ncRNAgff$strand[i]
mergedDat$file_names[counter] <- paste(unique(ncRNAgff$file_id[start_i:i]), collapse = ",")
mergedDat$row_numbers[counter] <- as.character(paste(c(start_i:i), collapse = ","))
mergedDat$prop_overlap[counter] <- prop_val
mergedDat$feature_match[counter] <- feature_matched
mergedDat$number_of_features[counter] <- number_of_features
mergedDat$score[counter] <- as.character(ncRNAgff$score[i])
mergedDat$new_feature[counter] <- !(F %in% ncRNAgff$new_feature[start_i:i])
mergedDat$number_of_rnaseq_files[counter] <- sum(as.integer(ncRNAgff$number_of_rnaseq_files[start_i:i]))
mergedDat$id1[counter] <- id1_val
mergedDat$id2[counter] <- id2_val
mergedDat$set_val_1[counter] <- as.character(set_val_1)
mergedDat$set_val_2[counter] <- as.character(set_val_2)
mergedDat$file_id[counter] <- paste(filenum1, "-", filenum2, sep = "")
##feature has been added so start again
current_feature <- F
new_feature <- F
}else{
aa <- min(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
bb <- max(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
cc <- min(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
dd <- max(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
prop_val <- (cc - bb)/(dd - aa)
if(end_val > ncRNAgff$start[i + 1] && prop_val > 0.25){
# if(prop_val != 1){
# print(prop_val)
# }
if(ncRNAgff$end[i + 1] > end_val){
end_val <- ncRNAgff$end[i + 1]
}
##now looking at an ongoing feature
current_feature <- T
}else{ ##if the next feature is not overlapping, start writing out the existing featur
##check if the subsequent feature was contained within the first feature
##check if there was more than one feature overlapping
feature_matched <- ifelse(length(unique(ncRNAgff$file_id[start_i:i])) > 1, T, F)
#select the current feature from the dataframe
idRows <- ncRNAgff[start_i:i,]
##keep the ids from rows of the dataframe that match the filenum (file name)
##The purpose of this is to keep track of each original file and row number for each feature as continued merging is done
id1_val <- paste(idRows$id[idRows$file_id == filenum1], collapse = "-")
id2_val <- paste(idRows$id[idRows$file_id == filenum2], collapse = "-")
##assign each missing file as an id of filenum_0
##this needs to be done for each individual file so that it appears as filenum1_0-filenum2_0 not filenum1-filenum2_0
##as this will make tracking which files contributed easier
if(id1_val == ""){
id_vals <- unlist(strsplit(filenum1, "-"))
id_vals <- paste(id_vals, "_0", sep = "")
id1_val <- paste(id_vals, collapse = "-")
}
if(id2_val == ""){
id_vals <- unlist(strsplit(filenum2, "-"))
id_vals <- paste(id_vals, "_0", sep = "")
id2_val <- paste(id_vals, collapse = "-")
}
##take the current set value and if one does not exist then set to 1
set_val_1 <- idRows$set_val[idRows$file_id == filenum1]
set_val_2 <- idRows$set_val[idRows$file_id == filenum2]
set_val_1 <- interset_all(set_val_1)
set_val_2 <- interset_all(set_val_2)
if(set_val_1 == ""){
set_val_1 <- "0"
}
if(set_val_2 == ""){
set_val_2 <- "0"
}
##Number of features
##Combine each set of ids and get a unique list of each id available
id_val <- paste(id1_val, id2_val, sep = "-")
idList <- unique(unlist(strsplit(id_val, "-")))
##loop through each ID and check if it is real or a place holder. All real IDs are 1 and placeholders are 0
for(j in 1:length(idList)){
number <- unlist(strsplit(as.character(idList[j]), "_"))
if(length(number) < 3){
idList[j] <- as.numeric(0)
}else{
number <- number[3]
if(number == "0"){
idList[j] <- as.numeric(0)
}else{
idList[j] <- as.numeric(1)
}
}
}
number_of_features <- sum(as.numeric(idList))
aa <- min(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
bb <- max(c(ncRNAgff$start[i], ncRNAgff$start[start_i] ))
cc <- min(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
dd <- max(c(ncRNAgff$end[i], ncRNAgff$end[start_i] ))
prop_val <- (cc - bb)/(dd - aa)
#print(aa)
#print(bb)
#print(cc)
#print(dd)
counter <- counter + 1
mergedDat$sequence[counter] <- ncRNAgff$sequence[i]
mergedDat$feature[counter] <- ncRNAgff$feature[i]
mergedDat$start[counter] <- start_val
mergedDat$end[counter] <- end_val
mergedDat$strand[counter] <- ncRNAgff$strand[i]
mergedDat$file_names[counter] <- paste(unique(ncRNAgff$file_id[start_i:i]), collapse = ",")
mergedDat$row_numbers[counter] <- as.character(paste(c(start_i:i), collapse = ","))
mergedDat$prop_overlap[counter] <- prop_val
mergedDat$feature_match[counter] <- feature_matched
mergedDat$number_of_features[counter] <- number_of_features
mergedDat$score[counter] <- as.character(ncRNAgff$score[i])
mergedDat$new_feature[counter] <- !(F %in% ncRNAgff$new_feature[start_i:i])
mergedDat$number_of_rnaseq_files[counter] <- sum(as.integer(ncRNAgff$number_of_rnaseq_files[start_i:i]))
mergedDat$id1[counter] <- id1_val
mergedDat$id2[counter] <- id2_val
mergedDat$set_val_1[counter] <- as.character(set_val_1)
mergedDat$set_val_2[counter] <- as.character(set_val_2)
mergedDat$file_id[counter] <- paste(filenum1, "-", filenum2, sep = "")
##feature has been added so start again
current_feature <- F
new_feature <- F
}
}
}
# Tidy up and finish ------------------------------------------------------
mergedDat <- mergedDat%>%filter(number_of_features > 0, file_names != "start_row")
mergedDat <- mergedDat%>%mutate(change = ifelse(start < end, F, T))%>%
mutate(start.tmp = end)%>%
mutate(end.tmp = start)%>%
mutate(start = ifelse(change == T, start.tmp, start))%>%
mutate(end = ifelse(change == T, end.tmp, end))%>%
select(-start.tmp, -end.tmp, -change)
mergedDat <- mergedDat%>%filter(!is.na(sequence))
mergedDat[is.na(mergedDat)] <- 0
runningTime <- proc.time() - ptm
if(time.it){
if(quiet == F){
printRunningTime(runningTime = runningTime)
}
}
return(mergedDat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.