R/MDOP.R

Defines functions readpath filetypeasinteger readmaxfoldersize cmd_line_instruction_directory cmd_line_instruction_file target_file_list recursive_copy max_packs copy_by_list degap rank_seq head_derep seq_derep multi_to_single_fasta

Documented in cmd_line_instruction_directory cmd_line_instruction_file copy_by_list degap head_derep max_packs multi_to_single_fasta rank_seq readmaxfoldersize readpath recursive_copy seq_derep target_file_list

###### Molecular Data Organization for Publication (MBOT) ###### 

# Written by Rob Young and Jiaojia Yu at the University of Guelph in Ontario Canada, October 2019

################################ SELECT TARGET DIRECTORY AND FLAGGING THE OPERATING SYSTEM FUNCTION ###########
  
readpath <- function()
{

  cmd_line_instruction_directory()  

  if (interactive() && .Platform$OS.type == "windows"){
    target_dir<-choose.dir(getwd(), "Choose a suitable folder")
  }else{
    system("osascript -e 'tell app \"R\" to POSIX path of (choose folder with prompt \"Choose Folder:\")' > /tmp/R_folder",
           intern = FALSE, ignore.stderr = TRUE)
    p <- system("cat /tmp/R_folder && rm -f /tmp/R_folder", intern = TRUE)
    target_dir<-ifelse(length(p), p, NA)
  }
    
  return(target_dir)
    
}

############################################## SELECTION OF THE FILE TYPE FUNCTION ############################
  
filetypeasinteger <- function()
{
    
  n=4
  
  while(!grepl("[0-3]",n)){
      
    n <- substr(readline(prompt="Enter a selection in the form of an integer, 0=cancel, 1=jpg, 2=ab1, 3=fas/fasta: "),1,1)
      
  }

    
  return(as.integer(n))
    
}


############################################## SELECTION OF THE MAX SIZE OF FILE FUNCTION ############################
  
readmaxfoldersize <- function()
{
  n <- readline(prompt="Enter an integer for max size of a file folder: ")
  if(!grepl("^[0-9]+$",n))
  {
    return(readmaxfoldersize())
  }
  
  return(as.integer(n))
}


############################################## INSTRUCTION FUNCTION FOLDER ############################
  
cmd_line_instruction_directory <- function()
{
  n <- substr(readline(prompt="Select the working directory in the next pop-up window. Hit enter key to continue..."),1,1)
}


############################################## INSTRUCTION FUNCTION FILE ############################
  
cmd_line_instruction_file <- function()
{
  n <- substr(readline(prompt="Select the file you would like to work with in the next pop-up window. Hit enter key to continue..."),1,1)  
}



#This function creates a list of all JPG, AB1, or FASTA files in one directory (including sub-directories).
########################################### target_file_list FUNCTION #####################################################################

target_file_list <- function(target_dir="target_dir", file_type="file_type")
{

  # Using a shared function across multiple tools, obtain a target folder location  
  if(target_dir=="target_dir"){
    target_dir = readpath()
  }

  # Using a shared function across multiple tools, get the type of target file (.jpg, .ab1, or .fas)
  if(file_type=="file_type"){
    file_type <- filetypeasinteger()
  }  

  #######  BIG IF STATEMENT FOR THE REST OF THE CODE TO NOT COMPLETE IF THE FILE TYPE RESPONSE IS 0 MEANING TO CANCEL#####

  if(file_type!=0){

    ############################################## SETTING PATH AND GETTING TIME ############################

    # Set the location of the current working directory
    setwd(target_dir)

    # Get a list of directories and sub directories at the specified location
    listofdirs <- list.dirs()

    #set the format for the date for all operating systems
    Sys.setlocale("LC_TIME", "C")

    # Current Date - for file naming use
    date <- sub("-", "", sub("-", "", Sys.Date()))

    ############################################################################
    # In this step, I want to create the file with every .jpg/.ab1/.fas in the parent folder and all subfolders.

    #Initializing the data frame used to write to the file
    tool1_final <- data.frame()

    if(file_type == "1") {

      for (i in 1:length(list.dirs())){

        setwd(list.dirs()[i])

        #Initiating the list of files in the folders
        file_list <- list.files(pattern = "*[.][Jj][Pp][Gg]$")
        path_list <- unlist(lapply(file_list,normalizePath))
        combined <- cbind(file_list,path_list)
        tool1_final <- rbind(tool1_final, combined)
        setwd(target_dir)
        write.table(tool1_final, file = paste(date, "_target_file_list_JPG", ".txt", sep = ""), quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t")
      }
      print(paste("Task complete, look for output file ", date, "_target_file_list_JPG.txt at this location ",target_dir, sep=""))
    } else if (file_type == "2") {

      for (i in 1:length(list.dirs())){

        setwd(list.dirs()[i])

        #Initiating the list of files in the folders
        file_list <- list.files(pattern = "*[.][Aa][Bb]1$")
        path_list <- unlist(lapply(file_list,normalizePath))
        combined <- cbind(file_list,path_list)
        tool1_final <- rbind(tool1_final, combined)
        setwd(target_dir)
        write.table(tool1_final, file = paste(date, "_target_file_list_AB1", ".txt", sep = ""), quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t")
      }
      print(paste("Task complete, look for output file ", date, "_target_file_list_AB1.txt at this location ",target_dir, sep=""))
    } else if (file_type == "3") {

      for (i in 1:length(list.dirs())){

        setwd(list.dirs()[i])

        #Initiating the list of files in the folders
        file_list <- list.files(pattern = "*[.]([Ff][Aa][Ss]$)|([Ff][Aa][Ss][Tt][Aa]$)")
        path_list <- unlist(lapply(file_list,normalizePath))
        combined <- cbind(file_list,path_list)
        tool1_final <- rbind(tool1_final, combined)
        setwd(target_dir)
        write.table(tool1_final, file = paste(date, "_target_file_list_FAS", ".txt", sep = ""), quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t")
      }
      print(paste("Task complete, look for output file ", date, "_target_file_list_FAS.txt at this location ",target_dir, sep=""))
    } else {
      print("Script aborted by user through selection of 0 = Cancel")
    }
  }
}


# This function copies all files with specific extension (JPG, AB1, or FAS) into a new directory. [RECURSIVELY]
##################################### recursive_copy FUNCTION ##############################################################
recursive_copy <- function(target_dir="target_dir", file_type="file_type")
{

  # Using a shared function across multiple tools, obtain a target folder location  
  if(target_dir=="target_dir"){
    target_dir = readpath()
  }

  # Using a shared function across multiple tools, get the type of target file (.jpg, .ab1, or .fas)
  if(file_type=="file_type"){
    file_type <- filetypeasinteger()
  }  


  #######  BIG IF STATEMENT FOR THE REST OF THE CODE TO NOT COMPLETE IF THE FILE TYPE RESPONSE IS 0 MEANING TO CANCEL#####

  if(file_type!=0){

    ############################################## SETTING PATH AND GETTING TIME ############################

    # Set the location of the current working directory
    setwd(target_dir)

    # Get a list of directories and sub directories at the specified location
    listofdirs <- list.dirs()

    #set the format for the date for all operating systems
    Sys.setlocale("LC_TIME", "C")

    # Current Date - for file naming use
    date <- sub("-", "", sub("-", "", Sys.Date()))

    ########################################## CREATE FILE FOLDER ########################################

    if(file_type == "1") {

      # Create a new directory for storing all the JPG files
      dir.create(paste( "./",date, "_recursive_copy_JPG", "/", sep = ""))

      # Absolute Path for JPG directory
      jpg_path_abs <- paste(target_dir,"/", date, "_recursive_copy_JPG","/", sep = "")

      # Loop for copy files to the JPG folder (All the JPG files will be saved in that folder)
      for (i in 1:length(listofdirs)) {

        setwd(target_dir)
        setwd(listofdirs[i])

        path_from <- list.files(pattern = "*[.][Jj][Pp][Gg]$")

        file.copy(path_from, jpg_path_abs)
      }
      print(paste("Task complete, look for output file folder ", date, "_recursive_copy_JPG at this location ",target_dir, sep=""))
    } else if (file_type == "2") {

      # Create a new directory for storing all the AB1 files
      dir.create(paste( "./",date, "_recursive_copy_AB1", "/", sep = ""))

      # Absolute Path for AB1 directory
      ab1_path_abs <- paste(target_dir,"/", date, "_recursive_copy_AB1","/", sep = "")

      # Loop for copy files to the AB1 folder (All the AB1 files will be saved in that folder)
      for (i in 1:length(listofdirs)) {

        setwd(target_dir)
        setwd(listofdirs[i])

        path_from <- list.files(pattern = "*[.][Aa][Bb]1$")

        file.copy(path_from, ab1_path_abs)
      }
      print(paste("Task complete, look for output file folder ", date, "_recursive_copy_AB1 at this location ",target_dir, sep=""))
    }  else if (file_type == "3") {

      # Create a new directory for storing all the FAS files
      dir.create(paste( "./",date, "_recursive_copy_FAS", "/", sep = ""))

      # Absolute Path for FAS directory
      fas_path_abs <- paste(target_dir,"/", date, "_recursive_copy_FAS","/", sep = "")

      # Loop for copy files to the FAS folder (All the FAS files will be saved in that folder)
      for (i in 1:length(listofdirs)) {

        setwd(target_dir)
        setwd(listofdirs[i])

        path_from <- list.files(pattern = "*[.]([Ff][Aa][Ss]$)|([Ff][Aa][Ss][Tt][Aa]$)")

        file.copy(path_from, fas_path_abs)
      }
      print(paste("Task complete, look for output file folder ", date, "_recursive_copy_FAS at this location ",target_dir, sep=""))
    }
  }else{
    print("Script aborted by user through selection of 0 = Cancel")
  }
}


# This function packages multiple target files into folders of a maximum size value
##################################### max_packs FUNCTION ##############################################################
max_packs <- function(target_dir="target_dir", file_type="file_type", max_folder_size="max_folder_size")
{

  # Using a shared function across multiple tools, obtain a target folder location  
  if(target_dir=="target_dir"){
    target_dir = readpath()
  }

  # Using a shared function across multiple tools, get the type of target file (.jpg, .ab1, or .fas)
  if(file_type=="file_type"){
    file_type <- filetypeasinteger()
  }  

  # Using a shared function across multiple tools, get the maximum desired folder size
  if(max_folder_size=="max_folder_size"){
    max_folder_size <- readmaxfoldersize()
  }  

  # Set the target directory as the working directory
  setwd(target_dir)

  #set the format for the date for all operating systems
  Sys.setlocale("LC_TIME", "C")

  # Current Date - for file naming use
  date <- sub("-", "", sub("-", "", Sys.Date()))

  # Obtain the total size of the target directory
  target_directory_size <- sum(file.info(list.files(".", all.files = TRUE, recursive = TRUE))$size)

  # If the target file folder is equal or less than the max size abort script
  if ((target_directory_size*0.000001) <= max_folder_size) {

    print ("The folder is smaller than the maximum upload size limit and this script is not needed.")

  } else {

    # Using the following if/else if/else if to obtain a list of files and their sizes from the target directory

    if(file_type == "1") {

    file_list <- list.files(pattern = "*[.][Jj][Pp][Gg]$")
    file_sizes <- file.info(list.files(".", pattern = "*[.][Jj][Pp][Gg]$", all.files = TRUE, recursive = TRUE))$size
    file_type <- "JPG"

    } else if (file_type == "2") {

    file_list <- list.files(pattern = "*[.][Aa][Bb]1$")
    file_sizes <- file.info(list.files(".", pattern = "*[.][Aa][Bb]1$", all.files = TRUE, recursive = TRUE))$size
    file_type <- "AB1"

    }  else if (file_type == "3") {

    file_list <- list.files(pattern = "*[.]([Ff][Aa][Ss]$)|([Ff][Aa][Ss][Tt][Aa]$)")
    file_sizes <- file.info(list.files(".", pattern = "*[.]([Ff][Aa][Ss]$)|([Ff][Aa][Ss][Tt][Aa]$)", all.files = TRUE, recursive = TRUE))$size
    file_type <- "FAS"

    }

    # Creating a single table with associated target names and file sizes
    total_file_info <- cbind(file_list,file_sizes)

    # Initializing the file folder naming counter
    file_folder_counter = 1

    # Create a directory for this fraction fo the files
    dir.create(paste( "./",date,"_max_packs_",file_type,"_", file_folder_counter, sep = ""))

    # Initializing the total size of the folder being filled
    total_file_size = 0

    # Looping through all files in the target folder to be copied and placed into multiple folders of maximum size
    for (i in 1:nrow(total_file_info)) {

      # This line takes current file size and converts it in to mb from bytes
      check_file_size <- as.numeric(total_file_info[i,2])*0.000001

      # This if will check to see if there is room left in the current folder or if a new folder needs to be created (based on the max value inputted)
      if ((total_file_size + check_file_size) < max_folder_size ) {

        # In the if indicating there is room remaining for this file in the current folder
        # create the copying elements the from file and to file strings
        from_file <- paste( "./", file_list[i], sep = "")
        to_file <- paste("./",date,"_max_packs_",file_type,"_", file_folder_counter,"/", total_file_info[i,1], sep = "")

        # Command to copy the file
        file.copy(from_file,to_file)

        # Update the total size of the file folder
        total_file_size = total_file_size + check_file_size

      } else {

        # In the else indicating there isn't room remaining for this file in the current folder

        # Add one to the file folder naming variable
        file_folder_counter = file_folder_counter + 1

        # Set the total size variable to the size of the current file
        total_file_size = check_file_size

        # Create a directory for this fraction fo the files
        dir.create(paste( "./",date,"_max_packs_",file_type,"_", file_folder_counter, sep = ""))

        # create the copying elements the from file and to file strings
        from_file <- paste( "./", file_list[i], sep = "")
        to_file <- paste("./",date,"_max_packs_",file_type,"_", file_folder_counter,"/", file_list[i], sep = "")

        # Command to copy the file
        file.copy(from_file,to_file)
      }
    }
    print(paste("Task complete. Find file folders with a max size of",max_folder_size, "MB at", target_dir), sep="")
  }
}

# This function copies a group of selected files based on a supplied list by the user
##################################### copy_by_list FUNCTION ##############################################################
copy_by_list <- function(target_dir="target_dir", file_list="file_list" )
{

  # Using a shared function across multiple tools, obtain a target folder location  
  if(target_dir=="target_dir"){
    target_dir = readpath()
  }

  # Obtain a target file location 
  if(file_list=="file_list"){

    # This is giving terminal instructions to select the file wanting to be used
    cmd_line_instruction_file()

    # Choose the fasta file you would like to work on: 
    file_list<-file.choose()

  }

  #This is setting the directory to the working directory
  setwd(target_dir)

  # load in the data file into the data for the lines  
  file_list<-read.delim(file_list,header=F,sep="\t",dec=".")

  #set the format for the date for all operating systems  
  Sys.setlocale("LC_TIME", "C")
    
  # Current Date - for file naming use
  date <- sub("-", "", sub("-", "", Sys.Date()))

  #Make a file folder to contain all results from the program
  dir.create(paste( "./", date, "_copy_by_list", "/", sep = ""))
      
  #Final location for the paste of the copied files for both Windows and MAC OS
  final_loc <- paste(target_dir, "/", date, "_copy_by_list", "/", sep = "")

  #Starting the loop to go through each of the items in the selected file to copy one at a time
  for (i in 1:nrow(file_list)) {

    # Copy each of the files in the list to the new folder
    path_from <- paste(target_dir,"/", file_list[i,], sep="")
    file.copy(path_from, final_loc)
  }
  print(paste("Task complete. Find the files in the folder ", date, "_copy_by_list at ", target_dir), sep="")
}


# This function removes gaps from all sequences in the selected file. 
##################################### degap FUNCTION ##############################################################
degap <- function(target_dir="target_dir", Seq_file="Seq_file")
{

  # Using a shared function across multiple tools, obtain a target folder location  
  if(target_dir=="target_dir"){
    target_dir = readpath()
  }

  # Set the location of the current working directory
  setwd(target_dir)

  # Obtain a target folder location 
  if(Seq_file=="Seq_file"){

    # This is giving terminal instructions to select the file wanting to be used
    cmd_line_instruction_file()

    # Choose the fasta file you would like to work on: 
    Seq_file<-file.choose()

  }

  # load in the data file in
  Seq_file<-data.frame(read.table(Seq_file))

  #set the format for the date for all operating systems  
  Sys.setlocale("LC_TIME", "C")
    
  # Current Date - for file naming use
  date <- sub("-", "", sub("-", "", Sys.Date()))

  #******************************************************************************************
  #This section is taking the fasta file and formatting it in to columns to create the input file for the rates iteration program

  #taking the read in file and changing from Fasta to tab delimited
  Header0 <- Seq_file[seq(from = 1, to = nrow(Seq_file), by = 2), 1]
  Sequence <- Seq_file[seq(from = 2, to = nrow(Seq_file), by = 2), 1]

  #*******************************************************************************************

  #Taking the sequences and removing the gap characters
  New_Seq_file <- data.frame(Header0,as.character(gsub("-", "", Sequence)))

  #**************************************CHANGE TO FASTA FORMAT******************************

  #Flag to enter in the first addition to the matrix to make the fasta file
  fasta_flag=0	

  for (j in 1:nrow(New_Seq_file)){
	
    #This is setting up a flag so the first time we initialize the matrix and then the second time we rbind to the matrix for the fasta file format
    if (fasta_flag==0){
      final_matrix_for_file<-paste(New_Seq_file[j,1],sep="")
      x<-as.character(New_Seq_file[j,2])
      final_matrix_for_file<-rbind(final_matrix_for_file,x)
      fasta_flag=1
    }else{
      x<-paste(New_Seq_file[j,1],sep="")
      final_matrix_for_file<-rbind(final_matrix_for_file,x)
      x<-as.character(New_Seq_file[j,2])
      final_matrix_for_file<-rbind(final_matrix_for_file,x)
    }
  }

  #**********************OUTPUT THE FILE ************************************************************
  # Providing the name and the location where the output files will be located
  Write_to_file_out_str <- paste(target_dir,"/", date, "_degap.fas", sep="")

  # write the file
  write.table(final_matrix_for_file,file=Write_to_file_out_str,append=FALSE, na="", row.names=FALSE, col.names=FALSE, quote = FALSE,sep="\n")

  # Print the completion message
  print(paste("Task complete, look for output file ", date, "_degap.fas at this location ",target_dir, sep=""))
}


# This function ranks sequences by length in a fasta formatted file. 
##################################### rank_seq FUNCTION ##############################################################
rank_seq <- function(target_dir="target_dir", Seq_file="Seq_file")
{
			
  # Using a shared function across multiple tools, obtain a target folder location  
  if(target_dir=="target_dir"){
    target_dir = readpath()
  }

  # Obtain a target file location 
  if(Seq_file=="Seq_file"){

    # This is giving terminal instructions to select the file wanting to be used
    cmd_line_instruction_file()

    # Choose the fasta file you would like to work on: 
    Seq_file<-file.choose()

  }

  #This is setting the directory to the working directory
  setwd(target_dir)

  #set the format for the date for all operating systems  
  Sys.setlocale("LC_TIME", "C")
    
  # Current Date - for file naming use
  date <- sub("-", "", sub("-", "", Sys.Date()))
							
  # load in the data file in
  Seq_file<-data.frame(read.table(Seq_file))				

  #***********************************************************************************************************************************************************************
  #This section is taking the fasta file and formatting it in to columns to create the input file for the rates iteration program

  #taking the read in file and changing from Fasta to tab delimited
  Header0 <- Seq_file[seq(from = 1, to = nrow(Seq_file), by = 2), 1]
  Sequence <- Seq_file[seq(from = 2, to = nrow(Seq_file), by = 2), 1]

  #***********************************************************************************************************************************************************************

  #Taking the sequences and removing the gap characters
  Sequence1 <- as.character(gsub("-", "", Sequence))

  #Getting the length of the sequences so we can sort by that length
  Seq_len <- as.character(nchar(Sequence1))

  #Adding the lengths to the dataframe
  New_Seq_file <- data.frame(Header0,Sequence,Seq_len)

  #***********************************************************************************************************************************************************************

  #Now with the three column data frame we will sort by the length file and place the results in a temp data frame 
  New_Seq_file_temp<-New_Seq_file[order(New_Seq_file$Seq_len),]

  #Taking the sorted dataframe and pulling out the header and sequences and rewriting the original data frame for output
  New_Seq_file<-data.frame(New_Seq_file_temp$Header0,New_Seq_file_temp$Sequence)

  #**************************************CHANGE TO FASTA FORMAT**************************************

  #Flag to enter in the first addition to the matrix to make the fasta file
  fasta_flag=0	

  for (j in 1:nrow(New_Seq_file)){
    #This is setting up a flag so the first time we initialize the matrix and then the second time we rbind to the matrix for the fasta file format
    if (fasta_flag==0){
      final_matrix_for_file<-paste(New_Seq_file[j,1],sep="")
      x<-as.character(New_Seq_file[j,2])
      final_matrix_for_file<-rbind(final_matrix_for_file,x)
      fasta_flag=1
    }else{
      x<-paste(New_Seq_file[j,1],sep="")
      final_matrix_for_file<-rbind(final_matrix_for_file,x)
      x<-as.character(New_Seq_file[j,2])
      final_matrix_for_file<-rbind(final_matrix_for_file,x)
    }
  }

  #**********************OUTPUT THE FILE ************************************************************
  # Providing the name and the location where the output files will be located
  Write_to_file_out_str <- paste(target_dir,"/", date, "_rank_seq.fas", sep="")
  write.table(final_matrix_for_file,file=Write_to_file_out_str,append=FALSE, na="", row.names=FALSE, col.names=FALSE, quote = FALSE,sep="\n")
  print(paste("Task complete, look for output file ", date, "_rank_seq.fas at this location ",target_dir, sep=""))
}

# This function dereplicates sequences based on headers. 
##################################### head_derep FUNCTION ##############################################################
head_derep <- function(target_dir="target_dir", Seq_file="Seq_file")
{

  # Using a shared function across multiple tools, obtain a target folder location  
  if(target_dir=="target_dir"){

    target_dir = readpath()

  }

  # Set the location of the current working directory
  setwd(target_dir)

  # Obtain a target file location 
  if(Seq_file=="Seq_file"){

    # This is giving terminal instructions to select the file wanting to be used
    cmd_line_instruction_file()

    # Choose the fasta file you would like to work on: 
    Seq_file<-file.choose()

  }

  # load in the data file in
  Seq_file<-data.frame(read.table(Seq_file))

  #set the format for the date for all operating systems  
  Sys.setlocale("LC_TIME", "C")
    
  # Current Date - for file naming use
  date <- sub("-", "", sub("-", "", Sys.Date()))				

  #***********************************************************************************************************************************************************************
  #This section is taking the fasta file and formatting it in to columns to create the input file for the rates iteration program

  #taking the read in file and changing from Fasta to tab delimited
  Header0 <- Seq_file[seq(from = 1, to = nrow(Seq_file), by = 2), 1]
  Sequence <- Seq_file[seq(from = 2, to = nrow(Seq_file), by = 2), 1]

  New_Seq_file <- data.frame(Header0,Sequence)

  #****************************************************************************************************************************
  #dereplicate based on Header
  New_Seq_file<-as.data.frame(subset(New_Seq_file, !duplicated(Header0)))

  #**********************OUTPUT THE FILE **************************************************************************************
  # Providing the name and the location where the output files will be located
  Write_to_file_out_str <- paste(target_dir,"/", date, "_head_derep.fas", sep="")

  # write the file
  write.table(New_Seq_file,file=Write_to_file_out_str,append=FALSE, na="", row.names=FALSE, col.names=FALSE, quote = FALSE,sep="\n")

  # Print the completion message
  print(paste("Task complete, look for output file ", date, "_head_derep.fas at this location ",target_dir, sep=""))
}

# This function dereplicates a fasta file based on sequences
##################################### seq_derep FUNCTION ##############################################################
seq_derep <- function(target_dir="target_dir", Seq_file="Seq_file")
{

  # Using a shared function across multiple tools, obtain a target folder location  
  if(target_dir=="target_dir"){
    target_dir = readpath()
  }

  # Obtain a target file location 
  if(Seq_file=="Seq_file"){

    # This is giving terminal instructions to select the file wanting to be used
    cmd_line_instruction_file()

    # Choose the fasta file you would like to work on: 
    Seq_file<-file.choose()

  }

  # Set the location of the current working directory
  setwd(target_dir)	

  #set the format for the date for all operating systems  
  Sys.setlocale("LC_TIME", "C")
    
  # Current Date - for file naming use
  date <- sub("-", "", sub("-", "", Sys.Date()))
				
  # load in the data file in
  Seq_file<-data.frame(read.table(Seq_file))					
			

  #***********************************************************************************************************************************************************************
  #This section is taking the fasta file and formatting it in to columns

  #taking the read in file and changing from Fasta to tab delimited
  Header0 <- Seq_file[seq(from = 1, to = nrow(Seq_file), by = 2), 1]
  Sequence <- Seq_file[seq(from = 2, to = nrow(Seq_file), by = 2), 1]

  New_Seq_file <- data.frame(Header0,Sequence)

  #***********************************************************************************************************************************************************************
  #dereplicate based on sequences
  New_Seq_file<-as.data.frame(subset(New_Seq_file, !duplicated(Sequence)))

  #**********************OUTPUT THE FILE **************************************************************************************
  # Providing the name and the location where the output files will be located
  Write_to_file_out_str <- paste(target_dir,"/", date, "_seq_derep.fas", sep="")
  write.table(New_Seq_file,file=Write_to_file_out_str,append=FALSE, na="", row.names=FALSE, col.names=FALSE, quote = FALSE,sep="\n")
  print(paste("Task complete, look for output file ", date, "_seq_derep.fas at this location ",target_dir, sep=""))
}



# This function changes a multi-line fasta file into a single line fasta format
##################################### multi_to_single_fasta FUNCTION ##############################################################
multi_to_single_fasta <- function(target_dir="target_dir", Seq_file ="Seq_file")
{			

  # Using a shared function across multiple tools, obtain a target folder location  
  if(target_dir=="target_dir"){
    target_dir = readpath()
  }

  # Obtain a target file location 
  if(Seq_file=="Seq_file"){

    # This is giving terminal instructions to select the file wanting to be used
    cmd_line_instruction_file()

    # Choose the fasta file you would like to work on: 
    Seq_file<-file.choose()

  }

  #This is setting the directory to the working directory
  setwd(target_dir)

  #set the format for the date for all operating systems  
  Sys.setlocale("LC_TIME", "C")

  # Current Date - for file naming use
  date <- sub("-", "", sub("-", "", Sys.Date()))

  # load in the data file in
  Seq_file<-data.frame(read.table(Seq_file))	

  #Initializing the flag
  fasta_flag=0
  #************************************************************************************************************************
  for (j in 1:nrow(Seq_file)){
	
    #This is setting up a flag so the first time we initialize the matrix and then the second time we rbind to the matrix for the fasta file format
    if (fasta_flag==0){
      if(grepl(">",Seq_file[1,1])==TRUE){
        final_matrix_for_file<-as.vector(Seq_file[j,1])
        seq_concate<-""
        fasta_flag=1
      }
    }else{
      if(grepl(">",Seq_file[j,1])==TRUE){
        final_matrix_for_file<-rbind(final_matrix_for_file,seq_concate,as.vector(Seq_file[j,1]))
        seq_concate<-""
      }else{
        seq_concate<-paste(seq_concate,Seq_file[j,1],sep="")
      }
    }
  }
  final_matrix_for_file<-rbind(final_matrix_for_file,seq_concate)
  #**********************OUTPUT THE FILE **************************************************************************************
  #Providing the name and the location where the output files will be located
  Write_to_file_out_str <- paste(target_dir,"/", date, "_multi_to_single.fas", sep="")

  write.table(final_matrix_for_file,file=Write_to_file_out_str,append=FALSE,na="",row.names = FALSE, col.names=FALSE, quote = FALSE,sep="\n") 				
  print(paste("Task complete, look for output file ", date, "_multi_to_single.fas at this location ",target_dir, sep=""))
}
rgyoung6/Molecular-Data-Organization-for-Publication-MDOP documentation built on Jan. 21, 2020, 12:12 a.m.