OmicsMetaData: OmicsMetaData: an R-package for interoperable and re-usable biodiversity 'omics (meta)data

Documented in FileNames.to.Table prep.metadata.ENA renameSequenceFiles sync.metadata.sequenceFiles

#==============================================================
# Author Maxime Sweetlove
# Part of the POLA3R website (successor or mARS.biodiversity.aq)
# version 1.0 (2020-01-28)
# file encoding UTF-8
#
#==============================================================
# Tools for submitting nucleotide sequence data to EMBL-ENA
#==============================================================
#' collect the names of sequence files in a folder
#' @author Maxime Sweetlove
#' @family data archiving functions
#' @usage FileNames.to.Table(file.dir, paired=TRUE, seq.file.extension=".fastq.gz", 
#'   pairedEnd.extension=c("_1", "_2"))
#' @description makes a table with the all the names of sequence files in a folder. see details.
#' @param file.dir a character string. The path to the directory where the sequence files are stored
#' @param paired boolean. wether or not the sequence files are paired-end (forward _1, reverse_2) or single-end
#' @param seq.file.extension a character string. The file-extension of the sequence files
#' @param pairedEnd.extension a character vector of length 2. If the data is paired-end data, specify the forward (first element of the vector) and reverse (second) extension tags here. Default is c("_1", "_2")
#' @details fastq files from sequencing facilities often come with long and complex file names that were automatically generated by the sequencer machine and no longer resemble the original name of the sample. 
#'   This function is part of tools that help to get file names and easily convert them back into the original file names. It makes a table with the all the names of sequence files in a folder that can be saved as a CSV file.
#'   Using a text editor or excell, the user can then fill in the column with new name (that is, the desired name), and then use the renameSequenceFiles() function to rename the files.
#' @return a data frame with a column "OldName" that list the sequence file names, an empty column "NewName" to be filled in by the user and a column "FileName" with the full file name.
#' @examples 
#' \donttest{
#' FileNames.to.Table(file.dir="path/to/the/sequenceFilesFolder", paired=TRUE, 
#'                    seq.file.extension=".fastq.gz",
#'                    pairedEnd.extension=c("_1", "_2"))
#' }
#' @export
FileNames.to.Table <- function(file.dir=NULL, paired=TRUE, seq.file.extension=".fastq.gz",
                               pairedEnd.extension=c("_1", "_2")){

  fileNames <- list.files(file.dir)

  paired.fw_ext <- pairedEnd.extension[1]
  paired.rv_ext <- pairedEnd.extension[2]

  if(paired){
    names_fw <- fileNames[grepl(paste(paired.fw_ext, seq.file.extension, "$", sep=""), fileNames)]
    names_rv <- fileNames[grepl(paste(paired.rv_ext, seq.file.extension, "$", sep=""), fileNames)]
    names_core_fw <- gsub(paste(paired.fw_ext, seq.file.extension, "$", sep=""), "", names_fw)
    names_core_rv <- gsub(paste(paired.rv_ext, seq.file.extension, "$", sep=""), "", names_rv)
    names(names_fw)<-names_core_fw
    names(names_rv)<-names_core_rv

    if(length(intersect(names_core_fw, names_core_rv))==length(names_core_fw)){
      #re-order
      names_fw <- names_fw[order(factor(names(names_fw), levels=tolower(names(names_fw))))]
      names_rv <- names_rv[order(factor(names(names_fw), levels=tolower(names(names_fw))))]
      names_out <- data.frame(NewName=rep("", length(names_fw)), OldName=names(names_fw),
                              ForwardName=names_fw, ReverseName=names_rv,
                              stringsAsFactors = FALSE)
    }else{
      #non-matching file names for paired-end reads
      names_fw_extra <- setdiff(names(names_fw), names(names_rv))
      names_rv_extra <- setdiff(names(names_rv), names(names_fw))
      names_comb <- intersect(names(names_fw), names(names_rv))

      if(length(names_fw_extra)>0){
        fw_out <- data.frame(NewName="", OldName=names_fw_extra,
                               ForwardName=paste(names_fw_extra, paired.fw_ext, seq.file.extension, sep=""),
                               ReverseName="",
                               stringsAsFactors = FALSE)
      }else{
        fw_out <- data.frame(matrix(ncol=4, nrow=0))
        colnames(fw_out) <- c("NewName", "OldName", "ForwardName", "ReverseName")
      }
      if(length(names_rv_extra)>0){
        rv_out <- data.frame(NewName="", OldName=names_rv_extra,
                             ForwardName="",
                             ReverseName=paste(names_rv_extra, paired.rv_ext, seq.file.extension, sep=""),
                             stringsAsFactors = FALSE)
      }else{
        rv_out <- data.frame(matrix(ncol=4, nrow=0))
        colnames(rv_out) <- c("NewName", "OldName", "ForwardName", "ReverseName")
      }
      if(length(comb_out)>0){
        comb_out <- data.frame(NewName="", OldName=names_comb,
                               ForwardName=paste(names_comb, paired.fw_ext, seq.file.extension, sep=""),
                               ReverseName=paste(names_comb, paired.rv_ext, seq.file.extension, sep=""),
                               stringsAsFactors = FALSE)
      }else{
        comb_out <- data.frame(matrix(ncol=4, nrow=0))
        colnames(comb_out) <- c("NewName", "OldName", "ForwardName", "ReverseName")
      }
      names_out <- rbind(fw_out, rv_out, comb_out)
    }
  }else{
    names_ext <- fileNames[grepl(paste(seq.file.extension, "$", sep=""), fileNames)]
    names_core <- gsub(paste(seq.file.extension, "$", sep=""), "", names_ext)
    names(names_ext)<-names_core

    names_out <- data.frame(NewName="", OldName=names(names_ext),
                            FileName=names_ext,
                            stringsAsFactors = FALSE)
  }
  return(names_out)
}


#' renames sequence files in a directory
#' 
#' @author Maxime Sweetlove
#' @family data archiving functions
#' @description  This function changes the filenames in the given directory according to the information provided in name_df
#' @usage renameSequenceFiles(name_df, colNameOld="OldName", 
#'   colNameNew="NewName", file.dir=NULL, paired=TRUE, 
#'   seq.file.extension=".fastq.gz", ask.input=TRUE, 
#'   pairedEnd.extension=c("_1", "_2"))
#' @param name_df a data.frame. The dataframe that lists the original sample same (without file extension) alongside the new file names (also without their file extension)
#' @param colNameOld a character string. The name of the column with the original file names
#' @param colNameNew a character string. The name of the column with the new file names
#' @param file.dir a character string. The path to the directory where the sequence files are stored
#' @param paired boolean. wether or not the sequence files are paired-end (forward _1, reverse_2) or single-end
#' @param seq.file.extension a character string. The file-extension of the sequence files
#' @param ask.input boolean. Will give a warning message before continuing.
#' @param pairedEnd.extension a character vector of length 2. If the data is paired-end data, specify the forward (first element of te vector) and reverse (second) extension tags here. Default is c("_1", "_2")
#' @details fastq files from sequencing facilities often come with long and complex file names that were automatically generated by the sequencer machine and no longer resemble the original name of the sample. This function is part of tools that help to get file names and easily convert them back into the original file names. It makes use of a table where each sequence file name is linked to a new name desired by the user. This can be generated by the FileNames.to.Table function.
#' @return the number of files changed.
#' @examples 
#' \dontrun{
#' fileNamesTable_rename <- data.frame(NewName=c("s1renamed", "s2renamed"), 
#'                                     OldName=c("seq_sample1", "seq_sample2"))
#' renameSequenceFiles(name_df=fileNamesTable_rename, colNameOld="OldName", 
#'                     colNameNew="NewName", file.dir=file_dir, paired=TRUE, 
#'                     seq.file.extension=".fastq.gz", ask.input=TRUE, 
#'                     pairedEnd.extension=c("_1", "_2"))
#' }
#' @export
renameSequenceFiles <- function(name_df, colNameOld="OldName", colNameNew="NewName",
                                 file.dir=NULL, paired=TRUE, seq.file.extension=".fastq.gz",
                                 ask.input=TRUE, pairedEnd.extension=c("_1", "_2")){
  if(ask.input){
    message("Warnings:\n\t- This function will rename files, this is irreversible.\n\t- This function assumes the provided old names correspond with the files in the given directory.\n\t\tThis can be checked with checked sync.metadata.sequenceFiles()\nContinue? (y/n)")
    ctu <- readLines(con = getOption("mypkg.connection"), n=1)
    if(tolower(ctu) %in% c("n", "no")){
      stop("you chose to stop the function")
    }
  }

  raw_fileNames <- list.files(file.dir)
  name_df <- name_df[!name_df[, colNameOld]=="",]

  paired.fw_ext <- pairedEnd.extension[1]
  paired.rv_ext <- pairedEnd.extension[2]

  if(paired){
    OldName_1 <- unlist(unname(sapply(name_df[, colNameOld], FUN=function(x){
      paste(x, paired.fw_ext, seq.file.extension, sep="")
    })))
    OldName_2 <- unlist(unname(sapply(name_df[, colNameOld], FUN=function(x){
      paste(x, paired.rv_ext, seq.file.extension, sep="")
    })))
    NewName_1 <- unlist(unname(sapply(name_df[, colNameNew], FUN=function(x){
      paste(x, paired.fw_ext, seq.file.extension, sep="")
    })))
    NewName_2 <- unlist(unname(sapply(name_df[, colNameNew], FUN=function(x){
      paste(x, paired.rv_ext, seq.file.extension, sep="")
    })))
    Names <- data.frame(Old=c(OldName_1, OldName_2), New=c(NewName_1, NewName_2))
  }else{
    OldNames <- unlist(unname(sapply(name_df[, colNameOld], FUN=function(x){
      paste(x, seq.file.extension, sep="")
    })))
    NewNames <- unlist(unname(sapply(name_df[, colNameNew], FUN=function(x){
      paste(x, seq.file.extension, sep="")
    })))
    Names <- data.frame(Old=OldNames, New=NewNames)
  }

  Names <- Names[Names$Old %in% intersect(Names$Old, raw_fileNames),]
  numFiles=0
  for(fl in Names$Old ){
    fl_pth <- paste(file.dir, fl, sep="/")
    fl_new <- Names[Names$Old==fl,]$New
    if(length(fl_new)>0){
      fl_new_pth <- paste(file.dir, fl_new, sep="/")
      file.rename(fl_pth, fl_new_pth)
      numFiles<-numFiles+1
    }
  }
  
  message(paste("renamed", numFiles, "files"))
  return(numFiles)
}


#' Check if all samples in a dataframe have sequence data
#' @author Maxime Sweetlove
#' @family data archiving functions
#' @usage sync.metadata.sequenceFiles(Names, file.dir=NULL, 
#'   paired=TRUE, seq.file.extension=".fastq.gz", 
#'   pairedEnd.extension=c("_1", "_2"))
#' @description checks if all records in the metadata file match with filenames in a given directory
#' @param Names a vector or a MIxS.metadata class object. A character vector with the sample names (without a file extension) or the MIxS.metadata object of which the sample names (original_name) should be compared to the sequence file names
#' @param file.dir a character string. The path to the directory where the sequence files are stored
#' @param paired boolean. wether or not the sequence files are paired-end (forward _1, reverse_2) or single-end
#' @param seq.file.extension a character string. The file-extension of the sequence files
#' @param pairedEnd.extension a character vector of length 2. If the data is paired-end data, specify the forward (first element of te vector) and reverse (second) extension tags here. Default is c("_1", "_2")
#' @details Nucleotide sequence datasets typically contain a large number of samples, too much to check manually. This function will check if all the names that are expected in the sequence data files are actually found. Reasons for mismatches may include typos, accedentally deleting files or records, errors in naming or copying files,...
#' @return a list of two. Two lists with the non-matching file names. redundant_metadata shows the names found in the metadata but not among the sequences, redundant_seqFiles shows sequence file names that were not listed in the metadata.
#' @examples 
#' \donttest{
#' sync.metadata.sequenceFiles(Names="seq_sample1", file.dir="user/path/to/sequenceFilesFolder",
#'                             paired=TRUE, seq.file.extension=".fastq.gz",
#'                             pairedEnd.extension=c("_1", "_2"))
#' }
#' @export
sync.metadata.sequenceFiles <- function(Names, file.dir=NULL,
                                        paired=TRUE, seq.file.extension=".fastq.gz",
                                        pairedEnd.extension=c("_1", "_2")){

  paired.fw_ext <- pairedEnd.extension[1]
  paired.rv_ext <- pairedEnd.extension[2]

  # 1. check input and get expected names
  if(check.valid.MIxS.metadata(Names)){
    metadata <- Names@data
    if(!any(rownames(metadata) %in% c(1:nrow(metadata))) &&
       length(intersect(rownames(metadata), clean_fileNames))!=0){
      # attempt 1: use the rownames of metadata
      exp_names <- rownames(metadata)
    }else if("original_name" %in% colnames(metadata) &&
             length(intersect(metadata$original_name, clean_fileNames))!=0){
      # attempt 2: use original_name
      exp_names <- metadata$original_name
    }else{
      stop("could not find the sample names in the metadata object")
    }
  }else{
    exp_names <- as.character(Names)
  }

  # 3. get and check the observed file names
  raw_fileNames <- list.files(file.dir)
  # only keep the sequence files
  raw_fileNames <- raw_fileNames[grepl(seq.file.extension, raw_fileNames)]

  if(paired){
    clean_fileNames <- unname(gsub(paste(paired.fw_ext, seq.file.extension,sep=""), "", raw_fileNames))
    clean_fileNames <- clean_fileNames[!grepl(seq.file.extension, clean_fileNames)]

    ## check if all files are paired
    files_rv_check <- unname(gsub(paste(paired.rv_ext, seq.file.extension,sep=""), "", raw_fileNames))
    files_rv_check <- files_rv_check[!grepl(seq.file.extension, files_rv_check)]
    files_diff <- c(setdiff(clean_fileNames, files_rv_check), setdiff(files_rv_check, clean_fileNames))
    if(length(files_diff)!=0){
      stop(paste("non-matching forward and reverse files found for:\n",paste(files_diff, collapse="\n"), sep="" ))
    }
  } else{
    clean_fileNames <- gsub(seq.file.extension, "", raw_fileNames)
  }

  # 4. check if expected and observed file names match
  wrn <- "" #warning messages
  out <- list(redundant_metadata=NULL, redundant_seqFiles=NULL) #output: two lists with the non-matching file names
  # 4.1 in expected but not in observed : warning and/or delete
  exp_diff <- setdiff(exp_names, clean_fileNames)
  if(length(exp_diff) != 0){
    out$redundant_metadata <- exp_diff
    if(length(exp_diff)>10){
      wrn <- paste(wrn,
                   length(exp_diff),
                   " samples were not found among the sequence files. Here are the first few:\n\t",
                   paste(exp_diff[1:10], collapse="\n\t"), "\n\t...\n", sep="")
    } else{
      wrn <- paste(wrn,
                   "the following samples samples were not found among the sequence files:\n\t",
                   paste(exp_diff, collapse="\n\t"), sep="")
    }
  }
  # 4.2 in observed but not in expected : just warning
  obs_diff <- setdiff(clean_fileNames, exp_names)
  if(length(obs_diff) != 0){
    if(paired){
      obs_diff_1 <- unname(sapply(obs_diff, FUN=function(x){paste(x, paired.fw_ext, seq.file.extension, sep="")}))
      obs_diff_2 <- unname(sapply(obs_diff, FUN=function(x){paste(x, paired.rv_ext, seq.file.extension, sep="")}))
      obs_diff <- c(obs_diff_1, obs_diff_2)
    }else{
      obs_diff <- unname(sapply(obs_diff, FUN=function(x){paste(x, seq.file.extension, sep="")}))
    }
    out$redundant_seqFiles <- obs_diff
    if(length(obs_diff)>10){
      wrn <- paste(wrn,
                   length(obs_diff),
                   " sequence files did not occur in the sample list. Here are the first few:\n\t",
                   paste(obs_diff[1:10], collapse="\n\t"),"\n\t...", sep="")
    } else{
      wrn <- paste(wrn,
                   "the following sequence files did not occur in the sample list:\n\t",
                   paste(obs_diff, collapse="\n\t"), sep="")
    }
  }

  # 5 finalize
  wrn <- gsub("^\n", "", wrn)
  if(nchar(wrn)>0){
    warning(wrn)
  }else{
    message("all records in the metadata matched with sequence files.\n")
  }

  return(out)
}


#' Formatting metadata and environmental data to upload to EMBL-ENA
#' 
#' @author Maxime Sweetlove
#' @family data archiving functions
#' @description converts metadata into the right format (a tab separated text file) to submit data to ENA (version Januari 2020)
#' @param metadata a MIxS.metadata class object. The object to be written as text file suited for submission to ENA.
#' @param file.name a character string. A name (without a file type extension) to use for the output files.
#' @param dest.dir a character string. The file path to the directory where the output files must be written. If left blank files are written to the working directory.
#' @param sample_unique_name_prefix a character string. The unique name prefix to append to the sample names (to tie them all together). Required for ENA submissions
#' @param checklist_accession a character string. The name of a MIxS environmetal package or it's ENA checklist accession number.
#' @param tax_name a character string. The scientific name of a taxon targeted in the data (applicable to all the samples). Same as "subspecf_gen_lin" or "scientific_name" in the MIxS.metadata input.
#' @param ask.input boolean. Whether or not to ask for user input to make decisions or solve problems that arise during the reformatting (e.g. missing data,...)
#' @param insert.size a character string. The size of the reads (in number of basepairs), if applicable to all samples.
#' @param library.layout a character string. The layout of the library, either PAIRED or SINGLE, if applicable to all samples.
#' @param library.strategy a character string. The library strategy (e.g. AMPLICON, WGS,...), if applicable to all samples.
#' @param library.selection a character string. The method used to select for, enrich or or screen the material being sequenced (e.g. PCR)?, if applicable to all samples.
#' @param seq.file.extension a character string. The extension for the sequence files. Default is .fastq.gz
#' @param pairedEnd.extension a character vector of length 2. If the data is paired-end data, specify the forward (first element of the vector) and reverse (second) extension tags here. Default is c("_1", "_2")
#' @usage prep.metadata.ENA(metadata, dest.dir, file.name,
#'   sample_unique_name_prefix=NA, checklist_accession=NA, 
#'   tax_name=NA, ask.input=TRUE, insert.size=NA, library.layout=NA, 
#'   library.strategy=NA, library.selection=NA, 
#'   seq.file.extension=".fastq.gz", 
#'   pairedEnd.extension=c("_1", "_2"))
#' @importFrom tidyr separate
#' @importFrom utils write.table
#' @details This function will reformat metadata for submission to ENA. Specifically made for ecological environmental studies 
#'   (e.g. amplicon sequencing, shotgun metagenomics,...), with additional QCs build in for Antarctic and Southern Ocean data. 
#'   The assumption is taken that the dataset has already been subjected a quality controll.
#' @return a *.tsv file with the sample metadata, and a \*_runInfo.tsv file with the technical data, written to the destination directory.
#' @examples 
#' \dontrun{
#' sampleNames <- c("sample_1", "sample_2")
#' test_MIxS <- new("MIxS.metadata",
#'                   data = data.frame(var1=c(1,2), var2=c(3,4), 
#'                                     eventID=sampleNames, 
#'                                     target_gene=c("16S", "18S"), 
#'                                     subspecf_gen_lin=c("Bacteria", "Eukaryota"),
#'                                     seq_meth=c("Illumina MiSeq", "Illumina MiSeq"),
#'                                     row.names=sampleNames),
#'                   section = c(var1="section1", var2="section1", eventID="miscellaneous",
#'                               target_gene="miscellaneous", subspecf_gen_lin="miscellaneous",
#'                               seq_meth="miscellaneous"),
#'                   units = c(var1="unit1", var2="unit2", eventID="alphanumeric",
#'                             target_gene="alphanumeric", subspecf_gen_lin="alphanumeric",
#'                             seq_meth="alphanumeric"),
#'                   env_package = "water",
#'                   type = "versatile",
#'                   QC = TRUE)
#' prep.metadata.ENA(metadata=test_MIxS, dest.dir=getwd(), file.name="testthat",
#'                   sample_unique_name_prefix=NA, checklist_accession=NA,
#'                   tax_name=NA, ask.input=FALSE,
#'                   insert.size=NA, library.layout=NA,
#'                   library.strategy=NA, library.selection=NA,
#'                   seq.file.extension=".fastq.gz", pairedEnd.extension=c("_1", "_2"))
#' }
#' @export
prep.metadata.ENA <- function(metadata, dest.dir=NULL, file.name=NULL,
                              sample_unique_name_prefix=NA, checklist_accession=NA,
                              tax_name=NA, ask.input=TRUE,
                              insert.size=NA, library.layout=NA,
                              library.strategy=NA, library.selection=NA,
                              seq.file.extension=".fastq.gz", pairedEnd.extension=c("_1", "_2")
){
  warningmessages<-c()

  # 1. check input
  if(check.valid.MIxS.metadata(metadata)){
    temp_unique_name_prefix <- deparse(substitute(metadata))
    metaunits <- metadata@units
    env_package <- metadata@env_package
    metadata <- metadata@data
  }else{
    stop("metadata argument must be a MIxS.metadata class object.
         Common dataframes should be converted to MIxS.metadata with
         the data.frame.to.MIxS.metadata function.")
  }
  # 2. check output destination
  if(!is.character(file.name) | c(NULL,NA) %in% file.name | length(file.name)>1 | grepl("/", file.name)){
    stop("No correct file name provided in the file.name argument.")
  }else{
    if(grepl(".txt$", file.name)){
      file.name <- gsub(".txt$", "", file.name)
    }else if(grepl(".csv$", file.name)){
      file.name <- gsub(".csv$", "", file.name)
    }
  }

  if(!is.null(dest.dir)){
    if(!dir.exists(file.path(dest.dir))){
      stop("Could not find the directory provided in the name argument.")
    }
    dest.dir<-gsub("/$","", dest.dir)
  }else{
    dest.dir <- getwd()
  }

  # 3. checklist_accession
  if(any(c(NULL,NA, "NA") %in% checklist_accession)){
    checklist_accession <- env_package
    if(checklist_accession %in% c("not_specified", "multiple_packages", "")){
      checklist_accession <- ENA_checklistAccession[ENA_checklistAccession$env_package=="miscellaneous_natural_or_artificial_environment",]$ena_checklist_accession
      warningmessages <- multi.warnings("The environmental package of the input data was not specified or refered to multiple packages. The ENA default was therfore selected. This can be overridden by providing input for the checklist_accession argument.",
                                        warningmessages)
    } else{
      checklist_accession <-  as.character(ENA_checklistAccession[ENA_checklistAccession$env_package==checklist_accession,]$ena_checklist_accession)
    }
  }else if(length(checklist_accession)==1){
    if(checklist_accession %in% ENA_checklistAccession$env_package){
      checklist_accession <- as.character(ENA_checklistAccession[ENA_checklistAccession$env_package==checklist_accession,]$ena_checklist_accession)
    }else if(! checklist_accession %in% ENA_checklistAccession$ena_checklist_accession){
      stop("The input for the checklist_accession argument was invalid.
           Must be a MIxS package (check spelling, use of underscores) or
           an ENA checklist accession number that refers to a MIxS package.")
    }
    } else{
      stop("The input for the checklist_accession argument was invalid.")
    }
  out.line01 <- paste("#checklist_accession", checklist_accession, sep='\t')

  # 4. sample_unique_name_prefix
  if(c(NULL,NA) %in% sample_unique_name_prefix | length(sample_unique_name_prefix)>1){
    if(ask.input){
      message("Please provide a unique name prefix (sample_unique_name_prefix argument)\n")
      sample_unique_name_prefix <- readLines(con = getOption("mypkg.connection"), n=1)
    }else{
      sample_unique_name_prefix <- temp_unique_name_prefix
    }
  }
  out.line02 <- paste("#unique_name_prefix", sample_unique_name_prefix, sep='\t')

  # 5. the actual data:
  ena_metadata <- data.frame(row.names=rownames(metadata)) #the values
  ena_variable <- c() #the row with the variable names (use of spaces by ENA makes is difficult to include with the values)
  ena_units <- c() #the unit row

  # 5.1 fixed term sample_alias (original sample name)
  if("original_name" %in% colnames(metadata)){
    ena_metadata$sample_alias<-metadata$original_name
  }else if("misc_param" %in% colnames(metadata)){
    msc <- strsplit(metadata[1,]$misc_param, ";")
    msc <- unlist(lapply(msc, function(x){strsplit(x, ":")}))
    if("original_name" %in% msc){
      sample_title <-c()
      for(stl in 1:nrow(metadata)){
        sName <- strsplit(metadata[stl,]$misc_param, ";")[[1]]
        sName <- sName[grepl("original_name:", sName)]
        sName <- strsplit(sName, ":")[[1]][2]
        if(sName=="NA" | sName=="NULL"){
          sName <- ""
        }
        sample_title <- c(sample_title, sName)
      }
      ena_metadata$sample_alias <- sample_title
    }
  }else{
    ena_metadata$sample_alias<-rownames(metadata)
  }
  ena_variable <- c(ena_variable, "sample_alias")
  ena_units <- c(ena_units, "#units")

  # 5.2 fixed term taxon: common_name
  # = The taxonomic classification (based on the ENA taxIDs) of the sample
  # for molecular data: assume environmental metagnemome unless enough evidence geven for organism/taxon
  if(c(NULL,NA) %in% tax_name){ #see if input was provided
    #look for a clue in the data
    c_taxa<-intersect(c("subspecf_gen_lin", "scientific_name"), colnames(metadata))
    if(length(c_taxa)>=1){
      tax_name <- metadata[,colnames(metadata)==c_taxa[1]]
    }else{
      if(ask.input){
        message("No Target taxon found:\n\tPlease type a taxon name (e.g. Bacteria, Cyanobacteria, Synechococcus,...). Or type n to ignore.\n")
        tax_name <- readLines(con = getOption("mypkg.connection"), n=1)
        if(!tax_name %in% c("n", "N")){
          tax_name <- rep(tax_name, nrow(metadata))
        } else{
          tax_name <- rep("not collected", nrow(metadata))
        }
      }else{
        tax_name <- rep("not collected", nrow(metadata))
      }
    }
  }else{
    tax_name <- rep(tax_name, nrow(metadata))
  }
  # common tax name
  ena_metadata$common_name <- tax_name
  ena_variable <- c(ena_variable, "common_name")
  ena_units <- c(ena_units, "")

  # 5.3 fixed term taxID and scientific tax name
  # taxID
  # investigation_type has priority because if sample is environmental,
  # a unknown variety and number of organisms will be present, thus one of the environmental
  # metagenome taxIDs should be taken
  if("investigation_type" %in% colnames(metadata)){
    ena_metadata$tax_name <- sapply(metadata$investigation_type, function(x){
      x <- gsub("mimarks-survey", "metagenome", x)
      return(x)})
    if(!all(unique(ena_metadata$tax_name)=="metagenome")){
      for(itx in 1:length(ena_metadata$tax_name)){
        if(!ena_metadata[itx,]$tax_name=="metagenome"){
          ena_metadata[itx,]$tax_name <- tax_name[itx]
        }
      }
    }
  }else{
    ena_metadata$tax_name <- as.character(tax_name) #just to create the column
  }
  # first fill in the scientific_name, already present as ena_metadata$tax_name
  ena_variable <- c(ena_variable, "scientific_name")
  ena_units <- c(ena_units, "")

  # then convert scientific_name to its tax_id
  ena_metadata$tax_IDs <- ena_metadata$tax_name
  ena_metadata$tax_IDs <- sapply(ena_metadata$tax_IDs, FUN=function(x){commonTax.to.NCBI.TaxID(x)})
  ena_variable <- c(ena_variable, "tax_id")
  ena_units <- c(ena_units, "")

  # 5.4 fixed term sample_title	(original sample name)
  if("original_name" %in% colnames(metadata)){
    ena_metadata$sample_title<-metadata$original_name
  }else if("misc_param" %in% colnames(metadata)){
    msc <- strsplit(metadata[1,]$misc_param, ";")
    msc <- unlist(lapply(msc, function(x){strsplit(x, ":")}))
    if("original_name" %in% msc){
      sample_title <-c()
      for(stl in 1:nrow(metadata)){
        sName <- strsplit(metadata[stl,]$misc_param, ";")[[1]]
        sName <- sName[grepl("original_name:", sName)]
        sName <- strsplit(sName, ":")[[1]][2]
        if(sName=="NA" | sName=="NULL"){
          sName <- ""
        }
        sample_title <- c(sample_title, sName)
      }
      ena_metadata$sample_title <- sample_title
    }
  }else{
    ena_metadata$sample_title<-rownames(metadata)
  }
  ena_variable <- c(ena_variable, "sample_title")
  ena_units <- c(ena_units, "")

  # 5.5 fixed term sample_description
  if("sample_description" %in% colnames(metadata)){
    ena_metadata$sample_description<-metadata$sample_description
  }else{
    if(ask.input){
      message("No sample description was found:\n\tPlease type a sample description. Or type n to ignore.\n")
      descr <- readLines(con = getOption("mypkg.connection"), n=1)
      if(!descr %in% c("n", "N")){
        ena_metadata$sample_description<-rep(descr, nrow(metadata))
      }else{
        ena_metadata$sample_description<-rep("", nrow(metadata))
      }
    }
  }
  ena_variable <- c(ena_variable, "sample_description")
  ena_units <- c(ena_units, "")

  # 5.6 geographic location (to get it on the right place)
  if("geo_loc_name" %in% colnames(metadata)){
    # for geographic location, only the names in the ENA_geoloc vector are allowed, oherwise ENA trows an error at submission
    # check if valid:
    if(any(!unique(metadata$geo_loc_name) %in% ENA_geoloc)){
      # not valid location names
      geoloc_data <- data.frame(geoloc_orig = as.character(unique(metadata$geo_loc_name)),
                                geoloc_ena = as.character(unique(metadata$geo_loc_name)),
                                stringsAsFactors = FALSE)
      geoloc_data <- geoloc_data[!geoloc_data$geoloc_orig=="",] #empty values will be handled at the end
      if(any(grepl(":", geoloc_data$geoloc_orig)) |
         any(grepl("|", geoloc_data$geoloc_orig)) |
         any(grepl(",", geoloc_data$geoloc_orig)) |
         any(grepl(";", geoloc_data$geoloc_orig))){
        # probably a hierarchical order of nested locations
        geoloc_data$geoloc_orig <- sapply(geoloc_data$geoloc_orig, FUN=function(x){
          gsub("|",":",x, fixed=TRUE)})
        geoloc_data$geoloc_orig <- sapply(geoloc_data$geoloc_orig, FUN=function(x){
          gsub(",",":",x)})
        geoloc_data$geoloc_orig <- sapply(geoloc_data$geoloc_orig, FUN=function(x){
          gsub(";",":",x)})
        for(l in 1:nrow(geoloc_data)){
          loc <- strsplit(geoloc_data[l,1], ":")[[1]]
          loc <- trimws(loc, which="both")
          i_loc <- intersect(tolower(loc), tolower(ENA_geoloc))
          if(length(i_loc)<1){
            bad_input <- TRUE
          }else if(length(i_loc)>1){
            if("antarctica" %in% tolower(i_loc) & "southern ocean" %in% tolower(i_loc)){
              i_loc <- "Southern Ocean"
              bad_input <- FALSE
            } else{
              bad_input <- TRUE
            }
          }else{
            i_loc <- ENA_geoloc[which(grepl(i_loc, tolower(ENA_geoloc)))]
            bad_input <- TRUE
          }
          if(bad_input & ask.input){
            if(ask.input){
              while(bad_input){
                message(paste("Illegal geographic location name found for \"",geoloc_data[l,1],"\".\n\tPlease type the correct name.\n\tType n to ignore.\n\tType h to see the allowed terms first.", sep=""))
                loc_input <- readLines(con = getOption("mypkg.connection"), n=1)
                if(!loc_input %in% c("n", "N", "h", "H") & loc_input %in% ENA_geoloc){
                  i_loc<- loc_input
                  bad_input <- FALSE
                }else if(loc_input %in% c("n", "N")){
                  i_loc <- ""
                  bad_input <- FALSE
                }else if(loc_input %in% c("h", "H")){
                  print(sort(ENA_geoloc))
                }
              }
            }else{
              i_loc <- ""
            }
          }
          geoloc_data[l,2] <- as.character(i_loc)
        }
      }
      ena_metadata$geo_loc_name <- unname(unlist(sapply(as.character(metadata$geo_loc_name), FUN=function(x){
        if(x %in% geoloc_data$geoloc_orig){
          x<-geoloc_data[geoloc_data$geoloc_orig==x,]$geoloc_ena
        }else{
          x<-""
        }
        return(x)
      })))

    }else{
      ena_metadata$geo_loc_name <- metadata$geo_loc_name
    }
    ena_variable <- c(ena_variable, get.ENAName("geo_loc_name"))
    ena_units <- c(ena_units, "")
  } else{
    ena_metadata$geo_loc_name <- rep("not collected", nrow(ena_metadata))
    ena_variable <- c(ena_variable, get.ENAName("geo_loc_name"))
    ena_units <- c(ena_units, "")
  }

  # 5.7 latlon DD
  if("decimalLatitude" %in% colnames(metadata) &
     "decimalLongitude" %in% colnames(metadata)){
    ena_metadata$decimalLatitude <- metadata$decimalLatitude
    ena_metadata$decimalLongitude <- metadata$decimalLongitude
  }else if("lat_lon" %in% colnames(metadata)){
    metadata <- separate(metadata, "lat_lon", sep=" ", into=c("decimalLatitude", "decimalLongitude"))
    ena_metadata$decimalLatitude <- metadata$decimalLatitude
    ena_metadata$decimalLongitude <- metadata$decimalLongitude
  }else{
    warningmessages <- multi.warnings("The data lacks geographical coordinates.", warningmessages)
    ena_metadata$decimalLatitude <- rep("not collected", nrow(ena_metadata))
    ena_metadata$decimalLongitude <- rep("not collected", nrow(ena_metadata))
  }
  ena_variable <- c(ena_variable, get.ENAName("decimalLatitude"), get.ENAName("decimalLongitude"))
  ena_units <- c(ena_units, "DD", "DD")

  # 5.8 convert remaining MIxS terms to ENA variants
  redundant_cols <- which(colnames(metadata) %in% c("sample_description", "lat_lon", "decimalLatitude",
                                                    "decimalLongitude", "geo_loc_name",
                                                    "original_name", "subspecf_gen_lin", "scientific_name"))
  if(length(redundant_cols)>0){
    metadata_reduced <- metadata[,-redundant_cols]
    metaunits <- metaunits[-redundant_cols]
    for(cl in 1:ncol(metadata_reduced)){
      clName <- colnames(metadata_reduced[cl])
      ena_name <- get.ENAName(clName)
      if(length(ena_name) == 0 || ena_name==""){
        ena_name <- clName
      }
      ena_metadata[,clName] <- c(as.character(metadata_reduced[,colnames(metadata_reduced) %in% clName]))
      ena_variable <- c(ena_variable, ena_name)
      ena_units <- c(ena_units, gsub("alphanumeric", "", metaunits[cl]))
    }
  }


  # 6. convert missing data for required fields to ENA accepted term "not collected"
  #ENArequired_terms <- c("lat_lon", "decimalLatitude", "decimalLongitude", "geo_loc_name",
  #                       "collection_date", "seq_meth", "investigation_type", "alt",
  #                       "elev", "depth")
  for(required_val in colnames(ena_metadata)){
    ena_metadata[,required_val]<- sapply(ena_metadata[,required_val], function(x){
      x<-as.character(x)
      if(x==""|is.na(x)|x=="ND"|x=="NA"){
        x<-"not collected"}
      return(x)})
  }

  if("host_taxid" %in% colnames(ena_metadata)){
    ena_metadata$host_taxid<- sapply(ena_metadata$host_taxid, function(x){
      x<-as.character(x)
      if(x==""){
        x<-"NCBI:txid12908"}
      return(x)})
  }


  # 7. the runInfo data (= technical details of the samples)
  # 7.1. sample_alias and creating the runInfo data frame
  ena_runInfo <- data.frame(sample_alias = ena_metadata$sample_alias,
                            instrument_model = "", library_name = "", library_source = "",
                            library_selection = "", library_strategy = "", library_layout = "",
                            design_description = "", library_construction_protocol = "",
                            insert_size = "", forward_file_name = "", forward_file_md5 = "",
                            reverse_file_name = "", reverse_file_md5 = "",
                            stringsAsFactors = FALSE)

  # 7.2 instrument_model
  if("seq_meth" %in% colnames(metadata)){
    ena_runInfo$instrument_model <- metadata$seq_meth
  }

  if(any(! unique(ena_runInfo$instrument_model) %in% ENA_instrument) |
     !"seq_meth" %in% colnames(metadata) &
     ask.input){
    bad_input <- TRUE
    while(bad_input){
      message("No valid sequencing instrument model found:\n\tPlease type the correct instrument model.\n\tType n to ignore.\n\tType h to see the allowed instrument models first.\n")
      descr <- readLines(con = getOption("mypkg.connection"), n=1)
      if(!descr %in% c("n", "N", "h", "H") & descr %in% ENA_instrument){
        ena_runInfo$instrument_model <- rep(descr, nrow(ena_runInfo))
        bad_input <- FALSE
      } else if(descr %in% c("h", "H")){
        print(ENA_instrument)
      } else if(descr %in% c("n", "N")){
        ena_runInfo$instrument_model <- rep("", nrow(ena_runInfo))
        bad_input <- FALSE
      }
    }
  }


  # 7.3. library_name
  if("original_name" %in% colnames(metadata)){
    ena_runInfo$library_name <- metadata$original_name
  }

  # 7.4. library_source
  if("library_source" %in% colnames(metadata)){
    ena_runInfo$library_source <- toupper(metadata$library_source)
  }else if(ask.input){
    bad_input<-TRUE
    while(bad_input){
      message("Could the data be cathegorized as:\n\ta) metagenomic\n\tb) metatranscriptomic\n\tc) genomic\n\td)viral RNA\n\te)other\n\tType one of the letters (a-e) or n to ignore.\n")
      descr <- readLines(con = getOption("mypkg.connection"), n=1)
      if(tolower(descr) %in% c("a", "b", "c", "d", "e")){
        if(tolower(descr)=="a"){
          descr <- "METAGENOMIC"
        }else if(tolower(descr)=="b"){
          descr <- "METATRANSCRIPTOMIC"
        }else if(tolower(descr)=="c"){
          descr <- "GENOMIC"
        }else if(tolower(descr)=="d"){
          descr <- "VIRAL RNA"
        }else if(tolower(descr)=="e"){
          descr <- "OTHER"
        }
        ena_runInfo$library_source <- rep(descr, nrow(ena_runInfo))
        bad_input<-FALSE
      }else if(descr %in% c("n", "N")){
        bad_input<-FALSE
      }
    }

  }else if("investigation_type" %in% colnames(metadata)){
    # metagenomic
    ena_runInfo$library_source <- gsub("mimarks-survey", "METAGENOMIC", metadata$investigation_type)
    ena_runInfo$library_source <- gsub("metagenome", "METAGENOMIC", ena_runInfo$library_source)
  }

  if(any(! unique(ena_runInfo$library_source)  %in% c("GENOMIC", "METAGENOMIC",
                                                      "OTHER", "TRANSCRIPTOMIC",
                                                      "METATRANSCRIPTOMIC", "VRIAL RNA",
                                                      "SYNTHETIC", ""))){
    stop("Invalid input for \"library_source\"\n\tOnly following terms allowed: GENOMIC, METAGENOMIC,\n\tOTHER, TRANSCRIPTOMIC, METATRANSCRIPTOMIC, VRIAL RNA, or SYNTHETIC")
  }

  # 7.5. library_selection
  if(!is.na(library.selection) && library.selection %in% ENA_select){
    ena_runInfo$library_selection <- rep(library.selection, nrow(ena_runInfo))
  }else if(ask.input){
    bad_input <- TRUE
    while(bad_input){
      message("What method was used to select for, enrich or or screen the material being sequenced (e.g. PCR)?\n\tType the appropriate method.\n\tType n to ignore.\n\tType h to see all the allowed methods.\n")
      descr <- readLines(con = getOption("mypkg.connection"), n=1)
      if(!descr %in% c("n", "N", "h", "H") & descr %in% ENA_select){
        ena_runInfo$library_selection <- rep(descr, nrow(ena_runInfo))
        bad_input <- FALSE
      } else if(descr %in% c("h", "H")){
        print(ENA_select)
      } else if(descr %in% c("n", "N")){
        ena_runInfo$library_selection <- rep("unspecified", nrow(ena_runInfo))
        bad_input <- FALSE
      }
    }
  }

  # 7.6. library_strategy
  if(!is.na(library.strategy) && library.strategy %in% ENA_strat){
    ena_runInfo$library_strategy <- rep(library.strategy, nrow(ena_runInfo))
  }else if("run_type" %in% colnames(metadata)){
    ena_runInfo$library_strategy <- metadata$run_type
  }else if("library_strategy" %in% colnames(metadata)){
    ena_runInfo$library_strategy <- metadata$library_strategy
  }
  if(any(! unique(ena_runInfo$library_strategy) %in% ENA_strat) &
     ask.input){
    bad_input <- TRUE
    while(bad_input){
      message("No valid library strategy found:\n\tPlease type the correct strategy (e.g. AMPLICON, WGS,...).\n\tType n to ignore.\n\tType h to see the allowed strategies.\n")
      descr <- readLines(con = getOption("mypkg.connection"), n=1)
      if(!descr %in% c("n", "N", "h", "H") & descr %in% ENA_strat){
        ena_runInfo$library_strategy <- rep(descr, nrow(ena_runInfo))
        bad_input <- FALSE
      } else if(descr %in% c("h", "H")){
        print(ENA_strat)
      } else if(descr %in% c("n", "N")){
        ena_runInfo$library_strategy <- rep("", nrow(ena_runInfo))
        bad_input <- FALSE
      }
    }
  }

  # 7.7. library_layout SINGLE or PAIRED
  if(!is.na(library.layout) && library.layout %in% c("PAIRED", "SINGLE")){
    ena_runInfo$library_layout <- rep(library.layout, nrow(ena_runInfo))
  }else if("library_layout" %in% colnames(metadata)){
    ena_runInfo$library_layout <- metadata$library_layout
  }else if(ask.input){
    bad_input<-TRUE
    while(bad_input){
      message("What was the library layout?.\n\ta) paired-end\n\tb) single-end\n\tType the appropriate letter (a-b).\n")
      descr <- readLines(con = getOption("mypkg.connection"), n=1)
      if(descr %in% c("a", "A", "b", "B")){
        descr <- gsub("a", "PAIRED", tolower(descr))
        descr <- gsub("b", "SINGLE", tolower(descr))
        ena_runInfo$library_layout <- rep(descr, nrow(ena_runInfo))
        bad_input<-FALSE
      }else{
        message("wrong input\n")
      }
    }
  }

  # 7.8. design_description
  if("experimental_factor" %in% colnames(metadata)){
    ena_runInfo$design_description <- metadata$experimental_factor
  }

  # 7.9. library_construction_protocol
  if("lib_const_meth" %in% colnames(metadata)){
    ena_runInfo$library_construction_protocol <- metadata$lib_const_meth
  }

  # 7.10. insert_size
  ## required parameter
  if(!is.na(insert.size)){
    ena_runInfo$insert_size <- rep(insert.size, nrow(ena_runInfo))
  }else if("insert_size" %in% colnames(metadata)){
    ena_runInfo$insert_size <- metadata$insert_size
  }else if(ask.input){
    message("What was the insert size of the reads?.\n\tType the appropriate number, or type n to ignore.\n")
    descr <- readLines(con = getOption("mypkg.connection"), n=1)
    if(grepl("[0-9]",descr)){
      ena_runInfo$insert_size <- rep(descr, nrow(ena_runInfo))
    }
  }

  # 7.11. forward_file_name / reverse_file_name
  if("original_name" %in% colnames(metadata)){
    # assuming the original names corresponds to the sequence data file names
    for(i in 1:nrow(ena_runInfo)){
      if(ena_runInfo[i,]$library_layout == "PAIRED"){
        # assuming _1 for forward files and _2 for reverse files
        ena_runInfo[i,colnames(ena_runInfo)=="forward_file_name"] <- paste(metadata[i,]$original_name, pairedEnd.extension[1], seq.file.extension, sep="")
        ena_runInfo[i,colnames(ena_runInfo)=="reverse_file_name"] <- paste(metadata[i,]$original_name, pairedEnd.extension[2], seq.file.extension, sep="")
      }else if(ena_runInfo[i,]$library_layout == "SINGLE"){
        ena_runInfo[i,colnames(ena_runInfo)=="forward_file_name"] <- paste(metadata[i,]$original_name, seq.file.extension, sep="")
        ena_runInfo[i,colnames(ena_runInfo)=="reverse_file_name"] <- paste(metadata[i,]$original_name, seq.file.extension, sep="")
      }
    }
  }


  # 8. put everything together for the sample metadata
  # 8.1 lines 3 to 5
  out.line03 <- paste(ena_variable, collapse='\t')
  out.line04 <- paste("#template", paste(rep("", ncol(ena_metadata)-1), collapse="\t"), sep='\t') #paste(c(as.character(ena_metadata[1,][-1]))
  out.line05 <- paste(ena_units, collapse='\t')

  # 8.2 line >5
  out.line99 <- apply(ena_metadata, 1, function(x) paste(x, collapse = "\t"))
  out.line99 <- paste(out.line99, collapse="\n")

  # 8.3 combine everything
  SampleMetadata_output <- paste(out.line01, out.line02, out.line03, out.line04, out.line05, out.line99, sep="\n")

  # 8.4 the runinfodata
  out.runInfo1 <- paste(colnames(ena_runInfo), collapse = "\t")
  out.runInfo2 <- apply(ena_runInfo, 1, function(x) paste(x, collapse = "\t"))
  out.runInfo2 <- paste(out.runInfo2, collapse="\n")
  runInfo_output <- paste(out.runInfo1, out.runInfo2, sep="\n")

  # 9. finalize
  metadataFile <- paste(dest.dir, "/", file.name, ".tsv", sep="")
  runInfoFile <- paste(dest.dir, "/", file.name, "_runInfo.tsv", sep="")

  write.table(SampleMetadata_output, file=metadataFile,
              col.names = FALSE, row.names = FALSE, quote = FALSE, fileEncoding = "UTF-8")
  message(paste("The sample metadata has been written to ", metadataFile, "\n",sep=""))

  write.table(runInfo_output, file=runInfoFile,
              col.names = FALSE, row.names = FALSE, quote = FALSE, fileEncoding = "UTF-8")
  message(paste("The technical runInfo data has been written to ", runInfoFile, "\n",sep=""))

  }
biodiversity-aq/OmicsMetaData documentation built on Dec. 19, 2021, 9:44 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
biodiversity-aq/OmicsMetaData
OmicsMetaData: an R-package for interoperable and re-usable biodiversity 'omics (meta)data

R/Format_sequenceData_ENA.R
In biodiversity-aq/OmicsMetaData: OmicsMetaData: an R-package for interoperable and re-usable biodiversity 'omics (meta)data

Defines functions prep.metadata.ENA sync.metadata.sequenceFiles renameSequenceFiles FileNames.to.Table

Documented in FileNames.to.Table prep.metadata.ENA renameSequenceFiles sync.metadata.sequenceFiles

R Package Documentation

Browse R Packages

We want your feedback!

biodiversity-aq/OmicsMetaData OmicsMetaData: an R-package for interoperable and re-usable biodiversity 'omics (meta)data

R/Format_sequenceData_ENA.R In biodiversity-aq/OmicsMetaData: OmicsMetaData: an R-package for interoperable and re-usable biodiversity 'omics (meta)data

Defines functions prep.metadata.ENA sync.metadata.sequenceFiles renameSequenceFiles FileNames.to.Table

Documented in FileNames.to.Table prep.metadata.ENA renameSequenceFiles sync.metadata.sequenceFiles

R Package Documentation

Browse R Packages

We want your feedback!

biodiversity-aq/OmicsMetaData
OmicsMetaData: an R-package for interoperable and re-usable biodiversity 'omics (meta)data

R/Format_sequenceData_ENA.R
In biodiversity-aq/OmicsMetaData: OmicsMetaData: an R-package for interoperable and re-usable biodiversity 'omics (meta)data