R/readData.R

Defines functions unionDNASets getSequencesByRegions getSequenceByRegion readNameToRegions readGenome

Documented in getSequenceByRegion getSequencesByRegions readGenome readNameToRegions unionDNASets

#' Read genome data from user
#'
#' A function that read genome sequence .fasta file from user by inputting
#' directories
#'
#' @param fastaFile A string indicating the path of .fasta file
#' @param nameToRegionsFile A string indicating the path of name to region .csv file
#'
#' @return Returns a DNAStringSet with sequences from fastaFile are add.
#'
#' @examples
#' # Example 1
#' # Load MN985325.1.fasta and MT066156.1.fasta file in inst/extdata
#' fastaPath1 <- system.file("extdata", "MN985325.1.fasta",
#'   package="Cov2Comparator")
#' fastaPath2 <- system.file("extdata", "MT066156.1.fasta",
#'   package="Cov2Comparator")
#' nameToRegionsFile <- system.file("extdata", "nameToCountry.txt",
#'   package="Cov2Comparator")
#' fasta1 <- readGenome(fastaPath1, nameToRegionsFile)
#'
#' @references
#'H. Pagès, P. Aboyoun, R. Gentleman and S. DebRoy (2020). Biostrings:
#'Efficient manipulation of biological strings.
#'R package version 2.58.0. https://bioconductor.org/packages/Biostrings
#'
#' @export
#' @importFrom Biostrings readDNAStringSet
#' @importFrom Biostrings DNAStringSet
readGenome <- function(fastaFile, nameToRegionsFile = NULL) {
  # check input types and if file exits
  if (class(fastaFile) != "character") {
    stop("Fasta file path should be a string")
  }
  if (! file.exists(fastaFile)) {
    stop("fastaFile not exist")
  }
  userSequence <- Biostrings::readDNAStringSet(fastaFile)
  if (is.null(nameToRegionsFile)) {
    return(userSequence)
  }
  if (! file.exists(nameToRegionsFile)) {
    stop("nameToRegionsFile not exist")
  }
  # read nameToRegionsFile
  nameToRegions <- readNameToRegions(nameToRegionsFile = nameToRegionsFile)
  if (length(userSequence) == 0) {
    stop("Fasta File contains no sequence")
  }
  newNames <- vector(mode='character', length=length(userSequence))
  for (i in seq_along(userSequence)) {
    # assign name as "accessionID region"
    id <- strsplit(names(userSequence)[i], " ")[[1]][1]
    region <- nameToRegions[nameToRegions['Name'] == id][2]
    newNames[i] <- paste(id, region, sep = " ")
  }
  # assign names (ids) to genomes
  names(userSequence) <- newNames
  return(Biostrings::DNAStringSet(userSequence))
}
#' Read a name to region dataframe. A helper function for readGenome
#'
#' A function that read name to region dataframe from user by inputting
#' directories.
#'
#' @param nameToRegionsFile A string indicating the path of name to region .csv file
#'
#' @return Returns a DNAStringSet with sequences from fastaFile are add.
#'
#' @references
#'H. Pagès, P. Aboyoun, R. Gentleman and S. DebRoy (2020). Biostrings:
#'Efficient manipulation of biological strings.
#'R package version 2.58.0. https://bioconductor.org/packages/Biostrings
#'
#' @importFrom utils read.csv
#' @keywords internal

readNameToRegions <- function(nameToRegionsFile) {
  # check input
  if (! is.character(nameToRegionsFile)) {
    stop("nameToRegionsFile path should be a string")
  }
  nameToRegions <- utils::read.csv(nameToRegionsFile, header = FALSE)
  names(nameToRegions) <- c("Name", "Region")
  return(nameToRegions)
}


################################################################################


#' Read genome sequence data from user by inputing region
#'
#' A function that takes region as input. It will look up the accessionID of
#' SARS-COV2 of this region. Then it will retrieve the sequence from NCBI
#' database online and return as a DNAStringSet object.
#'
#' @param region A string indicating the interested region in which SARS-COV2
#' are located that you want to study
#'
#' @return Returns a DNAStringSet with sequences.
#'
#' @examples
#' # Example 1
#' # Load MT066156.1 (accessionID of SARS-COV2 in Italy) by
#' # using "Italy" as input
#' region <- "Italy"
#' italySequence <- getSequenceByRegion(region)
#'
#' @references
#'H. Pagès, P. Aboyoun, R. Gentleman and S. DebRoy (2020). Biostrings:
#'Efficient manipulation of biological strings.
#'R package version 2.58.0. https://bioconductor.org/packages/Biostrings
#'
#'Paradis E. & Schliep K. (2019). ape 5.0: an environment for modern
#'phylogenetics and evolutionary
#'analyses in R. Bioinformatics 35: 526-528.
#'
#' @export
#' @importFrom Biostrings DNAStringSet
#' @importFrom ape read.GenBank


getSequenceByRegion <- function(region) {
  # check input
  if (! is.character(region)) {
    stop("Please input a character as argument")
  }
  accessionIDToRegion <- get0("accessionIDToRegion",
                              envir = asNamespace("Cov2Comparator"))
  # accessionIDToRegion <- Cov2Comparator:::accessionIDToRegion
  # check if region is valid
  if (! is.element(region, accessionIDToRegion$Region)) {
    stop("Sorry, currently data do not contain your interested region yet. \n
         Please input regions from following: (Canada, Italy, Wuhan, USA,
         Kenya, Bahrain, Germany, Pakistan, Britain,
        Thailand)")
  }
  accessionId <- accessionIDToRegion[accessionIDToRegion['Region'] == region][1]
  # Use ape package to retrieve sequence online
  dnabin <- ape::read.GenBank(accessionId, as.character = TRUE)
  sequenceString <- paste(dnabin[[1]], collapse = "")
  # modify type
  aastringSet <- Biostrings::DNAStringSet(sequenceString)
  names(aastringSet) <- paste(accessionId, region, sep = " ")
  return(Biostrings::DNAStringSet(aastringSet))
}


################################################################################


#' Read multiple genome sequence data from user by inputting a vector of regions
#'
#' A function that takes regions as input. It will look up the accessionIDs of
#' SARS-COV2 of this region. Then it will retrieve the sequences from NCBI
#' database online and return as a DNAStringSet object.
#'
#' @param regions A vector of strings indicating the interested regions in
#' which SARS-COV2 are located that you want to study
#'
#' @return Returns a DNAStringSet with sequences.
#'
#' @examples
#' # Example 1
#' # Load MT066156.1 (accessionID of SARS-COV2 in Italy) by
#' # using "Italy" as input
#' regions <- c("Italy", "Canada", "USA")
#' Sequence <- getSequencesByRegions(regions)
#'
#' @references
#'H. Pagès, P. Aboyoun, R. Gentleman and S. DebRoy (2020). Biostrings:
#'Efficient manipulation of biological strings.
#'R package version 2.58.0. https://bioconductor.org/packages/Biostrings
#'
#'Paradis E. & Schliep K. 2019. ape 5.0: an environment for modern
#'phylogenetics and evolutionary
#'analyses in R. Bioinformatics 35: 526-528.
#'
#' @export
#' @importFrom Biostrings DNAStringSet
getSequencesByRegions <- function(regions) {
  # check input
  if (length(regions) < 1) {
    stop("Please input a vector containing at least one element")
  }
  if (is.element(FALSE, is.element(regions, accessionIDToRegion$Region))) {
    stop("Your input contain region has not been supported yet")
  }
  # get sequence of first region first
  newNames <- vector(mode='character', length=length(regions))
  sequences <- getSequenceByRegion(regions[1])
  newNames[1] <- names(sequences)[1]
  if (length(regions) == 1) {
    return(Biostrings::DNAStringSet(sequences))
  }
  # get sequence for rest of regions
  regions <- regions[-1]
  for (i in seq_along(regions)) {
    iseq <- getSequenceByRegion(regions[i])
    newNames[i + 1] <- names(iseq)[1]
    sequences = union(sequences, iseq)
  }
  names(sequences) <- newNames
  return(Biostrings::DNAStringSet(sequences))
}


################################################################################


#' Union two DNAStringSet
#'
#' A function that takes two DNAStringSet and union them into one.
#'
#' @param set1 A DNAStringSet contains DNA sequence
#' @param set2 A DNAStringSet contains DNA sequence
#'
#' @return Returns a DNAStringSet with sequences contained in set1 and set2.
#'
#' @examples
#' # Example 1
#' library(Biostrings)
#' set1 <- Biostrings::DNAStringSet("ATCGATCG")
#' set2 <- Biostrings::DNAStringSet("ATTTTTTT")
#' set3 <- Biostrings::DNAStringSet("ATCGATTT")
#' setTotal <- unionDNASets(set1, set2)
#'
#' @references
#'H. Pagès, P. Aboyoun, R. Gentleman and S. DebRoy (2020). Biostrings:
#'Efficient manipulation of biological strings.
#'R package version 2.58.0. https://bioconductor.org/packages/Biostrings
#'
#'Paradis E. & Schliep K. 2019. ape 5.0: an environment for modern
#'phylogenetics and evolutionary
#'analyses in R. Bioinformatics 35: 526-528.
#'
#' @export
#' @importFrom Biostrings DNAStringSet
#' @importFrom Biostrings union

unionDNASets <- function(set1, set2) {
  if (is.null(set1)) {
    return(set2)
  }
  if (is.null(set2)) {
    return(set1)
  }
  return(Biostrings::DNAStringSet(Biostrings::union(set1, set2)))
}

# [END]
quick2063706271/Cov2Comparator documentation built on Dec. 22, 2021, 10:59 a.m.