R/merge_vipr_ncbi.R

Defines functions format_vipr_date

Documented in format_vipr_date

#! /usr/bin/env Rscript

library(tidyverse)
library(magrittr)

# === Functions

#' Format ViPR dates (MM/DD/YYYY) into some form of YYYY-MM-DD, with unknown set to XX
#' @param vc Character vector containing the vipr date such as "01/30/2022" or "01/2022"
#' @param delim Delimiter of date values, which is "/" by default but could also be "-"
#' @return The formatted date in "YYYY-MM-DD" format
#' @export
format_vipr_date <- function(vc, delim="/"){
  # If vc is NA, return early
  if (is.na(vc)){
    return("XXXX-XX-XX")
  }
  if ( vc == "-N/A-") {
    return("XXXX-XX-XX")
  }
  # If vc is delimited
  if (grepl(delim, vc)) {
    vc_temp <- vc %>%
      stringr::str_split(., delim, simplify = T) %>%
      as.vector(.)
    # MM/DD/YYYY
    if (length(vc_temp)==3){
      vc <- vc_temp %>% 
        { c(.[3], .[1], .[2]) } %>%
        paste(., collapse = "-", sep = "")
    }
    # MM/YYYY
    if (length(vc_temp)==2){
      vc <- vc_temp %>% 
        { c(.[2], .[1] , "XX") } %>%
        paste(., collapse = "-", sep = "")
    }
    
    return(vc)
  }
  # YYYY
  if (str_length(vc)==4){
    return(paste(vc, "-XX-XX", sep=""))
  }
  # If not recognized, return original date string
  return(vc)
}

#' Format NCBI dates (YYYY-MM-DD) into some form of YYYY-MM-DD, with unknown set to XX
#' @param vc Character vector containing the ncbi date such as "2022-01-30" or "2022-01"
#' @return The formatted date in "YYYY-MM-DD" format
#' @export
format_ncbi_date <- function(vc){
  # If NA, return early
  if(is.na(vc)){
    return(vc)
  }
  # If only YYYY, add unknown month day
  if(str_length(vc)==4){
    return(paste(vc, "-XX-XX", sep=""))
  }
  # If only YYYY-MM, add unknown day
  if(str_length(vc)==7){
    return(paste(vc, "-XX", sep=""))
  }
  # If not recognized, return original date string
  return(vc)
}

# # TODO: generalize this, read in dictionary haha
# guess_genotype <- function(strain){
#   if(grepl("RSVA", strain) | grepl("RSV_A", strain) | grepl("RSV-A", strain) | grepl("/A/", strain)){
#     return("A")
#   }
#   if(grepl("RSVB", strain) | grepl("RSV_B", strain) | grepl("RSV-B", strain) | grepl("/B/", strain)){
#     return("B")
#   }
#   return("")
# }


#' Read in delimited data
#' Drop any empty columns
#' Remove spaces from column names
#' @param filename Character vector containing path to file
#' @param delim Delimiter of file, tab by default
#' @param type Basic is a basic delimited file, but special handling for ncbi or vipr
#' @return The cleaned dataframe
#' @export
read_delim_file <- function(filename, delim="\t", type="basic", col_names=TRUE) {
  df <- readr::read_delim(
    filename, 
    delim=delim,
    col_names=col_names,
    col_types = cols(.default = "c")                  # convert to characters
  ) %>%
    discard(~all(is.na(.) | . == "" | . == "-N/A-"| . == "?"))  # drop empty cols
  
  isNAtable = df == "-N/A-"
  df[isNAtable] = NA
  isNAtable = df == "?"
  df[isNAtable] = NA
  
  # Format Column names
  names(df) = names(df) %>% 
    tolower(.) %>% 
    gsub(" ", "_", .)
  
  # VIPR specific
  if(type == "vipr") {
    df = df %>%
      group_by(genbank_accession) %>%
      mutate(
        collection_date = collection_date %>% format_vipr_date(vc=., delim="/"),
        genbank=genbank_accession,
        genotype_vipr=`subtype/genotype_(vipr)`
      ) %>%
      ungroup(.) %>%
      select(-genbank_accession, -`subtype/genotype_(vipr)`, -virus_type, -mol_type) %>%
      select(genbank, collection_date, strain_name, country, everything())
  }
  
  # Fauna specific
  if(type == "fauna") {
    df = df %>%
      mutate(
        genbank=accession,
        accession=NULL,
        virus=NULL,
        db=NULL,
        segment=NULL,
        collection_date=date,
        strain_name=strain,
        date=NULL,
        strain=NULL
        )
  }
  
  if(type == "ncbi") {
    df = df %>%
      group_by(accession) %>%
      mutate(
        collection_date = collection_date %>% format_ncbi_date(vc=.),
        genbank=accession,
        species=NULL,
        genus=NULL,
        family=NULL,
        molecule_type=NULL,
        sequence_type=NULL,
        strain_name=case_when( grepl("strain", genbank_title) ~ gsub(".*strain ","",genbank_title) %>% 
                                                                gsub(", complete genome","",.) %>%
                                                                gsub(", partial genome","",.)),
        genotype=case_when(!is.na(genotype) ~ genotype,
                           grepl("genotype", genbank_title) ~ gsub(".*genotype ","",genbank_title) %>% 
                             gsub(" .*","",.))
      ) %>%
      ungroup(.) %>%
      select(-accession) %>%
      select(genbank, collection_date, country, everything())
  }
  
  return(df)
}


#' Helps harmonize or expand column names, preparing for a summarize merge later
#' @param data Data frame
#' @param cname Vector of strings of column names that need to exist in data frame
#' @return New data frame containing a merge of original columns and cname
#' @export
fncols <- function(data, cname) {
  add <- cname[!cname %in% names(data)]
  if (length(add) != 0) data[add] <- NA
  data
}

#' Merge unique items in column by group, probably used in a summarize function
#' @param vc Vector of values for a particular column
#' @param delim Delimiter between unique values, will split original string into delims, and smash them together again
#' @return New string of unique values, delimited by delim
#' @export
uniqMerge <- function(vc, delim = ",") {
  # Drop NA, drop empty strings, combine together
  vc <- vc %>%
    na.omit(.) %>%
    stringi::stri_remove_empty_na(.) %>%
    unique(.) %>%
    paste(., collapse = delim, sep = "")
  if (grepl(delim, vc)) {
    vc <- vc %>%
      stringr::str_split(., delim, simplify = T) %>%
      as.vector(.) %>%
      unique(.) %>%
      paste(., collapse = delim, sep = "")
  }
  return(vc)
}

#' Merge two dataframes
#' @param one_df First data frame to merge, col names will be listed first.
#' @param two_df Second data frame to merge, col names will be harmonized with first. 
#' @return Merged data frame
#' @export
merge_two <- function(one_df, two_df) {
  one = fncols(one_df, names(two_df))
  two = fncols(two_df, names(one_df)) %>%
    select(names(one))
  all = rbind(one, two) %>%
    group_by(genbank) %>%
    summarize_at(., vars(-group_cols()), uniqMerge) %>%
    ungroup(.)
  return(all)
}

# # === Clean Data
# readr::write_delim(keep_df, "vipr_ncbi.tsv", delim="\t")
# writexl::write_xlsx(keep_df, "vipr_ncbi.xlsx")
                 
j23414/jenplots documentation built on Aug. 28, 2022, 6:32 p.m.