R/data_input_fcn.R

Defines functions randID input_data input_table input_dna dna_detectFileType input_vcf prepGenomics input_fasta input_spatial join_spatial_data input_phyloTree input_image annotate_image input_rdata

Documented in input_data

#Function that creates a unique ID for each data item
#code from: https://stackoverflow.com/questions/42734547/generating-random-strings
randID <- function(n = 1) {
  a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
  paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}


#' Input Data
#'
#' Generic input function for inputing data, helper functions are in the back
#'
#' @param file
#' @param dataType
#' @param asObj
#' @param desc
#' @param ...
#'
#' @import dplyr
#' @export
#' @return
input_data<-function(file  = NA, dataType = NA, asObj=TRUE,desc = NA,...){
  #Only supports specific data types
  #TODO: data for sequence alignment
  if(!(dataType %in% c("tree","table","dna","spatial","image")))
    stop("Data type is not supported. Please choose one of : tree, table, dna, spatial, image. Use ?input_data to learn more about the different input types.")

  #print(as.list(match.call())) #for testing only

  #make a unique ID for each object
  dataID<-randID()

  switch(dataType,
         "table" = input_table(file,asObj,dataID,desc,...),
         "tree" = input_phyloTree(file,asObj,dataID,desc,...),
         "dna" = input_dna(file,asObj,dataID,desc,...),
         "spatial" = input_spatial(file,asObj,dataID,desc,proj4String,...),
         "image" = input_image(file,asObj,dataID,desc,...),
         "rdata" = input_rdata(file, asObj, dataID,decs,...))
}

#***************************************************************
# INPUT TABLE DATA
#***************************************************************
input_table<-function(file=NA,asObj=TRUE,dataID=NA,desc=NA,stringsAsFactors = FALSE,...){
  #autodetect file type
  if(stringr::str_detect(file,"xls$|xlsx$")){
    dat<-readxl::read_excel(path=file,...)
  }else if(stringr::str_detect(file,"csv$")){
    dat<-read.csv(file=file,stringsAsFactors = stringsAsFactors,header=TRUE,...)
  }else if(stringr::str_detect(file,"tsv$")){
    dat<-read.table(file=file,stringsAsFactors = stringsAsFactors,header=TRUE,sep="\t",...)
  }else{
    stop("GEViT currently only supports xls,xlsx,csv,and tsv table input formats.")
  }

  #little bit of column name normalization
  colnames(dat)<-tolower(colnames(dat))

  if(asObj){
    objDat<-new("gevitDataObj",
                id  = paste("table",dataID,sep="_"),
                type = "table",
                source = file,
                data = list(table = dat))
    return(objDat)
  }else{
    return(dat)
  }
}
#***************************************************************
# INPUT DNA DATA
#***************************************************************
input_dna<-function(file=NA,asObj=TRUE,dataID=NA,desc=NA,fileType=NA,...){
  #right now, user must specify a file, and not a directory
  if(dir.exists(file)){
    #actions for a directory
    fileList<-list.files(file,full.names=TRUE)

    #identify file formats
    fileType<-sapply(fileList,function(x){
      dna_detectFileType(x)
    })

    if(sum(is.na(fileType)) == length(fileType)){
      stop("There are no vcf or fasta files to upload in this directory.")
    }

    #quick check on file num and size - warn user if too many files or too large
    fileInfo<-as.data.frame(t(sapply(fileList,file.info)),stringsAsFactors = FALSE)

    #warn if it looks like all files will take up 4 gigs of RAM or more
    totalFileSizeEst<-sum(as.numeric(fileInfo$size))/(1024^3) #convert bytes to gigs
    scalingFactor <- 2 #seems reasonable, but sometimes it's as much as 10!

    if(totalFileSizeEst*scalingFactor>4){
      warning("Current input files likely to exceed 4 gigs of RAM.")
    }

    #now call yo-self!
    #put all individual files into a single DNA bin object
    outInfo<-apply(cbind(fileList,fileType),1,function(x){
      input_dna(file=x[1],fileType=x[2],asObj=FALSE)
    }) %>%
      data.table::rbindlist() %>%
      toDNABIN()

    objDat<-new("gevitDataObj",
                id  = paste("dna",dataID,sep="_"),
                type = "dna",
                source = fileList,
                data = list(dnaBin=outInfo))

    return(objDat)
  }else{
    #actions for a single file
    if(is.na(fileType)){
      fileType<-dna_detectFileType(file)
    }

    if(!(fileType %in% c("vcf","fasta")))
      stop("Only VCF and FASTA files are supported at this time")

    #run approperiate input function given the file type
    output<-switch(fileType,
                   "vcf" = input_vcf(file,format=NA,...),
                   "fasta" = input_fasta(file,format=NA,...))


    if(asObj){
      objDat<-new("gevitDataObj",
                  id  = paste("dna",dataID,sep="_"),
                  type = "dna",
                  source = file,
                  data = list(dnaBin=output))
    }else{
      return(output)
    }
  }
}

#Helper function to detect file types for DNA input
dna_detectFileType<-function(file){
  #autodetect file type
  if(stringr::str_detect(file,".vcf$|.vcf.gz$")){
    return("vcf")
  }else if(stringr::str_detect(file,".fasta$|.fasta.gz$|.fa|.fa.gz$")){
    return("fasta")
  }else{
    return(NA)
  }
}

#***********************************************
# input_vcf
# This function is very tailored to epiDRIVE, so it won't work
# well generically. But what it does is assumes a concatenated vcf
# file instead of directory. ShinyFiles does allow directory files
# to be loaded (yay!) however, the problem automatically
# add the UI and the server observer function, which is just
# tricky to get right. Using the general shiny fileINPUT
# it becomes necessary to do this step
#
input_vcf<-function(file=NA,asObj=TRUE,filter=NA,...){
  #Currently will not work with header, assume multiple files stuck together
  #Only really works where all the files at concatenated together
  if(stringr::str_detect(file,".vcf.gz$")){
    tmp<-data.table::fread(paste0('zcat < ',file))
  }else{
    tmp<-data.table::fread(file)
  }
  #for now, just take the first five columns
  tmp<-tmp[,1:5]
  colnames(tmp)<-c("SAMPID","CHROM","POS","REF","ALT")

  #rename the sampleID
  splitChar<-ifelse(stringr::str_detect(file,".vcf.gz"),"\\.vcf\\.gz","\\.vcf")
  tmp<-tmp %>%
    group_by(SAMPID) %>%
    mutate(SAMPMOD=unlist(str_split(SAMPID,splitChar))[[1]]) %>%
    mutate(CHROMPOS = paste(CHROM,POS,sep=":")) %>%
    ungroup()

  #reference sequence for each position
  refSeq<-dplyr::select(tmp,CHROMPOS,REF) %>% distinct()

  #convert DNA object
  seqList<-list()
  for(sampVal in basename(unique(tmp$SAMPMOD))){
    subSamp<-dplyr::filter(tmp,SAMPMOD == sampVal)
    idx<-match(subSamp$CHROMPOS,refSeq$CHROMPOS)
    subSeq<-refSeq
    subSeq[idx,"REF"]<-subSamp$ALT
    seqList[[sampVal]]<-subSeq$REF
  }

  remove(refSeq,tmp)
  gc()

  return(as.DNAbin(seqList))
}

#helper function for VCF that preps a directory for loading
#quite a few things to add to make it work for fasta and gz files
#currently only works on linux files
prepGenomics<-function(fileDIR=NULL,filetype = NULL){
  #currently only works for VCFS, need to extend to other file types
  if(dir.exists(fileDIR)){
    if(file.exists(sprintf("%s/epiDRIVE_allTogether.vcf",fileDIR))){
      file.remove(sprintf("%s/epiDRIVE_allTogether.vcf",fileDIR))
    }
    system(sprintf("awk '{print FILENAME,$0}' %s/*.vcf | grep -v '#' | cat > %s/epiDRIVE_allTogether.vcf",fileDIR,fileDIR))
  }
}

#***********************************************
# Input fasta files that are zipped or otherwise
#
input_fasta<-function(file=NA,...){
  if(stringr::str_detect(file,".fasta.gz$")){
    return(ape::read.FASTA(file = gzfile(file), type="DNA"))
  }else{
    return(ape::read.FASTA(file = file, type="DNA"))
  }
}

#***************************************************************
# INPUT SHAPE FILE DATA
#***************************************************************
input_spatial<-function(file=NA,asObj=TRUE,dataID=NA,desc=NA,...){

  if(!stringr::str_detect(file,"shp$")){
    stop("Currently, the input file only loads shapefiles. Is your map an image file? Please choose set dataType to 'image' in order to load it properly")
  }

  #check in the directory to make sure supporting files are there
  #check dbf or csv file for metadata
  nc <- sf::st_read(file, quiet = TRUE)

  #create an object
  if(asObj){
    sfc_meta<-NA
    sfc_geodat<-nc

    #if there is any metadata, split it out into a data frame
    if(ncol(nc)>1){
      sfc_idx<-sapply(sapply(nc,class),function(x){"sfc" %in% x})
      sfc_geodat<-nc[,which(sfc_idx)]

      nc<-as.data.frame(nc)
      sfc_meta<-nc[,which(!sfc_idx)]

      #make a unique identifier that links these together
      #they all should be in order.
      itemID<-paste("polyID",randID(nrow(sfc_geodat)),sep = "_")
      sfc_geodat$minPolyID<-itemID
      sfc_meta$minPolyID<-itemID
    }

    objDat<-new("gevitDataObj",
                id  = paste("spatial",dataID,sep="_"),
                type = "spatial",
                source = file,
                data = list(geometry=sfc_geodat,
                            metadata = sfc_meta))
    return(objDat)
  }else{
    return(nc)
  }
}

#Helper function to join spatial file

join_spatial_data<-function(...,obj_names = NULL){

  spatial_obj<-list(...)

  #spatial object variables
  dataID<-paste("spatial",randID(),sep="_")

  geo_data<-NULL
  geo_metadata<-NULL


  for(idx in 1:length(spatial_obj)){

    obj<-spatial_obj[[idx]]

    if(obj@type !="spatial") next

    geo_tmp<-obj@data$geometry
    geo_meta_tmp<-if(!is.null(obj@data$metadata)) obj@data$metadata else NULL

    item_id<-if(is.null(obj_names)) obj@id else obj_names[idx]

    ##DEV NOTE: Assumes shape file only has geometry column
    # When read in as a shape file
    #adding to the geometry item
    geo_tmp_col<-colnames(geo_tmp)

    geo_tmp$minID<-rep(item_id,times = nrow(geo_tmp))

    if(is.null(geo_data)){
      geo_data<-geo_tmp
    }else{
      #normall, I could prefer to use st_join, but it doesn't support
      #full joins so I'll just do this manuall
      geo_data<-rbind(geo_tmp,geo_data)
    }

    #adding to metadata item
    if(is.null(geo_meta_tmp) | all(is.na(geo_meta_tmp))) next

    geo_meta_tmp$minID<-rep(item_id,times = nrow(geo_tmp))

    if(is.null(geo_metadata)){
      geo_metadata<-geo_meta_tmp
    }else{
      geo_metadata<-dplyr::full_join(geo_metadata,geo_meta_tmp)
    }
    #geo_meta_col<-colnames(geo_meta_tmp)
    #geo_meta_tmp<-cbind(rep(item_id,times = nrow(geo_tmp)),geo_meta_tmp)
    #colnames(geo_meta_tmp)<-c("minID",geo_tmp_col)

    #geo_metadata<-rbind(geo_metadata,geo_meta_tmp)
  }

  source_info<-sapply(spatial_obj,function(x){x@id})
  objDat<-new("gevitDataObj",
              id  = dataID,
              type = "spatial",
              source = paste(source_info,collapse =";"),
              data = list(geometry=geo_data,
                          metadata = geo_metadata))
  return(objDat)
}


#***************************************************************
# INPUT PHYLOGENETIC TREE DATA
#***************************************************************
input_phyloTree<-function(file=NA,asObj=TRUE,dataID=NA,desc=NA,sepLabel = NA,metadataFile=NULL,...){
  #Make sure that tree has the right format to load
  if(!stringr::str_detect(file,"tree$|nwk$|tre$|newick$|nexus$")){
    stop("Phylogenetic tree file cannot be loaded. Please ensure that your tree has a .tree, .tre, .nwk, or .nexus format.")
  }

  #Try to load the tree
  tree<-treeio::read.tree(file=file)
  #not sure why the tryCatch below is not working
  #tree<-tryCatch(treeio::read.tree(file=file,...),
  #               error = function(e) stop("Could not load tree."))


  #Create a separate data frame the contains the tipData
  #If there is a bunch of information stored in the tip data
  #use the sep-label command to split it into a table
  if(!is.na(sepLabel)){
    #if it is a special character add escape
    if(grepl('[[:punct:]]', sepLabel))
      sepLabel<-paste0("\\",sepLabel)

    #Think more about how you want to handle the error messages
    tipDat<-tryCatch(do.call(rbind,strsplit(tree$tip.label,sepLabel)),
                     error = function(e) return(NULL))
  }else{
    tipDat<-tree$tip.label
  }

  #if the user added metadata load too
  if(!is.null(metadataFile)){
    metadata<-input_table(file = metadataFile,asObj=FALSE)

    #quick scan for column that matches node labels to make it easier to link data later
    tipLab<-tree$tip.label
    containsLabs<-apply(metadata,2,function(x){sum(tipLab %in% x)})
    if(max(containsLabs) == 0){
      warning("Current metadata file does not contain column matching tree labels")
      linkVar <- NULL
    }else{
      #Best guess of which columns contain the label information
      linkVar<-names(which(containsLabs == max(containsLabs)))
    }
  }
  #return the tree and parsed tip data (if that's what the user wants)
  #Is there more I should get out of a tree?

  if(asObj){
    objDat<-new("gevitDataObj",
                id  = paste("phyloTree",dataID,sep="_"),
                type = "phyloTree",
                source = file,
                data = list(tree=tree)
    )

    if(!is.null(tipDat))
      objDat@data$tipData<-tipDat

    if(!is.null(metadataFile)){
      objDat@source<-c(objDat@source,metadataFile)
      objDat@data$metadata<-metadata

      objDat@data$linkVar<-linkVar
    }


    return(objDat)
  }else{
    return(list(tree=tree,nodeDat = tipDat,metadata = metadata))
  }
}

#***************************************************************
# INPUT IMAGE DATA
#***************************************************************
input_image<-function(file=NA,asObj=TRUE,dataID=NA,desc=NA,...){
  img<-magick::image_read(path=file)

  #all images get resized so that they are mangeable to work with
  #arbitrarily set this 1000 pixels for max height width. Note that
  #rescaling will preserve the aspect ratio
  #smaller size also lets it load faster
  img<-magick::image_resize(img, "1000x1000")

  #get details for later
  imgDetails<-magick::image_info(img)

    #just return the image
  warning("To use this image, please be sure to have separate file that links the image to data in pixel space. If you would like to CREATE an annotation file, run the 'annotate_image' command.")

  if(asObj){
    objDat<-new("gevitDataObj",
                id  = paste("image",dataID,sep="_"),
                type = "image",
                source = file,
                data = list(data=img,imgDetails = imgDetails,metadata = NULL)
    )

    return(objDat)
  }else{
    return(list(data=img,imgDetails = imgDetails,metadata=NULL))
  }
}

#helper method to annotate FEATURES within an image
#'@export
annotate_image<-function(img = NULL,imgDetails=NULL,outfile = NULL,overwrite_meta=FALSE){
  # If user does not provide a file name, make one up
  if(is.null(outfile)){
    outfile="annotated_image_file.csv"
  }

  if(class(img) == "gevitDataObj"){
    annote_dat<-runApp(annotate_app(img@data$data,img@data$imgDetails))
    #
  }else{
    annote_dat<-shiny::runApp(annotate_image_app(img,imgDetails))
  }
  #cleaning up the annotatation data
  annote_dat<-data.frame(elemID =annote_dat[,1],
                         x = as.numeric(annote_dat[,2]),
                         y = as.numeric(annote_dat[,3]),
                         xmax = as.numeric(annote_dat[,4]),
                         ymax = as.numeric(annote_dat[,5]),
                         element_name = annote_dat[,6],
                         type = annote_dat[,7],
                         stringsAsFactors = FALSE)

  if(class(img) == "gevitDataObj"){
    if(!is.null(img@data$metadata) & !overwrite_meta){
      # ------- TO DO: IF IT CAN'T RBIND ..then ... DIE GRACEFULLY # -------
      #if there's existing metadata, add to it, don't overwrite it
      annote_dat<-rbind(img@data$metadata,annote_dat)
    }

    img@data$metadata<-annote_dat
    return(img)
  }else{
    return(annote_dat)
  }
}

#***************************************************************
# INPUT R DATA
# Essentially, input any data that is outputted by R.
# There are specific object files that are loaded
#***************************************************************
input_rdata<-function(file=NA,asObj=TRUE,dataID=NA,desc=NA,...){
  #Load data from R
  if(stringr::str_detect(file,"rdata$ | rda")){
    dat<-try(load(file))
  }else if(stringr::str_detect(file,"rds$")){
    dat<-try(readRDS(file))
  }else{
    stop("Right now, epiDRIVE can only read data that has been saved by the save() function (file extension rdata, or rda) or by the saveRDS() function (file extension, rds)")
  }

  objName<-dat

  #check how many entitities are being loaded and create gevitR objects for use
  #use get and assign to read and set new objec
  for(i in 1:length(dat)){
    tmp<-get(dat[[i]])
    tmpClass<-class(tmp)

    #give it a unique dataID
    #support specific types - not everything, otherwise that's a nightmare
    tmpClass<-switch(tmpClass,
                     "phylo" = "tree",
                     "data.frame" = "table",
                     "igraph" = "nodeLink",
                     "hclust" = "tree",
                     NULL)

    if(is.null(tmpClass)){
      warning(sprintf("'%s' not loaded - the '%s' data type is currently not supported by epiDRIVE",class(tmp),objName[i]))
    }


    #converting objects into the gevitObject so that it can be used
    #with the rest of the analysis

    tmp<-new("gevitDataObj",
             id  = paste(tmpClass,dataID,sep="_"),
             type = tmpClass,
             source = file,
             data = list(data = tmp)
    )

    #reassign the environment variable to the gevit object
    #does not return anything
    assign(objName[i],tmp)
  }
}
sfisher4/gevitR documentation built on Feb. 10, 2020, 6:29 p.m.