R/load_pheno.R

Defines functions load_pheno

Documented in load_pheno

#' A function for making the raw data (CSV file) with the long format into a wide format
#'
#' This function will load from CSV and then make the long format into a wide format
#'
#' @param work.book.name The raw data file (CSV format) with long format
#' @param data.dir The path to the data directory
#' @param work.dir The path to the work directory
#'
#' @return pheno.name The data with a wide format
#'
#' @examples
#' @export
#'

load_pheno <- function(work.book.name, data.dir, work.dir) {
  setwd(data.dir)
  # pheno.name <- read.csv(work.book.name, header = TRUE) ## load CSV sheet
  # new data is labeled .csv but is tab delimted
  # pheno.name <- read.table(work.book.name, sep="\t", header = TRUE) ## was giving errors
  pheno.name <- read.delim(paste0(data.dir,work.book.name), sep="\t", header = TRUE)

  pheno.name <- as.data.frame(pheno.name)
  #colnames(pheno.name) = make.names(colnames(pheno.name))


  if (work.book.name == "FLOW") {
    ### FLOW data has columns with + and - (positive and neg) these get substituted with '.' making two columns with same name.  This should be changed
    pheno.name$Name <- gsub("\\-","_neg_",pheno.name$Name)
    pheno.name$Name <- gsub("\\+","_pos_",pheno.name$Name)
    pheno.name$Name <- gsub("\\%","Percent",pheno.name$Name)
  }

  ### 	Remove Duplicates from data
  ###	This is keep the first value and remove the second value
  ### code from http://stackoverflow.com/questions/9944816/unique-on-a-dataframe-with-only-selected-columns
  pheno.name <- pheno.name[!duplicated(pheno.name[,c("OrganismKey", "Mouse.ID", "Date.of.Birth", "JR.", "Sex", "Strain.Name","Genotype","Room.origin", "GenotypeSymbol","Job.ID", "TestCode","Life.Status","Exit.Reason", "DateReceived", "DateComplete","Name")]),]

  #### can also use - I did not use it.
  #pheno.name <- pheno.name[!duplicated(pheno.name[,c(1:16)]),]

  ####	Long to Wide here
  ####	Plyr will calculate summary stats if there are repeated rows - be careful
  pheno.name <- dcast(pheno.name, OrganismKey + Mouse.ID + Date.of.Birth + JR.+ Sex + Strain.Name + Genotype + Room.origin + GenotypeSymbol + Job.ID + TestCode + Life.Status + Exit.Reason + DateReceived + DateComplete ~ Name, value.var = "Value")

  colnames(pheno.name) = make.names(colnames(pheno.name))

  pheno.name$StrainGeno <- paste(pheno.name$Strain.Name, pheno.name$GenotypeSymbol) ### creates new factor that is combination of strain name and genotype symbol - use this for plotting

  ### Remove all animals with Genotype.Symbol is +/+
  remove_list <- which(pheno.name$GenotypeSymbol == "+/+")
  print(paste("Removing ", length(remove_list)," animals that don't have +/+ Genotype.", sep = ""))
  pheno.name <- pheno.name[-remove_list,]
  rm(remove_list)

  ### Change all +/- genotypes to -/+
  change_list <- which(pheno.name$GenotypeSymbol == "+/-")
  print(paste("Changing ", length(change_list)," animals that have +/- to -/+ Genotype.", sep = ""))
  pheno.name$GenotypeSymbol[change_list] <- "-/+"
  rm(change_list)

  ### Remove all animals with Genotype.Symbol is missing (means that genotype failed or is not complete)
  remove_list = intersect(which(pheno.name$GenotypeSymbol == ""), which(pheno.name$Strain.Name != "C57BL/6NJ"))
  print(paste("Removing ", length(remove_list)," animals that don't have any Genotype data.", sep = ""))
  pheno.name <- pheno.name[-remove_list,]
  rm(remove_list)

  ### DOB gets converted to number, check this - this does not happens all systems
  #pheno.name$Date.of.Birth <- as.Date(pheno.name$Date.of.Birth , origin = "1899-12-30")


  #### ADD ES Cell info
  #### HeartWeight and CBC - add ES cell information for modeling
  ### there is a private mutation in B6NJ that causes low heart weight in controls and may effect other phenotypes.
  ### to overcome this use the center or ES cell line as a covariate.
  pheno.name$EScell <- ifelse(grepl("^C57BL/6NJ$", pheno.name$Strain.Name), "C57BL/6NJ",
                             ifelse(grepl("Vlcg", pheno.name$Strain.Name), "Vlcg",
                                    ifelse(grepl("Wtsi", pheno.name$Strain.Name), "Wtsi",
                                           ifelse(grepl("em1J", pheno.name$Strain.Name), "em1J",
                                                  ifelse(grepl("Hmgu", pheno.name$Strain.Name), "Hmgu",
                                                         ifelse(grepl("Mbp", pheno.name$Strain.Name), "Mbp",
                                                                ifelse(grepl("tm1b", pheno.name$Strain.Name), "Misc",
                                                                       ifelse(grepl("tm1", pheno.name$Strain.Name), "Misc",
                                                                              ifelse(grepl("COIN", pheno.name$Strain.Name), "Misc",
                                                                                     ifelse(grepl("Lutzy", pheno.name$Strain.Name), "Misc",
                                                                                            "PROBLEM - Vivek FIX"))))))))))

  pheno.name$EScell <- as.factor(pheno.name$EScell)

  ### remove StrainGeno < 8
  StrainGeno.Freq <- as.data.frame(table(pheno.name$StrainGeno))
  colnames(StrainGeno.Freq) <- c("StrainGeno","Counts")
  StrainGeno2rm <- as.character(StrainGeno.Freq[which(StrainGeno.Freq$Counts < 8),1])

  print(paste("Removing ", length(StrainGeno2rm)," Strains with n < 8.", sep = ""))


  fname <- gsub(work.book.name, pattern = "csv",replacement = "StrainGenoFreq.csv")
  write.csv(file=fname, StrainGeno.Freq, quote = FALSE, row.names = FALSE)

  fname <- gsub(work.book.name,pattern = "csv",replacement = "StrainGenoRemoved.csv")
  StrainGeno.Freq.rm <- StrainGeno.Freq[which(StrainGeno.Freq$Counts < 8),]
  write.csv(file=fname, StrainGeno.Freq.rm, quote = FALSE, row.names = FALSE)
  pheno.name <- pheno.name[-c(which(pheno.name$StrainGeno %in% StrainGeno2rm)),]

  setwd(work.dir)
  return(pheno.name)
}
dleelab/KompUtils documentation built on May 13, 2017, 3:31 a.m.