R/cgms_processing.R

Defines functions CGMS_processor

Documented in CGMS_processor

#' CGMS data processor
#'
#' This function is designed to take a directory of ONLY raw CGMS xlsx files to output a csv with tidied data in that folder
#' @param inputdir directory to process.
#' @keywords processing
#' @export
#' @examples CGMS_processor(inputdir)
#####Input a directory of ONLY raw CGMS xlsx files. Output a csv with tidied data in that folder
CGMS_processor <- function(inputdir) {

  #Get all file dirs
  filelist <- list.files(inputdir,
                         full.names = T)

  #Init dataframe
  fulllist <- data.frame(1,1,1,1,1,1)
  names(fulllist) <- c("V1","V2","V3","V4","V5","V6")

  #Tidy
  for(j in filelist){

    #init readin listing
    mysheets_fromexcel <- list()
    path <- j

    #Get subj id from filename
    subjectno <- stringr::str_sub(path,start = -17, end= -15)

    #Tidy a single sheet
    mysheetlist <- excel_sheets(path=path)
    i=1

    #Create a list that has all matrices
    for (i in 1:length(mysheetlist)){
      tempdf <- readxl::read_excel(path=path, sheet = mysheetlist[i])
      #Sometimes the sheet has a header which is removed here if it exists
      if(names(tempdf)[1] != "ID") {
        tempdf <- tempdf[c(-1,-2),]
        names(tempdf) <- c("ID","Time","Record Type","Historic Glucose (mg/dL)")

        #The faulty readin caused by the header causes issues with dates which
        #are resolved seperately here
        tempdf$Time <- janitor::excel_numeric_to_date(as.numeric(tempdf$Time)
                                                        ,date_system = "modern",
                                                        include_time = T)
      }

      tempdf$Time <- as.character(tempdf$Time)
      #Grab visit from tab name. Trims the string so it is standard
      tempdf$visit <- toupper(stringr::str_sub(mysheetlist[i],
                                      start = stringr::str_locate(mysheetlist[i],"V")[1]))

      tempdf$subjectno <- subjectno
      mysheets_fromexcel[[i]] <- tempdf
    }

    #Stack the elements of the list of matrices into a single tidy df

    #Init sheet to be stacked on
    sheetone <-  mysheets_fromexcel[1]

    #Massages the elements of the list into something that can stack
    l <-unlist(sheetone,recursive = F)
    df <- data.frame(matrix(unlist(l), nrow=length(l), byrow=T))
    sheettogether <- t(df)

    for (i in 2:length(mysheets_fromexcel)) {

      nextsheet <-  mysheets_fromexcel[i]

      #Massages the elements of the list into something that will stack nicely
      l <-unlist(nextsheet,recursive = F)
      df <- data.frame(matrix(unlist(l), nrow=length(l), byrow=T))
      nextsheet <- t(df)
      sheettogether <- rbind(sheettogether,nextsheet)
    }
    names(sheettogether) <- c("V1","V2","V3","V4","V5","V6")
    #stack frame from one file onto the final stack
    fulllist <- rbind(fulllist,sheettogether)
  }

  #rename and reorder columns
  names(fulllist) <- c("id","time","recordtype","historic_glucose","visit","subjno")
  fulllist <- fulllist[-1,c(6,5,3,2,1,4)]

  write.csv(fulllist,
            paste(inputdir,
                  "CGMStidy.csv",
                  sep = "/"),
            row.names = F)
}
hamsamilton/cgms.analysis documentation built on March 29, 2020, 12:57 a.m.