R/mhealth.convert.R

Defines functions mhealth.convert

#' @name mhealth.convert
#' @title Convert dataframe to be compatible with mhealth specification
#' @description Convert dataframe to mhealth specification. The column order of converted dataframe would be required_cols, group_cols. Columns that are not specified in the input arguments will be dropped.
#' @param file_type mhealth specification file types, see `mhealth$filetype` for all supported types
#' @param required_cols the required columns of file types that are in order. E.g. for annotation, timestamp, start and end time and annotation name columns are required in the order; for sensor, timestamp and necessary numeric columns are required; for feature, timestamp, start and end time and numeric or categoric features are required; for event, timestamp, start and end time and numeric or categoric info columns are required.
#' @param rename_cols rename column names in results as provided string vector (except for the first timestamp column)
#' @param group_cols the columns used for indexing or grouping. Will be converted to numeric or if not, character. Default is null, which means no group columns.
#' @param datetime_format in string. Used to convert tiemstamp and start and stop time columns to date object
#' @param timezone in location string. Used to convert timestamp and start and stop time columns to date object
#' @export
#' @import stringr lubridate

mhealth.convert = function(df,
                           file_type,
                           required_cols,
                           rename_cols = NULL,
                           group_cols = NULL,
                           datetime_format = NULL,
                           timezone = "UTC"
                           ) {
  options(digits.secs=3)
  if(missing(file_type) || missing(required_cols)){
    message(sprintf(
      "\n
      File type or required columns are missing"
    ))
    return(NULL)
  }

  # validate required_cols format
  required_cols = .convert.column_input(df, required_cols)

  # validate required_cols are compatible with file_type
  if(is.null(required_cols)) return(NULL)
  if(length(required_cols) < 2 && file_type %in% c(mhealth$filetype$sensor, mhealth$filetype$event)){
    message(sprintf(
      "\n
      There should be at least two required columns for %s file",
      file_type
    ))
    return(NULL)
  }else if(length(required_cols) < 4 && file_type %in% c(mhealth$filetype$annotation, mhealth$filetype$feature)){
    message(sprintf(
      "\n
      There should be at least four required columns for %s file",
      file_type
    ))
    return(NULL)
  }


  # convert group columns
  group_cols = setdiff(group_cols, required_cols)
  group_cols = .convert.column_input(df, group_cols)

  # reorder columns
  if(!is.null(group_cols)){
    df = df[c(required_cols, group_cols)]
    required_cols = 1:length(required_cols)
    group_cols = (length(required_cols) + 1):(length(required_cols) + length(group_cols))
  }else{
    df = df[c(required_cols)]
    required_cols = 1:length(required_cols)
  }

  # timestamp column
  df = .convert.datecolumn(
    df,
    required_cols[1],
    datetime_format = datetime_format,
    timezone = timezone,
    new_name = mhealth$column$TIMESTAMP
  )
  if(is.null(df)) return(df)

  # start and end time column
  if (file_type == mhealth$filetype$annotation ||
             file_type == mhealth$filetype$feature) {
    df = .convert.datecolumn(
      df,
      required_cols[2],
      datetime_format = datetime_format,
      timezone = timezone,
      new_name = mhealth$column$START_TIME
    )
    if(is.null(df)) return(df)

    df = .convert.datecolumn(
      df,
      required_cols[3],
      datetime_format = datetime_format,
      timezone = timezone,
      new_name = mhealth$column$STOP_TIME
    )
    if(is.null(df)) return(df)
  }

  # convert annotation name column
  if (file_type == mhealth$filetype$annotation) {
    # convert annotation column to string and rename
    df = .convert.column_to(df, required_cols[4], class_type = "character", new_name = mhealth$column$ANNOTATION_NAME)
    if(is.null(df)) return(df)
  }

  # convert other columns
  if(file_type == mhealth$filetype$sensor){
    rest_cols = required_cols[-1]
  }
  else if(file_type == mhealth$filetype$annotation){
    rest_cols = required_cols[-(1:4)]
  }else if(file_type == mhealth$filetype$feature){
    rest_cols = required_cols[-(1:3)]
  }else if(file_type == mhealth$filetype$event){
    rest_cols = required_cols[-1]
  }

  for(n in rest_cols){
    if(is.character(df[1,n]) || is.numeric(df[1,n])) next;
    if(!is.factor(df[1, n])){
      temp_df = .convert.column_to(df, col = n, class_type = "numeric")
    }else{
      temp_df = .convert.column_to(df, col = n, class_type = "character")
    }
    if(is.null(temp_df)) return(NULL)
    if(is.na(temp_df[1, n])){
      if(filetype == mhealth$filetype$sensor){
        message(sprintf(
          "\n
          Required column %s should be convertible to numeric",
          names(temp_df)[n]
        ))
        return(NULL)
      }
      # try to convert to character
      temp_df = .convert.column_to(temp_df, col = n, class_type = "character")
      if(is.null(temp_df)) return(NULL)
      if(is.na(temp_df[1, n])){
        message(sprintf(
          "\n
          Don't know how to convert required column %s to numeric or character",
          names(temp_df)[n]
        ))
        return(NULL)
      }else{
        df = temp_df
      }
    }else{
      df = temp_df
    }
  }

  if(!is.null(group_cols)){
    for(n in group_cols){
      if(!is.factor(df[1, n])){
        temp_df = .convert.column_to(df, col = n, class_type = "numeric")
      }else{
        temp_df = .convert.column_to(df, col = n, class_type = "character")
      }
      if(is.character(df[1,n])) next;
      if(is.null(temp_df)) return(NULL)
      if(is.na(temp_df[1, n])){
        if(filetype == mhealth$filetype$sensor){
          message(sprintf(
            "\n
            Group column %s should be convertible to numeric, ignore",
            names(temp_df)[n]
          ))
        }
        # try to convert to character
        temp_df = .convert.column_to(temp_df, col = n, class_type = "character")
        if(is.null(temp_df)) return(NULL)
        if(is.na(temp_df[1, n])){
          message(sprintf(
            "\n
            Don't know how to convert group column %s to numeric or character. ignore",
            names(temp_df)[n]
          ))
        }else{
          df = temp_df
        }
      }else{
        df = temp_df
      }
    }
  }

  # rename columns
  if(!is.null(rename_cols)){
    col_names = colnames(df)
    col_names[2:(length(rename_cols)+1)] = rename_cols
    colnames(df) = col_names
  }else{
    message("rename cols are not valid, ignore it")
  }

  # convert all column headers to uppercase, and exclude illegal characters
  df = .convert.legit_column(df)
  if(is.null(df)) return(NULL)

  return(df)
}
qutang/mHealthR documentation built on May 26, 2019, 1:31 p.m.