R/safety_summary.R

Defines functions create.safety_summary print.safety_summary df_to_char safety_summary

Documented in create.safety_summary df_to_char print.safety_summary safety_summary

#' Calculate frequency tables from a rectangular data frame with one row per subject-event
#'
#' @param data a data set containing the following columns: \code{subjid, term, soc, serious, related, fatal, group}.
#' See \code{\link{safety}} for more details.
#' @param exposed a numeric vector giving the numbers of subjects exposed in each group.
#' This needs to be supplied directly by the user, and cannot be inferred from the input \code{data}
#' with one row per patient-event.
#' To ensure the ordering is correct either, name the vector with names matching the values in \code{data$group}, or
#' ensure that the data$group is an ordered factor, or relying on alphabetical ordering of the values in
#' \code{data$group}
#' @param excess_deaths a numeric vector giving the number of extra deaths not reported within \code{data}. Defaults to 0.
#' @param freq_threshold a value on a percentage scale at which to remove events if the incidence falls below. Defaults to 0
#' @param soc_index a character vector either "meddra" or "soc_term", which is used to identify if the soc variable in data gives the numerical meddra code or the description in English.
#' @param na.action a function that indicates what should happen if the data contain missing values. The default is \code{\link{na.fail}}  as both repositories will not accept any missing values in the upload. Alternatives could be \code{\link{na.omit}}, \code{\link{na.exclude}}, or \code{\link{na.pass}}.
#' @return a list of three dataframes: GROUP, SERIOUS, NON_SERIOUS. Each contains the summary statistics required by EudraCT, and is suitable for export.
#'
#' @seealso \code{\link{eudract_convert}} \code{\link{simple_safety_xml}}
#'
#' @export
#' @importFrom dplyr group_by summarise left_join mutate select rename ungroup %>%
#' @importFrom magrittr %<>%
#' @importFrom stats na.fail
#' @example example/canonical.R



safety_summary <- function(data, exposed, excess_deaths=0, 
                           freq_threshold=0, soc_index=c("meddra","soc_term"),
                           na.action=na.fail
                           ){
  #check the names of data

  var_names <- c("subjid", "term", "soc", "serious", "related", "fatal", "group")
  name_check <- var_names %in% names(data)
  if( !all(name_check)){
    stop( paste("your input data are missing the following variables:",
                paste(var_names[!name_check], collapse=", ")))
  }
  name_check <-  names(data) %in% var_names
  data <- data[,name_check]
  
  # remove any trailing white space
  data <- lapply(data, function(x){ if(is.character(x)){trimws(x)}else{x}} ) %>% 
    as.data.frame
  if( !is.null(names(exposed))) {
    names(exposed) <- exposed %>% names %>% trimws
  }
  
  # check for NA or empty strings
  if( anyNA(data) | any(data=="")){
   warning("Your input data contain missing values or empty strings. 
           The output XML is likely to be rejected by EudraCT or ClinicalTrials, 
           or will not report some of the adverse events.")
  }
  data <- lapply(data, function(x){
    if( is.character(x)){ifelse(x=="", NA, x)
      }else{x} }) %>% as.data.frame
  data <- na.action(data)
  
  # check if name and length of exposed matches
  group_names <- unique(as.character(data$group))
  if( length(exposed) < length(group_names)){
    stop("the argument 'exposed' has fewer elements than the number of groups")
  }
  if( !is.null(names(exposed)) && !all( group_names %in% names(exposed))){
    stop("the names of 'exposed' do not match up to the values in 'data$group'")
  }
  if( length(group_names)< length(exposed)){
    warning("There are groups with no events")
    if( is.null(names(exposed))){ stop("The 'exposed' argument needs to be a named vector")}
    # this line should make rows with zeroes where needed.
    group_names  <- names(exposed)
  }

  if(is.null(names(exposed))){ names(exposed) <- group_names}
  exposed_df <- data.frame(
    subjectsExposed = exposed,
    group = factor(names(exposed),levels=group_names)
  )


  data$group <- factor(as.character(data$group), levels=group_names)

  #Check length of group name text
  if( min(nchar(group_names))<4){stop("Group names must be at least 4 characters in length.")}

  # check length, value and names of excess_deaths
  if( length(excess_deaths)==1 && length(group_names)>1 && excess_deaths==0){
    excess_deaths <- rep(0, length(group_names))
  }
  if( length(excess_deaths)!= length(group_names)){
    stop("the argument 'excess_deaths' needs to be the same length as the number of groups")
  }
  if( !is.null(names(excess_deaths)) && !all( names(excess_deaths) %in% group_names)  ){
    stop("the names of 'excess_deaths' do not match up to the values in 'data$group'")
  }
  if(is.null(names(excess_deaths))){ names(excess_deaths) <- group_names}
  excess_deaths_df <- data.frame(
    excess_deaths = excess_deaths,
    group = factor(names(excess_deaths), levels=group_names)
  )


  # check the soc_index
  soc_index <- match.arg(soc_index)


  # Group level statistics

  group <- data %>% group_by(subjid,group) %>%
    summarise(
      serious_any=any(serious==1),
      nonserious_any=any(serious==0),
      deaths=any(fatal==1)
    ) %>% group_by(group) %>%
    summarise(
      subjectsAffectedBySeriousAdverseEvents = sum(serious_any),
      subjectsAffectedByNonSeriousAdverseEvents = sum(nonserious_any),
      deathsResultingFromAdverseEvents = sum(deaths)
    )


  group %<>%
    tidyr::complete(group, fill=list(
      "subjectsAffectedBySeriousAdverseEvents"=0,
      "subjectsAffectedByNonSeriousAdverseEvents"=0,
      "deathsResultingFromAdverseEvents"=0
    ))

 group %<>%
    left_join(exposed_df, by="group") %>%
    left_join(excess_deaths_df, by="group" )%>%
    mutate(
      deathsAllCauses=deathsResultingFromAdverseEvents+excess_deaths
    ) %>%
    select( group,subjectsAffectedBySeriousAdverseEvents,subjectsAffectedByNonSeriousAdverseEvents,
            deathsResultingFromAdverseEvents ,subjectsExposed,deathsAllCauses
    ) %>%
    rename("title"="group")
 #check the validity of the exposed values
 check <- group %>%
   mutate(
     check=subjectsExposed-pmax(subjectsAffectedBySeriousAdverseEvents,
                                subjectsAffectedByNonSeriousAdverseEvents,
                                deathsResultingFromAdverseEvents,
                                deathsAllCauses)
   ) %>%
   summarise( check=any(check<0))
 if( check$check){stop("'exposed' argument has elements that are too small")}

  ans <- list(GROUP=as.data.frame(group))

  # Nonserious term-level statistics
  if( any(!as.logical(data$serious))){
    non_serious <- data %>% dplyr::filter(!serious) %>%
      group_by(term, soc, group) %>%
      summarise(subjectsAffected=length(unique(subjid)),
                occurrences=dplyr::n()
      ) %>% ungroup %>% 
      tidyr::complete(group, tidyr::nesting(term, soc), 
                      fill=list("subjectsAffected"=0, "occurrences"=0)) %>%
      rename("groupTitle"="group") %>%
      left_join(soc_code, by=c("soc"=soc_index)) %>% ungroup() %>%
      select(groupTitle,subjectsAffected, occurrences,term,eutctId)

    # filter
    if( freq_threshold>0){
      non_serious %<>% left_join(group, by=c("groupTitle"="title")) %>%
        mutate( rate=subjectsAffected/subjectsExposed*100) %>% group_by(term,eutctId) %>%
        dplyr::filter( max(rate)>=freq_threshold) %>% ungroup %>%
        select(groupTitle,subjectsAffected, occurrences,term,eutctId)
    }
    ans <- c(ans, list( NON_SERIOUS=as.data.frame(non_serious)))
  } else{ warning("There are no non-serious events")}

  # Serious term-level statistics.
  if( any(as.logical(data$serious))){
    serious <- data %>% dplyr::filter(as.logical(serious)) %>%
      group_by(term, soc, group) %>%
      summarise(subjectsAffected=length(unique(subjid)),
                occurrences=dplyr::n(),
                occurrencesCausallyRelatedToTreatment=sum(related),
                deaths=sum(fatal),
                deathsCausallyRelatedToTreatment=sum(fatal*related)
      ) %>% ungroup %>% 
      tidyr::complete(group, tidyr::nesting(term, soc),
                      fill=list("subjectsAffected"=0,
                                "occurrences"=0,
                                "occurrencesCausallyRelatedToTreatment"=0,
                                "deaths"=0,
                                "deathsCausallyRelatedToTreatment"=0
                      )
      ) %>%
      rename("groupTitle"="group") %>%
      left_join(soc_code, by=c("soc"=soc_index)) %>% ungroup() %>%
      select(groupTitle,subjectsAffected, occurrences,term,eutctId,
             occurrencesCausallyRelatedToTreatment, deaths,
             deathsCausallyRelatedToTreatment
      )
    ans <- c(ans, list(SERIOUS=as.data.frame(serious)))
  } else{ warning("There are no serious events")}

  ans <- lapply(ans, df_to_char)
  class(ans) <- "safety_summary"
  ans
}

#' function to make a data frame be entirely character vectors
#'
#' @param df a data frame
#' @keywords internal
df_to_char <- function(df){
  df <- as.data.frame(lapply(df, as.character), stringsAsFactors = FALSE)
}

#' print method for safety summary object
#' @param x a safety_summary object
#' @param ... extra arguments for the generic print method
#' @export
#' @keywords internal
#' @importFrom utils head

print.safety_summary <- function(x,...){
  old_options <- options()
  on.exit( options(old_options))
  options(scipen=999)
  cat("Group-Level Statistics\n\n")
  print(x$GROUP)
  cat("\nNon-serious event-level statistics (intial rows)\n\n")
  print(head(x$NON_SERIOUS))
  cat("\nSerious event-level statistics (intial rows)\n\n")
  print(head(x$SERIOUS))

}

#' function that creates a safety_summary object from individual data.frames
#' @param group a data frame that contains the group-level statistics
#' @param non_serious a data frame that contains the non-serious term-group level statistics
#' @param serious a data frame that contains the serious term-group level statistics
#' @return  a safety_summary object
#' @export

create.safety_summary <- function(group,non_serious, serious){
  group_names <- c("title","subjectsAffectedBySeriousAdverseEvents",
                   "subjectsAffectedByNonSeriousAdverseEvents", "deathsResultingFromAdverseEvents",
                   "subjectsExposed","deathsAllCauses")
  name_check <- all(group_names %in% names(group)) && ncol(group)==length(group_names)
  if(! name_check){
    stop( "group data.frame does not have the correct variable names")
  }
  non_serious_names <- c("groupTitle","subjectsAffected" ,"occurrences","term", "eutctId" )
  name_check <- all(non_serious_names %in% names(non_serious)) && ncol(non_serious)==length(non_serious_names)
  if( !name_check){
    stop( "non_serious data.frame does not have the correct variable names")
  }
  serious_names <- c(non_serious_names,  "occurrencesCausallyRelatedToTreatment",
                     "deaths","deathsCausallyRelatedToTreatment" )
  name_check <- all(serious_names %in% names(serious)) && ncol(serious)==length(serious_names)
  if( !name_check){
    stop( "serious data.frame does not have the correct variable names")
  }



  ans <- list(GROUP=group, NON_SERIOUS=non_serious, SERIOUS=serious)
  ans <- lapply(ans, df_to_char)
  class(ans)="safety_summary"
  ans
}


if(getRversion() >= "2.15.1"){
  utils::globalVariables(
    c("deaths", "deathsAllCauses", "deathsCausallyRelatedToTreatment",
      "deathsResultingFromAdverseEvents" ,"eutctId" ,"fatal", "groupTitle" ,
      "nonserious" ,"occurrences", "occurrencesCausallyRelatedToTreatment", "rate",
      "related" ,"soc", "soc_code" ,"subjectsAffected",
      "subjectsAffectedByNonSeriousAdverseEvents",
      "subjectsAffectedBySeriousAdverseEvents", "subjectsExposed", "subjid", "term",
      "serious_any","nonserious_any"
      ))
}
shug0131/eudract_pkg documentation built on June 1, 2025, 6:10 p.m.