R/SeroData.R

Defines functions category.parameters testInteger find.category print.SeroData compute.age.groups SeroData

Documented in compute.age.groups SeroData

#' @title Create a dataset containing the serological data and other information
#'
#' @description This function contains the definition of the class \code{SeroData}. It requires the age and seropositivity of individuals, and accepts more input parameters. 
#'
#' @author Nathanael Hoze \email{nathanael.hoze@gmail.com}
#' 
#' @param age_at_sampling A vector of integers containing the age of the sampled individuals at the time of sampling. Must be equal or greater than 1.  
#' 
#' @param Y A vector containing the seropositivy status of the sampled individuals. It can be in a numeric form (1 or 0) or boolean (\code{TRUE} or \code{FALSE}). This vector must have the same size as \code{age}.
#'  
#' @param age_class Integer. The length in years of the age classes. Default = 1. 
#' 
#' @param max_age Integer. The maximal age considered for the individuals in the sample. Individuals older than \code{max_age} are set equal to \code{max_age}. 
#' 
#' @param sampling_year Integer. Defines the sampling year.  It can be a single value or a vector of the same size as the number of sampled individuals. If it is a single value, all the sampled individuals have the same sampling year. Default = 2017. 
#' 
#' @param location An optional character factor defining the sampling location. It can be a single chain of characters or a vector of the same size as the number of sampled individuals. Default = \code{NULL}.
#' 
#' @param sex An optional character factor defining the sex of the individuals. It can be a single chain of characters or a vector of the same size as the number of sampled individuals. Default = \code{NULL}. 
#'
#' @param category Character. An optional element containing the name of the categories and defining the category of the individuals. 
#' This feature is used when fitting the models assuming  different risks of infection for the different categories. It can be a single character element or a matrix of characters with row size equal to the number of individuals and column size equal to the number of different category classes. Default = "Category 1".
#'
#' @param reference.category Character. A vector containing the name of the reference categories. It must have the same length as the number of columns of \code{category}. By default, it will take as reference the most common element in each category.
#' 
#' @param ... Additional arguments (not used).
#'  
#' 
#' @return A list with the class \code{SeroData}, which contains the
#' following items:
#'
#' \itemize{
#'
#' \item age: A vector of integers containing the age of the sampled individuals at the time of the latest sampling, of length N.
#' 
#' \item age_at_sampling: A vector of integers containing the age of the sampled individuals at the time of sampling, of length N.
#' 
#' \item Y: A vector of seropositivity status, of length N.
#' 
#' \item N: The number of individuals considered.
#'
#' \item A: The maximal age.
#'
#' \item NGroups: The total number of age groups.
#'
#' \item sampling_year: A vector of sampling years, of length N.
#'
#' \item location: A vector of the sampling location, of length N.
#'
#' \item sex: A vector of the sex of each individual, of length N.
#'
#' }
#'
#' @export
#' @examples
#' 
#' ## A very simple example of a serological survey with three individuals: 
#' data = SeroData(age_at_sampling = c(10,32,24), Y=c(0,1,1), max_age = 50, age_class = 1, sampling_year = 2017)
#' 
#' seroprevalence(data = data)
#' data2 = subset(data,c(1,3))
#' seroprevalence(data = data2, age_class = 5) 
#' 
#' ## A example using categories 
#' sex= c('male', 'male', 'female')
#' data = SeroData(age_at_sampling = c(10,32,24), Y=c(0,1,1),  category= sex )
#' 
#' # defining the reference category 'female'
#'
#' data = SeroData(age_at_sampling = c(10,32,24), Y=c(0,1,1),  category= sex, reference.category='female')
#' 
#' # Grouping individuals by age categories 1-10, 11-20, 21-30, etc. 
#' data = SeroData(age_at_sampling = c(10,32,24), Y=c(0,1,1),  category= sex, reference.category='female', age_class = 10)


SeroData <- function(age_at_sampling,
                     age=NULL,
                     Y,         
                     age_class = 1,
                     max_age = NULL,
                     sampling_year = NULL,
                     location = NULL,
                     sex = NULL,
                     category = "Category 1",
                     reference.category = NULL, ## add in documentation  +  define default
                     class1 = NULL,
                     class2 = NULL,
                     ...){
  # Error control
  #add  Check : Y and age_at_sampling must have the same length also when Y is multidimensional
  if(dim(as.matrix(Y))[1] != dim(as.matrix(age_at_sampling))[1] ){
    stop("Error : Y and age_at_sampling must have the same length") 
  }
  
  if(sum(is.na(Y))){
    stop("Error : remove NA from the seropositivity Y") 
  }
  if(sum(is.na(age_at_sampling))){
    stop("Error : remove NA from the age age_at_sampling") 
  }
 
  if(!testInteger(age_at_sampling)){ 
    stop("age_at_sampling must be given as integer")
  }
  
  if(min(age_at_sampling)<=0){ 
    stop("age_at_sampling must have only positive numbers (>=1)")
  }
  
  if(!typeof(category)=="character"){
    stop("'category' must contain characters")
  }
  
  if (is.null(sampling_year)){
    sampling_year <- rep(2017,1,length(age_at_sampling))
  }else if(length(sampling_year) == 1){
    sampling_year <- rep(sampling_year,1,length(age_at_sampling))
  }
  
  if (is.null(location)){
    location <- rep('NA',1,length(age_at_sampling))
  }else if(length(location) == 1){
    location <- rep(location,1,length(age_at_sampling))
  }
  
  if (is.null(sex)){
    sex <- rep("All", 1, length(age_at_sampling))
  }else if(length(sex) == 1){
    sex <- rep(sex,1,length(age_at_sampling))
  }
  
  if(length(category) == 1){
    category <- as.matrix(rep(category,1,length(age_at_sampling)))
  }
  
  if(is.null(age)){
    age = age_at_sampling+max(sampling_year)-sampling_year
  } 
  
  if (is.null(max_age)){
    max_age <- max(age)
  }  
  
  # setting max_age above the largest multiple of an age category (e.g above 5, 10, 15 if age_class=5)
  max_age = ceiling(max_age/age_class)*age_class
  
  age[which(age>max_age)] <- max_age
  age_at_sampling[which(age_at_sampling>max_age)] <- max_age 
  age.groups <-compute.age.groups(age = age,sampling_year = sampling_year)
   
  class1 = matrix(0, nrow = max_age, ncol = age.groups$NAgeGroups)
  class2= matrix(0, nrow = max_age, ncol = age.groups$NAgeGroups)
  for(i in 1:age.groups$NAgeGroups){
    
    class1[,i] = as.matrix(rep(seq(1,max_age,age_class), each = age_class) ) 
    class2[,i] = as.matrix(age_class-1 + rep(seq(1,max_age,age_class), each = age_class) ) 
  } 
  
  N=length(age)
  
  param.category =  category.parameters(category=category, N=N, reference.category=reference.category)
  
  data <- list( A = max_age,
                N = N,
                Y = Y,
                age = age,
                age_at_sampling = age_at_sampling,
                sampling_year = sampling_year,
                location = location,
                sex = sex,
                category = param.category$category,
                categoryindex=param.category$categoryindex,
                MatrixCategory = param.category$MatrixCategory,
                Ncategory = param.category$Ncategory,
                maxNcategory=param.category$maxNcategory,
                Ncategoryclass=param.category$Ncategoryclass,
                unique.categories=param.category$unique.categories,
                Ncat.unique = param.category$Ncat.unique,
                category.position.in.table=param.category$category.position.in.table,
                reference.category=param.category$reference.category,
                NGroups = max_age, 
                NAgeGroups = age.groups$NAgeGroups, 
                age_at_init =  as.array(age.groups$age_at_init), # CHECK THIS 25/09/2018 
                age_group  = age.groups$age_group,
                class1  = class1,
                class2 = class2)
  
  
  class(data) <- 'SeroData'
  
  return(data)
}


#' @rdname SeroData
#' @export
compute.age.groups <- function(age,sampling_year){
  
  # HERE ADD CLASS1 AND CLASS2
  N=length(age)
  a = max(sampling_year)-sampling_year +1
  age_group = rep(0,N)
  u  = unique(a) 
  for(i in 1:length(u)){
    age_group[which(u[i] == a) ] =i
  } 
  age_at_init =u
  NAgeGroups = length(u)
  
  output <- list(age_at_init=age_at_init, NAgeGroups=NAgeGroups, age_group=age_group)
  return(output)
}



#' @export
print.SeroData <- function(data,...){
  
  
  df <-data.frame(age = data$age,
                  age_at_sampling = data$age_at_sampling,
                  Y=data$Y,
                  sampling_year = data$sampling_year,
                  location = data$location,
                  sex = data$sex,
                  category  = data$category )
  cat("<SeroData object>\n")
  cat(sprintf("%i serological samples ", data$N),'\n')
  cat(sprintf("Max age: %i ", data$A),'\n')
  print(df)
  
}


find.category <- function(x, categories){
  return(tail(which(x-categories>0), 1L))
}

testInteger <- function(x){
  test <- all.equal(x, as.integer(x), check.attributes = FALSE)
  if(test == TRUE){ return(TRUE) }
  else { return(FALSE) }
}

#' @export
category.parameters <- function(category,N, reference.category){
  
  category =  as.matrix(category)
  
  Ncategoryclass =  dim(category)[2]
  
  if(is.null(reference.category)){
    reference.category = c()
    for(i in 1:Ncategoryclass){
      most.common=  names(sort(summary(as.factor(category[,i])), decreasing=T)[1])
      reference.category=c(reference.category, most.common )  # add the most common element as reference
    }
  }
  
  if(Ncategoryclass != length(reference.category)){
    stop("'reference.category' and 'category' don't have the same number of elements ")
  }
  
  # # if reference.category doesn't exist in the list 
  # 
  # for(i in 1:Ncategoryclass){
  #   if(sum(category[,i]==reference.category[i])==0) {
  #     print(paste0('The reference category ', reference.category[i], ' was not found in the variable category'))
  #   }
  # }
  
  
  
  #Verfier ICI pour plusieurs dimensions
  for(i in 1:Ncategoryclass){
    if(sum(category[,i]==reference.category[i])==0) {
      most.common=  names(sort(summary(as.factor(category[,i])), decreasing=T)[1])
      reference.category[i] =   most.common    # add the most common element as reference
    }
  }
  
    A=apply(category, 2, unique)
  if(typeof(A)=="character"){ # needed in the case there is only one type of category
    B=list()
    B[[1]]=A
    A=B
  }
  
  maxNcategory=0
  V=c()
  for(I in 1:Ncategoryclass){
    
    if(maxNcategory<length(A[[I]])){
      maxNcategory=length(A[[I]])
    }
    
    s =  category[,I]
    v1=rep(0,N)
    
    u = unique(s) 
    # reorder with reference category as first element
    ref =  reference.category[I]
    u=unique(c(ref, u)) # put the reference category as the first element of u
    
    for(i in seq(1,length(u))){
      v1[which(s==u[i])] = i
    }
    V=cbind(V,v1)
  }
  
  # list of all combinations
  l=NULL
  for(I in 1:Ncategoryclass){
    l[I]  = list(unique(V[,I]))
    
  }
  Exp = expand.grid(l)
  
  rowProd <-  function(X){
    return(prod(X==a))
  }
  
  categoryindex=c()
  
  for(i in 1:N){
    a=V[i,]
    apply(Exp,1, FUN = rowProd)
    categoryindex[i]= which(apply(Exp,1, FUN = rowProd)==1)
  }
  
  MatrixCategory= Exp
  Ncategory = dim(Exp)[1]
  
  
  unique.categories  = unique(as.vector(category))
  Ncat.unique =length(unique.categories)
  
  
  L1 <- function(x){ 
    return(length(which(x==1))==Ncategoryclass-1)
  }
  L2 <- function(x){ 
    G=0
    if(L1(x) == TRUE){
      G=which(x!=1)
      
    }
    return(G)
  }
  
  
  index = which(apply(X = Exp, 1, FUN = L1))
  w=(apply(X = Exp, 1, FUN = L2))
  col.index = w[w>0]
  
  category.position.in.table= data.frame(predictor = character(),
                                         relative_to = character(),
                                         index = integer())
  I=0
  for(i in unique(col.index)){
    ref =  reference.category[i]
    U=unique(c(ref,  unique(category[,i]))) # put the reference category as the first element of u
    for(j in 2:(length(U)) ){
      I=I+1
      de<-data.frame(U[j],U[1], index[I])
      category.position.in.table=rbind(category.position.in.table, setNames(de, names(category.position.in.table)))
    }
  }
  
  return(  list(category = category,
                categoryindex=categoryindex,
                MatrixCategory = MatrixCategory,
                Ncategory = Ncategory,
                maxNcategory=maxNcategory,
                Ncategoryclass=Ncategoryclass,
                unique.categories=unique.categories,
                Ncat.unique = Ncat.unique,
                category.position.in.table=category.position.in.table,
                reference.category=reference.category))
  
}
nathoze/Rsero documentation built on Feb. 3, 2024, 9:58 p.m.