R/utilities.R

Defines functions extractAssay IG_cat IG_numeric entropy get_AICtab

get_AICtab<-function(fit){
  
  ########################
  # Flag invalid options #
  ########################
  
  if (!class(fit) %in% c('cpglm', 'zcpglm', 'glmmTMB')){
    stop('Not supported. Valid options are cplm , zcpglm, and glmmTMB')
  }
  
  ######################
  #  Initialize AICtab #
  ######################
  
  AICtab<-rep(NA, 5)
  
  ###########################
  # Case-by-Case Extraction #
  ###########################
  
  if (class(fit)=='cpglm'){
    
    ##########################################
    # Back calculate logLik and BIC from AIC #
    ##########################################
    
    AIC<-fit$aic
    AIC_multiplier<-length(fit$y) - fit$df.residual
    logLik<-(AIC - 2*AIC_multiplier)/2
    BIC_multiplier<-AIC_multiplier*log(length(fit$y))
    BIC<-BIC_multiplier + 2*logLik
    deviance<-fit$deviance
    df.resid<-fit$df.residual
    
    # Coherent output
    AICtab<-c(AIC, BIC, logLik, deviance, df.resid)
  }
  
  if (class(fit)=='zcpglm'){
    
    ##########################################
    # Back calculate AIC and BIC from logLik #
    ##########################################
    
    logLik<--fit$llik
    AIC_multiplier<-length(fit$y) - fit$df.residual
    BIC_multiplier<-AIC_multiplier*log(length(fit$y))
    AIC<-2*AIC_multiplier + 2*logLik
    BIC<-BIC_multiplier + 2*logLik
    deviance<-NA
    df.resid<-fit$df.residual
    
    # Coherent output
    AICtab<-c(AIC, BIC, logLik, deviance, df.resid)
    
  }
  
  if (class(fit)=='glmmTMB'){
    
    #######################################
    # Extract AICtab from glmmTMB objects #
    #######################################
    
    AICtab<-summary(fit)["AICtab"]$AICtab
    
  }
  
  ##########
  # Return #
  ##########
  
  names(AICtab)<-c('AIC', 'BIC', 'logLik', 'deviance', 'df.resid')
  return(AICtab)
}




# Adapted form: https://rstudio-pubs-static.s3.amazonaws.com/455435_30729e265f7a4d049400d03a18e218db.html

#' @export
entropy <- function(target) {
  #if(all(is.na(target)))  0 
  freq <- table(target)/length(target)
  # vectorize
  vec <- as.data.frame(freq)[,2]
  #drop 0 to avoid NaN resulting from log2
  vec<-vec[vec>0]
  #compute entropy
  -sum(vec * log2(vec))
}

IG_numeric<-function(data, feature, target, bins=4) {
  #Strip out rows where feature is NA
  data<-data[!is.na(data[,feature]),]
  #compute entropy for the parent
  e0<-entropy(data[,target])
  
  data$cat<-cut(data[,feature], breaks=bins, labels=c(1:bins))
  
  #use dplyr to compute e and p for each value of the feature
  dd_data <- data %>% group_by(cat) %>% summarise(e=entropy(get(target)), 
                                                  n=length(get(target)),
                                                  min=min(get(feature)),
                                                  max=max(get(feature))
  )
  
  #calculate p for each value of feature
  dd_data$p<-dd_data$n/nrow(data)
  #compute IG
  IG<-e0-sum(dd_data$p*dd_data$e)
  
  return(IG)
}



#returns IG for categorical variables.
IG_cat<-function(data,feature,target){
  #Strip out rows where feature is NA
  data<-data[!is.na(data[,feature]),] 
  #use dplyr to compute e and p for each value of the feature
  dd_data <- data %>% group_by_at(feature) %>% summarise(e=entropy(get(target)), 
                                                         n=length(get(target))
  )
  
  #compute entropy for the parent
  e0<-entropy(data[,target])
  #calculate p for each value of feature
  dd_data$p<-dd_data$n/nrow(data)
  #compute IG
  IG<-e0-sum(dd_data$p*dd_data$e)
  
  return(IG)
}

# entropy (c("A", "A", "A", "A", "A", "B", "B"))
# 0.8631206

#entropy (c("A", "A", "A", "A"))
# 0

#entropy (c("A", "A", "A", "A", "B", "B", "B", "B"))
#1

#entropy (c("C", "A", "A", "A", "B", "B", "B", "B"))
# 1.405639

#entropy (c("C", "A", "D", "A", "B", "B", "B", "B"))
# 1.75

# entropy (c(1, 1, 2, 1, 1, 1, 2, 1))
#0.8112781


# Written by Grace
extractAssay <- function(input, assay_name = "counts") {
  
  # Extract assay name based on the user input
  if ("counts" %in% assayNames(input)) {
    counts_data <- assay(input, assay_name)
    cat("The specified assay has been extracted\n")
    return(as.data.frame(as.matrix(counts_data)))
  } else {
    cat("The specified assay was not found\n")
    return(NULL)
  }
}
himelmallick/tweedieverse documentation built on Aug. 13, 2024, 5:48 a.m.