R/zimfrv.R

Defines functions zimfrv preprocess_phenodata preprocess_genedata cauchyp p_kernel_single p_burden_single U_phi_mu4zinb phi_mu_hat4zinb U_fi_lmd phi_lambda_hat

Documented in cauchyp p_burden_single phi_lambda_hat phi_mu_hat4zinb p_kernel_single preprocess_genedata preprocess_phenodata U_fi_lmd U_phi_mu4zinb zimfrv

#' @title phi_lambda_hat
#' @description
#' Estimation of phi_hat and lambda_hat for ZIP model
#'
#' This function gives the estimation of 2 parameters phi and lambda for each subject
#' under the null hypothesis.
#'
#' @param simud a data frame containing a phenotype named y and covariates
#' @return a list of 2 estimations of parameters for each subject
#' @details
#' This function first fits zero‐inflated Poisson regression of phenotype y
#' on the covariates only to obtain the estimates of regression coefficients
#' and then compute the estimations of phi and lambda.
#' @seealso \code{zeroinfl}
#' @importFrom pscl zeroinfl
#' @importFrom CompQuadForm davies
#' @importFrom stats cov
#' @importFrom stats dbeta
#' @importFrom stats pchisq
#' @export


phi_lambda_hat <- function(simud){
  simudata <- as.data.frame(simud)
  m1 <- zeroinfl(y ~ ., data = simudata)
  a1_hat <- m1$coefficients$zero
  a2_hat <- m1$coefficients$count
  X <- as.matrix(cbind(rep(1, nrow(simudata)), simudata[, -1]))
  hat_fi <- exp(X%*%a1_hat)
  hat_lambda <- exp(X%*%a2_hat)
  hat <- list(hat_fi = hat_fi, hat_lambda = hat_lambda)
  return(hat)
}


#' @title U_fi_lmd
#' @description
#' Compute Score statistics for ZIP model
#'
#' This function takes the estimations of phi and lambda produced by the \code{phi_lambda_hat}
#' and computes the score statistics under the null hypothesis.
#'
#' @param simudata a data frame containing a phenotype named y and covariates
#' @param G_rare a data frame containing data of rare variants with the same subject order as in simudata
#' @return a list of 2 matrice of the score statistics for each variant from each subject
#' @importFrom pscl zeroinfl
#' @importFrom CompQuadForm davies
#' @importFrom stats cov
#' @importFrom stats dbeta
#' @importFrom stats pchisq
#' @export

U_fi_lmd <- function(simudata, G_rare){
  hat_fi <- phi_lambda_hat(simudata)$hat_fi
  hat_lambda <- phi_lambda_hat(simudata)$hat_lambda
  comb <- cbind(simudata$y, hat_fi, hat_lambda)
  colnames(comb)[2:3] <- c( "fi","lambda")
  comb <- as.matrix(comb)
  u_fi <- matrix(nrow = nrow(G_rare), ncol = ncol(G_rare))

  for (i in 1:nrow(G_rare)){

    for (j in 1:ncol(G_rare)){

      if (comb[i,1]==0) {
        fim <- comb[i,2]*G_rare[i,j]/(comb[i,2] + exp(-comb[i,3])) - comb[i,2] * G_rare[i,j]/(1 + comb[i,2])
      }
      else {
        fim <- (- comb[i,2] * G_rare[i,j]/(1 + comb[i,2]))
      }
      u_fi[i,j] <- fim
    }
  }

  u_lambda=matrix(nrow=nrow(G_rare), ncol = ncol(G_rare))

  for (i in 1:nrow(G_rare)){

    for (j in 1:ncol(G_rare)){

      if (comb[i,1]==0) {
        fim= -exp(-comb[i,3]) * comb[i,3] * G_rare[i,j]/(comb[i,2] + exp(-comb[i,3])) }
      else { fim = (comb[i,1] - comb[i,3]) * G_rare[i,j]}

      u_lambda[i,j]=fim
    }
  }

  score=list(u_fi=u_fi, u_lambda=u_lambda)
  return(score)
}


#' @title phi_mu_hat4zinb
#' @description
#' Estimation of phi_hat, mu_hat and alpha_hat for ZINB model
#'
#' This function gives the estimation of three parameters phi, mu and alpha in ZINB model
#' for each subject under the null hypothesis.
#'
#' @param simud a data frame containing a phenotype named y and covariates
#' @return a list of 3 estimations of parameters for each subject
#' @details
#' This function first fits zero‐inflated negative binomial regression of phenotype y
#' on the covariates only to obtain the estimates of regression coefficients
#' and inverse dispersion
#' and then compute the estimations of phi, mu and alpha.
#' @seealso \code{zeroinfl}
#' @importFrom pscl zeroinfl
#' @importFrom CompQuadForm davies
#' @importFrom stats cov
#' @importFrom stats dbeta
#' @importFrom stats pchisq
#' @export


phi_mu_hat4zinb <- function(simud){
  simudata <- as.data.frame(simud)
  m1 <- zeroinfl(y ~ .|., data = simudata, dist = "negbin")
  a1_hat <- m1$coefficients$zero
  a2_hat <- m1$coefficients$count
  hat_alpha <- (m1$theta)^(-1)
  X <- as.matrix(cbind(rep(1, nrow(simudata)), simudata[, -1]))
  hat_phi <- exp(X%*%a1_hat)
  hat_mu <- exp(X%*%a2_hat)
  hat <- list(hat_phi = hat_phi, hat_mu = hat_mu, hat_alpha = hat_alpha)
  return(hat)
}


#' @title U_phi_mu4zinb
#' @description
#' Compute score statistics for ZINB model
#'
#' This function takes the estimations of phi and lambda produced by the \code{phi_lambda_hat4negbin}
#' and computes the score statistics for ZINB model under the null hypothesis.
#'
#' @param simudata a data frame containing a phenotype named y and covariates
#' @param G_rare a data frame containing data of rare variants with the same subject order as in simudata
#' @return a list of 2 matrice of the score statistics for each variant from each subject
#' @importFrom pscl zeroinfl
#' @importFrom CompQuadForm davies
#' @importFrom stats cov
#' @importFrom stats dbeta
#' @importFrom stats pchisq
#' @export

U_phi_mu4zinb <- function(simudata, G_rare){
  estimation <- phi_mu_hat4zinb(simudata)
  hat_phi <- estimation$hat_phi
  hat_mu <- estimation$hat_mu
  hat_alpha <- estimation$hat_alpha
  comb <- cbind(simudata$y, hat_phi, hat_mu)
  colnames(comb)[2:3] <- c( "phi","mu")
  comb <- as.matrix(comb)

  u_phi <- matrix(nrow = nrow(G_rare), ncol = ncol(G_rare))

  for (i in 1:nrow(G_rare)){

    for (j in 1:ncol(G_rare)){

      if (comb[i,1]==0) {
        phi_im <- (1/(comb[i,2]+(1+hat_alpha*comb[i,3])^(-hat_alpha^(-1)))-1/(1+comb[i,2]))*comb[i,2]*G_rare[i,j]
      }
      else {
        phi_im <- -comb[i,2]*G_rare[i,j]/(1+comb[i,2])
      }
      u_phi[i,j] <- phi_im
    }
  }

  u_mu <- matrix(nrow=nrow(G_rare), ncol = ncol(G_rare))

  for (i in 1:nrow(G_rare)){

    for (j in 1:ncol(G_rare)){

      if (comb[i,1]==0) {
        mu_im <- -comb[i,3]*(1+hat_alpha*comb[i,3])^(-1-hat_alpha^(-1))*G_rare[i,j]/(comb[i,2]+(1+hat_alpha*comb[i,3])^(-hat_alpha^(-1)))
      }
      else {
        mu_im <- (comb[i,1]-comb[i,3])*G_rare[i,j]/(1+hat_alpha*comb[i,3])
      }

      u_mu[i,j] <- mu_im
    }
  }

  score=list(u_phi=u_phi, u_mu=u_mu)
  return(score)
}


#' @title p_burden_single
#' @description
#' Compute the p-value for the burden test
#'
#' This function takes a vector of weights, a data frame of rare variants
#' and a matrix of Score statistics produced by \code{U_fi_lmd} for ZIP model
#' or \code{U_phi_mu4zinb} for ZINB model to compute the p-value for the burden test.
#'
#' @param wt a numeric vector containing weights for all variants
#' @param G_rare a data frame containing data of rare variants
#' @param s a matrix of the score statistics for each variant from each subject
#' @return the p-value for the burden test
#' @importFrom pscl zeroinfl
#' @importFrom CompQuadForm davies
#' @importFrom stats cov
#' @importFrom stats dbeta
#' @importFrom stats pchisq
#' @export

p_burden_single = function(wt,G_rare, s){

  sum_var=rep(1,ncol(G_rare))
  for (i in 1:nrow(G_rare)){
    sum_var[i]=sum(wt*s[i,])
  }
  Q=(sum(sum_var)^2)/sum(sum_var^2)
  if (is.nan(Q)) {Q =0 }
  p=1-pchisq(Q,df=1)

  return(p)
}


#' @title p_kernel_single
#' @description
#' Compute the p-value for the kernel test
#'
#' This function takes a diagonal matrix of weights, a data frame of rare variants
#' and a matrix of Score statistics produced by \code{U_fi_lmd} for ZIP model
#' or \code{U_phi_mu4zinb} for ZINB model to compute the p-value for the kernel test.
#'
#' @param wt_matrix2 a diagonal matrix containing the squared weights for all variants
#' @param G_rare a data frame containing data of rare variants
#' @param s a matrix of the score statistics for each variant from each subject
#' @return the p-value for the kernel test (ZIP-k)
#' @importFrom pscl zeroinfl
#' @importFrom CompQuadForm davies
#' @importFrom stats cov
#' @importFrom stats dbeta
#' @importFrom stats pchisq
#' @export

p_kernel_single = function(wt_matrix2,G_rare, s){

  onematrix=as.matrix(rep(1,nrow(G_rare)))
  T=as.vector(t(onematrix)%*%s%*% wt_matrix2 %*% t(s)%*%onematrix)
  V=nrow(G_rare)*cov(s)%*% wt_matrix2
  lambda_V=eigen(V, symmetric = T, only.values = T)$values
  p=davies(T, lambda = lambda_V)$Qq

  return(p)
}


#' @title cauchyp
#' @description
#' Cauchy combination test (Cauchy‐p)
#'
#' This function combines p-values using Cauchy combination test
#' for testing the joint genetic effect.
#'
#' @param x a numeric vector containing p-values
#' @return a combined p-value indicating the joint effect
#' @importFrom pscl zeroinfl
#' @importFrom CompQuadForm davies
#' @importFrom stats cov
#' @importFrom stats dbeta
#' @importFrom stats pchisq
#' @export

cauchyp<-function(x){
  pi=3.141593
  T<-0.5*tan((0.5-x[1])*pi) + 0.5*tan((0.5-x[2])*pi)
  p = 0.5 - (atan(T))/pi
  return(p)
}


#' @title preprocess_genedata
#' @description
#' Preprocess genotype files in PLINK format
#'
#' This function converts PLINK format files into data frames containing genotypes information in proper format
#' for the model fitting and testing.
#'
#' @param gene_name a character string of the name of a gene, e.g."CEPT". The name is case-sensitive.
#' @param region_file a file listing genetic regions where each row contains chromosome, basepairs and the name of genetic region respectively
#' @param dosage_file a dosage file includes dosage information of each variant for all individuals
#' @param fam_file .fam file in PLINK format
#' @returns a data frame containing genotypes for all individuals in the required format for model fitting and testing
#'
#' @examples
#' data(Ex2_fam)
#' data(Ex2_dosage)
#' data(Ex2_region)
#' preprocess_genedata(Ex2_fam,Ex2_dosage,Ex2_region,"r2")
#'
#' @importFrom data.table transpose
#' @export
#'

preprocess_genedata <- function(fam_file,dosage_file,region_file,gene_name){
  locate_gene <- which(region_file[,4]==gene_name)
  chr <- region_file[locate_gene,1]
  start_point <- region_file[locate_gene,2]
  end_point <- region_file[locate_gene,3]
  id_extract <- fam_file[,1:2]
  colnames(id_extract) <- c("FID","IID")
  genotype_select <- which(dosage_file[,4]>=start_point & dosage_file[,4]<=end_point & dosage_file[,1]==chr)
  genotype_extract <- suppressWarnings(lapply(transpose(dosage_file[genotype_select,7:ncol(dosage_file)]),as.numeric))
  genedata <- cbind(id_extract,genotype_extract)
  return(genedata)
}


#' @title preprocess_phenodata
#' @description
#' Preprocess phenotype files in PLINK format
#'
#' This function converts PLINK format files into data frames containing phenotypes and covariates information in proper format
#' for the model fitting and testing.
#'
#' @param pheno_file phenotype file in PLINK format
#' @param cov_file covariate file in PLINK format
#' @returns a data frame containing phenotypes and covariates respectively for all individuals in the required format for model fitting and testing
#'
#' @examples
#' data(Ex2_pheno)
#' data(Ex2_covar)
#' preprocess_phenodata(Ex2_pheno,Ex2_covar)
#'
#' @export
#'

preprocess_phenodata <- function(pheno_file,cov_file){
  pheno_file[pheno_file=='na'] <- NA
  cov_file[cov_file=='na'] <- NA
  colnames(pheno_file)[1:3] <- c("FID","IID","count")
  pheno_file[,3] <- as.numeric(pheno_file[,3])
  colnames(cov_file)[1:2] <- c("FID","IID")
  phenodata <- cbind(pheno_file[,1:3],cov_file[,-(1:2)])
  return(phenodata)
}



#' @title zimfrv
#' @description
#' Gene‐based association tests to model zero-inflated count data
#'
#' This function performs gene‐based association tests
#' between a set of SNPs/genes and zero-inflated count data
#' using ZIP regression or ZINB regression or two-stage SKAT model framework.
#'
#' @param phenodata a data frame containing family and individual IDs for all objects as well as
#' zero-inflated counts as a phenotype and a set of covariates.
#' Each row represents a different individual.
#' The first two columns are Family ID (FID) and Individual ID (IID) respectively.
#' There must be one and only one phenotype in the third column and
#' the phenotype have to be zero-inflated count data which should be non-negative integers, e.g. neuritic plaque counts.
#' Each of the rest of columns represents a different covariate, e.g. age, sex, etc.
#' @param genedata a data frame containing family and individual IDs for all objects as well as numeric genotype data.
#' Each row represents a different individual.
#' The first two columns are Family ID (FID) and Individual ID (IID) respectively.
#' Each of the rest columns represents a seperate gene/SNP marker.
#' The genotype should be coded as 0, 1, 2 and NA for AA, Aa, aa and missing.
#' Both of Family ID (FID) and Individual ID (IID) for each row in the 'genedata'
#' derived from the PLINK formatted files should be in the same order as in the 'phenodata'.
#' The number of rows in 'genedata' should be equal to the number of rows in 'phenodata'.
#' @param genename a character string of the name of a gene, e.g. "CETP". The name is case-sensitive.
#' @param weights a character string of pre-specified variant weighting schemes (default="Equal").
#' "Equal" represents no weight,
#' "MadsenBrowning" represents the Madsen and Browning (2009) weight,
#' "Beta" represents the Beta weight.
#' @param missing_cutoff a cutoff of the missing rates of SNPs (default=0.15).
#' Any SNPs with missing rates higher than the cutoff will be excluded from the analysis.
#' @param max_maf a cutoff of the maximum minor allele frequencies (MAF) (default=1, no cutoff).
#' Any SNPs with MAF > cutoff will be excluded from the analysis.
#' @param model character specification of zero-inflated count model family (default="zip").
#' "zip" represents Zero-Inflated Poisson model,
#' "zinb" represents Zero-Inflated Negative Binomial model,
#' "skat" represents the two-stage Sequence Kernel Association Test method.
#' @returns a list of 10 items including the name of gene,
#' the number of rare variants in the genetic region,
#' the kind of method used for modeling,
#' and individual p-values of gene‐based association tests (burden test and kernel test for both parameters)
#' and combined p-values using Cauchy combination test.
#'
#' \item{GeneName}{the name of gene.}
#' \item{No.Var}{the number of rare variants in the gene.}
#' \item{Method}{the method used to compute the p-values.}
#' \item{p.value_pi_burden}{single p-value for parameter \eqn{\pi} using burden test.}
#' \item{p.value_lambda_burden / p.value_mu_burden}{single p-value for parameter \eqn{\lambda} or \eqn{\mu} using burden test.}
#' \item{p.value_pi_kernel}{single p-value for parameter \eqn{\pi} using kernel test.}
#' \item{p.value_lambda_kernel / p.value_mu_kernel}{single p-value for parameter \eqn{\lambda} or \eqn{\mu} using kernel test.}
#' \item{p.value_pi_combined}{Combined p-value of testing parameter \eqn{\pi} from both burden and kernel test using Cauchy combination test.}
#' \item{p.value_lambda_combined / p.value_mu_combined}{Combined p-value of testing parameter \eqn{\lambda} or \eqn{\mu} from both burden and kernel test using Cauchy combination test.}
#' \item{p.value_overall}{Combined p-value of testing the overall association using Cauchy combination test.}
#'
#' @examples
#' data(Ex1_phenodata)
#' data(Ex1_genedata)
#' zimfrv(Ex1_phenodata,Ex1_genedata,weights = "Beta",max_maf = 0.02,model="zinb")
#'
#' @references Fan, Q., Sun, S., & Li, Y.‐J. (2021). Precisely modeling zero‐inflated count phenotype for rare variants. Genetic Epidemiology, 1–14.
#' @importFrom pscl zeroinfl
#' @importFrom CompQuadForm davies
#' @importFrom stats cov
#' @importFrom stats dbeta
#' @importFrom stats pchisq
#' @importFrom SKAT SKAT_Null_Model
#' @importFrom SKAT SKAT
#' @importFrom RNOmni RankNorm
#' @export

zimfrv <- function(phenodata, genedata, genename = "NA", weights = "Equal", missing_cutoff = 0.15, max_maf = 1, model = "zip"){

  n <- nrow(genedata)
  if(nrow(phenodata) != n){
    stop("phenodata and genedata do not have the same number of rows")
  }
  if((sum(phenodata[,1]==genedata[,1]) != n) | (sum(phenodata[,2]==genedata[,2]) != n)){
    stop("the IDs for each row in phenodata and genedata are not in the same orders")
  }

  phenodata <- phenodata[,-(1:2)] # remove
  genedata <- genedata[,-(1:2)]

  missing_check_ind <- is.na(genedata)
  missing_check <- apply(missing_check_ind, 2, sum)/n
  id_check <- which(missing_check < missing_cutoff)
  genedata_check <- as.matrix(genedata[,id_check])

  sam_maf <- apply(genedata_check, 2, sum, na.rm=T)/(2*n)
  id_rare <- which(sam_maf < max_maf & sam_maf > 0)
  genedata_rare <- as.matrix(genedata_check[,id_rare])
  n_rare <- length(id_rare)
  maf_rare_sam <- sam_maf[id_rare]

  if(weights=="Equal"){
    wt <- rep(1,n_rare)
  }else if(weights=="MadsenBrowning"){
    wt <- 1/sqrt(maf_rare_sam*(1- maf_rare_sam))
  }else if(weights=="Beta"){
    wt <- dbeta(maf_rare_sam,1,25)
  }else{
    stop("invalid weights")
  }
  wt_matrix <- diag(x=wt^2,nrow = n_rare,ncol = n_rare)

  colnames(phenodata)[1] <- "y"

  if(model=="zip"){
    s <- U_fi_lmd(phenodata, genedata_rare)
    s_fi <- s$u_fi
    s_lambda <- s$u_lambda

    s_fi[is.na(s_fi)]=0
    s_lambda[is.na(s_lambda)]=0


    p_fi_burden <- p_burden_single(wt, genedata_rare, s_fi)
    p_lambda_burden <- p_burden_single(wt, genedata_rare, s_lambda)
    p_fi_kernel <- p_kernel_single(wt_matrix, genedata_rare, s_fi)
    p_lambda_kernel <- p_kernel_single(wt_matrix, genedata_rare, s_lambda)

    p_pi_combined <- cauchyp(c(p_fi_burden,p_fi_kernel))
    p_lambda_combined <- cauchyp(c(p_lambda_burden,p_lambda_kernel))

    p_overall <- cauchyp(c(p_pi_combined,p_lambda_combined))

    output <- data.frame(GeneName=genename,No.Var=n_rare,Method="ZIP",
                         p_pi_burden=p_fi_burden,p_lambda_burden=p_lambda_burden,
                         p_pi_kernel=p_fi_kernel,p_lambda_kernel=p_lambda_kernel,
                         p_pi_combined=p_pi_combined,p_lambda_combined=p_lambda_combined,
                         p_overall=p_overall)


  }else if(model=="zinb"){
    s <- U_phi_mu4zinb(phenodata, genedata_rare)
    s_phi <- s$u_phi
    s_mu <- s$u_mu

    s_phi[is.na(s_phi)]=0
    s_mu[is.na(s_mu)]=0


    p_phi_burden <- p_burden_single(wt, genedata_rare, s_phi)
    p_mu_burden <- p_burden_single(wt, genedata_rare, s_mu)
    p_phi_kernel <- p_kernel_single(wt_matrix, genedata_rare, s_phi)
    p_mu_kernel <- p_kernel_single(wt_matrix, genedata_rare, s_mu)

    p_phi_combined <- cauchyp(c(p_phi_burden,p_phi_kernel))
    p_mu_combined <- cauchyp(c(p_mu_burden,p_mu_kernel))

    p_overall <- cauchyp(c(p_phi_combined,p_mu_combined))

    output <- data.frame(GeneName=genename,No.Var=n_rare,Method="ZINB",
                         p_pi_burden=p_phi_burden,p_mu_burden=p_mu_burden,
                         p_pi_kernel=p_phi_kernel,p_mu_kernel=p_mu_kernel,
                         p_pi_combined=p_phi_combined,p_mu_combined=p_mu_combined,
                         p_overall=p_overall)
  }else if(model=="skat"){
    y <- phenodata[,1]
    X <- as.matrix(phenodata[,-1])

    y1 <- ifelse(y>0, 0, 1)
    obj <- SKAT_Null_Model(y1 ~ X, out_type="D")

    id_y_pos <- which(y>0)
    y2 <- y[id_y_pos]
    X2 <- X[id_y_pos,]
    Z <- as.matrix(genedata_rare[id_y_pos,])
    y2t <- RankNorm(y2)
    obj.c <- SKAT_Null_Model(y2t ~ X2, out_type="C")

    p_pi_burden <- SKAT(genedata_rare, obj, weights = wt, r.corr = 1)$p.value
    p_mu_burden <- SKAT(Z, obj.c, weights = wt, r.corr = 1)$p.value
    p_pi_kernel <- SKAT(genedata_rare, obj, weights = wt)$p.value
    p_mu_kernel <- SKAT(Z, obj.c, weights = wt)$p.value

    p_pi_combined <- cauchyp(c(p_pi_burden,p_pi_kernel))
    p_mu_combined <- cauchyp(c(p_mu_burden,p_mu_kernel))

    p_overall <- cauchyp(c(p_pi_combined,p_mu_combined))

    output <- data.frame(GeneName=genename,No.Var=n_rare,Method="SKAT",
                         p_pi_burden=p_pi_burden,p_mu_burden=p_mu_burden,
                         p_pi_kernel=p_pi_kernel,p_mu_kernel=p_mu_kernel,
                         p_pi_combined=p_pi_combined,p_mu_combined=p_mu_combined,
                         p_overall=p_overall)

  }else{
    stop("invalid model family")
  }

  return(output)

}

Try the ZIM4rv package in your browser

Any scripts or data that you put into this service are public.

ZIM4rv documentation built on April 3, 2025, 7:39 p.m.