R/glmm.binped.R

Defines functions glmm.binped

Documented in glmm.binped

### ********************************************** ###
glmm.binped <- function(phenfile,genfile,pedfile,phen,covars=NULL,mafRange=c(0,0.05),chr,snpinfoRdata,sep.ped=",",sep.phe=",",sep.gen=" ",
aggregateBy="SKATgene",maf.file,snp.cor,ssq.beta.wts=c(1,25),singleSNP.outfile=F){  

  ### ********************************************** ###
  read.in.data <- function(phenfile,
                           genfile,
                           pedfile,
                           sep.ped = sep.ped,
                           sep.phe = sep.phe,
                           sep.gen = sep.gen,
                           snpinfo) {
    ### ********************************************** ###
    
    print("Reading in Data")
    
    ped.dat <- read.table(genfile,header=TRUE,na.strings="NA",sep=sep.gen, stringsAsFactors=FALSE, colClasses="integer")
    snp.names <- scan(genfile,nlines=1,sep=sep.gen,what="")[-1]
    snp.names <- snp.names[snp.names%in%snpinfo$Name] 
    colnames(ped.dat)[-1] <- snp.names
    
    pedigree <- read.table(pedfile,header=TRUE,sep=sep.ped,stringsAsFactors=FALSE, colClasses="integer" )
    colnames(pedigree)[colnames(pedigree)=="sex"] <- "sex.in.ped" 
    gntp.all <- merge(pedigree[,c("famid","id","sex.in.ped")],ped.dat,by="id") 
    
    phen.dat <- read.table(phenfile,header=TRUE,sep=sep.phe)  
    phen.name=colnames(phen.dat)[-1]
    
    phensnp.dat<-merge(gntp.all,phen.dat,by=c("id"))
    print("Done reading in data")
    return(list(data=phensnp.dat,snps=snp.names,phen.name=phen.name))
  }
  
  #library(lme4); library(MASS)
  print(paste("phenotype data = ", phenfile))
  print(paste("genotype data = ", genfile))
  print(paste("pedigree data = ", pedfile))
  if (is.null(covars)) print("Covariates = NONE") else print(paste("Covariate(s) =",covars,collapse=", "))
  
  print("Running GLMM")
  loadsnpinfo <- try(load(snpinfoRdata))  
  if (inherits(loadsnpinfo,"try-error")) stop(paste('SNP info Rdata does not exist at ',snpinfoRdata))  
  snpinfo <- snpinfo[snpinfo$Chr==chr,]

  rsnpsingene.cor <- NULL
  loadsnpcor <- try(load(snp.cor))  
  if (inherits(loadsnpcor,"try-error")) stop(paste('SNP correlation matrix Rdata does not exist at ',snp.cor))  

  maf <- read.csv(maf.file, stringsAsFactors=FALSE, colClasses=c("character","numeric") )
  snpinfo <- merge(snpinfo,maf,by="Name",all.x=T) 

  #### read in data
  phensnp.dat <- read.in.data(phenfile=phenfile,
                              genfile=genfile,
                              pedfile=pedfile,
                              sep.ped=sep.ped,
                              sep.phe=sep.phe,
                              sep.gen=sep.gen,
                              snpinfo=snpinfo)
  snplist<-phensnp.dat$snps
  if (!any(snplist%in%snpinfo$Name)) stop(paste('SNP name from genotype file do not match Name in SNP info Rdata!!'))

  test.dat<-phensnp.dat$data

  if (!is.null(covars) & sum(snplist %in% covars)>=1) {
     names(test.dat)[which(names(test.dat) %in% paste(snplist[snplist %in% covars],".x",sep=""))] <- snplist[snplist %in% covars]
     covars[covars %in% snplist] <- paste(covars[covars %in% snplist],".y",sep="")
  }
  test.dat<-test.dat[order(test.dat$famid),]

  #### collinearity checking
  if (!is.null(covars)) {
     covars.dat <- na.omit(test.dat[,covars])
     single.cov <- F
     if (length(covars)==1) single.cov <- var(covars.dat)==0 else {
        single.cov <- any(apply(covars.dat,2,var)==0)
        if (single.cov) stop(paste("Single category in covariates!"))
        for (i in covars){
            cov1 <- covars.dat[,i]
            for (j in covars[covars!=i]){
                cov2 <- covars.dat[,j]
                if (abs(cor(cov1,cov2))>0.99999999) stop(paste("Highly correlated covariates ",i," and ",j,"!!",sep=""))
            }
        }
     }
  }   

  snplist.len <- length(snplist)

  ### single variant analysis
  if (singleSNP.outfile==F) { 
     temp.out <- data.frame(ntotal = vector(length=snplist.len, mode="numeric"),
                           nmiss = vector(length=snplist.len, mode="numeric"),
                           maf_ntotal = vector(length=snplist.len, mode="numeric"),
                           beta = vector(length=snplist.len, mode="numeric"),
                           se = vector(length=snplist.len, mode="numeric"),
                           Z = vector(length=snplist.len, mode="numeric"),
                           remark = vector(length=snplist.len, mode="character"),
                           p = vector(length=snplist.len, mode="numeric"),
                           MAC = vector(length=snplist.len, mode="numeric"),
                           n0 = vector(length=snplist.len, mode="numeric"),
                           n1 = vector(length=snplist.len, mode="numeric"),
                           n2 = vector(length=snplist.len, mode="numeric"),
                           stringsAsFactors=FALSE) 

     if (is.null(covars)){
        for (i in 1:length(snplist)) temp.out[i,] <- glmm.EC(snp=test.dat[,snplist[i]],phen=phen,test.dat=test.dat,chr=chr)
     } else {
        for (i in 1:length(snplist)) temp.out[i,] <- glmm.EC(snp=test.dat[,snplist[i]],phen=phen,test.dat=test.dat,chr=chr,covar=covars)
     }
  
     temp.out$Name <- snplist
     temp.out <- merge(temp.out,snpinfo[,c("Name","maf",aggregateBy)],by="Name",sort=F)
     temp.out <- temp.out[,c(aggregateBy,"Name","maf","ntotal","nmiss","maf_ntotal","beta","se","Z","remark","p","MAC","n0","n1","n2")]
     colnames(temp.out)[1] <- "gene"

     single.outfile <- paste(paste(phen,"_singleSNP_chr_",chr,".txt",sep=""))
     write.table(as.matrix(temp.out),single.outfile,quote=F,row.names=F,sep="\t",na="NA")
  } else {  
     temp.out <- read.table(paste(phen,"_singleSNP_chr_",chr,".txt",sep=""),header=T,sep="\t",as.is=T)  
     temp.out <- temp.out[temp.out$Name%in%snpinfo$Name,]
  }    

  ### Rdata & Burden test
  rdata.outfile <- paste(phen,"_chr_",chr,".RData",sep="")   
  gene.list <- na.omit(unique(snpinfo[snpinfo$Name%in%snplist,aggregateBy]))  

  gene.ssq.out  <- data.frame(gene=vector(mode="character",length=length(gene.list)),
                              SSQ=vector(mode="numeric", length=length(gene.list)),
                              cmafTotal=vector(mode="numeric", length=length(gene.list)),
                              cmafUsed=vector(mode="numeric", length=length(gene.list)),
                              nsnpsTotal=vector(mode="numeric", length=length(gene.list)),
                              nsnpsUsed=vector(mode="numeric", length=length(gene.list)),
                              nmiss=vector(mode="numeric", length=length(gene.list)),
                              df=vector(mode="numeric", length=length(gene.list)),
                              p=vector(mode="numeric", length=length(gene.list)),
                              stringsAsFactors=FALSE)
  
  gene.counts.out  <- data.frame(gene = vector(mode="character",length=length(gene.list)),
                                 beta = vector(mode="numeric", length=length(gene.list)),
                                 se = vector(mode="numeric", length=length(gene.list)),
                                 Z = vector(mode="numeric", length=length(gene.list)),
                                 cmafTotal = vector(mode="numeric", length=length(gene.list)),
                                 cmafUsed = vector(mode="numeric", length=length(gene.list)),
                                 nsnpsTotal = vector(mode="numeric", length=length(gene.list)),
                                 nsnpsUsed = vector(mode="numeric", length=length(gene.list)),
                                 nmiss = vector(mode="numeric", length=length(gene.list)),
                                 remark = vector(mode="numeric", length=length(gene.list)),
                                 p = vector(mode="numeric", length=length(gene.list)),
                                 stringsAsFactors=FALSE)
  
  gene.mbweights.out  <- data.frame(gene = vector(mode="character",length=length(gene.list)),
                                    beta = vector(mode="numeric", length=length(gene.list)),
                                    se = vector(mode="numeric", length=length(gene.list)),
                                    Z = vector(mode="numeric", length=length(gene.list)),
                                    cmafTotal = vector(mode="numeric", length=length(gene.list)),
                                    cmafUsed = vector(mode="numeric", length=length(gene.list)),
                                    nsnpsTotal = vector(mode="numeric", length=length(gene.list)),
                                    nsnpsUsed = vector(mode="numeric", length=length(gene.list)),
                                    nmiss = vector(mode="numeric", length=length(gene.list)),
                                    remark = vector(mode="numeric", length=length(gene.list)),
                                    p = vector(mode="numeric", length=length(gene.list)),
                                    stringsAsFactors=FALSE)
   
  Rdata <- rep(list(NA),length(gene.list))
  names(Rdata) <- gene.list

  is.na.p <- is.na(temp.out$p)

  for (i in 1:length(gene.list)){
      gene <- gene.list[i]; 

      if (singleSNP.outfile==F) { 
         snps.in.temp <- as.character(temp.out[temp.out$gene==gene,"Name"])   
         na.snps.in.temp <- as.character(temp.out[temp.out$gene==gene & is.na.p,"Name"])   
         p.snps.in.temp <- as.character(temp.out[temp.out$gene==gene & !is.na.p,"Name"])   
         scores <- sign(temp.out[temp.out$gene==gene,"beta"]) * sqrt(temp.out[temp.out$gene==gene,"Z"]^2)/(temp.out[temp.out$gene==gene,"se"])   
         scores[is.na(scores)] <- 0   
         icov <- matrix(0,length(scores),length(scores))   
         rownames(icov) <- colnames(icov) <- snps.in.temp
   
         if (sum(temp.out$gene==gene & !is.na.p)>1) {
            diag.temp <- diag(1/temp.out[temp.out$gene==gene & !is.na.p,"se"])
            icov.base <- diag.temp %*% rsnpsingene.cor[[gene]][p.snps.in.temp,p.snps.in.temp] %*% diag.temp
            rownames(icov.base) <- colnames(icov.base) <- p.snps.in.temp   
            icov[rownames(icov.base),colnames(icov.base)] <- icov.base[rownames(icov.base),colnames(icov.base)]
         } else 
         if (sum(temp.out$gene==gene & !is.na.p)==1) {
            icov.base <- matrix(1/(temp.out[temp.out$gene==gene & !is.na.p,"se"]^2),1,1)
            rownames(icov.base) <- colnames(icov.base) <- p.snps.in.temp   
            icov[rownames(icov.base),colnames(icov.base)] <- icov.base[rownames(icov.base),colnames(icov.base)]
         }
         n <- max(temp.out[temp.out$gene==gene,"ntotal"],na.rm=T) 
         af <- temp.out[temp.out$gene==gene,"maf_ntotal"]
         af[is.na(af)] <- 0
         Rdata[[i]] <- list(scores=scores,cov=Matrix(icov,sparse=T),n=n,maf=af,sey=1)
      }  

      snps.total <- snpinfo[snpinfo[,aggregateBy]==gene & snpinfo$Name%in%snplist,"Name"]
      cmafTotal <- sum(temp.out[temp.out$Name%in%snps.total,"maf_ntotal"],na.rm=T)
      nsnpsTotal <- sum(snpinfo[,aggregateBy]==gene & snpinfo$Name%in%snplist)
                
      rsnps.in.gene <- as.character(temp.out[temp.out[,"Name"]%in%snps.total & ((!is.na(temp.out[,"maf_ntotal"])) & temp.out[,"maf_ntotal"]<max(mafRange) & temp.out[,"maf_ntotal"]>min(mafRange)),"Name"])   
      rsnps.maf <- temp.out[temp.out$Name%in%rsnps.in.gene,c("Name","maf_ntotal")]
      cmafUsed <- sum(rsnps.maf$maf_ntotal)   
      nsnpsUsed <- length(rsnps.in.gene)
      nmiss <- sum(temp.out[temp.out$Name %in% rsnps.in.gene,"nmiss"])

      #### ssq test
      if (length(rsnps.in.gene)==0) gene.ssq.out[i,] <- list(gene,NA,NA,NA,NA,NA,NA,NA,NA) else {            
         if (length(rsnps.in.gene)==1) {                     
            ssq.out <- temp.out[temp.out$Name==rsnps.in.gene,]
            gene.ssq.out[i,] <- list(gene, ssq.out[9]^2, cmafTotal, cmafUsed, nsnpsTotal, nsnpsUsed, nmiss, 1,ssq.out[11])
         } else {
             z.1 <- temp.out[temp.out$Name%in%rsnps.in.gene,c("Name","Z")]
             z.1 <- merge(z.1,rsnps.maf,by="Name",sort=F)
             wi <- dbeta(z.1$maf,ssq.beta.wts[1],ssq.beta.wts[2])^2
             ssq <- sum(z.1$Z ^2 * wi)

             rsnps.in.gene.cor <- rsnpsingene.cor[[gene]][rsnps.in.gene,rsnps.in.gene]

             diag.temp<-diag(sqrt(wi))  
             c <- eigen(diag.temp %*% rsnps.in.gene.cor %*% diag.temp)$values  
             a <- sum(c^3)/sum(c^2)
             b <- sum(c)-((sum(c^2))^2)/sum(c^3)
             d <- ((sum(c^2))^3)/((sum(c^3))^2)
             p <- try(pchisq((ssq-b)/a,df=d,lower.tail=FALSE)) 

             if (!"try-error"%in%class(p)) gene.ssq.out[i,] <- list(gene,ssq,cmafTotal,cmafUsed,nsnpsTotal,nsnpsUsed,nmiss,d,p) else 
                gene.ssq.out[i,] <- list(gene,NA,NA,NA,NA,NA,NA,NA,NA)        
         } 
      }

      ### Burden tests      
      if (length(rsnps.in.gene)==0) {

         gene.counts.out[i,] <- list(gene,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
         gene.mbweights.out[i,] <- list(gene,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA) 
      } else {
          if (is.null(covars)) {
             rsnps.dat <- test.dat[,c("famid","id","sex.in.ped",rsnps.in.gene,phen)]  

             ### T count test
             if (length(rsnps.in.gene)==1) counts.out <- temp.out[temp.out$Name==rsnps.in.gene,-(1:3)] else {   
                rsnps.dat$counts <- rowSums(rsnps.dat[,rsnps.in.gene],na.rm=TRUE)      
                counts.out <- glmm.EC(snp=rsnps.dat$counts,phen=phen,test.dat=rsnps.dat,chr=chr)
             }   
             gene.counts.out[i,] <- list(gene,
                                    counts.out[4],counts.out[5],counts.out[6],
                                    cmafTotal,cmafUsed,nsnpsTotal,nsnpsUsed,nmiss,counts.out[7],counts.out[8])

             ### MB weight test
             for (rsnp in rsnps.in.gene) rsnps.dat[,rsnp] <- rsnps.dat[,rsnp]/(1-rsnps.maf[rsnps.maf$Name==rsnp,"maf_ntotal"])/rsnps.maf[rsnps.maf$Name==rsnp,"maf_ntotal"]
             if (length(rsnps.in.gene)==1) rsnps.dat$mbweights <- rsnps.dat[,rsnps.in.gene] else rsnps.dat$mbweights <- rowSums(rsnps.dat[,rsnps.in.gene],na.rm=TRUE) 
             mbweights.out <- glmm.EC(snp=rsnps.dat$mbweights,phen=phen,test.dat=rsnps.dat,chr=chr)
             gene.mbweights.out[i,]  <- list(gene,
                                        mbweights.out[4],mbweights.out[5],mbweights.out[6],
                                        cmafTotal,cmafUsed,nsnpsTotal,nsnpsUsed,nmiss,
                                        mbweights.out[7] ,mbweights.out[8] )      
   
          } else {
             rsnps.dat <- test.dat[,c("famid","id","sex.in.ped",rsnps.in.gene,phen,covars)]    
             ### T count test
             if (length(rsnps.in.gene)==1) counts.out <- temp.out[temp.out$Name==rsnps.in.gene,-(1:3)] else {   
                rsnps.dat$counts <- rowSums(rsnps.dat[,rsnps.in.gene], na.rm=TRUE)
                counts.out <- glmm.EC(snp=rsnps.dat$counts,phen=phen,test.dat=rsnps.dat,covar=covars,chr=chr)
             }           
             gene.counts.out[i,] <- list(gene,
                                    counts.out[4],counts.out[5],counts.out[6],
                                    cmafTotal,cmafUsed,nsnpsTotal,nsnpsUsed,nmiss,
                                    counts.out[7],counts.out[8])

             ### MB weight test
             for (rsnp in rsnps.in.gene) rsnps.dat[,rsnp] <- rsnps.dat[,rsnp]/(1-rsnps.maf[rsnps.maf$Name==rsnp,"maf_ntotal"])/rsnps.maf[rsnps.maf$Name==rsnp,"maf_ntotal"]
             if (length(rsnps.in.gene)==1) rsnps.dat$mbweights <- rsnps.dat[,rsnps.in.gene] else rsnps.dat$mbweights <- rowSums(rsnps.dat[,rsnps.in.gene],na.rm=TRUE)
             mbweights.out <- glmm.EC(snp=rsnps.dat$mbweights,phen=phen,test.dat=rsnps.dat,covar=covars,chr=chr)
             gene.mbweights.out[i,] <- list(gene, 
                                       mbweights.out[4],mbweights.out[5],mbweights.out[6],
                                       cmafTotal,cmafUsed,nsnpsTotal,nsnpsUsed,nmiss,
                                       mbweights.out[7], mbweights.out[8])
          }
      }
  }

  Tcounts.outfile <- paste(phen,"_T",max(mafRange),"_chr_",chr,".txt",sep="")  
  MBweights.outfile <- paste(phen,"_MB",max(mafRange),"_chr_",chr,".txt",sep="")  
  SSQ.outfile <- paste(phen,"_SSQ",max(mafRange),"_chr_",chr,".txt",sep="")  
 
  write.table(as.matrix(gene.counts.out),Tcounts.outfile,quote=F,row.names=F,sep="\t",na="NA")
  write.table(as.matrix(gene.mbweights.out),MBweights.outfile,quote=F,row.names=F,sep="\t",na="NA")
  write.table(as.matrix(gene.ssq.out),SSQ.outfile,quote=F,row.names=F,sep="\t",na="NA")
  if (singleSNP.outfile==F) save(Rdata,file=rdata.outfile)  
}

Try the RVFam package in your browser

Any scripts or data that you put into this service are public.

RVFam documentation built on May 2, 2019, 8:26 a.m.