R/WGScan_VCF.R

Defines functions WGScan.VCF.chr WGScan.VCF.Region

Documented in WGScan.VCF.chr

WGScan.VCF.chr<-function(result.prelim,vcf.filename,chr,pos.min=NULL,pos.max=NULL,Gsub.id=NULL,annot.filename=NULL,cell.type=NULL,MAF.weights='beta',test='combined',window.size=c(5000,10000,15000,20000,25000,50000),MAF.threshold=1,impute.method='fixed'){

  time.start<-proc.time()[3]
  region.step=0.2*10^6
  options(scipen = 999) #remove scientific notations
  #data("WGScan.info")
  #pos.min<-0#min(pos)
  #pos.max<-WGScan.info$hg19.chrom.sizes[match(paste0('chr',chr),WGScan.info$hg19.chrom.sizes[,1]),2]#max(pos)+50000
  if(length(pos.min)==0){pos.min<-1}
  if(length(pos.max)==0){
    #hg19.chrom.sizes<-read.table('http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes')
    #hg19.chrom.sizes<-WGScan.info$hg19.chrom.sizes
    hg19.chrom.sizes<-data.frame(fread('http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes'))
    pos.max<-50000+hg19.chrom.sizes[match(paste0('chr',chr),hg19.chrom.sizes[,1]),2]#max(pos)+50000
  }

  start<-pos.min;null.model<-F;
  result.summary<-c();M.sum<-0
  while (start<pos.max){
    end<-start+region.step
    print(paste0(percent((start-pos.min)/(pos.max-pos.min)),' finished. Time used: ', round(proc.time()[3]-time.start,digits=1),'s. Scan ',start,'-',end))

    #dir.string<-paste0('tabix /home/skardia_lab/zihuai/GenoNet/GenoNetScores_byChr/GenoNet_',chr,'.bed.gz chr',chr,':',max(0,start-5000),'-',end+5000,' > /home/skardia_lab/zihuai/SFARI_SSC_WGS_1b/MidFiles/ScanT_Subset_',i,'_',l,'.txt')
    range<-paste0(chr,":",start,"-",end)
    WGScan.Region.fit<-WGScan.VCF.Region(result.prelim,vcf.filename,range,annot.filename=annot.filename,cell.type=cell.type,MAF.weights=MAF.weights,test=test,window.size=window.size,MAF.threshold=MAF.threshold,impute.method=impute.method,job_id=paste0('WGScan_temp_',chr,':',pos.min,'-',pos.max))
    if ('try-error' %in% class(WGScan.Region.fit)|is.na(WGScan.Region.fit$p.value)==T){
      WGScan.Region.fit<-c();WGScan.Region.fit$M<-0;WGScan.Region.fit$p.value<-NA;WGScan.Region.fit$re.minP<-NA;WGScan.Region.fit$window.summary<-c()
    }

    result.summary<-rbind(result.summary,WGScan.Region.fit$window.summary)
    M.sum<-M.sum+WGScan.Region.fit$M

    start<-end-50000
  }
  unique.index<-match(unique(paste0(result.summary[,1],'-',result.summary[,2])),paste0(result.summary[,1],'-',result.summary[,2]))
  result.summary<-cbind(chr,result.summary[unique.index,])

  if(length(cell.type)==0){
    colnames(result.summary)<-c('chr','start','end','dispersion','burden')
  }else{
    if('all'%in%cell.type){
      cell.type<-c("E001", "E002", "E003", "E004", "E005", "E006", "E007", "E008", "E009", "E010",
                   "E011", "E012", "E013", "E014", "E015", "E016", "E017", "E018", "E019", "E020",
                   "E021", "E022", "E023", "E024", "E025", "E026", "E027", "E028", "E029", "E030",
                   "E031", "E032", "E033", "E034", "E035", "E036", "E037", "E038", "E039", "E040",
                   "E041", "E042", "E043", "E044", "E045", "E046", "E047", "E048", "E049", "E050",
                   "E051", "E052", "E053", "E054", "E055", "E056", "E057", "E058", "E059", "E061",
                   "E062", "E063", "E065", "E066", "E067", "E068", "E069", "E070", "E071", "E072",
                   "E073", "E074", "E075", "E076", "E077", "E078", "E079", "E080", "E081", "E082",
                   "E083", "E084", "E085", "E086", "E087", "E088", "E089", "E090", "E091", "E092",
                   "E093", "E094", "E095", "E096", "E097", "E098", "E099", "E100", "E101", "E102",
                   "E103", "E104", "E105", "E106", "E107", "E108", "E109", "E110", "E111", "E112",
                   "E113", "E114", "E115", "E116", "E117", "E118", "E119", "E120", "E121", "E122",
                   "E123", "E124", "E125", "E126", "E127", "E128", "E129")
    }
    colnames(result.summary)<-c('chr','start','end',
                                c('dispersion',paste0('dispersion-',cell.type)),
                                c('burden',paste0('burden-',cell.type)))
  }

  return(list(window.summary=result.summary,M.sum=M.sum,threshold=0.05/M.sum))
}

WGScan.VCF.Region<-function(result.prelim,vcf.filename,range,Gsub.id=NULL,annot.filename=NULL,cell.type=NULL,MAF.weights='beta',test='combined',window.size=c(5000,10000,15000,20000,25000,50000),MAF.threshold=1,impute.method='fixed',job_id=range){
  #load vcf region
  #vcf.fileName<-paste0("/home/skardia_lab/zihuai/SFARI_SSC_WGS_1b/SSC_WGS_SNP_PASS_chr",i,".vcf.gz")
  #range<-'1:2000000-2010000'
  chr<-as.numeric(gsub(":.*$","",range))
  G <- t(readVCFToMatrixByRange(vcf.filename, range,annoType='')[[1]])
  #match phenotype id and genotype id
  if(length(Gsub.id)==0){match.index<-match(result.prelim$id,rownames(G))}else{
    match.index<-match(result.prelim$id,Gsub.id)
  }
  if(mean(is.na(match.index))>0){
    msg<-sprintf("Some individuals are not matched with genotype. The rate is%f", mean(is.na(match.index)))
    warning(msg,call.=F)
  }
  G<-G[match.index,]
  if(ncol(G)<=1){
    msg<-'Number of variants in the specified range is <= 1, please try a larger region'
    warning(msg,call.=F)
    p.value<-NA
    return(list(p.value=p.value))
  }
  # missing genotype imputation
  G[G==-9 | G==9]<-NA
  N_MISS<-sum(is.na(G))
  if(N_MISS>0){
    msg<-sprintf("The missing genotype rate is %f. Imputation is applied.", N_MISS/nrow(G)/ncol(G))
    warning(msg,call.=F)
    G<-Impute(G,impute.method)
  }
  #get positions
  pos<-as.numeric(gsub("^.*\\:","",colnames(G)))

  #load preliminary features
  Y<-result.prelim$Y;X0<-result.prelim$X0
  n<-result.prelim$n;id<-result.prelim$id
  nullglm<-result.prelim$nullglm;out_type<-result.prelim$out_type
  mu<-nullglm$fitted.values;Y.res<-Y-mu
  re.Y.res<-result.prelim$re.Y.res
  n<-nrow(Y);p<-ncol(G);X0<-svd(X0)$u

  # genotype
  G<-as.matrix(G);center.G<-t(t(G)-colMeans(G))
  MAF<-colMeans(G)/2;MAF[is.na(MAF)]<-0
  G[,MAF>0.5]<-2-G[,MAF>0.5]
  MAF[MAF>0.5]<-1-MAF[MAF>0.5]
  index<-which(colMeans(center.G^2)!=0 & MAF<MAF.threshold)
  G<-as.matrix(G[,index])
  pos<-pos[index]
  MAF<-MAF[index]
  G<-Matrix(G,sparse=T)

  if(length(index)==0){
    msg<-sprintf("No variant has variation, return NA")
    warning(msg,call.=F)
    p.value<-NA
    return(list(p.value=p.value))
  }

  if(MAF.weights=='beta'){weights<-as.matrix(dbeta(MAF,1,25))}
  if(MAF.weights=='equal'){weights<-as.matrix(rep(1,length(MAF)))}#rep(1,ncol(SNP.set))
  #get functional annotations
  if(length(annot.filename)!=0 & length(cell.type)!=0){
    dir.string<-paste0('tabix ',annot.filename,' chr',chr,':',min(pos),'-',max(pos),' > ',job_id,'.txt')
    system(dir.string)
    score<-try(data.frame(fread(paste0(job_id,'.txt'))),silent = T)
    if(length(score)!=0){
      colnames(score)<-c('chr','start','end','name',
                         "E001", "E002", "E003", "E004", "E005", "E006", "E007", "E008", "E009", "E010",
                         "E011", "E012", "E013", "E014", "E015", "E016", "E017", "E018", "E019", "E020",
                         "E021", "E022", "E023", "E024", "E025", "E026", "E027", "E028", "E029", "E030",
                         "E031", "E032", "E033", "E034", "E035", "E036", "E037", "E038", "E039", "E040",
                         "E041", "E042", "E043", "E044", "E045", "E046", "E047", "E048", "E049", "E050",
                         "E051", "E052", "E053", "E054", "E055", "E056", "E057", "E058", "E059", "E061",
                         "E062", "E063", "E065", "E066", "E067", "E068", "E069", "E070", "E071", "E072",
                         "E073", "E074", "E075", "E076", "E077", "E078", "E079", "E080", "E081", "E082",
                         "E083", "E084", "E085", "E086", "E087", "E088", "E089", "E090", "E091", "E092",
                         "E093", "E094", "E095", "E096", "E097", "E098", "E099", "E100", "E101", "E102",
                         "E103", "E104", "E105", "E106", "E107", "E108", "E109", "E110", "E111", "E112",
                         "E113", "E114", "E115", "E116", "E117", "E118", "E119", "E120", "E121", "E122",
                         "E123", "E124", "E125", "E126", "E127", "E128", "E129")
      if('all'%in%cell.type){Z<-as.matrix(score[floor((pos-score[1,2])/25)+1,-c(1:4)])}else{
        Z<-as.matrix(score[floor((pos-score[1,2])/25)+1,cell.type])
      }
      Z[is.na(Z)]<-0
      Z<-cbind(weights,as.vector(weights)*Z)
    }else{
      if('all'%in%cell.type){Z<-matrix(weights,length(weights),128)}else{
        Z<-matrix(weights,length(weights),length(cell.type)+1)
      }
    }
  }else{
    Z<-as.matrix(weights)
  }

  #calculate score statistics and resampling based score statistics
  mu<-nullglm$fitted.values;Y.res<-Y-mu
  score<-t(G)%*%Y.res
  re.score<-t(t(G)%*%re.Y.res)
  score<-as.matrix(score);re.score<-as.matrix(re.score)

  #generate a matrix to specify the variants in each window
  window.matrix0<-c()
  for(size in window.size){
    if (size==1){next}
    #pos.tag<-pos
    pos.tag<-seq(min(pos),max(pos),by=size*1/2)
    pos.tag<-sapply(pos.tag,function(x)pos[which.min(abs(x-pos))])
    window.matrix0<-cbind(window.matrix0,sapply(pos.tag,function(x)as.numeric(pos>=x & pos<x+size)))
  }
  if(length(window.matrix0)!=0){
    #merge identical windows to get actual windows (start and end with variant position)
    window.string<-apply(window.matrix0,2,function(x)paste(as.character(x),collapse = ""))
    window.matrix0<-as.matrix(window.matrix0[,match(unique(window.string),window.string)])

    if(1 %in% window.size){window.matrix0<-as.matrix(window.matrix0[,apply(window.matrix0,2,sum)>1])} #single variants will be added back later
    #incorperate weights (MAF/annotations)
    window.matrix<-c()
    for(i in 1:ncol(Z)){window.matrix<-cbind(window.matrix,Z[,i]*window.matrix0)}
    #calculate scan statistics for all windows
    window.summary<-t(apply(window.matrix0,2,function(x)c(min(pos[which(x==1)]),max(pos[which(x==1)]))))
    if(test=='dispersion'){
      score.window<-as.vector(t(score^2)%*%window.matrix^2)
      re.score.window<-re.score^2%*%window.matrix^2
      #calculate p-values for all windows
      all.p<-Get.p(rbind(score.window,re.score.window),re.score.window)
      p.window<-all.p[1,]
      window.summary<-cbind(window.summary,matrix(all.p[1,],nrow(window.summary),ncol(Z)))
    }
    if(test=='burden'){
      score.window<-as.vector(t(score)%*%window.matrix)^2
      re.score.window<-(re.score%*%window.matrix)^2
      #calculate p-values for all windows
      all.p<-Get.p(rbind(score.window,re.score.window),re.score.window)
      p.window<-all.p[1,]
      window.summary<-cbind(window.summary,matrix(all.p[1,],nrow(window.summary),ncol(Z)))
    }
    if(test=='combined'){
      score.window<-as.vector(t(score^2)%*%window.matrix^2)
      re.score.window<-re.score^2%*%window.matrix^2
      #calculate p-values for all windows
      all.p<-Get.p(rbind(score.window,re.score.window),re.score.window)

      score.window<-as.vector(t(score)%*%window.matrix)^2
      re.score.window<-(re.score%*%window.matrix)^2
      #calculate p-values for all windows
      all.p<-cbind(all.p,Get.p(rbind(score.window,re.score.window),re.score.window))

      window.summary<-cbind(window.summary,matrix(all.p[1,],nrow(window.summary),ncol(Z)*2))
    }
  }else{all.p<-NULL;window.summary<-NULL}

  re.p<-as.matrix(all.p[-1,]) # resampled p-values
  temp.p<-all.p[1,] # p-values

  #calculate minimum p-values
  index<-which(!is.na(temp.p))
  minP<-min(temp.p[index]);
  re.minP<-as.matrix(apply(as.matrix(re.p[,index]),1,min,na.rm=T))

  if(length(index)==1){
    msg<-sprintf("Only one window has variation, test reduces to 1 df. chisq test")
    warning(msg,call.=F)
    p.value<-temp.p[index]
    return(list(score=score,re.score=re.score,n.marker=p,window.summary=window.summary,minP=minP,re.minP=re.minP,M=1,opt.window=window.summary[1:2],p.value=p.value))
  }else{
    p.value<-Get.Gumbel.p(-log(minP),-log(re.minP)) #moment matching based p-value
  }
  #Get number of independent tests
  M<-Get.Gumbel.M(-log(re.minP))

  return(list(minP=minP,re.minP=re.minP,n.marker=ncol(G),window.summary=window.summary,M=M,p.value=p.value))
}

Try the WGScan package in your browser

Any scripts or data that you put into this service are public.

WGScan documentation built on May 27, 2019, 9:05 a.m.