R/krlmm.R

Defines functions verifyChrXSNPs verifyChrYSNPs krlmmImputeGender AddBackNoVarianceSNPs computeCallPr calculateMahalDist1Cluster calculateParameters1Cluster calculateOneParams1Cluster calculateMahalDist2Cluster calculateParameters2Cluster calculateOneParams2Cluster calculateMahalDist3Cluster calculateParameters3Cluster calculateOneParams3Cluster hardyweinberg calculateParameters assignCalls assignCallsOneSNP assignCalls1Cluster assignCalls2Cluster assignCalls3Cluster predictKwithVGLM getKrlmmVGLMCoefficients calculateKrlmmCoefficients calculatePriorcentersC calculatePriorValues computeAverageLogIntensity computeLogRatio getNumberOfCores quantilenormalization loessnormalization krlmmnopkg krlmm

krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
    pkgname <- getCrlmmAnnotationName(cdfName)
    
### pre-processing, output M    
    M = computeLogRatio(cnSet, verbose)
    message("Leaving out non-variant SNPs")
    
    # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
    uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
    SNPtoProcessIndex = uniqueCount >= 3
    noVariantSNPIndex = uniqueCount < 3
    M = M[SNPtoProcessIndex, ]

    numSNP = nrow(M)
    numSample = ncol(M)
    
    calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
    scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
    open(calls)
    open(scores)

    rownames(calls) = rownames(M)
    rownames(scores) = rownames(M)
    colnames(calls) = colnames(M)
    colnames(scores) = colnames(M)  
    
    priormeans = calculatePriorValues(M, numSNP, verbose)
      
### retrieve or calculate coefficients
    krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
    
### do VGLM fit, to predict k for each SNP
    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose)
    rm(VGLMparameters)
    
### assign calls
    assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose)
    rm(kPrediction)

### assign confidence scores
    computeCallPr(scores, M, calls, numSNP, numSample, verbose)
    
### add back SNPs excluded before
    AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)

    loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
    YIndex <- getVarInEnv("YIndex")
    loader("annotation.rda", .crlmmPkgEnv, pkgname)
    annotation <- getVarInEnv("annot")
    
### impute gender if gender information not provided
    if (is.null(gender)) {
        gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose)
    }

### double-check ChrY SNP cluster, impute gender if gender information not provided
    if (!(is.null(gender))) {
        verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
    }

    close(calls)
    close(scores)
    rm(calls)
    rm(scores)
    rm(M)
    rm(annotation)
    rm(YIndex)
    TRUE
}

krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annotation=NULL, verbose=TRUE) {
    if(is.null(normalize.method)){
        message("No normalization method specified. Quantile normalization applied")
        M=quantilenormalization(cnSet,verbose,offset=offset)
    }else{
        if(normalize.method=="quantile"){
            M=quantilenormalization(cnSet,verbose,offset=offset)
        }
        if(normalize.method=="loess"){
            M=loessnormalization(cnSet,verbose,offset=offset)
	}
    }

    message("Leaving out non-variant SNPs")
    
    # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
    uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
    SNPtoProcessIndex = uniqueCount >= 3
    noVariantSNPIndex = uniqueCount < 3
    M = M[SNPtoProcessIndex, ]

    numSNP = nrow(M)
    numSample = ncol(M)

    calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
    scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
    open(calls)
    open(scores)

    rownames(calls) = rownames(M)
    rownames(scores) = rownames(M)
    colnames(calls) = colnames(M)
    colnames(scores) = colnames(M)

    prepriormeans=calculatePriorcentersC(M,numSample)

    trueCalls=matrix(NA,nrow(M),ncol(M))

    for(i in 1:ncol(M)){
      tmp=kmeans(M[,i],centers=prepriormeans,nstart=45)
      trueCalls[,i]=tmp$cluster
    }
    colnames(trueCalls)=colnames(M)
    rownames(trueCalls)=rownames(M)
if(numSample<=30){
  priormeans=prepriormeans
}else{
  priormeans = calculatePriorValues(M, numSNP, verbose)
    if((abs(priormeans[1]-prepriormeans[1])+abs(priormeans[2]-prepriormeans[2])+abs(priormeans[3]-prepriormeans[3]))>=3){
priormeans=prepriormeans
}else{
      priormeans=priormeans
  }
 }
 
    
    VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)

### retrieve or calculate coefficients

    krlmmCoefficients = getKrlmmVGLMCoefficients(trueCalls=trueCalls,params=VGLMparameters,numSample=ncol(M),samplenames=colnames(M),verbose=T)
### do VGLM fit, to predict k for each SNP
    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose)
    rm(VGLMparameters)

### assign calls
    assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose=T)
    rm(kPrediction)

### assign confidence scores
    computeCallPr(scores, M, calls, numSNP, numSample, verbose)

   YIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==24 & isSnp(cnSet)]
    XIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==23 & isSnp(cnSet)]
### impute gender if gender information not provided
    if (is.null(gender)) {
       if(sum(is.na(YIndex))==length(YIndex)){
		gender=NULL
       }else{
	gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset)
       }
    }

### double-check ChrY ChrX SNP cluster, impute gender if gender information not provided
    if (!(is.null(gender))) {
        verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
        verifyChrXSNPs(cnSet, M, calls, gender, annotation, XIndex, priormeans, verbose)
    }

#    if (length(YIndex)>0  && !(is.null(gender))) {
#      verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
#    }
### add back SNPs excluded before
   AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)

    close(calls)
    close(scores)
    rm(calls)
    rm(scores)
    rm(M)
    TRUE
}


#######################################################################################################

loessnormalization<- function(cnSet, verbose, offset=16, blockSize = 300000){
    # compute log-ratio in blocksize of 300,000 by default
    message("Start computing log-ratios")
    A <- A(cnSet)
    B <- B(cnSet)
    open(A)
    open(B)

    numSNP <- nrow(A)
    numSample <- ncol(A)
   

    M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
    S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")

    rownames(M) = rownames(A)
    colnames(M) = colnames(A)

    numBlock = ceiling(numSNP / blockSize)
    for (i in 1:numBlock){
        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
        subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
        M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
        subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
        S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]

        rm(subsetA, subsetB, subsetM,subsetS)

    }
    close(A)
    close(B)
    gc()

    isna=is.na(M[,1])
    prepriormeansL=calculatePriorcentersC(M[,][!isna,],numSample)
    F01=abs(prepriormeansL[1])-prepriormeansL[2]   
    F02=abs(prepriormeansL[3])+prepriormeansL[2]

    for(i in 1:ncol(M)) {
        isna=is.na(M[,i])
        Msub = M[!isna,i]
        Mna=M[isna,i]
        Ssub = S[!isna,i]
        Sna=S[isna,i]

        km = kmeans(Msub, prepriormeansL)
        ind1v2 = km$cluster==1
        ind2v2 = km$cluster==2
        ind3v2 = km$cluster==3

        l1v2 = loessFit(Msub[ind1v2], Ssub[ind1v2])
        l2v2 = loessFit(Msub[ind2v2], Ssub[ind2v2])
        l3v2 = loessFit(Msub[ind3v2], Ssub[ind3v2])
        Msub[ind1v2] = l1v2$residuals+F01
        Msub[ind2v2] = l2v2$residuals
        Msub[ind3v2] = l3v2$residuals-F02
        Mnew=rbind(as.matrix(Msub),as.matrix(Mna))
        rownames(Mnew)=c(rownames(as.matrix(Msub)),rownames(as.matrix(Mna)))
        m=match(rownames(M),rownames(Mnew))
        Mnew=Mnew[m]
        M[,i] = Mnew
        rm(Msub, Ssub, ind1v2, ind2v2, ind3v2, l1v2, l2v2, l3v2)
    }
    return(M)
}


#######################################################################################################

quantilenormalization <- function(cnSet, verbose, offset=16, blockSize = 300000){
    A <- A(cnSet)
    B <- B(cnSet)
    open(A)
    open(B)

    numSNP <- nrow(A)
    numSample <- ncol(A)

    M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
    X <- oligoClasses::initializeBigMatrix(name="X", numSNP, numSample, vmode = "integer")
    Y <- oligoClasses::initializeBigMatrix(name="Y", numSNP, numSample, vmode = "integer")
    rownames(M) = rownames(X) = rownames(Y) = rownames(A)
    colnames(M) = colnames(X) = colnames(Y) = colnames(A)

    # This step may chew up a lot of memory...
    X =  normalize.quantiles(as.matrix(A[,]))+offset
    Y =  normalize.quantiles(as.matrix(B[,]))+offset

    numBlock = ceiling(numSNP / blockSize)
    for (i in 1:numBlock){
        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
        subsetXqws = as.matrix(X[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
        subsetYqws = as.matrix(Y[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
        subsetM = .Call("krlmmComputeM", subsetXqws, subsetYqws, PACKAGE="crlmm")
        M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
        rm(subsetYqws, subsetXqws, subsetM)
    }

   close(A)
   close(B)
   return(M)
   delete(X)
   delete(Y)
   rm(X)
   rm(Y)
   gc()
}


#######################################################################################################

getNumberOfCores <- function(){
    defaultCores = min(detectCores(), 8)
    return(getOption("krlmm.cores", defaultCores))    
}


#######################################################################################################

computeLogRatio <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){
    # compute log-ratio in blocksize of 500,000 by default
    if(verbose)
        message("Start computing log-ratios")
    if(!is.null(col) | !is.null(row)) {
      if(is.null(row)) {
       A <- A(cnSet)[,col]
       B <- B(cnSet)[,col]
      }
      if(is.null(col)) {
       A <- A(cnSet)[row,]
       B <- B(cnSet)[row,]
      }
      if(!is.null(col) & !is.null(row)) {
       A <- A(cnSet)[row,col]
       B <- B(cnSet)[row,col]
      }
    } else {
       A <- A(cnSet)
       B <- B(cnSet)
    }
    open(A)
    open(B)
    
    numSNP <- nrow(A)
    numSample <- ncol(A)
    
    M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
    rownames(M) = rownames(A)
    colnames(M) = colnames(A)
  
    numBlock = ceiling(numSNP / blockSize)
    for (i in 1:numBlock){
        if(verbose) message(" -- Processing segment ", i, " out of ", numBlock)
        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset              
        subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
        M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
        rm(subsetA, subsetB, subsetM)
    }
    
    close(A)
    close(B)
    if(verbose)
        message("Done computing log-ratios")
    return(M)    
}

computeAverageLogIntensity <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){
    # compute average log intensity in blocksize of 500,000 by default
    if(verbose)
        message("Start computing average log-intensities")
    if(!is.null(col) | !is.null(row)) {
      if(is.null(row)) {
       A <- A(cnSet)[,col]
       B <- B(cnSet)[,col]
      }
      if(is.null(col)) {
       A <- A(cnSet)[row,]
       B <- B(cnSet)[row,]
      }
      if(!is.null(col) & !is.null(row)) {
       A <- A(cnSet)[row,col]
       B <- B(cnSet)[row,col]
      }
    } else {
       A <- A(cnSet)
       B <- B(cnSet)
    }
    open(A)
    open(B)
    
    numSNP <- nrow(A)
    numSample <- ncol(A)
    
    S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
    rownames(S) = rownames(A)
    colnames(S) = colnames(A)
      
    numBlock = ceiling(numSNP / blockSize)
    for (i in 1:numBlock){
        if(verbose) message(" -- Processing segment ", i, " out of ", numBlock)
        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset                     
        subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
        S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
        rm(subsetA, subsetB, subsetS)
    }
    
    close(A)
    close(B)
    if(verbose)
        message("Done computing average log-intensities")
    return(S)  
}


#######################################################################################################

calculatePriorValues <- function(M, numSNP, verbose) {

    calculateOneKmeans <- function(x) {
        tmp = kmeans(x, 3, nstart=45)
        return(sort(tmp$centers, decreasing = T))
    }

    if (verbose) message("Start calculating prior means")
    cl <- makeCluster(getNumberOfCores())
    centers <- parRapply(cl, M, calculateOneKmeans)
    stopCluster(cl) 
    centers <- matrix(centers, numSNP, 3, byrow = TRUE)
    priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
    if(abs(sum(priormeans))>1) {
      checksymmetric= apply(centers,1,function(x){abs(sum(x))})<1
      priormeans=apply(centers[checksymmetric,],2, FUN="median", na.rm=TRUE)
    }
    if (verbose) message("Done calculating prior means")
    return(priormeans)
}

calculatePriorcentersC <- function(M, numSample) {

    calculateOneKmeans <- function(x) {
        tmp = kmeans(x, 3, nstart=45)
        return(sort(tmp$centers, decreasing = T))
    }

    cl <- makeCluster(getNumberOfCores())
    centers <- parCapply(cl, M, calculateOneKmeans)
    stopCluster(cl) 
    centers <- matrix(centers, numSample, 3, byrow = TRUE)
    prepriormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
    
    return(prepriormeans)
}


#######################################################################################################

calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
#    if (!require(VGAM)) {
#        message("VGAM package not found, fall back to use defined platform-specific coefficients")
#        return(NULL)
#    }
    amatch = match(rownames(params), rownames(trueCalls))
    amatch = amatch[!is.na(amatch)]
    trueCalls = trueCalls[amatch,]

    amatch = match(rownames(trueCalls), rownames(params))
    params = params[amatch, ]
    
    amatch = match(samplenames, colnames(trueCalls))
    amatch = amatch[!is.na(amatch)]    
    trueCalls = trueCalls[, amatch]
        
    if ((numSample <= 40) && (ncol(trueCalls) < round(numSample/2))){
        message("Sample size is ", numSample, ", KRLMM requires at least trueCalls of ", round(numSample/2), " samples to calculate coefficients")
        return(NULL)
    }
    if ((numSample > 40) && (ncol(trueCalls) < 20)){
        message("At least trueCalls of 20 samples are required to calculate coefficients")
        return(NULL)
    }
    if (nrow(trueCalls) < 1000){
        message("At lease trueCalls of 1000 SNPs are required to calculate coefficients")
        return(NULL)
    }

    getna = apply(trueCalls, 1, function(x){sum(is.na(x))>=10})
    truek = apply(trueCalls, 1, function(x){length(unique(x[!is.na(x)]))})
   
    params1 = params[!getna,]
    truek1 = truek[!getna]

    xx = data.frame(params1)
    t = as.factor(as.numeric(truek1)) 
    xx = data.frame(xx, t)
    fit = suppressWarnings(vglm(t~., multinomial(refLevel=1), xx))
    coe = coefficients(fit) # VGAM::
    return(coe)    
}

getKrlmmVGLMCoefficients <- function(pkgname, trueCalls, params, verbose, numSample, samplenames){
    if (!is.null(trueCalls)) {
        coe = calculateKrlmmCoefficients(trueCalls, params, numSample, samplenames)
        if (!is.null(coe)) {
            if (verbose) message ("Done calculating platform-specific coefficients to predict number of clusters")
            return(coe)
        }
    }
    if (!is.null(trueCalls)) message("Fall back to use defined platform-specific coefficients")
    if (verbose) message ("Retrieving defined platform-specific coefficients to predict number of clusters")
    loader("krlmmVGLMCoefficients.rda", .crlmmPkgEnv, pkgname)
    return(getVarInEnv("krlmmCoefficients"))      
}


predictKwithVGLM <- function(data, coe, verbose){
    if (verbose) message("Start predicting number of clusters")
    logit1 <- rep(0, nrow(data))
    logit2 <- coe[1]+coe[3]*data[,1]+coe[5]*data[,2]+coe[7]*data[,3]+coe[9]*data[,4]+coe[11]*data[,5]+coe[13]*data[,6]+coe[15]*data[,7]+coe[17]*data[,8]
    logit23 <- coe[2]+coe[4]*data[,1]+coe[6]*data[,2]+coe[8]*data[,3]+coe[10]*data[,4]+coe[12]*data[,5]+coe[14]*data[,6]+coe[16]*data[,7]+coe[18]*data[,8]
   
    logits <- cbind(logit1, logit2, logit23)
    rm(logit1)
    rm(logit2)
    rm(logit23)
    p.unscaled <- exp(logits)
    rm(logits)   
    p <- p.unscaled / rowSums(p.unscaled)
    clusterPrediction = apply(p, 1, function(x){which.max(x)})
    rm(p.unscaled)
    rm(p)
    if (verbose) message("Done predicting number of clusters")
    return(clusterPrediction)
}

#######################################################################################################

assignCalls3Cluster <- function(intensities, priormeans){
    prior31 = c(priormeans[1]/2,priormeans[2],priormeans[3])
    prior32 = c(priormeans[1],priormeans[2],priormeans[3]/2)	

    emp <- rep(NA, length(intensities))
	ansCalls <- emp
    tmp <- try(kmeans(intensities, priormeans, nstart=1),TRUE)
    if(class(tmp) == "try-error") {
        tmp1 <- try(kmeans(intensities, prior31, nstart=1),TRUE)
        if(class(tmp1) == "try-error"){
            tmp2 <- try(kmeans(intensities, prior32, nstart=1),TRUE)
            if(class(tmp2) == "try-error"){
                ansCalls = emp
            }else{
                ansCalls = tmp2$cluster
            }
        }else{
            ansCalls = tmp1$cluster
        }
    }else{
        ansCalls = tmp$cluster
    }
    rm(prior31, prior32)
    return(ansCalls)
 
}

#######################################################################################################

assignCalls2Cluster <- function(intensities, priormeans){
    closest <- rep(NA, 3)	
    for(j in 1:3){
        distance <- as.vector(abs(priormeans[j]-intensities))
        closest[j] <- intensities[which.min(distance)]
    }
    prior2 <- priormeans[priormeans!=priormeans[which.max(abs(closest-priormeans))]]
	
    emp <- rep(NA, length(intensities))
    ansCalls <- emp    
    tmp <- try(kmeans(intensities, prior2, nstart=1), TRUE)
    if(class(tmp) == "try-error") {
        ansCalls <- emp
    }else{
        if(prior2[1]==priormeans[2] && prior2[2]==priormeans[3]){
            mp <- abs(tmp$centers[1]-tmp$centers[2])
            pp <- abs(priormeans[2]-priormeans[3])
            if((mp/pp)<=0.25){
                ansCalls <- emp
            }else{
                ansCalls <- tmp$cluster+1
            }
        }
        if(prior2[1]==priormeans[1] && prior2[2]==priormeans[2]){
            mp=abs(tmp$centers[1]-tmp$centers[2])
            pp=abs(priormeans[1]-priormeans[2])
            if((mp/pp)<=0.25){
                ansCalls = emp
            }else{
                ansCalls <- tmp$cluster
            }
        }
        if(prior2[1]==priormeans[1] && prior2[2]==priormeans[3]){
            mp <- abs(tmp$centers[1]-tmp$centers[2])
            pp <- abs(priormeans[1]-priormeans[3])
            if ((mp/pp) <=0.25){
                ansCalls <- emp
            }else{
                ansCalls[tmp$cluster==1] <- 1
                ansCalls[tmp$cluster==2] <- 3
            }
        }
    }
    rm(tmp)
    return(ansCalls)
}

#######################################################################################################

assignCalls1Cluster <- function(intensities, priormeans){
    closest <- rep(NA, 3)
    for(j in 1:3){
        distance <- as.vector(abs(priormeans[j]-intensities))
        closest[j]=intensities[which.min(distance)]
    }
    clusterindex <- which.min(abs(closest-priormeans))
    return(rep(clusterindex, length(intensities)))
}
  
#######################################################################################################

assignCallsOneSNP <- function(x, priormeans, numSample){
    tolerance = 1e-3
       
    k = x[numSample + 1]
    values = x[1:numSample]  
    
    if (abs(k - 2) <= tolerance) {
        tmp <- try(kmeans(values, priormeans, nstart=1),TRUE)
        if (!(class(tmp)=="try-error")) {
            k <- 3;
        }
    }

    successful <- FALSE;
    if (abs(k - 3) <= tolerance){
        SNPCalls <- assignCalls3Cluster(values, priormeans)
        successful <- !is.na(SNPCalls[1])
        if (!successful) { 
            k <- 2
        }
    }
		
    if ( (abs(k - 2) <= tolerance) && (!successful)){
        SNPCalls <- assignCalls2Cluster(values, priormeans)
        successful <- !is.na(SNPCalls[1])
        if (!successful) { 
            k <- 1
        }			
    }

    if ( (abs(k - 1) <= tolerance) && (!successful)){
        SNPCalls <- assignCalls1Cluster(values, priormeans)
    }
    return(SNPCalls)
}


assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
    # process by block size of 500,000 by default
    message("Start assigning calls")
       
    numBlock = ceiling(numSNP / blockSize)

    for (i in 1:numBlock){
        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
        subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
        thisnumSNP = nrow(subsetM)
        subseta = a[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP)]
        subseta = matrix(subseta, thisnumSNP, 1)

        testValues = cbind(subsetM, subseta)

        cl <- makeCluster(getNumberOfCores())
        callAns <- parRapply(cl, testValues, assignCallsOneSNP, priormeans, numSample)
        stopCluster(cl)

        callAnsAllValues = unlist(callAns)

        subsetcalls <- matrix(callAnsAllValues, thisnumSNP, numSample, byrow = TRUE)
        
        callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
        rm(subsetM, subseta, subsetcalls)
    }
    message("Done assigning calls")
}

#######################################################################################################


calculateParameters <- function(M, priormeans, numSNP, verbose) {
    if (verbose) message("Start calculating 3-cluster parameters")
    params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
    if (verbose) message("Done calculating 3-cluster parameters")

    if (verbose) message("Start calculating 2-cluster parameters")
    params2cluster <- calculateParameters2Cluster(M, priormeans, numSNP, verbose);
    if (verbose) message("Done calculating 2-cluster parameters")

    if (verbose) message("Start calculating 1-cluster parameters")
    params1cluster <- calculateParameters1Cluster(M, priormeans, numSNP, verbose);    
    if (verbose) message("Done calculating 1-cluster parameters")

    parameters <- cbind(as.matrix(params1cluster$elemh), as.matrix(params1cluster$eless), as.matrix(params2cluster$elemh), as.matrix(params2cluster$eless),
                        as.matrix(params3cluster$elemh), as.matrix(params3cluster$eless), as.matrix(params2cluster$elehw), as.matrix(params3cluster$elehw));
    
    rownames(parameters) = rownames(M)
    rm(params3cluster)
    rm(params2cluster)
    rm(params1cluster)
    return(parameters)
}

hardyweinberg <- function(x, minN = 8){
   if (length(x) < minN){
       return(NA) 
    } else {
       result = .Call("krlmmHardyweinberg", x)
       return(result)
    }
}

calculateOneParams3Cluster <- function(x, priors, priorDown, priorUp){

    tmp = try(kmeans(x, priors, nstart=1), TRUE)
    if(class(tmp) == "kmeans") {
        flag = 1
     } else {
        tmp = try(kmeans(x, priorDown, nstart=1), TRUE)
        if(class(tmp) == "kmeans") {
            flag = 2
        } else {
            tmp = try(kmeans(x, priorUp, nstart=1), TRUE)
            if (class(tmp) == "kmeans") {
                flag = 3
            } else {
                tmp = kmeans(x, 3, nstart=35)
                flag = 4
            }
        }  
    }
    ss = sum(tmp$withinss)
    hw = hardyweinberg(tmp$cluster) 
    centers = sort(tmp$centers, decreasing = TRUE)

    return(c(centers, ss, hw, flag))
}

calculateParameters3Cluster <- function(M, priormeans, numSNP, verbose) {
    Apriors = priormeans
    ApriorDown = c(Apriors[1]/2, Apriors[2], Apriors[3]) # shift-down
    ApriorUp = c(Apriors[1], Apriors[2], Apriors[3]/2) # shift-up

    cl <- makeCluster(getNumberOfCores())
    clusterEvalQ(cl, library(crlmm))
    parAns <- parRapply(cl, M, calculateOneParams3Cluster, Apriors, ApriorDown, ApriorUp)

    stopCluster(cl)
    parAnsAllValues = unlist(parAns)
    parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
   
    centers <- parameters[, 1:3]
    ss <- parameters[, 4]
    hw <- parameters[, 5]
    flag <- parameters[, 6]

    rm(parAns)
    rm(parameters)
   
    sigma=solve(cov(centers, use="complete.obs"))
    mh = calculateMahalDist3Cluster(centers, sigma, flag, Apriors, ApriorDown, ApriorUp, numSNP)

    rm(sigma)
    rm(centers)

    gc()
    return(list(eless = ss, elemh = mh, elehw = hw))
}


calculateMahalDist3Cluster <- function(centers, sigma, flag, priors, priorDown, priorUp, numSNP){
    mahaldist = rep(NA, numSNP)

    tolerance = 1e-3
    for (i in 1:numSNP) {
        if ((abs(flag[i] - 1) <= tolerance) || (abs(flag[i]- 4) <= tolerance)) difference = centers[i, ] - priors
        if (abs(flag[i] - 2) <= tolerance) difference = centers[i, ] - priorDown
        if (abs(flag[i] - 3) <= tolerance) difference = centers[i, ] - priorUp         
        mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
    }
    return(mahaldist)
}


calculateOneParams2Cluster <- function(x, priors){
    aa = rep(NA, 3)   
    for(j in 1:3){
        dist = as.vector(abs(priors[j]-x))
        aa[j]=x[which.min(dist)]
    } 
    prior2 = priors[priors!=priors[which.max(abs(aa-priors))]]
     

    tmp = try(kmeans(x, prior2, nstart=1), TRUE)
    if (class(tmp)=="kmeans") {
        centers = tmp$centers
        hw = hardyweinberg(tmp$cluster)
    }
    rm(tmp)
    tmp = kmeans(x, 2, nstart = 35)
    if (tmp$centers[1] < tmp$centers[2]){
        centers = c(tmp$centers[2], tmp$centers[1])
        hw = hardyweinberg(3 - tmp$cluster)   
    } else {
        centers = tmp$centers
        hw = hardyweinberg(tmp$cluster)
    }
    ss = sum(tmp$withinss)     
    return(c(centers, prior2, ss, hw))
}


calculateParameters2Cluster <- function(M, priormeans, numSNP, verbose) {
    Apriors = priormeans
   
    cl <- makeCluster(getNumberOfCores())
    clusterEvalQ(cl, library(crlmm))
    parAns <- parRapply(cl, M, calculateOneParams2Cluster, Apriors)
    stopCluster(cl)

    parAnsAllValues = unlist(parAns)
    parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)

    centers <- parameters[, 1:2]
    priors2cluster <- parameters[, 3:4]
    ss <- parameters[, 5]
    hw <- parameters[, 6]
    
    rm(parAns)
    rm(parameters)
    sigma=solve(cov(centers, use="complete.obs"))

    mh = calculateMahalDist2Cluster(centers, sigma, priors2cluster, numSNP)

    rm(sigma)
    rm(centers)

    gc()
    return(list(eless = ss, elemh = mh, elehw = hw))
}


calculateMahalDist2Cluster <- function(centers, sigma, priors, numSNP){
    mahaldist = rep(NA, numSNP)
    
    for (i in 1:numSNP) {
        difference <- centers[i,] - priors[i,]
        mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)        
    }
    return(mahaldist)
}

calculateOneParams1Cluster <- function(x, priors){
    center = mean(x)
    diff <- x - center
    diffsquare <- diff^2
    ss = sum(diffsquare)

    closestPrior = priors[which.min(abs(priors - center))]

    return(c(center, closestPrior, ss))
}


calculateParameters1Cluster <- function(M, priormeans, numSNP, verbose) {
    Apriors = priormeans
   
    cl <- makeCluster(getNumberOfCores())
    clusterEvalQ(cl, library(crlmm))
    parAns <- parRapply(cl, M, calculateOneParams1Cluster, Apriors)
    stopCluster(cl)

    parAnsAllValues = unlist(parAns)
    parameters <- matrix(parAnsAllValues, numSNP, 3, byrow = TRUE)

    centers <- matrix(parameters[, 1], numSNP, 1)
    prior1cluster <- matrix(parameters[, 2], numSNP, 1)
    ss <- parameters[, 3]
    
    rm(parAns)
    rm(parameters)
    sigma=solve(cov(centers, use="complete.obs"))

    mh = calculateMahalDist1Cluster(centers, sigma, prior1cluster, numSNP)

    rm(sigma)
    rm(centers)

    gc()
    return(list(eless = ss, elemh = mh))
}


calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
    mahaldist = rep(NA, numSNP)
    
    for(i in 1:numSNP) {
        difference <- as.vector(centers[i, 1] - priors[i, 1])
	mahaldist[i]=difference%*%sigma%*%difference
    }
    return(mahaldist)
}
   
#############################################


computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
    # compute log-ratio in blocksize of 500,000 by default
    if (verbose) message("Start computing confidence scores")
    
    numBlock = ceiling(numSNP / blockSize)
    for (i in 1:numBlock){
        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
        subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
        subsetCalls = as.matrix(calls[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
        subsetCallProb = .Call("krlmmConfidenceScore", subsetM, subsetCalls, PACKAGE="crlmm");
        callsPr[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetCallProb[1:nrow(subsetM), ]
        rm(subsetM, subsetCalls, subsetCallProb)
    }

    if (verbose) message("Done computing confidence scores")
}

#############################################

AddBackNoVarianceSNPs <- function(cnSet, calls, scores, numSNP, numSample, variantSNPIndex, noVariantSNPIndex){
    callsGt = calls(cnSet)
    callsPr = snpCallProbability(cnSet)
    open(callsGt)
    open(callsPr)

    callsGt[variantSNPIndex, ] = calls[,]
    callsPr[variantSNPIndex, ] = scores[,]  
    
    callsGt[noVariantSNPIndex, ] = NA
    callsPr[noVariantSNPIndex, ] = 0     

    close(callsGt)
    close(callsPr)    
}

#############################################

krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose, offset=0){
    if (verbose) message("Start imputing gender")    
    S = computeAverageLogIntensity(cnSet, verbose, offset=offset) 

    # S is calculated and saved in original SNP order. 
    matchy = match(rownames(annotation)[YIndex], rownames(S))
    matchy = matchy[!is.na(matchy)]
    if (length(matchy) <= 10){
        predictedGender = rep(NA, ncol(S))
    }
    Sy = S[matchy,]

    uniqueDataPoint = apply(Sy, 1, function(x){length(unique(x))})
    validYSNPs = uniqueDataPoint >= 2
    SyValid = Sy[validYSNPs, ]
    
    if (nrow(SyValid) < 20){
        message("Not enough ChrY SNPs, skipping gender prediction step");
        predictedGender = rep(NA, ncol(Sy))
    }
   
    rm(S)
    numYChrSNP = nrow(SyValid)

    allS = matrix(NA, numYChrSNP, 2)

    for (i in 1:numYChrSNP) {
        tmp = kmeans(SyValid[i,] ,2, nstart=45)
        allS[i,] = sort(tmp$centers, decreasing=F)
    }
    priorS = apply(allS, 2, FUN="median", na.rm=TRUE)

    if (abs(priorS[1] - priorS[2]) <= 1.6) {
        message("Separation between clusters too small (samples probabaly all the same gender): ");
      
    }
    
    meanmatrix = apply(Sy, 2, median)

    Sy1 = abs(meanmatrix - priorS[1])
    Sy2 = abs(meanmatrix - priorS[2])

    # output male - 1, female - 2, female S-values are smaller than male S-values. 
    test = cbind(Sy2, Sy1)
    predictedGender = apply(test, 1, which.min)

    open(cnSet$gender)
    cnSet$gender[,] = predictedGender
    close(cnSet$gender)
    
    if (verbose) message("Done imputing gender")
    return(predictedGender)   
}


#############################################

verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose){
    if (verbose) message("Start verifying SNPs on Chromosome Y")
    callsGt = calls(cnSet)
    callsPr = snpCallProbability(cnSet)
    open(callsGt)
    open(callsPr)
       
    matchy = match(rownames(annotation)[YIndex], rownames(M))
    matchy = matchy[!is.na(matchy)]
   
    MChrY = M[matchy,]
    callsChrY = calls[matchy,]
  
    male = gender == 1
    female = gender == 2    
    
    checkK = apply(callsChrY[, male], 1, function(x){ length(unique(x[!is.na(x)])) } )
    
    for(i in 1:nrow(MChrY)){
        # Chromosome Y SNPs, no calls for female, k = 1 or 2 permitted for male samples
        thisChrYSNPorigPosition = match(rownames(callsChrY)[i], rownames(callsGt))
        callsGt[thisChrYSNPorigPosition, female] = NA
        callsPr[thisChrYSNPorigPosition, female] = 0

        # early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned
        # male samples to 3 clusters by mistake. 
        if (checkK[i] < 3) next;
          
        if (class(try(kmeans(MChrY[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){
           
            maleSampleCalls = kmeans(MChrY[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster
            callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 1] = 1
            callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 2] = 3
        } else {
                    
            distanceToPrior1 = mean(abs(MChrY[i, male] - priormeans[1]))
            distanceToPrior3 = mean(abs(MChrY[i, male] - priormeans[3]))                
            callsGt[thisChrYSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3)
        }

        MMaleSamples = MChrY[i, male]
        callsMaleSample = callsGt[thisChrYSNPorigPosition, male]
        scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm");

        callsPr[thisChrYSNPorigPosition, male] = scoresMaleSample       
    }

    close(callsGt)
    close(callsPr)
    
    if (verbose) message("Done verifying SNPs on Chromosome Y")
}




#######################################
verifyChrXSNPs <- function(cnSet, M, calls, gender, annotation, XIndex, priormeans, verbose){
    if (verbose) message("Start verifying SNPs on Chromosome X")
    callsGt = calls(cnSet)
    callsPr = snpCallProbability(cnSet)
    open(callsGt)
    open(callsPr)
      
    matchx = match(rownames(annotation)[XIndex], rownames(M))
    matchx = matchx[!is.na(matchx)]
   
    MChrX= M[matchx,]
    callsChrX = callsGt[matchx,]
  
    male = gender == 1
    female = gender == 2    
    
    checkK = apply(callsChrX[, male], 1, function(x){ length(unique(x[!is.na(x)])) } )
    
    for(i in 1:nrow(MChrX)){
      
        # Chromosome X SNPs for male, k = 1 or 2 permitted for male samples
thisChrXSNPorigPosition = match(rownames(callsChrX)[i], rownames(callsGt))
        # early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned
        # male samples to 3 clusters by mistake. 
        if (checkK[i] < 3) next;
          
        if (class(try(kmeans(MChrX[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){
           
            maleSampleCalls = kmeans(MChrX[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster
            callsGt[thisChrXSNPorigPosition, male][maleSampleCalls == 1] = 1
            callsGt[thisChrXSNPorigPosition, male][maleSampleCalls == 2] = 3
        } else {
                    
            distanceToPrior1 = mean(abs(MChrX[i, male] - priormeans[1]))
            distanceToPrior3 = mean(abs(MChrX[i, male] - priormeans[3]))                
            callsGt[thisChrXSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3)
        }

        MMaleSamples = MChrX[i, male]
        callsMaleSample = callsGt[thisChrXSNPorigPosition, male]
        scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm");

       callsPr[thisChrXSNPorigPosition, male] = scoresMaleSample       
    }

    close(callsGt)
    close(callsPr)
    
    if (verbose) message("Done verifying SNPs on Chromosome X")
}

Try the crlmm package in your browser

Any scripts or data that you put into this service are public.

crlmm documentation built on Nov. 8, 2020, 4:55 p.m.