gbnorm: Graph-based Normalization of RNA-seq data

Documented in build.graph get.references.apcluster get.references.blocks get.references.v1

#' Build graph from correlation matrix
#' 
#' @import igraph
#' @export
build.graph <- function(corrMatrix, minCor = 0.5) {
    output = corrMatrix
    for (i in 1:nrow(corrMatrix)) {
        for (j in i:ncol(corrMatrix)) {
            if (i == j || is.na(corrMatrix[i,j]) || (corrMatrix[i,j] <= minCor)) {
                output[i,j] <- 0
                output[j,i] <- 0
            }
        }
    } 
    return(igraph::graph.adjacency(output,'undirected',weighted = TRUE,diag = FALSE))
}

#' get.references
#' 
#' @import igraph
#' @export
get.references.v1 <- function(m,tmin=0.7, tmax=0.95, nsteps=5,js.tol=0.1) {
    if (any(m == 0) > 0) { m = m + 1}
    cor.expcnt = cor(log(m,base = 10),method = 'pearson')
    output = list()
    lc.df = data.frame(list(t=tmin + c(0:nsteps)*((tmax-tmin) / nsteps)))
    message(lc.df$t)
    lcliques = list()
    for (i in 1:length(lc.df$t)) {
        message(paste("Working on graph built at t =",lc.df[i,'t']))
        t0 = proc.time()
        g = build.graph(cor.expcnt,minCor = lc.df[i,'t'])
        t1 = proc.time()
        lcliques[[i]] = igraph::largest_cliques(g)[[1]]
        t2 = proc.time()
        if (is.null(output$references)) {
            output$references <- lcliques[[i]]
        }
        if (i > 1) {
            lc.df[i,'js'] = jaccard.sim(lcliques[[i-1]], lcliques[[i]])
            if (lc.df[i,'js'] < js.tol) {
                output$references <- lcliques[[i]]
            }
        } 
        lc.df[i,'cliqueSize']= length(lcliques[[i]])
        lc.df[i,'nVertices']= igraph::vcount(g)
        lc.df[i,'nEdges'] = igraph::ecount(g)
        lc.df[i,'time.graph'] = (t1 - t0)['elapsed']
        lc.df[i,'time.clique'] =(t2 - t1)['elapsed']
    }
    output$runs = lc.df
    return(output)
}

get.references.v2 <- function(m,t=0.7,k=5) {
    if (any(m == 0) > 0) { m = m + 1}
    cor.expcnt = cor(log(m,base = 10),method = 'pearson')
    g = build.graph(cor.expcnt,minCor =t)
    candidates = igraph::maximal.cliques(g)
    errors = rep(Inf,length(candidates))
    message(lc.df$t)
}

#' Identify the best set of references
#' 
#' @export
get.references.blocks <- function(m,
                                  cor.method='pearson',
                                  n.runs= 3, min.size = 10,
                                  min.corr = 0.75,
                                  min.count = 0,
                                  log.base = 2,
                                  out.file = '',
                                  debug = FALSE) {
 
    if (system.file("python/sbm.py", package = "gbnorm") == "") {
        stop("Python script not found. Cannot run sbm.py")
    }
    if (is.null(rownames(m))) rownames(m) <- 1:nrow(m)
    if (is.null(colnames(m))) colnames(m) <- 1:ncol(m)

    ## Make sure sbm is executable
    sbm.cmd = paste("python3", system.file("python/sbm.py", package = "gbnorm"))
    tryCatch(
        system(sbm.cmd,ignore.stderr = TRUE, ignore.stdout = TRUE),
        error = function(e) {
            stop(e)
        })
    
    ## Reduce the size of graph
    idx.allpresent = which (sapply(1:ncol(m), FUN = function(i) { return(all(m[,i] > min.count)) }))
    m.compressed  = m[,idx.allpresent]
    if (log.base > 0) { m.compressed = log(m.compressed,base = log.base) }
    cor.expcnt = cor(m.compressed,method=cor.method)

    g = build.graph(cor.expcnt, minCor=min.corr)
    
    ## write graph, run block (in python), load results
    gPrefix = paste0('tmp', paste(as.character(sample(9, replace=T)), collapse=''))
    oPrefix = paste0('tmp', paste(as.character(sample(9, replace=T)), collapse=''))
    gFileName  = paste0(gPrefix, '.graphml')
    oLabelFile = paste0(oPrefix, '.tsv')
    oScoreFile = paste0(oPrefix, '.entropy')
    igraph::write.graph(g, file= gFileName, format='graphml')
   
    sbm.cmd <- paste(sbm.cmd, gFileName, oPrefix, n.runs)
    message("Running blocks with the command")
    message(sbm.cmd)
    system(sbm.cmd, wait=TRUE)
    
    blocks.entropy = read.table(oScoreFile)
    bestModel = which.min(blocks.entropy[,1]) 
    blocks.all = read.table(oLabelFile,sep='\t')
    blocks = blocks.all[,c(1,2,bestModel+2)]
    names(blocks) = c('CompressedId', 'Name', 'BlockId')
    blocks[,'CompressedId'] = blocks[,'CompressedId'] + 1 # python 0-based to R 1-based index
    blocks[,'BlockId'] = blocks[,'BlockId'] + 1
    
    ## Map the Id in compressed matrix to original matrix
    blocks[,'Id'] = idx.allpresent[blocks[,'CompressedId']]
  
    blocks.df = data.frame(table(blocks$BlockId))
    names(blocks.df) = c('BlockId', 'size')
    blocks.df = blocks.df[blocks.df$size >= min.size,]
    blocks.df[,'Rank1Residuals'] =  rep(0,nrow(blocks.df))
    blocks.df[,'MinCor'] =  rep(0,nrow(blocks.df))
    # blocks.df[,'MedianCor'] =  rep(0,nrow(blocks.df))
    blocks.members= list()
    blocks.memNames= list()
    for (i in 1:nrow(blocks.df)) {
        j = blocks.df$BlockId[i]
        blocks.members[[i]] = blocks[blocks$BlockId == j, 'Id' ]
        blocks.memNames[[i]] = blocks[blocks$BlockId == j, 'Name' ]
        blocks.df[i,'Rank1Residuals'] = rank1.residuals(m[,blocks.members[[i]] ]) 
        idx = blocks[blocks$BlockId == j, 'CompressedId']
        tmpc= cor.expcnt[idx, idx]
        blocks.df[i,'MinCor'] = min(c(tmpc))
        # blocks.df[i,'MedianCor'] = median(a(tmpc))
    }
    output <- list(
        'Id'= NULL,
        'Name'= NULL,
        'nVertices' = ncol(m),
        'nVertices.compressed' = ncol(m.compressed)
    )
    remains = which(blocks.df$MinCor > min.corr)
    if (length(remains) == 0) {
        message("No block satisfies mininum requirements.")
        if (debug) {
            message(blocks.df)
        }
        return(output)
    }
    bId = remains[which.min(blocks.df[remains,'Rank1Residuals'])]
    ## Clean up and return
    if (!debug) system(paste('rm', gFileName, oLabelFile, oScoreFile))
    if (out.file == '') out.file <- paste0(oPrefix, '.RDS')
    output[['Id']] <- blocks.members[[bId]]
    output[['Name']] <- blocks.memNames[[bId]]
    saveRDS(output, file = out.file)
    return(output)
}

#' get.references.apcluster
#' 
#' Graph-based identify the set of references using affinity propagation as the 
#' underlying clustering method
#' 
#' @param m expression matrix, genes x samples (which will be transposed internally)
#' @param cor.method [="pearson"] the correlation method used to construct edges between genes. Acceptable values are same as those of R base function `cor()`.
#' @param min.size minimum size of the candidate refernece clusters
#' @param min.count minimum count of a gene across all samples, for it to be retained in constructing the graph
#' @param med.quantile the quantile above which a gene should be expressed in all samples, for it to be retained in constructing the graph
#' @param log.base the base to log-transform the expression matrix
#' @param debug whether to print additional information when running
#' @param verbose.output whether to all clusters (in addition to the reference one) in the output
#' @importFrom apcluster apcluster
#' @export
get.references.apcluster <- function(m, 
                                     cor.method='pearson',
                                     min.size = 10,
                                     min.corr = 0.75,
                                     min.count = 0,
                                     med.quantile = 0.5,
                                     log.base = 2,
                                     debug = FALSE,
                                     verbose.output = FALSE, ...) {
    ## Transpose the input count matrix
    m = t(m)
    ## Reduce the size of graph
    isUniversal = sapply(1:ncol(mt), FUN = function(i) { return(all(mt[,i] > min.count)) })
    medium.threshold = sapply(1:nrow(mt), function(i) {
        (mt[i,] > 0) %>%
            which() %>%
            `[`(mt, i, .) %>%
            quantile(probs = med.quantile) %>%
            return()
    })
    isHighEnough = sapply(1:ncol(mt), function(i) {
        ((mt[,i] - medium.threshold) > 0) %>%
            all() %>% return()
    })
    
    idx.candidates = (isUniversal & isHighEnough) %>% which()
    mt.compressed  = mt[,idx.candidates]
    if (log.base > 0) { mt.compressed = log(mt.compressed,base = log.base) }
    if (debug) {message(sprintf("Building compressed graph with %d vertices", length(idx.candidates)))}
    startT = proc.time()
    cor.expcnt = cor(mt.compressed,method=cor.method)
    dt1 = proc.time() - startT
    cor.expcnt[cor.expcnt < min.corr] <- 0
    if (debug) {message(sprintf("Calculate correlation in %f sec.", dt1['elapsed']))}
    # Affinity propagation
    startT = proc.time()
    apclust = apcluster::apcluster(s= cor.expcnt, ...)
    dt2 = proc.time() - startT
    if (debug) {message(sprintf("Run AP clustering in %f sec.", dt2['elapsed']))}
    ## Mapping compressed id to original id
    ## ClusterId: -1 indicates those removed from the graph, 0 those not belonging to any cluster, and number greater than 0 indicate cluster index
    cl.assignment = data.frame('Id' = 1:ncol(mt),
                               'Name' = colnames(mt),
                               'ClusterId' = rep(-1, ncol(mt)),
                               stringsAsFactors = FALSE)
    clSizes = sapply(apclust@clusters, length)
    cl.assignment[idx.candidates,'ClusterId'] = 0
    for (cl in 1:length(apclust@clusters)) {
        cl.assignment[idx.candidates,][apclust@clusters[[cl]], 'ClusterId'] = cl
    }
    
    cl.members = list()
    cl.memNames = list()
    cl.df = data.frame('ClusterId' = 1:length(apclust@clusters),
                       'size' = clSizes)
    for (i in 1:nrow(cl.df)) {
        cl.members[[i]] = cl.assignment[cl.assignment$ClusterId == i, 'Id' ]
        cl.memNames[[i]] = cl.assignment[cl.assignment$ClusterId == i, 'Name' ]
        cl.raw_expr = mt[,cl.members[[i]],drop=FALSE]
        if (length(cl.members[[i]]) >= 2) {
            cl.df[i,'Rank1Residuals'] = rank1.residuals(cl.raw_expr)
            cl.df[i,'CVMean'] = mean(apply(cl.raw_expr, 1, function(x) {return(sd(x) / mean(x))}))
            cl.df[i,'CVLogNormMean'] = mean(apply(cl.raw_expr, 1, function(x) {return(cv.lognorm(x, pseudocount = 1))}))
        } else {
            cl.df[i,'Rank1Residuals'] = NA
        }
        # cl.df[i,'size'] =  length(cl.members[[i]])
    }
    
    ## largest cluster with the smallest rank1 residuals
    # largest = which(cl.df$size == max(cl.df$size))
    # cId = which.min(cl.df[largest, 'Rank1Residuals'])
    ## best cluster scored by both rank1residuals and size, but weighted slighly more by rank1residuals
    # cId = best.cluster(cl.df, features = c('Rank1Residuals', 'size'), weights = c(-0.7, 0.3))
   
    filtered_cl.idx = which(cl.df$size >= min.size) 
    ## alternatively, best cluster scored by coefficient of variation of the members, computed on un-normalized counts
    # cId = best.cluster(cl.df[cl.df$size >= min.size,], features = c('cv.median'), weights = -1)
    cId = filtered_cl.idx[best.cluster(cl.df[filtered_cl.idx,], features = c('CVLogNormMean', 'Rank1Residuals'), weights = c(-0.7, -0.3))]
    
    
    if (debug) {print(cl.df)}
    if (verbose.output) {
        return(list(
            'references' = list('Id'= cl.members[[cId]],
                                'Name'= cl.memNames[[cId]]),
            'graph' = list('nVertices' = ncol(mt),
                           'nVertices.compressed' = ncol(mt.compressed),
                           'nEdges' = length(which(cor.expcnt[upper.tri(cor.expcnt)] > 0))),
            'clusters' = cl.assignment)
        )
    } else {
        return(list('Id'= cl.members[[cId]],
                    'Name'= cl.memNames[[cId]]))
    }
}

#' get.references.dbscan
#' 
#' Graph-based identify the set of references using DBSCAN as the underlying clustering method
#' @importFrom dbscan dbscan
#' @export
get.references.dbscan <- function(m, 
                                cor.method='pearson',
                                min.size = 3,
                                min.count = 0,
                                med.quantile = 0.5,
                                log.base = 2,
                                eps = 0.1,
                                debug = FALSE,...) {
    mt = t(m)
    ## Reduce the size of graph
    isUniversal = sapply(1:ncol(mt), FUN = function(i) { return(all(mt[,i] > min.count)) })
    medium.threshold = sapply(1:nrow(mt), function(i) {
        (mt[i,] > 0) %>%
            which() %>%
            `[`(mt, i, .) %>%
            quantile(probs = med.quantile) %>%
            return()
    })
    isHighEnough = sapply(1:ncol(mt), function(i) {
        ((mt[,i] - medium.threshold) > 0) %>%
            all() %>% return()
    })
    
    idx.candidates = (isUniversal & isHighEnough) %>% which()
    mt.compressed  = mt[,idx.candidates]
    if (log.base > 0) { mt.compressed = log(mt.compressed,base = log.base) }
    if (debug) {message(sprintf("Building compressed graph with %d vertices", length(idx.candidates)))}
    startT = proc.time()
    cor.expcnt = cor(mt.compressed,method=cor.method, use = 'pairwise.complete')
    dt1 = proc.time() - startT
    if (debug) {message(sprintf("Calculate correlation in %f sec.", dt1['elapsed']))}
    
    ## DBSCAN requires distance matrix as input
    startT = proc.time()
    d.compressed = cor.expcnt %>%
        `[<-`(cor.expcnt< 0, 0.001) %>%
        log10() %>%
        `-`(0, .)
    ### scaling the distance from 0 to 1
    d.compressed = (d.compressed - min(d.compressed)) / (max(d.compressed) - min(d.compressed))
    clust = dbscan::dbscan(as.dist(d.compressed), eps = eps, minPts = min.size)
    dt2 = proc.time() - startT
   
    if (debug) {print(str(d.compressed))} 
    if (debug) {message(sprintf("Run DBSCAN in %f sec.", dt2['elapsed']))}
    if (debug) {print(clust)}
    ## Mapping compressed id to original id
    cl.assignment = data.frame('Id' = 1:ncol(mt),
                               'Name' = colnames(mt),
                               'ClusterId' = rep(0, ncol(mt)))
    cl.assignment[idx.candidates, 'ClusterId'] = clust$cluster
    cl.assignment[idx.candidates, 'ClusterId'] = clust$cluster
    
    cl.members = list()
    cl.memNames = list()
    cl.df = data.frame('ClusterId' = c(1:length(table( clust$cluster[clust$cluster !=0]))))
    for (i in 1:nrow(cl.df)) {
        cl.members[[i]] = cl.assignment[cl.assignment$ClusterId == i, 'Id' ]
        cl.memNames[[i]] = cl.assignment[cl.assignment$ClusterId == i, 'Name' ]
        if (length(cl.members[[i]]) >= min.size) {
            cl.df[i,'Rank1Residuals'] = rank1.residuals(mt[,cl.members[[i]] ])
        } else {
            cl.df[i,'Rank1Residuals'] = NA
        }
        cl.df[i,'size'] =  length(cl.members[[i]])
    }
    
    ## largest cluster with the smallest rank1 residuals
    # largest = which(cl.df$size == max(cl.df$size))
    # cId = which.min(cl.df[largest, 'Rank1Residuals'])
    ## best cluster scored by both rank1residuals and size
    if (debug) {print(cl.df)}
    cId = best.cluster(cl.df, features = c('Rank1Residuals', 'size'), weights = c(-0.5, 0.5))
    if (debug) {message(sprintf("selected cluster: %s", cId))}
    return(list('Id'= cl.members[[cId]],
                'Name'= cl.memNames[[cId]],
                'nVertices' = ncol(mt),
                'nVertices.compressed' = ncol(mt.compressed)))
    
}

#' get.references.hclust
#' 
#' @import dynamicTreeCut
#' @export
get.references.hclust <- function(m, 
                                  cor.method='pearson',
                                  min.corr = 0.75,
                                  min.size = 3,
                                  min.count = 0,
                                  med.quantile = 0.5,
                                  log.base = 2,
                                  debug = FALSE) {
    mt = t(m)
    ## Reduce the size of graph
    isUniversal = sapply(1:ncol(mt), FUN = function(i) { return(all(mt[,i] > min.count)) })
    medium.threshold = sapply(1:nrow(mt), function(i) {
        (mt[i,] > 0) %>%
            which() %>%
            `[`(mt, i, .) %>%
            quantile(probs = med.quantile) %>%
            return()
    })
    isHighEnough = sapply(1:ncol(mt), function(i) {
        ((mt[,i] - medium.threshold) > 0) %>%
            all() %>% return()
    })
    
    idx.candidates = (isUniversal & isHighEnough) %>% which()
    mt.compressed  = mt[,idx.candidates]
    if (log.base > 0) { mt.compressed = log(mt.compressed,base = log.base) }
    if (debug) {message(sprintf("Building compressed graph with %d vertices", length(idx.candidates)))}
    startT = proc.time()
    corM = cor(mt.compressed,method=cor.method)
    dt1 = proc.time() - startT
    corM[corM < min.corr ] <- 0
    if (debug) {message(sprintf("Calculate correlation in %f sec.", dt1['elapsed']))}
    
    ## hclust requires distance matrix as input
    startT = proc.time()
    d.compressed = corM %>%
        `[<-`(corM < min.corr, 0.001) %>%
        log10() %>%
        `-`(0, .)
    ### scaling the distance from 0 to 1
    d.compressed = (d.compressed - min(d.compressed)) / (max(d.compressed) - min(d.compressed))
    clust = hclust(as.dist(d.compressed), method = 'average') %>% dynamicTreeCut::cutreeDynamic(distM = d.compressed)
    # cluster id start at 0, make it start at 1 so 0 can be used for non-clustered points
    clust = clust +1
    dt2 = proc.time() - startT
    if (debug) {message(sprintf("Run hierarchical clustering in %f sec.", dt2['elapsed']))}
    if (debug) { message(str(clust)) } 
    
    ## Mapping compressed id to original id
    cl.assignment = data.frame('Id' = 1:ncol(mt), 'Name' = colnames(mt), 'ClusterId' = rep(0, ncol(mt)))
    cl.assignment[idx.candidates, 'ClusterId'] = clust
   
    cl.members = list()
    cl.memNames = list()
    cl.df = data.frame('ClusterId' = as.numeric(names(table(clust))))
    for (i in 1:nrow(cl.df)) {
        cl.raw_expr = mt[,cl.members[[i]] ]
        cl.members[[i]] = cl.assignment[cl.assignment$ClusterId == i, 'Id' ]
        cl.memNames[[i]] = cl.assignment[cl.assignment$ClusterId == i, 'Name' ]
        cl.df[i,'nReferences'] = length(grep('ERCC',x = cl.memNames[[i]]))
        cl.df[i,'size'] =  length(cl.members[[i]])
        if (cl.df[i,'size'] < min.size) {
            cl.df[i,'Rank1Residuals'] = NA
        } else {
            cl.df[i,'Rank1Residuals'] = rank1.residuals(cl.raw_expr) 
        }
    }
    
    
    ## largest cluster with the smallest rank1 residuals
    # largest = which(cl.df$size == max(cl.df$size))
    # cId = which.min(cl.df[largest, 'Rank1Residuals'])
    ## best cluster scored by both rank1residuals and size
    cId = best.cluster(cl.df, features = c('Rank1Residuals', 'size'), weights = c(-0.7, 0.3))
    # if (debug) {print(cl.df)}
    return(list('Id'= cl.members[[cId]],
                'Name'= cl.memNames[[cId]],
                'nVertices' = ncol(mt),
                'nVertices.compressed' = ncol(mt.compressed)))
}

best.cluster <- function(clusters.df, features = c('Rank1Residuals', 'size'), weights = c(-0.5, 0.5)) {
    # cl.df <- standardize(clusters.df[,features,drop=FALSE], na.rm = TRUE)
    cl.df = clusters.df[,features,drop=FALSE]
    score <- as.matrix(cl.df) %*% weights
    # print(data.frame(clusters.df, score = score))
    return(which.max(score))
}

#' Normalizing (global-scaling) using references 
#'
#' Normalize a read count matrix given the set of reference genes identified by id
#' @param X a read-count matrix of the form genes x samples
#' @param ref.idx an integer vector specifying the column indices of X to be used as reference
#' @export
normalize.by.refs <- function(X, ref.idx, scale = TRUE) {
    if (length(ref.idx) == 1) { # need to be treated specially since dim(X) reduces to NULL and cause error in apply
        Xref = matrix(X[ref.idx,], nrow=1)
    } else {
        Xref = X[ref.idx,]
    }
    normFactors = colSums(Xref) # apply(Xref,MARGIN = 2,FUN = sum)
    if (scale) {
        normFactors = normFactors / geom.mean(normFactors)
    }
    # sanity check
    idx.zero = which(normFactors == 0)
    if (length(idx.zero) > 0) {
        message(paste0("All reference transcripts are zero in the sample ", idx.zero, ". Please remove.\n"))
        return(NULL)
    }
    
    X.norm = sweep(X, MARGIN = 2, STATS = normFactors, FUN = '/')
    colnames(X.norm) = colnames(X)
    rownames(X.norm) = rownames(X)
    return(X.norm)
}

#' normalize.by.scaleFactors
#'
#' @param X count matrix in the form genes x samples
#' @param scaleFactors a vector of scaling factor, must be the same length as the number of samples
#' @export
normalize.by.scaleFactors <- function(X, scaleFactors) {
    if (length(scaleFactors) != ncol(X)) {
        stop("scaleFactors should have the same length as number of samples in the input matrix.")
    }
    X.normed = sapply(1:length(scaleFactors), FUN = function(j) {
        return(X[,j]/scaleFactors[j])
    })
    `colnames<-`(X.normed, colnames(X))
    return(X.normed)
}


#' Normalize by Upper Quantile
#' 
#' @import edgeR
#' @export
normalize.by.uq <- function(X, ...) {
    effLibSizes = colSums(X) * edgeR::calcNormFactors(X, method = 'upperquartile', group = group, ...) # effective library sizes
    sweep(X, 2, effLibSizes, "/") %>%
        return()
}

#' Call calcNormFactors by edgeR
#' 
#' @import edgeR
#' @export
normalize.by.tmm <- function(X,...) {
    effLibSizes = colSums(X) * edgeR::calcNormFactors(X, method = 'TMM', group = group, ...) # effective library sizes
    sweep(X, 2, effLibSizes, "/") %>%
        return()
}

#' Call estimateSizeFactorsForMatrix by DESeq2
#' 
#' @import DESeq2
#' @param X read count matrix with samples in rows and genes in columns
#' @export
normalize.by.deseq <- function(X, ...) {
    DESeq2::estimateSizeFactorsForMatrix(X) %>%
        sweep(X, 2, ., "/")  %>%
        return()
}

#' Normalize by PoissonSeq
#' 
#' @import PoissonSeq
#' @export
normalize.by.poissonseq <- function(X, ...) {
    PoissonSeq::PS.Est.Depth(X, ...) %>%
        sweep(X, 2, ., "/") %>%
        return()
}

#' Normalize by DEGES
#' 
#' @import TCC
#' @export
normalize.by.deges <- function(X, group, norm.method = 'tmm', test.method = 'edger', iteration = 1) {
   TCC::TCC(X, group) %>%
        TCC::calcNormFactors(norm.method = 'tmm', test.method = 'edger', iteration = iteration) %>%
        TCC::getNormalizedData() %>%
        return()
}

#' Normalize by RUVr
#' 
#' @import RUVSeq
#' @export
normalize.by.ruv_r <- function(X, group, ...) {
    if (!is.factor(group)) group <- factor(group)
    design <- model.matrix(~group, data=as.data.frame(X))
    
    y <- edgeR::DGEList(counts=X, group=group)
    y <- edgeR::calcNormFactors(y, method="upperquartile")
    y <- edgeR::estimateGLMCommonDisp(y, design)
    y <- edgeR::estimateGLMTagwiseDisp(y, design)
    
    fit <- edgeR::glmFit(y, design)
    res <- residuals(fit, type="deviance")
    if (is.null(res)) {
        message(str(y))
    }
    X.normed <- RUVSeq::RUVr(X, 1:nrow(X), k=1, res, round = FALSE)$normalizedCounts
    return(X.normed)
}

#' generic function for cross-validation
#' 
#' @param X read count matrix
#' @param ref.idx the indices of features to use as references
#' @param diff.func the function that calculate the differences between 2 normalized matrices
#' @param k [=5]
crossval <- function(X,ref.idx,diff.func, k=5) {
    if (k > length(ref.idx)) {
        k <- length(ref.idx)
        warning("crossval: k > number of refererences. k set to length(ref.idx)")
    }
    warning()
    parts = partitions(length(ref.idx),k=k)
    diffs = list() # matrix(rep(0,k*4),ncol=4)
    # colnames(diffs) = c('U', 'S', 'V','delta')
    for (i in 1:k) {
        set1 = ref.idx[parts[[i]]]
        set2 = setdiff(ref.idx,set1)
        m1 = normalize.by.refs(X,set1)
        m2 = normalize.by.refs(X,set2)
        diffs[[i]] = diff.func(m1, m2)
    }
    return(diffs)
}

crossval.refs <- function(X,ref.idx,k=5) {
    parts = partitions(length(ref.idx),k=k)
    diffs = list() # matrix(rep(0,k*4),ncol=4)
    # colnames(diffs) = c('U', 'S', 'V','delta')
    for (i in 1:k) {
        set1 = ref.idx[parts[[i]]]
        set2 = setdiff(ref.idx,set1)
        m1 = normalize.by.refs(X,set1)
        m2 = normalize.by.refs(X,set2)
        m1.svd = svd(m1)
        m2.svd = svd(m2)
        diffs[[i]]$U = m1.svd$u - m2.svd$u
        diffs[[i]]$S = normalize.vec(m1.svd$d) - normalize.vec(m2.svd$d)
        diffs[[i]]$V = m1.svd$v - m2.svd$v
        diffs[[i]]$delta = (m1 / norm(m1,type='F')) - (m2 / norm(m2, type='F'))
    }
    return(diffs)
}

crossval.rank <- function(X,ref.idx,k=5,tol=1e-12) {
    parts = partitions(length(ref.idx),k=k)
    diffs = matrix(rep(0,k*3),ncol=3)
    colnames(diffs) = c('U', 'S', 'V')
    
    for (i in 1:k) {
        set1 = ref.idx[parts[[i]]]
        set2 = setdiff(ref.idx,set1)
        m1 = normalize.by.refs(X,set1)
        m2 = normalize.by.refs(X,set2)
        m.svd = svd(m1 / norm(m1,type='F') - m2 / norm(m2, type='F'))
        # diffs[i,'U'] = norm(m1.svd$u - m2.svd$u,type='F')
        diffs[i,'numericalRank'] = sum(m.svd$d > tol)
        # diffs[i,'V'] = norm(m1.svd$v - m2.svd$v,type='F')
    }
    return(diffs)
}

standardize <- function(X, na.rm = FALSE) {
    m = matrix(rep(apply(X,2,mean, na.rm = na.rm),nrow(X)),nrow=nrow(X), byrow = TRUE)
    s = matrix(rep(apply(X,2,sd, na.rm = na.rm),nrow(X)),nrow=nrow(X), byrow = TRUE)
    return((X - m)/(ifelse(s == 0,1,s)))
}

centering <- function(X) {
    centroid = matrix(rep(colSums(X) / nrow(X),nrow(X)),nrow=nrow(X),byrow = TRUE) 
    return(X - centroid)
}

rms <- function(X) {
    return(sqrt(mean(c(X^2))))
}

#' Standardized root mean squared deviation between two matrices
#' 
#' @export
srms <- function(X1, X2) {
    return(rms(standardize(X1) - standardize(X2)))
}
#' Least root mean square distance between two set of points in d-dimension
#' 
#' Find and perform the transformation (translation + rotation) on Q that minimizes the distance between 2 matrices P and Q.
#' Return the root mean squared of the differences
#' 
#' 
#' @export
rmsd <- function (P, Q, transform=FALSE) {
    # sanity check
    if (any(dim(P) != dim(Q))) {
        stop("Incompatible comparison. P and Q must have same dimensions.")
    }
    d = ncol(P)
    Pc = centering(P)
    Qc = centering(Q)
    # This scaling is not rigorously tested
    Ps = Pc / norm(Pc,'F')
    Qs = Qc / norm(Qc,'F')
   
    if (transform) {
   
        A = t(Ps) %*% Qs
        A.svd = svd(A)
        S = diag(1,nrow=nrow(A.svd$u))
        S[d,d] = det(A.svd$u %*% t(A.svd$v))
        Rot = A.svd$u %*% S %*% t(A.svd$v)
        diff = Ps%*% Rot - Qs
    } else {
        diff = Pc - Qc
    }
    return(rms(diff))
}
ttdtrang/gbnorm documentation built on Dec. 23, 2021, 1:01 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ttdtrang/gbnorm
Graph-based Normalization of RNA-seq data

R/rnaseq.R
In ttdtrang/gbnorm: Graph-based Normalization of RNA-seq data

Defines functions get.references.apcluster get.references.blocks get.references.v2 get.references.v1 build.graph

Documented in build.graph get.references.apcluster get.references.blocks get.references.v1

R Package Documentation

Browse R Packages

We want your feedback!

ttdtrang/gbnorm Graph-based Normalization of RNA-seq data

R/rnaseq.R In ttdtrang/gbnorm: Graph-based Normalization of RNA-seq data

Defines functions get.references.apcluster get.references.blocks get.references.v2 get.references.v1 build.graph

Documented in build.graph get.references.apcluster get.references.blocks get.references.v1

R Package Documentation

Browse R Packages

We want your feedback!

ttdtrang/gbnorm
Graph-based Normalization of RNA-seq data

R/rnaseq.R
In ttdtrang/gbnorm: Graph-based Normalization of RNA-seq data