ignore/code/analysis/mousehybrid/complexity.R

###Assess complexity under varying parameters
###NB: The paths are relative to the root of the scphaser git repo
###
###SYNOPSIS
###library('devtools')
###devtools::load_all()
###source("ignore/code/analysis/mousehybrid/complexity.R")


##Dirs
out_data_dir = './ignore/res/mousehybrid/complexity/data'
out_pdf_dir = './ignore/res/mousehybrid/complexity/pdf'

##Files
timing_rds = file.path(out_data_dir, 'timing.rds')
timing_rds = file.path(out_data_dir, 'timing.rand.rds')

##Libs
source('./ignore/code/analysis/performance.R')
library('BiocParallel')
library('dplyr')
library('tidyr')

main <- function(){


    ##*###
    ##Read and prep data
    ##*###
    load('../nogit/data/mousehybrid/mousehybrid.rda')

    ##create acset
    acset = new_acset(mousehybrid$featdata, mousehybrid$refcount, mousehybrid$altcount, mousehybrid$phenodata)
    lapply(acset, dim) #167443    336
    length(unique(acset[['featdata']][, 'feat'])) #11590
    feat2nvars = table(acset[['featdata']][, 'feat'])
    table(feat2nvars) #16: 333

    
    ##*###
    ##Parameters
    ##*###

    ##*###
    ##Test run
    ##*###
    verbosity = 0
    
    ##sampling iterations
    perm_iter = 1
    
    ##phasing method params
    input = c('gt')
    method = c('exhaust', 'cluster')
    weigh = c(FALSE)
    
    ##"minimal" filtering
    min_acount = 3
    fc = 3
    nmincells = 3
    nminvar = 2
    
    ##filter on n.vars
    n.vars = c(2)

    ##subsample n.sel feats
    n.feats = c(25)
    
    ##subsample n.cells
    n.cells = c(50)    

    ##specify paramset as all possible combinations of the params
    paramset = expand.grid(perm_iter, min_acount, fc, nmincells, input, weigh, method, n.vars, n.feats, n.cells, stringsAsFactors = FALSE)
    colnames(paramset) = c('perm_iter', 'min_acount', 'fc', 'nmincells', 'input', 'weigh', 'method', 'n.vars', 'n.feats', 'n.cells')
    dim(paramset) #2
    
    ##parallelization
    ncores = 2
    bp_param = BiocParallel::MulticoreParam(workers = ncores)

    
    ##*###
    ##Full run
    ##*###
    verbosity = 0
    
    ##sampling iterations
    perm_iter = 1
    
    ##phasing method params
    input = c('gt', 'ac')
    method = c('exhaust', 'cluster')
    weigh = c(TRUE, FALSE)
    
    ##"minimal" filtering
    min_acount = 3
    fc = 3
    nmincells = 3
    nminvar = 2
    
    ##filter on n.vars
    n.vars = c(2, 4, 6, 8, 10)

    ##subsample n.sel feats
    n.feats = c(25, 50, 100)
    
    ##subsample n.cells
    n.cells = c(50, 100, 200, 300)    

    ##specify paramset as all possible combinations of the params
    paramset = expand.grid(perm_iter, min_acount, fc, nmincells, input, weigh, method, n.vars, n.feats, n.cells, stringsAsFactors = FALSE)
    colnames(paramset) = c('perm_iter', 'min_acount', 'fc', 'nmincells', 'input', 'weigh', 'method', 'n.vars', 'n.feats', 'n.cells')
    dim(paramset) #480
    
    ##parallelization
    ncores = 80
    bp_param = BiocParallel::MulticoreParam(workers = ncores)


    ##*###
    ##Filter
    ##*###

    
    ##Call gt
    acset = call_gt(acset, min_acount, fc)

    ##randomly flip nfracflip vars
    nfracflip = 0.5
    acset_rand = rand_flip(acset, nfracflip)
    
    ##Filter
    acset_rand = filter_acset(acset_rand, nmincells, nminvar)
    lapply(acset_rand, dim) #167443 -> 152110
    length(unique(acset_rand[['featdata']][, 'feat'])) #11590
    
    feat2nvars = table(acset_rand[['featdata']][, 'feat'])
    table(feat2nvars) #10: 536
        
    
    ##*###
    ##phase (parallelize on the paramset)
    ##*###
    ##time each phasing

    ##loop param set
    nparamset = nrow(paramset)
    res_list = BiocParallel::bplapply(1:nparamset, complexity_par, BPPARAM = bp_param, paramset = paramset, acset = acset_rand, verbosity = verbosity)
    res_df = do.call(rbind, res_list)
    res_df = as.data.frame(res_df)

    ##convert datatypes
    char.cols = c('input', 'weigh', 'method')
    num.cols = c('user.self', 'sys.self', 'elapsed', 'user.child', 'sys.child')
    int.cols = setdiff(colnames(res_df), c(char.cols, num.cols))
    res_df[, char.cols] = lapply(res_df[, char.cols], as.character)
    res_df[, num.cols] = lapply(res_df[, num.cols], as.numeric)
    res_df[, int.cols] = lapply(res_df[, int.cols], as.integer)
    
    ##add combo column
    res_df = tidyr::unite(res_df, in.meth.w, input, method, weigh, remove = FALSE, sep = '.')

    ##Dump
    saveRDS(res_df, file = timing_rds)
    ##status: fin (50min; 384 rows, 80 cores)
    
    
    ##*###
    ##Post-phase analysis (plotting)
    ##*###
    library('ggplot2')

    res_df = readRDS(timing_rds)

    ##Normalize by setting smallest time to one
    res_by_ncells = res_df %>% group_by(in.meth.w, n.cells) %>% summarise(mean.t = mean(elapsed))
    tmin_ncells = min(res_by_ncells[, 'mean.t'])
    res_by_nfeats = res_df %>% group_by(in.meth.w, n.feats) %>% summarise(mean.t = mean(elapsed))
    tmin_nfeats = min(res_by_nfeats[, 'mean.t'])
    res_by_nvars = res_df %>% group_by(in.meth.w, n.vars) %>% summarise(mean.t = mean(elapsed))
    tmin_nvars = min(res_by_nvars[, 'mean.t'])
    tmin = c(tmin_ncells, tmin_nfeats, tmin_nvars)
    names(tmin) = c('n.cells', 'n.feats', 'n.vars')

    ##add numeric weigh col
    weigh.num = as.integer(as.logical(res_df[, 'weigh']))
    res_df = cbind(res_df, weigh.num)
    
    x.str = 'n.cells'
    y.str = 'elapsed'
    group.col = 'in.meth.w'
    colour.col = 'method'
    lt.col = 'input'
    size.col = 'weigh.num'

    j_res = res_df
    j_res[, 'elapsed'] = j_res[, 'elapsed'] / tmin[x.str]

    if(0){
        gg = ggplot(j_res, aes_string(x = x.str, y = y.str, group = group.col, colour = colour.col, linetype = lt.col, size = size.col))
        gg = gg + geom_line()
        gg = gg + scale_size(range = c(0.5, 1), breaks = c(0, 1), labels = c('FALSE', 'TRUE'))    
        gg = gg + facet_grid(n.vars ~ n.feats, scales = 'free_y')
        plot(gg)
    }
    
    ##time2ncells
    x.str = 'n.cells'
    j.pdf = file.path(out_pdf_dir, 'time2ncells.scaled.pdf')
    gg = ggplot(j_res, aes_string(x = x.str, y = y.str, group = group.col, colour = colour.col, linetype = lt.col, size = size.col))
    gg = gg + stat_summary(fun.y = 'mean', geom = 'line')
    gg = gg + scale_size(range = c(0.5, 1), breaks = c(0, 1), labels = c('FALSE', 'TRUE'))
    gg = gg + facet_wrap(~ n.vars, scales = 'free', ncol = 2)

    ##Background
    gg = gg + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(panel.background = element_blank())
    gg = gg + theme(axis.text = element_text(colour="black"), axis.ticks = element_line(colour = 'black'))

    ##plot
    pdf(j.pdf)
    plot(gg)
    dev.off()

    
    ##time2nfeats
    x.str = 'n.feats'
    j_res = res_df
    j_res[, 'elapsed'] = j_res[, 'elapsed'] / tmin[x.str]
    j.pdf = file.path(out_pdf_dir, 'time2ngenes.scaled.pdf')
    gg = ggplot(j_res, aes_string(x = x.str, y = y.str, group = group.col, colour = colour.col, linetype = lt.col, size = size.col))
    gg = gg + stat_summary(fun.y = 'mean', geom = 'line')
    gg = gg + scale_size(range = c(0.5, 1), breaks = c(0, 1), labels = c('FALSE', 'TRUE'))
    gg = gg + facet_wrap(~ n.vars, scales = 'free', ncol = 2)

    ##Background
    gg = gg + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(panel.background = element_blank())
    gg = gg + theme(axis.text = element_text(colour="black"), axis.ticks = element_line(colour = 'black'))

    ##Plot
    pdf(j.pdf)
    plot(gg)
    dev.off()

    
    ##time2nvars
    x.str = 'n.vars'
    j_res = res_df
    j_res[, 'elapsed'] = j_res[, 'elapsed'] / tmin[x.str]

    res_df_filt = filter(j_res, n.vars != 2)

    ##rm
    pdf.h = 2.3
    pdf.w = 4.2
    j.pdf = file.path(out_pdf_dir, 'time2nvars.scaled.pdf')
    gg = ggplot(res_df_filt, aes_string(x = x.str, y = y.str, group = group.col, colour = colour.col, linetype = lt.col, size = size.col))
    gg = gg + stat_summary(fun.y = 'mean', geom = 'line')
    gg = gg + scale_size(range = c(0.5, 1), breaks = c(0, 1), labels = c('FALSE', 'TRUE'))
    gg = gg + ylab('Time')
    
    ##Background
    gg = gg + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(panel.background = element_blank())
    gg = gg + theme(axis.text = element_text(colour="black"), axis.ticks = element_line(colour = 'black'))

    pdf(j.pdf, height = pdf.h, width = pdf.w)
    plot(gg)
    dev.off()
    
}

obsolete <- function(){

    ##test filtering
    feat2nvars = as.data.frame(feat2nvars, stringsAsFactors = FALSE)
    colnames(feat2nvars) = c('feat', 'n.vars')    
    feat2nvars = merge(feat2nvars, as.matrix(sel.nvars), by.x = 'n.vars', by.y = 1)
    lapply(feat2nvars, class)
    feat.subset = feat2nvars %>% group_by(n.vars) %>% sample_n(n.sel)

}
edsgard/scphaser documentation built on May 15, 2019, 11:02 p.m.