R/threshold.R

Defines functions lookupTable

Documented in lookupTable

#' generates `lookupTable` for filtering raw data post gene centric assembly
#'
#' returns three tables, 1. gene centric assembly readOrigin identities, 2. homologySearch assignment, 3 superimposed
#'
#' @param genesOfInterest the kos of interest defaults to scg must be in the ko:KXXXXX format
#' @param annotations table containing the number of KOs for each taxID
#' @param rs processed readStatuses from newbler output
#' @importFrom stats complete.cases
#' @importFrom dplyr rowwise case_when desc everything
#' @examples
#' \dontrun{
#' anno = "~/simulation_fr_the_beginning/out/template.csv"
#' rs = lapply(dynList, function(x) x$rs %>% mutate(ko = x$ko)) %>%
#'    do.call(rbind,.)
#' output = lookupTable(genesOfInterest = scg,
#'    annotations = anno, rs)
#' e
#' }
#' @export
lookupTable = function(genesOfInterest = scg, annotations = "~/simulation_fr_the_beginning/out/template.csv", rs)
{
    . ='shutup'
    scg = NULL
    taxid = NULL
    matched = NULL
    Freq = NULL
    ko = NULL
    total = NULL
    threshold = NULL
    spanning = NULL
    status = NULL
    readOrigin = NULL
    makeLookup = NULL
    abu = NULL
    name = NULL
    # in fact there's 2 lookup tables
    # 1. for homology search - the homology search + LCA has correctly identified the contigs to the correct ID
    # 2. for assembly - the assembly process has successfuly put together reads to form the correct contig

    genesOfInterest %<>% gsub("^(ko:)*","ko:",.)

    message("Simulated taxa")
    suppressMessages({
        simTaxa = dbquery("match ()-[:SIMULATED]-(g:Taxon) return distinct g.taxid as taxid") %$%
        as.character(taxid)
    })
    message(sprintf("%s taxa simulated", length(simTaxa)))

    message("Calculating their abundances")
    suppressMessages({
        abuDF = lapply(simTaxa, function(x){
            dbquery("match (t:Taxon{taxid:{taxid}}) return t.abundance as abu, t.taxid as taxid", list(taxid = x))
        }) %>% do.call(rbind,.)
    })
    ##################################################
    # Homology search
    message("Querying for Homology search data")
    ##################################################

    findGenus="
    MATCH
        (k:ko{ko:{ko}})<-[:assignment]-(c:contigs)-[:taxomapped]->(t:Taxon)
    WHERE
        c.mdr=1 AND c.spanning=1 AND toInt(c.readnum) > k.threshold
    WITH
        t
    OPTIONAL MATCH
        (t)-[:childof*]->(t2:genus)
    RETURN
        CASE
        WHEN t:genus THEN t.taxid
        ELSE t2.taxid
        END AS taxid
    "
        homologyAssignments = genesOfInterest %>% lapply(function(koi){
        message(sprintf("Downloading Homology search data for %s", koi))
    suppressMessages({
            assignedDF = dbquery(findGenus, list(ko = koi))
                tryCatch(
                {
                    if(!is.na(assignedDF) && nrow(assignedDF) > 0){
                        #browser()
                        assignedDF %>% make.data.frame %$% table(taxid) %>% data.frame %>% setNames(c("taxid", "Freq")) %>% arrange(desc(as.integer(Freq))) %>% tbl_df %>% mutate(ko = koi)
                    }else{
                        NA
                    }
                },error = function(e) warning(sprintf("%s has problem", koi))
                )
    })
            }) %>% do.call(rbind,.)
    homologyAssignments$taxid %<>% as.factor
    homologyAssignments$Freq %<>% as.integer
    homologyAssignments = homologyAssignments[complete.cases(homologyAssignments),]
    wideDF = homologyAssignments  %>% group_by(taxid) %>% mutate(tot = sum(Freq)) %>% ungroup %>% spread(ko, Freq, fill=0) %>% tbl_df

    #summaryDF = homologyAssignments %>% group_by(taxid) %>% 
        #summarise(ave = mean(Freq), sd = sd(Freq)) %>% 
        #merge(abuDF, by="taxid", all=T) %>% rowwise %>% mutate(name = taxnam.sql(taxid)$name) %>%
        #arrange(desc(abu)) %>% tbl_df

    #summaryDF %<>% merge(wideDF, by="taxid") %>% arrange(desc(abu)) %>% tbl_df
    #colnames(summaryDF) = make.names(colnames(summaryDF))

    ##################################################
    # Assembly
    message("Querying for Assembly information")
    ##################################################
    message("Reading newbler assembly readStatus; only considering pure contigs")
    rs = rs %>% filter(ko %in% gsub("ko:", "", genesOfInterest)) %>% merge(thresholds, ., all=T)
    suppressMessages({
    thresholds = genesOfInterest %>% 
        lapply(function(ko){dbquery("match (k:ko{ko:{ko}}) return k.ko as ko, k.threshold as threshold", list(ko = ko))}) %>%
        do.call(rbind,.)
    })
    thresholds$ko %<>% gsub("ko:", "", .)


    ass = rs %>% filter(total > threshold, spanning == 1, status == 'pure') %>% mutate(ko = gsub("^", "ko:", ko)) %>%
        group_by(ko, readOrigin) %>% summarise(n=n()) %>%
        setNames(c("ko", "taxid", "Freq"))
    ass_wide = ass %>% spread(ko, Freq, fill = 0)

    ##################################################
    # Truth
    message("Who Matched")
    ##################################################
    df = data.table::fread("~/simulation_fr_the_beginning/out/template.csv") %>% as.data.frame
    annotationDF = df %>% filter(ko %in% genesOfInterest) %>% group_by(seqnames, ko) %>% summarise(Freq=n()) %>% setNames(c("taxid", "ko", "Freq")) %>% ungroup
    wideDF_truth = df %>% filter(ko %in% genesOfInterest) %>% group_by(ko, seqnames) %>% summarise(n=n()) %>% setNames(c("taxid", "ko", "Freq")) %>% spread(taxid, Freq, fill = 0)

    makeLookup = function(input, annotationDF, abuDF, type=c("wide", "long"))
    {
        comp = merge(annotationDF, input,by=c("taxid", "ko"), suffix=c(".truth", ".assigned"), all=T) %>% tbl_df
        comp[is.na(comp)] = 0
        if(type == 'wide'){
            wide = comp %>% rowwise %>%
                mutate(matched= case_when(
                    Freq.truth == 0 && Freq.assigned == 0 ~ 0,
                    Freq.truth == Freq.assigned ~ 1,
                    TRUE ~ 0
                )) %>%
                #mutate(matched = Freq.truth + Freq.assigned)  %>%
                select(taxid, ko, matched) %>% spread(ko, matched)
            #if missing should be
            wide[is.na(wide)] = 0
            #Add abundanceInfo
            wide %<>% merge(abuDF, by="taxid", all.y=T) %>% rowwise %>% mutate(name = taxnam.sql(taxid)$name) %>%
            arrange(desc(abu)) %>% tbl_df %>% select(taxid, name, abu, everything())
            colnames(wide) = make.names(colnames(wide))
            wide
        }else if(type=='long'){
            comp %>%
                rowwise %>%
                mutate(matched= case_when(
                    Freq.truth == 0 && Freq.assigned == 0 ~ 0,
                    Freq.truth == Freq.assigned ~ 1,
                    TRUE ~ 0
                )) %>%
                select(taxid,ko, matched)
        }else{
            warning("unknown type")
        }
    }

    wideDF_comparison_assembly = makeLookup(ass, annotationDF, abuDF, "wide")
    wideDF_comparison_homology = makeLookup(homologyAssignments, annotationDF, abuDF, "wide")

    comparison_assembly = makeLookup(ass, annotationDF, abuDF, "long")
    comparison_homology = makeLookup(homologyAssignments, annotationDF, abuDF, "long")

# need to change

    twoTables = merge(comparison_assembly , comparison_homology , by=c("taxid", "ko"), suffix=c(".assembly", ".homology")) %>%  rowwise %>%
       mutate(matched= case_when(
           matched.assembly == 0 && matched.homology == 0 ~ 0,
           matched.assembly == matched.homology ~ 1,
           TRUE ~ 0
       ))  %>% select(taxid, ko, matched)

    #by filling with zero i'm saying dun look at this one
    superImposed = twoTables %>% spread(ko, matched, fill=0)
    superImposed %<>% merge(abuDF, by="taxid") %>% arrange(desc(abu)) #%>% head(n=1)

    allKOs = list(
            homology    = wideDF_comparison_homology,
            assembly    = wideDF_comparison_assembly,
            superImposed= superImposed
    )
}


#' dynPlot Diagnostic plots
#'
#' @param dyn output from dynamicThreshold
#'
#' @import ggplot2
#' @importFrom tidyr gather
#' @importFrom dplyr filter mutate select group_by do ungroup summarise
#' @importFrom gridExtra grid.arrange
#' @importFrom magrittr "%<>%"
#' @importFrom forcats fct_rev fct_relevel
#' @keywords internal
#' @examples
#' \dontrun{
#'     connect("yourDomain", password="yourPassword")
#'     newblerDir= "~/simulation_fr_the_beginning/reAssemble/everybodyelse/data/newbler/"
#'     dynList = scg  %>% gsub("ko:", "", .) %>% 
#'        mclapply(dynamicThreshold, root=newblerDir, mc.cores=20)
#'     dynList = scg  %>% gsub("ko:", "", .) %>% head(n=1) %>%
#'        lapply(dynamicThreshold, root=newblerDir)
#'     pdf("thresholdPlots.pdf", width=10)
#'     plotDF = dynPlots(dynList, F)
#'     dev.off()
#'     pdf("abundance.pdf", width=10)
#'     lapply(plotDF$details,"[[", 1)
#'     dev.off()
#'     plotDF$p %>% ggsave(file="summaryPlot.pdf", w=10)
#'     plotDF$violin %>% ggsave(file="violin.pdf", w=10, h=4)
#' }
#' @export
dynPlots = function(dynList, indivPlots=T){
    .='shutup'
    MDR = NULL
    readnum = NULL
    threshold = NULL
    ko = NULL
    remaining = NULL
	cutoff = NULL
	perc = NULL
    remaining.cutoff = NULL
    spanning = NULL
    total = NULL
    name = NULL
    cumsum.perc = NULL
    scg = NULL
    taxid = NULL
    inSimulation = NULL
    outsideSimulation = NULL
    newtype = NULL
    annotation =NULL
    g.taxid = NULL
    abu = NULL
    type = NULL
    value = NULL
    bin = NULL
    status = NULL
    repeats = function(df, oneSCG, cutoff, indivPlots=TRUE, spanning=TRUE){
        message(sprintf("Processing KO of interest: %s", oneSCG))
        #recovered = rs %>% filter(origin == 'inSimulation') %>% nrow
        contigs = contigInfo(ko=oneSCG) %>% filter(MDR == '1')
        if(spanning){
            contigs %<>% filter(spanning==1)
        }
        simulatedDF = simulated(oneSCG)
        df2 = df %>% mutate(
            ko               = oneSCG,
            cutoff           = cutoff,
            simulated        = sum(simulatedDF$Freq),
            remaining = sapply(df$threshold, function(x) nrow(filter(contigs, readnum > x))),
            remaining.cutoff = nrow(filter(contigs, readnum > cutoff))
        )
        # shows the number of repeats
        p1 = ggplot(df2)                                      +
            geom_line(aes(threshold, repeats, group=ko))      + # this repeats is based on num. of genomes being repeated
            geom_vline(aes(xintercept = cutoff), color="red") +
            ggtitle(koname(oneSCG)$ko.definition)             +
            ylab("Repeats")  +
            ggtitle("Repeats")

        p2 = ggplot(df2)                                                                                 +
            geom_line(aes(threshold, remaining, group=ko))                                               +
            geom_hline(aes(yintercept=simulated))                                                        +
            geom_hline(aes(yintercept = remaining.cutoff), color="red")                                  +
            geom_text(aes(0, simulated, label = "Simulated", vjust = -1))                                +
            geom_text(aes(50, remaining.cutoff, label = "After thresholding", vjust = -1), color="Red")  +
            ggtitle(koname(oneSCG)$ko.definition)                                                        +
            ylab("#genes (total)")

        # shows the number of simulated & not simulated upon thresholding
        p3 = df %>% gather(type, value, -threshold) %>% filter(type != 'repeats') %>% mutate(type = fct_rev(as.factor(type)))
            ggplot() +
                geom_line(aes(x=threshold, y=value, color=type, group=type))                                               +
                facet_wrap(~type, scales="free", nrow=3)+
                #scale_color_manual(values=c("#e41a1c", "#377eb8", "#4daf4a"))
                theme(axis.text.x = element_text(angle=90, vjust=1, hjust=1))
        message("Lost value")

        if(indivPlots){
            message("Grid")
            grid.arrange(p1, p2) %>% print
            print(p3)
        }
        df2
    }

    kos = lapply(dynList, function(x) x$koi) %>% gsub("ko:", "", .)

    repeatDF = mapply(repeats,
        df        =  lapply(dynList, function(x) x$repeats),# %>% .[3],# %>% head(n=2),
        oneSCG    =  lapply(dynList, function(x) x$koi) %>% gsub("ko:", "", .),# %>% .[3],#%>% head(n=2),
        cutoff    =  lapply(dynList, function(x) x$thres),
        MoreArgs  =  list(indivPlots = indivPlots),
        SIMPLIFY  =  FALSE
    ) %>% do.call(rbind,.)

    repeatDF.thresholdOnly = mapply(repeats,
        df        =  lapply(dynList, function(x) x$repeats.noSpan),# %>% .[3],# %>% head(n=2),
        oneSCG    =  lapply(dynList, function(x) x$koi) %>% gsub("ko:", "", .),# %>% .[3],#%>% head(n=2),
        cutoff    =  lapply(dynList, function(x) x$thres.noSpan),
        MoreArgs  =  list(indivPlots = indivPlots, spanning=FALSE),
        SIMPLIFY  =  FALSE
    ) %>% do.call(rbind,.)

    message("Summary")
    #Naive
    errorTable.vanilla = lapply(dynList, function(x) x$koi) %>% gsub("ko:", "", .) %>% lapply(contigInfo) %>% do.call(rbind,.) %>% filter(MDR==1) %>% group_by(bin) %>% summarise(n=n()) %>% mutate(simulated = sapply(bin, function(ko) nrow(simulated(ko)))) %>% mutate(diff = n - simulated)
    mae.vanilla = signif(sum(errorTable.vanilla$diff)/nrow(errorTable.vanilla), 4)

    #Spanning only 26.65
    errorTable.b4 = repeatDF %>% filter(threshold == 1) %>% select(ko, simulated, remaining)  %>% mutate(diff = abs(remaining - simulated))
    mae.b4      = signif(sum(errorTable.b4$diff)/nrow(errorTable.b4), 4)

    #Threshold only (incl spanning and non-spanning)
    errorTable.thresholdOnly = repeatDF.thresholdOnly %>% select(ko, simulated, remaining.cutoff) %>% unique %>% mutate(diff = abs(remaining.cutoff - simulated))
    mae.threshold = signif(sum(errorTable.thresholdOnly$diff)/nrow(errorTable.thresholdOnly), 4)

    #Remove spanning, then apply threshold mae 21.84
    errorTable.after = repeatDF %>% select(ko, simulated, remaining.cutoff) %>% unique %>% mutate(diff = abs(remaining.cutoff - simulated))
    mae.after   = signif(sum(errorTable.after$diff)/nrow(errorTable.after), 4)

    #apply threshold, then remove spanning (worse) mae 24.45
    thresholdThenSpan = repeatDF.thresholdOnly %>% select(ko, cutoff) %>% unique
    koOrder = sapply(dynList, `[[`, 5)
    thresholdThenSpan$remaining.cutoff = thresholdThenSpan %>% apply(1, function(row){ nrow(filter(dynList[[which(koOrder %in% row['ko'])]]$rs , spanning == 1, total > as.integer(row['cutoff']))) })
    thresholdThenSpan$simulated =  thresholdThenSpan %>% apply(1, function(row){nrow(simulated(unname(row['ko'])))})
    thresholdThenSpan %<>% mutate(diff = abs(remaining.cutoff - simulated))
    mae.after2 = signif(sum(thresholdThenSpan$diff)/nrow(thresholdThenSpan), 4)

    mae.change = sprintf("Mean Absolute Error (MAE): %s (before threshold: %s | spanning-only: %s | threshold-only %s)", mae.after, mae.vanilla, mae.b4, mae.threshold)
    message(mae.change)

    uniqueGenomes = scg %>% lapply(simulated) %>% do.call(rbind,.) %$% unique(taxid) %>% length

    message(sprintf("Number of simulated genomes: %s", uniqueGenomes))
    estimate     = repeatDF %>% select(ko, remaining.cutoff) %>% unique %$% mean(remaining.cutoff)
    simulated    = repeatDF %>% select(ko, simulated) %>% unique %$% mean(simulated)
    prethreshold = repeatDF %>% filter(threshold == 1) %>% select(ko, remaining) %>% unique %$% mean(remaining)

    #hardcoded
    maxGenes = repeatDF$remaining %>% max + 50

    message("Preparing summary PlotDF")
    #naive
    plotdf = repeatDF.thresholdOnly %>% filter(threshold == 1) %>% select(ko, simulated, remaining, inSimulation:outsideSimulation) %>% unique %>% gather(type, value, -ko) %>% mutate(threshold = "naive")
    #with post-processing
    plotdf = repeatDF %>% group_by(ko) %>% filter(threshold == cutoff) %>% ungroup %>% select(inSimulation:ko, remaining.cutoff) %>% gather(type, value, -ko) %>% mutate(threshold="cutoff") %>% rbind(plotdf)

    plotdf$threshold[plotdf$type == 'simulated']= "truth"
    plotdf %<>% mutate(newtype = paste(type, threshold, sep="_"))

    groupType = function(x){
        if(unique(x$type) == 'inSimulation'){
            x %>% mutate(annotation = "in")
        }else if (unique(x$type) == 'outsideSimulation'){
            x %>% mutate(annotation = "out")
        }else{
            x %>% mutate(annotation = "norm")
        }
    }

    plotdf %<>% group_by(newtype) %>% do(groupType(.))
    message("summaryplot")
    p = ggplot(plotdf, aes(x=ko, y=value, group=newtype, 
                           color=fct_relevel(newtype, "remaining_naive", "inSimulation_naive", "outsideSimulation_naive", "remaining.cutoff_cutoff", "simulated_truth", "inSimulation_cutoff", "outsideSimulation_cutoff")))                                                          +
    #regression lines
    geom_line(stat="smooth", method="lm", alpha=0.05, linetype="dashed")                                                                                                                                                 +
    #are there SCGs which are not found in a given genome (yes most definitely; not checked though)
    geom_hline(aes(yintercept = uniqueGenomes), color="grey")+
    geom_line(data = plotdf %>% filter(newtype %in% c("remaining_naive", "simulated_truth", "remaining.cutoff_cutoff")), aes(x=ko, y=value, group=newtype), stat="smooth", method="lm", alpha=0.2, linetype="dashed")                                                                                                                                                 +
    geom_line(alpha=0.5, aes(linetype=fct_relevel(annotation, "norm", "in", "out")))                                                                                                                           +
    geom_point(alpha=0.8)                                                                                                                                                                                               +
    scale_color_manual("Processing",
        values=c("#de2d26", "#fc9272", "#fc9272", "#3182bd","grey", "#9ecae1", "#9ecae1"),
        labels=c(sprintf("Naive (MAE: %s)", mae.vanilla), sprintf("With processing (MAE: %s)", mae.after)),
        breaks = c("remaining_naive", "remaining.cutoff_cutoff"))                                                                                                                                                                    +
    scale_linetype_manual("In Genome Annotation", values=c("solid", "dotted", "dotdash"), label=c("Yes", "No"), breaks=c("in", "out")) +
    scale_y_continuous(breaks=c(0, 100, 200, 300, uniqueGenomes, 400), labels=c(0, 100, 200, 300, sprintf("Genomes: %s",uniqueGenomes), 400))+
    #ylim(c(0, maxGenes))                                                                                                                                                                                                +
    #aesthetics
    ylab(expression("N"[contigs]*" in MDR"))                                                                                                                                                                                             +
    xlab("KEGG Orthology Group")                                                                                                                                                                                        +
    theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1), legend.position="top")                                                                                                                                                       +
    guides(colour = guide_legend(order=1),
           linetype=guide_legend(order=2))+
    guides(col = guide_legend(ncol = 1, byrow = TRUE),linetype = guide_legend(ncol = 1, byrow = TRUE))
    #overallPlot
    message("Boxplot")
    simTaxa = scg %>% lapply(function(oneSCG) dbquery("match (k:ko{ko:{ko}})-[:SIMULATED]-(g:Taxon) return k.ko, g.taxid", list(ko=oneSCG))) %>% do.call(rbind,.) %$% unique(g.taxid) %>% as.character()
    abuDF = simTaxa %>% lapply(function(x) dbquery("match (t:Taxon{taxid:{taxid}}) return t.abundance as abu, t.taxid as taxid", list(taxid = x))) %>% do.call(rbind,.)
    abuDF$taxid %<>% as.character
    simulatedInfo = simTaxa %>% lapply(taxnam.sql) %>% do.call(rbind,.)
    abuDF %<>% merge(simulatedInfo, by="taxid")
    abuDF %<>% arrange(desc(abu)) %>% mutate(rank = 1:nrow(abuDF))
    abuDF$taxid %<>% as.factor
    abuDF %<>% mutate(cumsum.perc = cumsum(abu) / sum(abu))

    missingFunc <- function(oneDynList, abuDF)
    {
        df = oneDynList$rs
        oneSCG  = oneDynList$koi %>% gsub("ko:", "", .)
        cutoff = oneDynList$thres

        naive = abuDF$taxid %in% df$readOrigin
        thres = abuDF$taxid %in% filter(df, total > cutoff)$readOrigin

        abuDF$status = "kept"
        abuDF$status[!thres] = "lostAfterthres"
        abuDF$status[!naive] = "lostNaive"
        thresColor= "#3182bd"
        naiveColor= "#de2d26"
        #browser()
        p1 = ggplot(abuDF, aes(x=rank, y=abu, fill=as.factor(status)))                             +
            geom_bar(stat="identity")                            +
            geom_text(data = abuDF %>% filter(status!='kept'), aes(x=rank,y=abu, label=name, color=as.factor(status)), hjust=0, angle=45) +
            scale_color_manual(values=c(thresColor, naiveColor), guide=FALSE)                             +
            scale_fill_manual("Recovery Status", values=c("#00000020",thresColor, naiveColor), 
            labels=c("Recovered", "Not recovered (With Processing)", "Not recovered (Naive)"))             +
            scale_x_continuous(breaks = abuDF %>% filter(status!='kept') %$% rank)   +
            theme(axis.text.x=element_text(angle=90))                           +
            xlab("Abundance Rank of Genome") + ylab("Abundance")+
            ggtitle(sprintf("Gene recovery of %s: %s", oneSCG, koname(oneSCG)$ko.definition))
        da = data.frame(perc = seq(0.1, 1, 0.1), num.naive =  seq(0.1, 1, 0.1)%>% sapply(function(percentile) nrow(filter(abuDF, cumsum.perc < percentile, status == 'lostNaive'))), num.threshold = seq(0.1, 1, 0.1)  %>% sapply(function(percentile) nrow(filter(abuDF, cumsum.perc < percentile, status != 'kept'))), ko = oneSCG)
        list(plot=p1, data=da)
    }
    remainder = dynList %>% lapply(missingFunc, abuDF = abuDF)

    #browser()
    percDF = lapply(remainder, '[[', 2) %>% do.call(rbind,.) %>% gather(type, value, -perc, -ko)
    percDF$perc %<>% as.factor

    #summary(lm(data=percDF, value ~ perc))

    # Rohan's comments
    # need to show what happens as I continue to increase the thresholding
    # Why is there such variation in the genes (which can be solved by the shannon entropy)

    #summaryPlot - violin
    message("Final")
    violinPlot = percDF  %>% 
        ggplot(aes(x=perc, y=value, fill=as.factor(type), color=as.factor(type)))                    +
            geom_boxplot(position=position_dodge(1), alpha=0.5)                                                 +
            geom_point(position=position_jitterdodge(dodge.width=0.9))                                          +
            #scale_y_log10()+
            scale_color_manual("Processing", values=c("#de2d26", "#3182bd"), label=c("Naive","With processing")) +
            scale_fill_manual( "Processing", values=c("#de2d26", "#3182bd"), label=c("Naive","With processing")) +
            scale_x_discrete(labels=seq(10, 100, 10))+
            xlab("Top Genera (%)")+ylab(c(expression("N"[Genes]*' not recovered')))+
    theme(legend.position="top")+
    geom_vline(aes(xintercept = 1.5), linetype="dashed")+
    geom_vline(aes(xintercept = 2.5), linetype="dashed")+
    geom_vline(aes(xintercept = 3.5), linetype="dashed")+
    geom_vline(aes(xintercept = 4.5), linetype="dashed")+
    geom_vline(aes(xintercept = 5.5), linetype="dashed")+
    geom_vline(aes(xintercept = 6.5), linetype="dashed")+
    geom_vline(aes(xintercept = 7.5), linetype="dashed")+
    geom_vline(aes(xintercept = 8.5), linetype="dashed")+
    geom_vline(aes(xintercept = 9.5), linetype="dashed")

    list(df = repeatDF, mae = list(spanning=mae.b4, bothSpanningNThreshold=mae.after, naive=mae.vanilla), plot=p, details = remainder, violin = violinPlot)
}

#' dynamicThreshold tries to identify the lower bound converage value in order to remove low quality contigs.
#' Number of contigs tend to stablize at a value when removing reads below a certain readnum and we try to identify the min read required for that region of stability
#'
#' @param koi KO of interest
#' @param mode two options either readnum or coverage; coverage is experimental, use readnum
#' @param thresholding which type of thresholding method to use
#' @param root folder to find
#' @importFrom dplyr filter mutate select group_by do
#' @importFrom zoo rollapply
#' @keywords internal
#' @export
dynamicThreshold <- function(koi, root){
	spanning = NULL
    message(sprintf("Processing %s", koi))
    rs = readStatusReader(root, koi, mdr=TRUE)

    repeatsDF.spanning =  contigsSurvive.repeats.readNum(filter(rs,spanning == 1), koi=koi)
    repeatsDF.noSpan =  contigsSurvive.repeats.readNum(rs, koi=koi)

    list(
        repeats        = repeatsDF.spanning,
        thres          = simpleThres(repeatsDF.spanning) %>% ceiling,
        repeats.noSpan = repeatsDF.noSpan,
        thres.noSpan   = simpleThres(repeatsDF.noSpan) %>% ceiling,
        koi            = koi,
        rs             = rs
    )
}

#' rolling A dynamic threshold for the killing redundant contigs
#'
#' outputs a data.frame with 2 columns 1.) rollingMean 2.) meanDepth
#' @param keptDF dataframe for the #repeats left after thresholding, columnnames thresholds, repeats
#' @param wind window size
#' @param interval interval size
#' @keywords internal
rolling <- function(keptDF, wind=15, interval = 10){
        repeats = NULL
        threshold = NULL
        data.frame(
            #should rename to rolling diff basically the averaged gradient change
            #in the number of repeats
            rollingMean = keptDF %$% rollapply(repeats, width=2, by=1, FUN = function(e){ abs(e[2] - e[1]) }) %>% rollapply(width = wind, by = interval, FUN = mean), 
            #the mean no. of #reads / coverage to keep in that 10 base window
            meanDepth   = keptDF %$% rollapply(threshold[-1], width = wind,  by = interval, FUN= mean)
        )
}


#' simpleThres
#'
#' internal cluster
#
#' @param keptDF internal function
simpleThres = function(keptDF){
    #browser()
	threshold=NULL
	repeats = NULL
    . = 'shutup'
    myRLE = keptDF %>% filter(threshold <=50) %$% rollapply(repeats, width=2, by=1, FUN = function(e){ abs(e[2] - e[1]) }) %>% rle %>% .$length
    ceiling(mean(which(rep(myRLE == max(myRLE), myRLE))))
}

#' contigsSurvive.repeats.readNum
#'
#' @param rs data.frame read.status, contig, readOrigin (taxid)
#' @param thresholds the threshold window
#' @importFrom tidyr spread
#' @keywords internal
contigsSurvive.repeats.readNum <- function(rs, thresholds = seq(1, 50, 1), koi){
    count = NULL
    readOrigin = NULL
    repeats = NULL
    perc = NULL
    X5..Contig = NULL
    origin = NULL
    types = NULL
    .='shutup'
    sim = simulated(koi)
    thresholds %>% lapply(function(readDepth){
        df1 = rs %>% filter(count > readDepth) %>% group_by(readOrigin) %>% 
        summarise(repeats = n())  %>% filter(repeats > 1) %$% sum(repeats) %>% 
        data.frame(threshold = readDepth, repeats = .)
        topHIT = function(x) { x %>% arrange(desc(perc)) %>% head(n=1) }
        df2 = rs %>% group_by(X5..Contig) %>% do(topHIT(.)) %>% ungroup %>%
        filter(count > readDepth) %>% group_by(origin) %>% 
        summarise(types = n()) %>% mutate(threshold = readDepth) %>% spread(origin, types)
        merge(df1, df2, by="threshold")
    }) %>% do.call(rbind,.)
}

#' finds all simulated genera for that particular KO
#'
#' `simulated` pull all simulated genomes
#'
#' @param ko koi
#' @export
#' @keywords internal
simulated <- function(ko){
    message(sprintf("Universal truth: How many genes were simulated for %s", ko))
    simulated = dbquery("
        MATCH
            (k:ko{ko:{ko}})-[sim:SIMULATED]-(g:genus)
        RETURN
            k.ko as ko,
            g.taxid as taxid, 
            sim.genes as Freq", 
    list(ko=gsub("^(ko:)*", "ko:", ko))) %>% make.data.frame
    simulated$Freq %<>% as.character %>% as.integer
    message(sprintf("%s gene simulated from %s genera", sum(simulated$Freq), nrow(simulated)))
    simulated
}

#' contigsSurvive.repeats.rpk
#'
#' @param rs data.frame read.status, contig, readOrigin (taxid)
#' @param rpk reads per kilo
#' @param koi the KO of interest
#' @keywords internal
contigsSurvive.repeats.rpk <- function(rs, rpk = seq(0, 2, 0.01), koi='K00927'){
    . = 'shutup'
    readOrigin = NULL
    repeats = NULL
    node = NULL
    query = "
        MATCH
            (c:contigs{bin:{koi}})
        WHERE
            c.mdr = 1
        RETURN
            c.contig as ID,
            toFloat(c.readnum) / toFloat(c.length) as rpk
    "
    coverageDF = dbquery(query, params=list(koi = gsub("^(ko:)*", "ko:", koi)))
    coverageDF$X5..Contig = as.character(coverageDF$ID) %>% strsplit(':') %>% lapply('[[', 2) %>% do.call(c,.)
    coverageDF = merge(coverageDF, rs, by="X5..Contig",all.y=T)

    rpk %>% lapply(function(threshold) {
        coverageDF %>% filter(rpk > threshold) %>%
        #true repeats not paralogs!!!!
        group_by(readOrigin) %>% summarise(repeats = n())  %>% filter(repeats > 1) %$% data.frame(threshold, repeats = sum(repeats))
    }) %>% do.call(rbind,.)
}


#' readStatusReader stores read assignment details from the simulation and assigns the contig a genome of origin
#'
#' taken from /rootDir/ko/454ReadStatus.txt from newbler.
#' We remove stray reads from other taxa if it only attributes to less than 10% of the contig, output
#' data.frame has column spanning set to 0 1 to show if its spanning or not
#'
#' @param root the root directory
#' @param koi the ko directory
#' @importFrom data.table as.data.table
#' @export
#' @keywords internal
readStatusReader <- function(root, koi, mdr=FALSE)
{
    Accno = NULL
    Read.Status = NULL
    X5..Contig = NULL
    MDR = NULL
    spanning = NULL
    contig = NULL
    readOrigin = NULL
    count = NULL
    total = NULL
    perc = NULL
    .='shutup'
    message("Reading in read assignment data from NEWBLER output")
    readStatus = as.data.table(read.csv(sprintf("%s/%s/454ReadStatus.txt", root, koi), sep="\t"))
    message(sprintf("%s reads read", nrow(readStatus)))
    readStatus$Accno %<>% as.character
    regexstr = "^\\d+\\|(\\d+)"
    readAlloc = readStatus %$% regmatches(Accno, regexec(regexstr,Accno)) %>% sapply(`[`,2 )
    rs = readStatus %>%
        select(Read.Status, X5..Contig) %>%
        mutate(readOrigin = readAlloc)  %>%
        filter(Read.Status == 'Assembled')
    message(sprintf("%s reads Assembled", nrow(readStatus)))
    #removes readOrigins which account for 10% of the contig
    #there's contigs with same genome (readOrigin) but diff geneloc
    if(mdr){
        df = contigInfo(ko=koi) %>% make.data.frame %>% filter(MDR == '1')
        mdrContigs = df$contig %>% as.character %>% strsplit(':') %>% lapply(function(i) i[[2]]) %>% do.call(rbind,.)
        mdrSpanningContigs = df %>% filter(spanning == '1') %$% contig %>% as.character %>% strsplit(':') %>% lapply(function(i) i[[2]]) %>% do.call(rbind,.)
        rs %>% filter(X5..Contig %in% mdrContigs) %>%
            group_by(X5..Contig, readOrigin) %>% summarise(count=n()) %>% ungroup %>% group_by(X5..Contig) %>% 
            mutate( total = sum(count),
                    spanning = ifelse(X5..Contig %in% mdrSpanningContigs, 1, 0)) %>%
        mutate(perc = count/total) %>% filter(perc > 0.1) %>% categorize(koi)
    }else{
        rs %>% group_by(X5..Contig, readOrigin) %>% summarise(count=n()) %>% ungroup %>% group_by(X5..Contig) %>% mutate(total = sum(count)) %>% mutate(perc = count/total) %>% filter(perc > 0.1) %>%
        categorize(koi)
    }
    message("Finished reading")
}


#' categorize
#'
#' Adds 2 more columns
#' 1. if the contig is chimeric or pure based on the read origins
#' 2. if the readOrigin aligns with annotations
#'
#' @param rs output from readStatusReader
#' @param ko the ko of interest
#' @keywords internal
categorize <- function(rs, ko){
	Freq = NULL
	X5..Contig = NULL
    .="shutup"
    chimeric = rs$X5..Contig %>% table %>% as.data.frame %>% arrange(desc(Freq)) %>% filter(Freq > 1) %$% . %>% as.character
    rs$status = "pure"
    rs[rs$X5..Contig %in% chimeric,]$status="chimeric"
    rs$origin = "unknown"
    pure = rs %>% filter(!X5..Contig %in% chimeric)
    simulatedDF = simulated(ko)
    rs[which(rs$readOrigin %in% simulatedDF$taxid),]$origin= "inSimulation"
    rs[which(!rs$readOrigin %in% simulatedDF$taxid),]$origin= "outsideSimulation"
    rs
}

Try the MetamapsDB package in your browser

Any scripts or data that you put into this service are public.

MetamapsDB documentation built on May 1, 2019, 9:23 p.m.