R/convertPQtree.R

Defines functions convertPQtree

Documented in convertPQtree

## ------------------------------------------------------------------------
## read *PQTREE* from ANGES output (do not use *PQRTREE* and do not use
## *DOUBLED*) and convert it into a table-like format with columns $marker,
## $orientation, $car, and at least two additional columns with node $type*
## and $elem*
## ------------------------------------------------------------------------

#' Convert linearly encoded \emph{PQ-trees} into a two-dimensional
#' \emph{PQ-structure}
#'
#' Convert linearly encoded \emph{PQ-trees} of an ancestral genome
#' reconstruction, for example as output by the software ANGES (Jones \emph{et
#' al.} 2012), into a two-dimensional \emph{PQ-structure} that can be used as
#' \code{compgenome} input for the functions \code{\link{computeRearrs}},
#' \code{\link{summarizeBlocks}}, and \code{\link{genomeRearrPlot}}
#'
#' @param rawtree Data frame with a single column that encodes the structure of
#'   \emph{PQ-trees} as strings. See Details.
#'
#' @details
#'
#'   \code{rawtree} is a data frame with a single column encoding an ancestral
#'   genome reconstruction as strings, for example as generated by the software
#'   ANGES (Chauve & Tannier 2008; Jones \emph{et al.} 2012). The first row
#'   gives the name of the ancestor (preceded by \code{>}), followed by two rows
#'   for each ancestral genome segment (CAR). The first row of each set gives
#'   the CAR ID (preceded by \code{#CAR}), and the second row provides the
#'   \emph{PQ-tree} structure of that CAR. The children of each node in the
#'   \emph{PQ-tree} are enclosed by \code{_P} and \code{P_} markups for
#'   \emph{P-nodes}, or by \code{_Q} and \code{Q_} markups for \emph{Q-nodes}.
#'   Markers (i.e., ortholog IDs) that belong to a particular node are located
#'   between its corresponding markups. Markers with reversed orientation are
#'   preceded by a \code{-} sign. The opening (\code{_P} or \code{_Q}) and
#'   closing (\code{P_} or \code{Q_}) markups can be nested to allow the
#'   representation of the hierarchical structure of the \emph{PQ-tree}. For
#'   details on \emph{PQ-trees} see Booth & Lueker 1976, Chauve & Tannier 2008,
#'   or the package vignette.
#'
#'   Example of a \code{rawtree} object, corresponding to
#'   \code{\link{TOY24_compgenome}}:\cr
#'
#'   \code{>TOY24}\cr
#'   \code{#CAR1}\cr
#'   \code{_Q 1 2 3 4 5 -6 7 8 Q_}\cr
#'   \code{#CAR2}\cr
#'   \code{_Q 9 _Q 10 Q_ _Q 11 12 13 Q_ Q_}\cr
#'   \code{#CAR3}\cr
#'   \code{_Q 14 Q_}\cr
#'   \code{#CAR4}\cr
#'   \code{_Q _P 15 16 17 18 P_ _Q _Q -19 20 21 Q_ _Q -22 -23 24 Q_ Q_ Q_}
#'
#'   \strong{Important:} The function was designed to work with the ANGES
#'   \code{*PQTREE*} output file. It was not desiged to work with the
#'   ancillary \code{*PQRTREE*} or the \code{*DOUBLED*} output files.
#'
#' @section References:
#'
#'   Booth, K.S. & Lueker, G.S. (1976). Testing for the consecutive ones
#'   property, interval graphs, and graph planarity using \emph{PQ}-Tree
#'   algorithms. \emph{Journal of Computer and System Sciences}, \strong{13},
#'   335--379. doi:
#'   \href{https://doi.org/10.1016/S0022-0000(76)80045-1}{10.1016/S0022-0000(76)80045-1}.
#'
#'   Chauve, C. & Tannier, E. (2008). A methodological framework for the
#'   reconstruction of contiguous regions of ancestral genomes and its
#'   application to mammalian genomes. \emph{PLOS Computational Biology},
#'   \strong{4}, e1000234. doi:
#'   \href{https://doi.org/10.1371/journal.pcbi.1000234}{10.1371/journal.pcbi.1000234}.
#'
#'   Jones, B. R. \emph{et al.} (2012). ANGES: reconstructing ANcestral GEnomeS
#'   maps. \emph{Bioinformatics}, \strong{28}, 2388--2390. doi:
#'   \href{https://doi.org/10.1093/bioinformatics/bts457}{10.1093/bioinformatics/bts457}
#'
#' @return A data frame encoding \emph{PQ-trees} of an ancestral genome
#'   reconstruction as a two-dimensional \emph{PQ-structure}.
#'
#'   See the description of the \code{"compgenome"} class in the Details section
#'   of the \code{\link{checkInfile}} function for more information on the
#'   output format.
#'
#' @seealso \code{\link{checkInfile}}, \code{\link{computeRearrs}},
#'   \code{\link{summarizeBlocks}}, \code{\link{genomeRearrPlot}}.
#'
#' @examples
#' \dontrun{
#'
#' ## Read a text file with an ancestral genome reconstruction
#' ## from the software ANGES into R:
#' MSSYE_PQTREE_HEUR <- read.table("PATH/TO/MSSYE_PQTREE_HEUR",
#'                                 sep = ",", comment.char = "",
#'                                 as.is = TRUE)
#'
#' ## Convert linearly encoded PQ-trees into PQ-structure:
#' MSSYE_compgenome <- convertPQtree(MSSYE_PQTREE_HEUR)
#' }
#'
#' @export
#' @importFrom utils tail


convertPQtree<-function(rawtree){

    tree<-character()
    car<-NA
    elem<-NA
    ori<-NA
    nodestack<-character()
    countstack<-integer()
    maxnodes<-0

    converted<-character()

    for(i in 1:nrow(rawtree)){

        if(grepl("^>",rawtree[i,])){
            cat(paste0("\nconverting data for ",sub(">","",rawtree[i,])))
        }else if(grepl("^#CAR",rawtree[i,])){
            car<-as.integer(sub("#CAR","",rawtree[i,]))
        }else{
            tree<-unlist(strsplit(rawtree[i,]," +"))
            if(length(tree) == 0){
                stop(paste0("CAR",car," does not match the required format"))
            }
            for(j in 1:length(tree)){
                elem<-tree[j]
                if(length(countstack)>0){
                    ## already node(s)
                    ## -> increment node counter
                    countstack[length(countstack)]<-countstack[length(countstack)]+1
                }
                if(is.element(elem,c("_P","_Q"))){
                    ## node opens
                    elem<-sub("_","",elem)
                    nodestack<-c(nodestack,elem)
                    countstack<-c(countstack,0)
                }else if(is.element(elem,c("P_","Q_"))){
                    ## node closes
                    elem<-sub("_","",elem)
                    if(length(nodestack) == 0){
                        stop(paste0("something in the hierarchy of CAR",car," is wrong"))
                    }else if(tail(nodestack,n=1L) != elem){
                        stop(paste0("something in the hierarchy of CAR",car," is wrong"))
                    }else{
                        nodestack<-nodestack[-length(nodestack)]
                        countstack<-countstack[-length(countstack)]
                    }
                }else{
                    ## marker
                    if(grepl("^-",elem)){
                        ori<-"-"
                        elem<-sub("^-","",elem)
                    }else{
                        ori<-"+"
                    }
                    tmp<-c(elem,ori,car)
                    if(length(nodestack) == 0){
                        stop(paste0("something in the hierarchy of CAR",car," is wrong"))
                    }else if(length(nodestack) != length(countstack)){
                        stop(paste0("something in the hierarchy of CAR",car," is wrong"))
                    }
                    for(k in 1:length(nodestack)){
                        tmp<-c(tmp,nodestack[k],countstack[k])
                    }
                    maxnodes<-max(maxnodes,length(nodestack))
                    converted<-c(converted,paste(tmp,collapse=" "))
                }
            }
        }
    }


    if(is.na(car)){
        cat("\n... no CARs found ...\n\n")
        return(NA)
    }else if(car == 1){
        cat(paste("\n... processed",car,"CAR ...\n\n"))
    }else{
        cat(paste("\n... processed",car,"CARs ...\n\n"))
    }


    ## convert into data frame
    convtree<-matrix(NA,nrow=length(converted),ncol=maxnodes*2+3)
    for(i in 1:length(converted)){
        tmp<-unlist(strsplit(converted[i]," "))
        convtree[i,1:length(tmp)]<-tmp
    }
    convtree<-as.data.frame(convtree,stringsAsFactors=FALSE)
    for(i in seq(1,ncol(convtree),2)){
        convtree[,i]<-as.integer(convtree[,i])
    }
    colnames(convtree)<-c("marker","orientation","car",
                          paste0(rep(c("type","elem"),
                                     times=(ncol(convtree)-3)/2),
                                 rep(1:((ncol(convtree)-3)/2),each=2)))

    return(convtree)
}

## ------------------------------------------------------------------------
dorolin/rearrvisr documentation built on Aug. 6, 2020, 1:32 a.m.