R/Apoderoides.R

Defines functions calc.Score is.monophyleticByRank findUpperNode getAllCentroids getAllRankNames autoDeletion deleteAnomaly deleteSubFunc get.upperRank getRankMRCA getAllMRCAs

Documented in autoDeletion calc.Score deleteAnomaly deleteSubFunc findUpperNode getAllCentroids getAllMRCAs getAllRankNames getRankMRCA get.upperRank is.monophyleticByRank

#get MRCAs for all upper ranks in a tree.
#if a rank is monophyletic, return the tip node number instead of NULL.
getAllMRCAs<-function(tree,OTUrankData=NULL)
{
  allRankNames<-getAllRankNames(tree,OTUrankData)
  allTipUpperRank<-get.upperRank(tree$tip,OTUrankData)
  MRCAlist<-vector("list",length(allRankNames))
  for(i in 1:length(MRCAlist))
  {
    #use regular expressions to conduct exact matching
    rankTips<-grep(paste0("^",allRankNames[i],"$"),allTipUpperRank)
    temp<-getMRCA(tree,rankTips)
    if(is.null(temp))
    {
      MRCAlist[[i]]<-rankTips
    }
    else
    {
      MRCAlist[[i]]<-temp
    }
  }
  return(MRCAlist)
}

#get MRCA of rankName in tree
getRankMRCA<-function(rankName,tree,dropIndex,rankList)
{
  #use regular expressions to conduct exact matching
  rankTips<-grep(paste0("^",rankName,"$"),rankList)
  rankTips<-setdiff(rankTips,dropIndex)
  MRCA<-getMRCA(tree,rankTips)
  if(is.null(MRCA))
  {
    MRCA<-rankTips
  }
  return(MRCA)
}

#get upperRank from OTU name. List is necessary if getting rank upper than genus.
#this function is vectorized for the lowerRank.
get.upperRank<-function(data,OTUrankData=NULL)
{
  #when getting genus from species name
  if(is.null(OTUrankData))
  {
    match<-regexpr("_",data)
    result<-substr(data,1,match-1)
  }
  else
    #when getting upper rank from lower rank
  {
    match<-match(data,OTUrankData[[1]])
    result<-OTUrankData[[2]][match]
  }
  return(result)
}
#load a centroid or MRCA score and return dropping OTU(s)
deleteSubFunc<-function(tree,score,OTUrankData=NULL)
{
  topOTUScore<-score[1,2][[1]]
  #find index whose clade score is identical to the top clade score
  topScoreOTU<-score[topOTUScore==score[,2],1]
  if(length(topScoreOTU)==1)
  {
    #check if the topScoreOTU is monotypic
    topRank<-get.upperRank(topScoreOTU,OTUrankData)
    if(is.null(OTUrankData))
    {
      OTUrankData<-vector("list",2)
      OTUrankData[[2]]<-get.upperRank(tree$tip)
    }
    rankOTU<-extractOTUbyRankName_C(topRank,tree$tip,OTUrankData[[2]])
    #when the dropping OTU rank has one or more than two OTUs, drop the OTU with the highest score
    if(length(rankOTU)==1||length(rankOTU)>2)
    {
      return(score[1,1][[1]])
    }
    #when the dropping OTU rank has two OTUs, compare thier intruder score and drop one with the higher score
    else
    {
      int1<-score[rankOTU[1]==score[,1],4]
      int2<-score[rankOTU[2]==score[,1],4]
      if(int1>int2)
      {
        return(rankOTU[1])
      }
      else if(int2>int1)
      {
        return(rankOTU[2])
      }
      else
      {
        return(rankOTU)
      }
    }
  }
  #when there are multiple OTUs with the highest score
  else
  {
    #find OTUs whose clade number has the smallest number of OTUs
    topScoreCladeNumber<-score[topOTUScore==score[,2],6]
    minCladeNumber<-Inf
    minCladeIndex<-0
    for(i in 1:length(topScoreCladeNumber))
    {
      #count the same clade number OTUs
      temp<-sum(topScoreCladeNumber[i]==topScoreCladeNumber)
      if(minCladeNumber>temp)
      {
        minCladeIndex<-i
        minCladeNumber<-temp
      }
      else if(minCladeNumber==temp)
      {
        minCladeIndex<-c(minCladeIndex,i)
      }
    }
    #cladeNumber with the smallest number of OTUs
    candidateCladeNumber<-as.integer(topScoreCladeNumber[minCladeIndex])
    #choose ones with the smallest clade number
    candidateCladeNumber<-candidateCladeNumber[candidateCladeNumber==min(candidateCladeNumber)]
    candidateCladeNumber<-candidateCladeNumber[1]
    deletingOTU<-score[score[,6]==candidateCladeNumber,1]
    return(deletingOTU)
  }
}

deleteAnomaly<-function(tree,scores,OTUrankData=NULL,drop=FALSE,prior="MRCA")
{
  droppingIndex<-list()
  isCentroidScore<-FALSE
  isMRCAScore<-FALSE
  if(length(scores[[1]])>0)
  {
    isCentroidScore<-TRUE
  }
  if(length(scores)>1)
  {
    if(length(scores[[2]])>0)
    {
      isMRCAScore<-TRUE
    }
  }
  if(isCentroidScore)
  {
    scores[[1]]<-scores[[1]][order(as.numeric(scores[[1]][,2]),decreasing=T),]
    topScore1<-as.numeric(scores[[1]][1,2][[1]])
  }
  if(isMRCAScore)
  {
    scores[[2]]<-scores[[2]][order(as.numeric(scores[[2]][,2]),decreasing=T),]
    topScore2<-as.numeric(scores[[2]][1,2][[1]])
  }
  if(isMRCAScore&&isCentroidScore)
  {
    topOTUScore<-max(topScore1,topScore2)
  }
  else if(!isMRCAScore)
  {
    topOTUScore<-scores[[1]][1,2][[1]]
    topScore2<--1
  }
  else
  {
    topOTUScore<-scores[[2]][1,2][[1]]
    topScore1<--1
  }
  #when the top OTU score is 0, return
  if(topOTUScore==0)
  {
    return("No anomaly OTU in the tree.")
  }
  #when centroidScore > MRCAScore, use centroid score
  if(topScore1>topScore2)
  {
    dropOTU<-deleteSubFunc(tree,scores[[1]],OTUrankData)
  }
  #when centroidScore < MRCAScore, use MRCA score
  else if(topScore1<topScore2)
  {
    dropOTU<-deleteSubFunc(tree,scores[[2]],OTUrankData)
  }
  #when centroidScore == MRCAScore, compare the number of dropping OTUs and use fewer score
  else
  {
    dropOTUcand1<-deleteSubFunc(tree,scores[[1]],OTUrankData)
    dropOTUcand2<-deleteSubFunc(tree,scores[[2]],OTUrankData)
    if(length(dropOTUcand1)<length(dropOTUcand2))
    {
      dropOTU<-dropOTUcand1
    }
    else if(length(dropOTUcand1)>length(dropOTUcand2))
    {
      dropOTU<-dropOTUcand2
    }
    #when the number of dropping OTUs are equal, use one defined by the prior argument
    else
    {
      if(prior=="centroid")
      {
        dropOTU<-dropOTUcand1
      }
      else if (prior=="MRCA")
      {
        dropOTU<-dropOTUcand2
      }
      else
      {
        print("'prior' argument must be 'MRCA' or 'centroid'.")
        return()
      }
    }
  }
  if(drop)
  {
    return(list(dropOTU,drop.tip(tree,dropOTU)))
  }
  else
  {
    return(list(dropOTU,tree))
  }
}

autoDeletion<-function(tree,OTUrankData=NULL,show_progress=TRUE,num_threads=1,prior="MRCA",criteria="composite")
{
  if(criteria!="composite")
  {
    if(criteria!="MRCA")
    {
      if(criteria!="centroid")
      {
        if(criteria!="both")
        {
          print("'criteria' argument must be 'composite', 'both', 'MRCA' or 'centroid'.")
          return()
        }
        else
        {
          if(prior!="MRCA"&&prior!="centroid")
          {
            print("'prior' argument must be 'MRCA' or 'centroid'.")
            return()
          }
        }
      }
    }
  }
  if(length(tree$tip)<=3)
  {
    return("The tree includes only three or less OTUs.")
  }
  if(show_progress)
  {
    calcTime<-proc.time()
  }
  totalCentroidScore<-list()
  totalMRCAScore<-list()
  totalScore<-list()
  droppedOTUs<-character()
  dropIndex<-integer()
  allRankNames<-getAllRankNames(tree,OTUrankData)
  if(!is.null(OTUrankData))
  {
    rankList<-OTUrankData[[2]]
  }
  else
  {
    rankList<-get.upperRank(tree$tip)
  }
  allCentroids<-NULL
  if(criteria!="MRCA")
  {
    allCentroids<-getAllCentroids(tree,OTUrankData,show_progress,num_threads)
  }
  allMRCAs<-NULL
  if(criteria!="centroid")
  {
    allMRCAs<-getAllMRCAs(tree,OTUrankData)
  }
  counter<-1
  progress = 0
  firstPositiveScoreOTU = -1
  currentPositiveScoreOTU = 0
  scores<-vector("list",2)
  while(TRUE)
  {
    if(length(tree$tip)<=3)
    {
      break
    }
    scores<-calc.Score(tree,OTUrankData,allRankNames,allCentroids,allMRCAs,dropIndex,show_progress=show_progress,num_threads=num_threads,criteria=criteria)

    #check the score reached 0
    #somewhat if(scores[[1]][1,2][[1]]==0||scores[[2]][1,2][[1]]==0) gives an error
    if(scores[[1]][1,2][[1]]==0)
    {
      break
    }
    if(criteria=="both")
    {
      if(scores[[2]][1,2][[1]]==0)
      {
        break
      }
    }
    if(firstPositiveScoreOTU==-1)
    {
      if(criteria=="both")
      {
        firstPositiveScoreOTU<-max(sum(scores[[1]][,2]>0),sum(scores[[2]][,2]>0))
      }
      else
      {
        firstPositiveScoreOTU<-sum(scores[[1]][,2]>0)
      }
    }
    if(criteria=="both")
    {
      currentPositiveScoreOTU<-max(sum(scores[[1]][,2]>0),sum(scores[[2]][,2]>0))
    }
    else
    {
      currentPositiveScoreOTU<-sum(scores[[1]][,2]>0)
    }
    if(show_progress)
    {
      print(paste0("auto-deletion loop",counter))
      if(firstPositiveScoreOTU == -1)
      {
        print(paste0("progress: 0%"))
      }
      else if(firstPositiveScoreOTU == 0)
      {
        print(paste0("progress: 100%"))
      }
      else
      {
        print(paste0("progress: " ,100*(firstPositiveScoreOTU-currentPositiveScoreOTU)/firstPositiveScoreOTU,"%"))
      }
      counter<-counter+1
    }
    if(criteria=="both")
    {
      totalCentroidScore<-append(totalCentroidScore,list(scores[[1]]))
      totalMRCAScore<-append(totalMRCAScore,list(scores[[2]]))
    }
    else
    {
      totalScore<-append(totalScore,list(scores[[1]]))
    }
    temp<-deleteAnomaly(tree,scores,OTUrankData)
    droppedOTUs<-append(droppedOTUs,as.vector(temp[[1]]))
    #index of dropped OTU
    index<-match(temp[[1]],tree$tip)
    dropIndex<-c(dropIndex,index)
    lastDropRank<-unique(get.upperRank(tree$tip[index[1]],OTUrankData))
    lastDropRankIndex<-match(lastDropRank,allRankNames)
    #renew centroids
    if(show_progress)
    {
      print("renewing a centroid and/or MRCA")
      centroidTime<-proc.time()
    }
    if(criteria=="both"||criteria=="composite"||criteria=="centroid")
    {
      allCentroids[[lastDropRankIndex]]<-getRankCentroid_C(lastDropRank,dropIndex,tree$tip,tree[[1]][,1],tree[[1]][,2],rankList,show_progress,num_threads)
    }
    if(criteria=="both"||criteria=="composite"||criteria=="MRCA")
    {
      allMRCAs[[lastDropRankIndex]]<-getRankMRCA(lastDropRank,tree,dropIndex,rankList)
    }
    if(show_progress)
    {
      print(proc.time()-centroidTime)
    }
  }
  tree<-drop.tip(tree,dropIndex)
  if(show_progress)
  {
    print("total time")
    print(proc.time()-calcTime)
  }
  if(criteria=="both")
  {
    return(list(resultantTree=tree,droppedOTU=droppedOTUs,scoreTransition1=totalCentroidScore,scoreTransition2=totalMRCAScore))
  }
  else
  {
    return(list(resultantTree=tree,droppedOTU=droppedOTUs,scoreTransition=totalScore))
  }
}

getAllRankNames<-function(tree,OTUrankData=NULL)
{
  dataRank<-get.upperRank(tree$tip,OTUrankData)
  return(unique(dataRank))
}

getAllCentroids<-function(tree,OTUrankData=NULL,show_progress=FALSE,num_threads=1)
{
  allRankNames<-getAllRankNames(tree,OTUrankData)
  allCentroid<-vector("list",length(allRankNames))
  if(show_progress)
  {
    print("calculating all centroids")
    calcTime<-proc.time()
  }
  if(is.null(OTUrankData))
  {
    OTUrankData<-list()
    OTUrankData[[2]]<-get.upperRank(tree$tip)
  }
  allCentroid<-getAllCentroids_C(tree$tip,allRankNames,tree[[1]][,1],tree[[1]][,2],OTUrankData[[2]],show_progress,num_threads)
  if(show_progress)
  {
    print(proc.time()-calcTime)
  }
  return(allCentroid)
}

#obtain upper node of a given node/tip
findUpperNode<-function(tree,node)
{
  treeMatrix<-tree[[1]]
  return(treeMatrix[treeMatrix[,2]==node,1])
}

#return TRUE if all the given nodes belong to the same rank
is.monophyleticByRank<-function(tree,nodeIndex,OTUrankData)
{
  nodes<-tree$tip[nodeIndex]
  rank<-OTUrankData[[2]][nodeIndex]
  return(all(rank[1]==rank))
}

calc.Score<-function(
    tree,OTUrankData=NULL,allRankNames=NULL,allCentroids=NULL,allMRCAs=NULL,dropIndex=NULL,sort=TRUE,
    show_progress=TRUE,num_threads=1,criteria="composite")
{
  if(criteria!="composite")
  {
    if(criteria!="MRCA")
    {
      if(criteria!="centroid")
      {
        if(criteria!="both")
        {
          print("'criteria' argument must be 'composite', 'both', 'MRCA' or 'centroid'.")
          return()
        }
      }
    }
  }
  OTUList<-tree$tip
  intScore<-numeric(length(OTUList))
  outScore<-numeric(length(OTUList))
  OTUScore<-numeric(length(OTUList))
  #numeric group which has the identical score and is monophyletic
  cladeNumber<-numeric(length(OTUList))
  #total score for the monophyletic group
  cladeScore<-numeric(length(OTUList))
  #cladeScore/number of OTUs in the clade
  perCladeScore<-numeric(length(OTUList))
  scores<-vector("list",2)
  if(criteria!="both")
  {
    #make list size 1
    scores[[2]]<-NULL
  }
  if(is.null(allCentroids)&&criteria!="MRCA")
  {
    allCentroids<-getAllCentroids(tree,OTUrankData,show_progress,num_threads)
  }
  if(is.null(allMRCAs)&&criteria!="centroid")
  {
    allMRCAs<-getAllMRCAs(tree,OTUrankData)
  }
  if(is.null(dropIndex))
  {
    dropIndex<-integer(0)
  }
  if(is.null(allRankNames))
  {
    allRankNames<-getAllRankNames(tree,OTUrankData)
  }
  if(is.null(OTUrankData))
  {
    OTUrankData<-list()
    OTUrankData[[1]]<-tree$tip
    OTUrankData[[2]]<-get.upperRank(tree$tip)
  }
  range<-1:length(OTUList)
  range<-setdiff(range,dropIndex)
  for(j in 1:length(scores))
  {
    isScored<-logical(length(OTUList))
    scoreCounter<-1
    if(show_progress)
    {
      #for new line
      print("calculating score")
      calcTime<-proc.time()
      pb<-txtProgressBar(style=3)
    }
    for(i in range)
    {
      if(show_progress)
      {
        setTxtProgressBar(pb,i/length(range))
      }
      if(isScored[i])
      {
        next
      }
      isScored[i]<-TRUE
      cladeIndex<-i
      nextCladeIndex<-i
      checkingNode<-i
      while(is.monophyleticByRank(tree,nextCladeIndex,OTUrankData))
      {
        cladeIndex<-nextCladeIndex
        checkingNode<-findUpperNode(tree,checkingNode)
        #when the checking node is the root node
        if(length(checkingNode)==0)
        {
          break
        }
        nextCladeIndex<-findSubTips_C(tree$tip,tree[[1]][,1],tree[[1]][,2],checkingNode)
      }
      intCore<-NULL
      outCore<-NULL
      if(criteria=="centroid"||(criteria=="both"&&j==1))
      {
        intCore<-allCentroids
        outCore<-allCentroids
      }
      if(criteria=="MRCA"||(criteria=="both"&&j==2))
      {
        intCore<-allMRCAs
        outCore<-allMRCAs
      }
      if(criteria=="composite")
      {
        intCore<-allMRCAs
        outCore<-allCentroids
      }
      intscore<-calcIntScore_C(tree$tip,tree[[1]][,1],tree[[1]][,2],OTUList[i],intCore,allRankNames,OTUrankData[[2]])
      outscore<-calcOutScore_C(tree$tip,tree[[1]][,1],tree[[1]][,2],OTUList[i],outCore,allRankNames,OTUrankData[[2]],dropIndex=dropIndex)

      intScore[cladeIndex]<-intscore
      outScore[cladeIndex]<-outscore
      OTUScore[cladeIndex]<-intscore+outscore
      perCladeScore[cladeIndex]<-(intscore+outscore)/length(cladeIndex)
      isScored[cladeIndex]<-TRUE
      cladeNumber[cladeIndex]<-scoreCounter
      scoreCounter<-scoreCounter+1
    }
    if(show_progress)
    {
      close(pb)
      print(proc.time()-calcTime)
    }
    scores[[j]]<-array(c(OTUList,perCladeScore,OTUScore,intScore,outScore,cladeNumber),dim=c(length(OTUList),6))
    scores[[j]]<-scores[[j]][order(as.numeric(scores[[j]][,6])),]
    colnames(scores[[j]])<-c("OTU","perCladeOTUScore","sum","intruder","outlier","#clade")
    if(sort)
    {
      scores[[j]]<-scores[[j]][order(as.numeric(scores[[j]][,2]),decreasing=T),]
    }
  }
  return(scores)
}

Try the Apoderoides package in your browser

Any scripts or data that you put into this service are public.

Apoderoides documentation built on June 8, 2025, 10:31 a.m.