R/domainArchitectureSimilarityRedis.R

Defines functions initializeDomainWeights initializeDomainAnnotations pairwiseVectorSpaceModel generateDomainArchitectureSpaceVectorRedis pairwiseDistanceKey sortedPairName pairwiseDomainArchitectureDistanceRedis partialDomainArchitectureDistancesRedis pairwiseSequenceDistanceRedis partialSequenceDistancesRedis

initializeDomainWeights <- function( domain.weights.table ) {
  # Iterates over all rows of argument domain.weights.table, and stores in the
  # current redis server domain.weights.table[[ current.row, 1 ]].
  #
  # Args:
  #  domain.weights.table : A single column table of the domain weights, with
  #                         rownames being the domain IDs. 
  #
  # Returns: NULL
  #   
  lapply( rownames( domain.weights.table ), function( domain.id ) {
    redisSet( domain.id, domain.weights.table[[ domain.id, 1 ]] )
  })
  NULL
}

initializeDomainAnnotations <- function( domain.annotations,
  annotation.type = "InterPro" ) {
  # Each protein's set of annotated domains is stored as a Redis Set. Key is
  # the protein's accession and value the set of domain IDs of the argument
  # type 'annotation.type'.
  #
  # Args:
  #  domain.annotations : The table of domain annotations as generated by
  #                       function 'retrieveAnnotationsBiomart'.
  #  annotation.type : The row of argument 'domain.annotations' to read out.
  #
  # Returns: NULL
  #   
  lapply( colnames( domain.annotations ), function( protein.accession ) {
    prot.dom.annos <- domain.annotations[[ annotation.type, protein.accession ]]
    for( domain.id in prot.dom.annos ) {
      redisSAdd( protein.accession,
        charToRaw( as.character( domain.id ) )
      )
    }
  })
  NULL
}

pairwiseVectorSpaceModel <- function( protein.accession.a,
  protein.accession.b ) {
  # Returns the _unsorted_ set union of the domain annotations for both
  # argument proteins. Retrieves the two annotation sets from the current redis
  # connection. See function 'initializeDomainAnnotations' on how to initialize
  # this.
  #
  # Args:
  #  protein.accession.a : Accession of first protein
  #  protein.accession.b : Accession of second protein
  #
  # Returns: The set _unsorted_ set union of the two argument protein domain
  # annotation sets.
  #   
  redisSUnion( protein.accession.a, protein.accession.b )
}

generateDomainArchitectureSpaceVectorRedis <- function( protein.accession,
  vector.space.model ) {
  # Constructs a numeric vector where each element 'i' is either the domain
  # weight of the ith domain in the argument 'vector.space.model' or 0.0 if the
  # argument protein is not annotated with domain i. Note: Domains missing a
  # domain weight always will have a domain weight of 0.0!
  #
  # Args:
  #  protein.accession : The accession of the protein to generate the domain
  #                      architecture space vector for.
  #  vector.space.model : The set of domain IDs the vector space is composed
  #                       of.
  #
  # Returns: The numeric vector with the retrieved domain weights or 0.0, if
  # the argument vector is not annotated with domain i.
  #   
  as.numeric(
    lapply( vector.space.model, function( domain.id ) {
      dom.weight <- redisGet( domain.id )
      if( redisSIsMember( protein.accession, domain.id ) &&
          ! is.null(dom.weight) 
        ) {
        dom.weight
      } else {
        0.0
      }
    })
  )
}

pairwiseDistanceKey <- function( protein.accession.a,
  protein.accession.b, distance.type="das_dist" ) {
  # Generates unique key to store the pairwise DAS distance under.
  #
  # Args:
  #  protein.accession.a : Accession of first protein.
  #  protein.accession.b : Accession of second protein.
  #
  # Returns: Alphabetically sorted arguments separated by underscore.
  #   
  paste.funk <- function( ... ) paste( ..., sep="_" ) 
  do.call( 'paste.funk',
    as.list( 
      c(
        sort( c(protein.accession.a, protein.accession.b) ),
        distance.type
      )
    )
  )
}

sortedPairName <- function( protein.accession.a,
  protein.accession.b ) {
  # Generates unique key to store the pairwise DAS distance under.
  #
  # Args:
  #  protein.accession.a : Accession of first protein.
  #  protein.accession.b : Accession of second protein.
  #
  # Returns: Alphabetically sorted arguments separated by underscore.
  #   
  paste.funk <- function( ... ) paste( ..., sep="_" ) 
  do.call( 'paste.funk',
    as.list( 
      c(
        sort( c(protein.accession.a, protein.accession.b) )
      )
    )
  )
}

pairwiseDomainArchitectureDistanceRedis <- function( protein.accession.a,
  protein.accession.b, save.result=TRUE ) {
  # Computes the pairwise distance in the Domain Architecture Space (DAS) for
  # the two argument proteins. Retrieves required values from current redis
  # connection.
  #
  # Args:
  #  protein.accession.a : Accession of first protein.
  #  protein.accession.b : Accession of second protein.
  #  save.result : Store the computed distance in the DAS in the current redis
  #                connection using result of function 'pairwiseDistanceKey(
  #                protein.accession.a, protein.accession.b )'.
  #
  # Returns: The distance in DAS as computed by function
  # 'pairwiseDomainArchitectureDistance'.
  #   
  vsm <- pairwiseVectorSpaceModel( protein.accession.a, protein.accession.b )
  das.a <- generateDomainArchitectureSpaceVectorRedis( protein.accession.a, vsm )
  das.b <- generateDomainArchitectureSpaceVectorRedis( protein.accession.b, vsm )
  # compute 1.0 minus the cosine of the angle between the two vectors:
  das.dist <- pairwiseDomainArchitectureDistance( das.a, das.b )
  # Save result in redis, if requested:
  if( save.result ) {
    redisSet( 
      pairwiseDistanceKey( protein.accession.a, protein.accession.b ),
      das.dist
    )
  }
  # return
  das.dist
}

partialDomainArchitectureDistancesRedis <- function(
  all.accessions, partial.accessions, lapply.funk=lapply, 
  init.thread.funk=NULL, close.thread.funk=NULL, ... ) {
  # Computes all pairwise distances in domain architecture space between
  # accessions in argument 'partial.accessions' and 'all.accessions'. Only
  # computes distances if they are not yet stored in the current REDIS instance
  # and stores it in this case.
  #
  # Args:
  #  all.accessions     : All amino acid accessions.
  #  partial.accessions : Accessions to generate all pairs with all.accessions
  #                       for.
  #  lapply.funk        : Set to 'mclapply' if computation in parallel is
  #                       wanted.
  #  init.thread.funk   : If not NULL, this function is invoked at the very
  #                       start of each loop inside (mc)lapply.
  #  close.thread.funk  : If not null, this function is invoked at the very 
  #                       end of each loop inside (mc)lapply.
  #  ...                : Additional arguments to be passed to 'lapply.funk'
  #
  # Returns: NULL
  #   
  lapply.funk( partial.accessions, function( acc.1 ) {
    if ( ! is.null(init.thread.funk) ) 
      init.thread.funk()
    for ( acc.2 in all.accessions ) {
      if ( is.null( redisGet( pairwiseDistanceKey( acc.1, acc.2 ) ) ) ) {
        pairwiseDomainArchitectureDistanceRedis(
          acc.1, acc.2
        )
      }
    }
    if ( ! is.null(close.thread.funk) )
      close.thread.funk()
  }, ... )
  # return
  NULL
}

pairwiseSequenceDistanceRedis <- function( aa.seq.pattern,
  aa.seq.subject, pattern.accession, subject.accession,
  sub.matrix="PAM250", gap.open.pnlty=-10,
  gap.extension.pnlty=-0.1, distance.model="Dayhoff", 
  distance.key.suffix='seq_dist' ) {
  # Computes the sequence distance between argument amino acid sequences
  # 'aa.seq.pattern' and 'aa.seq.subject' using function
  # 'pairwiseSequenceDistance(...)'. Saves the computed sequence distance in
  # the current redis instance using the key as returned by function
  # pairwiseDistanceKey with suffix as in argument 'distance.key.suffix'.
  #
  # Args:
  #  aa.seq.pattern, aa.seq.subject : argument amino acid sequences
  #  sub.matrix                     : substitution matrix to use in the global
  #                                   alignment
  #  gap.open.pnlty                 : score penalty to apply for opening a gap
  #                                   in the global alignment
  #  gap.extension.pnlty            : score penalty to apply for extending a
  #                                   gap in the global alignment
  #  distance.model                 : model to base the sequence distance
  #                                   computation on
  #  pattern.accession              : The accession of argument AA-Sequence
  #                                   'aa.seq.pattern'. 
  #  subject.accession              : The accession of argument AA-Sequence
  #                                   'aa.seq.subject'. 
  #  distance.key.suffix            : The suffix to append to the redis key
  #                                   under which the computed sequence
  #                                   distance is stored. See function
  #                                   pairwiseDistanceKey(...) for details.
  #
  # Returns: The computed pairwise sequence distance as result of function
  # 'pairwiseSequenceDistance(...)'.
  #   
  
  # Compute Sequence Distance:
  seq.dist <- pairwiseSequenceDistance(
    as.character( aa.seq.pattern ),
    as.character( aa.seq.subject ),
    sub.matrix, gap.open.pnlty,
    gap.extension.pnlty, distance.model
  )
  # Save result in current redis connection:
  redisSet(  
    pairwiseDistanceKey( pattern.accession,
      subject.accession, distance.type=distance.key.suffix
    ),
    seq.dist
  )
  # Return computed distance:
  seq.dist
}

partialSequenceDistancesRedis <- function( aa.sequences,
  partial.accessions, 
  distance.key.suffix='seq_dist', lapply.funk=lapply,
  init.thread.funk=NULL, close.thread.funk=NULL, ... ) {
  # Computes pairwise amino acid sequence distances for all pairs of entries in
  # argument 'partial.accessions' and argument 'all.accessions'. Distances are
  # only computed, if not already stored in the current REDIS instance and are
  # stored in this case. 
  #
  # Args:
  #  aa.sequences : A named list of amino acid sequences.
  #  partial.accessions : Accessions in argument 'aa.sequences' to compute the
  #                       pairwise distances for.
  #  distance.key.suffix : The suffix of the key the distance is stored under.
  #                        See function 'pairwiseDistanceKey(...)' for details.
  #  lapply.funk : Function to use to iterate over all sequences and compute
  #                distances. Set to mclapply, if parallel computation is requested.
  #  init.thread.funk   : If not NULL, this function is invoked at the very
  #                       start of each loop inside (mc)lapply.
  #  close.thread.funk  : If not null, this function is invoked at the very 
  #                       end of each loop inside (mc)lapply.
  #  ...                : Additional arguments to be passed to 'lapply.funk'
  #
  # Returns: NULL
  #   
  all.accessions <- names( aa.sequences )
  lapply.funk ( partial.accessions, function( acc.pattern ) {
    if ( ! is.null(init.thread.funk) ) 
      init.thread.funk()
    aa.seq.pattern <- aa.sequences[ acc.pattern ]
    for ( acc.subject in all.accessions ) {
      if ( is.null( redisGet( pairwiseDistanceKey( acc.pattern,
              acc.subject, distance.type=distance.key.suffix ) ) )
        ) {
        pairwiseSequenceDistanceRedis( aa.seq.pattern, aa.sequences[ acc.subject ],
          acc.pattern, acc.subject
        )
      }
    }
    if ( ! is.null(close.thread.funk) )
      close.thread.funk()
  }, ... )
  # return
  NULL
}
groupschoof/PhyloFun documentation built on May 17, 2019, 8:38 a.m.