R/scorePaths.R

Defines functions scorePaths

Documented in scorePaths

# scorePaths, the main function in the TaSTY package
# REMOVE THE GENOMICS DATATYPES FROM THE SCORING

scorePaths <- function(pdb, tdb, # input databases
                       pro,sty, #input data
                       c_pro_rec, c_pro_int, c_pro_tf,
                       c_sty_rec, c_sty_int, c_sty_tf,
                       c_tdb,# input constants
                       detection_cutoff, # filter for the proportion of a pathway detected
                       de_cutoff, # differential expression cutoff
                       nPerms) {  # number of permutations to perform

  # Filter the pdb based on cut-off for PathwayDetectionProportion
  # Filter out the paths where the receptor wasn't detection
  proteinsDetected <- unique(pro[!is.na(pro$norm), ]$gene)
  pSitesDifferentiallyExpressed <- unique(sty[!is.na(sty$norm) & abs(sty$norm) >= de_cutoff, ]$pSite)

  # Immediately cuts out the database and probably throws off the p-value
  pdb <- pdb[pdb$detectedPercent > detection_cutoff &
               pdb$Node1 %in% proteinsDetected &
               pdb$TF %in% proteinsDetected &
               pdb$pTF %in% pSitesDifferentiallyExpressed, ]

  # Add a column to pdb for max phosphorylation event
  sty2 <- separate(sty, col="pSite", sep="_", into=c('gene','residue'), remove=FALSE)
  sty3 <- ddply(sty2, .(gene), summarize, max.norm=max(norm), min.norm=min(norm))
  sty3$norm <- sty3$max.norm
  sty3[abs(sty3$min.norm) > sty3$max.norm, ]$norm <- sty3[abs(sty3$min.norm) > sty3$max.norm, ]$min.norm
  sty3 <- sty3[ ,c('gene', 'norm')]
  sty3 <- as.data.table(sty3)
  setkey(sty3, gene)


  ## Add columns to pdb to store the results of path scores
  # DT[i, (colvector) := val] basic syntax
  colvector = c('proN1', 'proG2', 'proG3','proG4','proG5','proG6',
                'styN1', 'styN2', 'styN3','styN4','styN5','styN6',
                'TFscore') # create the vector of columns to add
  val = numeric(nrow(pdb)) # initialize with zeros
  pdb[ ,(colvector) := val]

    # # CHANGE WEIGHTINGS TO APPLY TO NODES
  # Add weights to each datatype
  # pro[ ,2] <- c_pro * pro[ ,2]
  # sty[ ,2] <- c_sty * sty[ ,2]
  # tdb[ ,2] <- c_tdb * tdb[ ,2]

  pdb[ ,'proN1'] <- pro[pdb[ ,c('Node1')], ]$norm
  pdb[ ,'proG2'] <- pro[pdb[ ,c('Gene2')], ]$norm
  pdb[ ,'proG3'] <- pro[pdb[ ,c('Gene3')], ]$norm
  pdb[ ,'proG4'] <- pro[pdb[ ,c('Gene4')], ]$norm
  pdb[ ,'proG5'] <- pro[pdb[ ,c('Gene5')], ]$norm
  pdb[ ,'proG6'] <- pro[pdb[ ,c('Gene6')], ]$norm

  # Modify the matrix according to weights
  pdb[ ,'proN1'] <- c_pro_rec * pro[pdb[ ,c('Node1')], ]$norm
  pdb[ ,'proG2'] <- c_pro_int * pro[pdb[ ,c('Gene2')], ]$norm # node 2 is always an intermediate

  # Node 3 cases
  pdb[pdb$pathLength %in% c(4,5,6), ]$proG3 <- c_pro_int * pro[pdb[pdb$pathLength %in% c(4,5,6), c('Gene3')], ]$norm
  pdb[pdb$pathLength == 3, ]$proG3 <- c_pro_tf * pro[pdb[pdb$pathLength == 3,c('Gene3')], ]$norm

  # Node 4 cases
  pdb[pdb$pathLength %in% c(5,6), ]$proG4 <- c_pro_int * pro[pdb[pdb$pathLength %in% c(5,6), c('Gene4')], ]$norm
  pdb[pdb$pathLength == 4, ]$proG4 <- c_pro_tf * pro[pdb[pdb$pathLength == 4, c('Gene4')], ]$norm

  # Node 5 cases
  pdb[pdb$pathLength %in% 5, ]$proG5 <- c_pro_int * pro[pdb[pdb$pathLength %in% 5, c('Gene5')], ]$norm
  pdb[pdb$pathLength == 5, ]$proG5 <- c_pro_tf * pro[pdb[pdb$pathLength == 5, c('Gene5')], ]$norm


  #pdb[ ,'styN1'] <- sty[pdb[ ,c('Node1')], ]$norm  ## THIS IS THE PROBLEMATIC ONE...GET NO MATCHES
  pdb[ ,'styN1'] <- sty3[pdb[ ,c('Node1')], ]$norm
  pdb[ ,'styN2'] <- sty[pdb[ ,c('Node2')], ]$norm
  pdb[ ,'styN3'] <- sty[pdb[ ,c('Node3')], ]$norm
  pdb[ ,'styN4'] <- sty[pdb[ ,c('Node4')], ]$norm
  pdb[ ,'styN5'] <- sty[pdb[ ,c('Node5')], ]$norm
  pdb[ ,'styN6'] <- sty[pdb[ ,c('Node6')], ]$norm


  # Modify the matrix according to weights
  # Node 1 is always the receptor
  pdb[ ,'styN1'] <- c_sty_rec * sty3[pdb[ ,c('Node1')], ]$norm

  # node 2 is always an intermediate
  pdb[ ,'styN2'] <- c_sty_int * sty[pdb[ ,c('Node2')], ]$norm

  # Node 3 cases
  pdb[pdb$pathLength %in% c(4,5,6), ]$styN3 <- c_sty_int * sty[pdb[pdb$pathLength %in% c(4,5,6) ,c('Node3')], ]$norm
  pdb[pdb$pathLength == 3, ]$styN3 <- c_sty_tf * sty[pdb[pdb$pathLength == 3 ,c('Node3')], ]$norm

  # Node 4 cases
  pdb[pdb$pathLength %in% c(5,6), ]$styN4 <- c_sty_int * sty[pdb[pdb$pathLength %in% c(5,6) ,c('Node4')], ]$norm
  pdb[pdb$pathLength == 4, ]$styN4 <- c_sty_tf * sty[pdb[pdb$pathLength == 4 ,c('Node4')], ]$norm

  # Node 5 cases
  pdb[pdb$pathLength %in% c(6), ]$styN5 <- c_sty_int * sty[pdb[pdb$pathLength %in% c(6) ,c('Node5')], ]$norm
  pdb[pdb$pathLength == 5, ]$styN5 <- c_sty_tf * sty[pdb[pdb$pathLength == 5 ,c('Node5')], ]$norm





  pdb[ ,'TFscore'] <- c_tdb*tdb[pdb[ ,c('TF')], ]$TFscore
  pdb[ ,'ReceptorProteinExpression'] <- pro[pdb[ ,c('Node1')], ]$norm

  # Compute total scores by path
  #tmp <- rowSums(pdb[ ,c('cnaN1', 'cnaG2', 'cnaG3','cnaG4','cnaG5','cnaG6',
    #              'rnaN1', 'rnaG2', 'rnaG3','rnaG4','rnaG5','rnaG6',
     #             'mutN1', 'mutG2', 'mutG3','mutG4','mutG5','mutG6',
      #            'proN1', 'proG2', 'proG3','proG4','proG5','proG6',
       #           'styN1', 'styN2', 'styN3','styN4','styN5','styN6',
        #          'TFscore')], na.rm=TRUE)

  #pdb[,('sumScore'):=tmp]
  pdb$intNodeScore <- 0
  pdb[pdb$pathLength %in% 6, ]$intNodeScore <- rowSums(pdb[pdb$pathLength %in% 6, c('proG2', 'proG3','proG4','proG5',
                                  'styN2', 'styN3','styN4','styN5')],
                              na.rm=TRUE) / (pdb[pdb$pathLength %in% 6, ]$pathLength-2)

  pdb[pdb$pathLength %in% 5, ]$intNodeScore <- rowSums(pdb[pdb$pathLength %in% 5, c('proG2', 'proG3','proG4',
                                                                                    'styN2', 'styN3','styN4')],
                                                       na.rm=TRUE) / (pdb[pdb$pathLength %in% 5, ]$pathLength-2)

  pdb[pdb$pathLength %in% 4, ]$intNodeScore <- rowSums(pdb[pdb$pathLength %in% 4, c('proG2', 'proG3',
                                                                                     'styN2', 'styN3')],
                                                       na.rm=TRUE) / (pdb[pdb$pathLength %in% 4, ]$pathLength-2)

  pdb[pdb$pathLength %in% 3, ]$intNodeScore <- rowSums(pdb[pdb$pathLength %in% 3, c('proG2',
                                                                                    'styN2')],
                                                       na.rm=TRUE) / (pdb[pdb$pathLength %in% 3, ]$pathLength-2)

  # Apply the cases to the overall score
  pdb$sumScore <- 0

  pdb[pdb$pathLength %in% 6, ]$sumScore <- rowSums(pdb[pdb$pathLength %in% 6 ,c('proN1','intNodeScore' ,'proG6',
                         'styN1', 'intNodeScore','styN6',
                         'TFscore')], na.rm=TRUE)
  pdb[pdb$pathLength %in% 5, ]$sumScore <- rowSums(pdb[pdb$pathLength %in% 5 ,c('proN1','intNodeScore' ,'proG5',
                                                                                'styN1', 'intNodeScore','styN5',
                                                                                'TFscore')], na.rm=TRUE)

  pdb[pdb$pathLength %in% 4, ]$sumScore <- rowSums(pdb[pdb$pathLength %in% 4 ,c('proN1','intNodeScore' ,'proG4',
                                                                                'styN1', 'intNodeScore','styN4',
                                                                                'TFscore')], na.rm=TRUE)
  pdb[pdb$pathLength %in% 3, ]$sumScore <- rowSums(pdb[pdb$pathLength %in% 3 ,c('proN1','intNodeScore' ,'proG3',
                                                                                'styN1', 'intNodeScore','styN3',
                                                                                'TFscore')], na.rm=TRUE)
  summary(pdb$sumScore)

  # Perform a large number of permutation tests
  # randomly swap rows in the data and calculate path scores
  # Create a matrix of null scores that is initialized with 0
  # if the null matrix isn't initialized with 0 then NAs will dominate
  # The presence of NAs will throw off the p-value calculation
  nullScores <- matrix(0, nrow = nrow(pdb), ncol=nPerms)
  pdb_tmp <- pdb
  for (i in 1 : nPerms){
    print(i)
    pro[ ,2] <- pro[sample(1:nrow(pro),nrow(pro), replace=FALSE), 2]
    sty[ ,2] <- sty[sample(1:nrow(sty),nrow(sty), replace=FALSE), 2]
    tdb[ ,2] <- tdb[sample(1:nrow(tdb), nrow(tdb), replace=FALSE), 2]

    pdb_tmp[ ,'proN1'] <- pro[pdb_tmp[ ,c('Node1')], ]$norm
    pdb_tmp[ ,'proG2'] <- pro[pdb_tmp[ ,c('Gene2')], ]$norm
    pdb_tmp[ ,'proG3'] <- pro[pdb_tmp[ ,c('Gene3')], ]$norm
    pdb_tmp[ ,'proG4'] <- pro[pdb_tmp[ ,c('Gene4')], ]$norm
    pdb_tmp[ ,'proG5'] <- pro[pdb_tmp[ ,c('Gene5')], ]$norm
    pdb_tmp[ ,'proG6'] <- pro[pdb_tmp[ ,c('Gene6')], ]$norm

    # Modify the matrix according to weights
    pdb_tmp[ ,'proN1'] <- c_pro_rec * pro[pdb_tmp[ ,c('Node1')], ]$norm
    pdb_tmp[ ,'proG2'] <- c_pro_int * pro[pdb_tmp[ ,c('Gene2')], ]$norm # node 2 is always an intermediate

    # Node 3 cases
    pdb_tmp[pdb_tmp$pathLength %in% c(4,5,6), ]$proG3 <- c_pro_int * pro[pdb_tmp[pdb_tmp$pathLength %in% c(4,5,6), c('Gene3')], ]$norm
    pdb_tmp[pdb_tmp$pathLength == 3, ]$proG3 <- c_pro_tf * pro[pdb_tmp[pdb_tmp$pathLength == 3,c('Gene3')], ]$norm

    # Node 4 cases
    pdb_tmp[pdb_tmp$pathLength %in% c(5,6), ]$proG4 <- c_pro_int * pro[pdb_tmp[pdb_tmp$pathLength %in% c(5,6), c('Gene4')], ]$norm
    pdb_tmp[pdb_tmp$pathLength == 4, ]$proG4 <- c_pro_tf * pro[pdb_tmp[pdb_tmp$pathLength == 4, c('Gene4')], ]$norm

    # Node 5 cases
    pdb_tmp[pdb_tmp$pathLength %in% 5, ]$proG5 <- c_pro_int * pro[pdb_tmp[pdb_tmp$pathLength %in% 5, c('Gene5')], ]$norm
    pdb_tmp[pdb_tmp$pathLength == 5, ]$proG5 <- c_pro_tf * pro[pdb_tmp[pdb_tmp$pathLength == 5, c('Gene5')], ]$norm


    #pdb_tmp[ ,'styN1'] <- sty[pdb_tmp[ ,c('Node1')], ]$norm  ## THIS IS THE PROBLEMATIC ONE...GET NO MATCHES
    pdb_tmp[ ,'styN1'] <- sty3[pdb_tmp[ ,c('Node1')], ]$norm
    pdb_tmp[ ,'styN2'] <- sty[pdb_tmp[ ,c('Node2')], ]$norm
    pdb_tmp[ ,'styN3'] <- sty[pdb_tmp[ ,c('Node3')], ]$norm
    pdb_tmp[ ,'styN4'] <- sty[pdb_tmp[ ,c('Node4')], ]$norm
    pdb_tmp[ ,'styN5'] <- sty[pdb_tmp[ ,c('Node5')], ]$norm
    pdb_tmp[ ,'styN6'] <- sty[pdb_tmp[ ,c('Node6')], ]$norm


    # Modify the matrix according to weights
    # Node 1 is always the receptor
    pdb_tmp[ ,'styN1'] <- c_sty_rec * sty3[pdb_tmp[ ,c('Node1')], ]$norm

    # node 2 is always an intermediate
    pdb_tmp[ ,'styN2'] <- c_sty_int * sty[pdb_tmp[ ,c('Node2')], ]$norm

    # Node 3 cases
    pdb_tmp[pdb_tmp$pathLength %in% c(4,5,6), ]$styN3 <- c_sty_int * sty[pdb_tmp[pdb_tmp$pathLength %in% c(4,5,6) ,c('Node3')], ]$norm
    pdb_tmp[pdb_tmp$pathLength == 3, ]$styN3 <- c_sty_tf * sty[pdb_tmp[pdb_tmp$pathLength == 3 ,c('Node3')], ]$norm

    # Node 4 cases
    pdb_tmp[pdb_tmp$pathLength %in% c(5,6), ]$styN4 <- c_sty_int * sty[pdb_tmp[pdb_tmp$pathLength %in% c(5,6) ,c('Node4')], ]$norm
    pdb_tmp[pdb_tmp$pathLength == 4, ]$styN4 <- c_sty_tf * sty[pdb_tmp[pdb_tmp$pathLength == 4 ,c('Node4')], ]$norm

    # Node 5 cases
    pdb_tmp[pdb_tmp$pathLength %in% c(6), ]$styN5 <- c_sty_int * sty[pdb_tmp[pdb_tmp$pathLength %in% c(6) ,c('Node5')], ]$norm
    pdb_tmp[pdb_tmp$pathLength == 5, ]$styN5 <- c_sty_tf * sty[pdb_tmp[pdb_tmp$pathLength == 5 ,c('Node5')], ]$norm





    pdb_tmp[ ,'TFscore'] <- c_tdb*tdb[pdb_tmp[ ,c('TF')], ]$TFscore
    pdb_tmp[ ,'ReceptorProteinExpression'] <- pro[pdb_tmp[ ,c('Node1')], ]$norm

    # Compute total scores by path
    #tmp <- rowSums(pdb_tmp[ ,c('cnaN1', 'cnaG2', 'cnaG3','cnaG4','cnaG5','cnaG6',
    #              'rnaN1', 'rnaG2', 'rnaG3','rnaG4','rnaG5','rnaG6',
    #             'mutN1', 'mutG2', 'mutG3','mutG4','mutG5','mutG6',
    #            'proN1', 'proG2', 'proG3','proG4','proG5','proG6',
    #           'styN1', 'styN2', 'styN3','styN4','styN5','styN6',
    #          'TFscore')], na.rm=TRUE)

    #pdb_tmp[,('sumScore'):=tmp]
    pdb_tmp$intNodeScore <- 0
    pdb_tmp[pdb_tmp$pathLength %in% 6, ]$intNodeScore <- rowSums(pdb_tmp[pdb_tmp$pathLength %in% 6, c('proG2', 'proG3','proG4','proG5',
                                                                                      'styN2', 'styN3','styN4','styN5')],
                                                         na.rm=TRUE) / (pdb_tmp[pdb_tmp$pathLength %in% 6, ]$pathLength-2)

    pdb_tmp[pdb_tmp$pathLength %in% 5, ]$intNodeScore <- rowSums(pdb_tmp[pdb_tmp$pathLength %in% 5, c('proG2', 'proG3','proG4',
                                                                                      'styN2', 'styN3','styN4')],
                                                         na.rm=TRUE) / (pdb_tmp[pdb_tmp$pathLength %in% 5, ]$pathLength-2)

    pdb_tmp[pdb_tmp$pathLength %in% 4, ]$intNodeScore <- rowSums(pdb_tmp[pdb_tmp$pathLength %in% 4, c('proG2', 'proG3',
                                                                                      'styN2', 'styN3')],
                                                         na.rm=TRUE) / (pdb_tmp[pdb_tmp$pathLength %in% 4, ]$pathLength-2)

    pdb_tmp[pdb_tmp$pathLength %in% 3, ]$intNodeScore <- rowSums(pdb_tmp[pdb_tmp$pathLength %in% 3, c('proG2',
                                                                                      'styN2')],
                                                         na.rm=TRUE) / (pdb_tmp[pdb_tmp$pathLength %in% 3, ]$pathLength-2)

    # Apply the cases to the overall score
    pdb_tmp$sumScore <- 0

    pdb_tmp[pdb_tmp$pathLength %in% 6, ]$sumScore <- rowSums(pdb_tmp[pdb_tmp$pathLength %in% 6 ,c('proN1','intNodeScore' ,'proG6',
                                                                                  'styN1', 'intNodeScore','styN6',
                                                                                  'TFscore')], na.rm=TRUE)
    pdb_tmp[pdb_tmp$pathLength %in% 5, ]$sumScore <- rowSums(pdb_tmp[pdb_tmp$pathLength %in% 5 ,c('proN1','intNodeScore' ,'proG5',
                                                                                  'styN1', 'intNodeScore','styN5',
                                                                                  'TFscore')], na.rm=TRUE)

    pdb_tmp[pdb_tmp$pathLength %in% 4, ]$sumScore <- rowSums(pdb_tmp[pdb_tmp$pathLength %in% 4 ,c('proN1','intNodeScore' ,'proG4',
                                                                                  'styN1', 'intNodeScore','styN4',
                                                                                  'TFscore')], na.rm=TRUE)
    pdb_tmp[pdb_tmp$pathLength %in% 3, ]$sumScore <- rowSums(pdb_tmp[pdb_tmp$pathLength %in% 3 ,c('proN1','intNodeScore' ,'proG3',
                                                                                  'styN1', 'intNodeScore','styN3',

                                                                                                                                                                'TFscore')], na.rm=TRUE)
    # Compute total scores by path
    summary(pdb_tmp$sumScore)
    nullScores[ ,i] <- pdb_tmp$sumScore
    #nullScores[ ,i] <- rowSums(pdb_tmp[ ,c('proN1', 'proG2', 'proG3','proG4','proG5','proG6',
      #                     'styN1', 'styN2', 'styN3','styN4','styN5','styN6',
        #                   'TFscore')], na.rm=TRUE) / pdb_tmp$pathLength
  }
  # APPLY VARIOUS CUTOFFS TO GET THE SCORE SIGNIFICANCE LEVELS
  cut95 <- apply(nullScores, 1, quantile, 0.95)
  cut99 <- apply(nullScores, 1, quantile, 0.99)
  cut999 <- apply(nullScores, 1, quantile, 0.999)
  cut9999 <- apply(nullScores, 1, quantile, 0.9999)

  cut05 <- apply(nullScores, 1, quantile, 0.05)
  cut01 <- apply(nullScores, 1, quantile, 0.01)
  cut001 <- apply(nullScores, 1, quantile, 0.001)
  cut0001 <- apply(nullScores, 1, quantile, 0.0001)
  #testfun <- apply(nullScores, 1, function(x) ecdf(x))

  # Setup the right and left tails
  pdb$RT <- as.numeric(pdb$RT)
  pdb$LT <- as.numeric(pdb$LT)


  # Apply the basic formula for the p-value
  # pval = (1+sum(s >= s0))/(N+1)

  for (i in 1:nrow(pdb)){
    pdb[i, ]$RT <- (1+sum(as.numeric(pdb[i, ]$sumScore >= nullScores[i, ]))) / (nPerms + 1)
    pdb[i, ]$LT <- (1+sum(as.numeric(pdb[i, ]$sumScore <= nullScores[i, ]))) / (nPerms + 1)
  }
  pdb$pval2 <- 1 - pdb$RT
  pdb[pdb$LT > pdb$RT, ]$pval2 <- 1 - pdb[pdb$LT > pdb$RT, ]$LT
  summary(pdb$pval2)
  pdb$pval2adj <- p.adjust(pdb$pval2, method="BH")
  summary(pdb$pval2adj)


  pdb[ ,('pVal') := numeric(nrow(pdb))]
  pdb[ ,pVal := pVal + 1]
  pdb[pdb$sumScore > cut95 | pdb$sumScore < cut05, ]$pVal <- 0.05
  pdb[pdb$sumScore > cut99 | pdb$sumScore < cut01, ]$pVal <- 0.01
  pdb[pdb$sumScore > cut999 | pdb$sumScore < cut001, ]$pVal <- 0.001
  pdb[pdb$sumScore > cut9999 | pdb$sumScore < cut0001, ]$pVal <- 0.0001
  #for (i in 1 : length(testfun)) {
   # tmpecdf <- testfun[[i]]
  #  pdb[i, ]$pVal <- tmpecdf(pdb[i, ]$sumScore)
  #}
  pdb[ ,('padj') := p.adjust(pdb$pVal, method='BH')]

  # Useful to return the nullScores for debugging
  #fncout <- list(nullScores, pdb)
  #return(fncout)
  #return(pdb[pdb$pVal <= 0.05, ])
  return(pdb)
}
tastymethod/TaSTY documentation built on Nov. 20, 2019, 12:05 a.m.