R/Utils.R

# Randomize columns of data
getRandomizedDatCols <- function(ncol_data, strataF, nCases, nControls, stratagroups, stratanumbers){

  randomized_data_indices <- NA
  if(length(strataF) == 1 & is.na(strataF)){
    randomized_data_indices <- sample(1:ncol_data, replace = F)
    }else{
      randomized_data_indices <- rep(0, ncol_data)
      for(i in stratagroups){
        stratainds <- which(stratanumbers == i)
        samplestratainds <- sample(stratainds, size = length(stratainds), replace = F)
        randomized_data_indices[stratainds] <- samplestratainds
      }
    }
    return(randomized_data_indices)

  }



# Get the matrix whose rows are randomized columns of the data
getRandIndicesMat <- function(ncol_data, iterations, strataF, nCases, nControls, stratagroups, stratanumbers){

  RandIndicesMat <- matrix(0, nrow = iterations, ncol = ncol_data)

  if(length(strataF) == 1 & is.na(strataF)){

    for(i in 1:iterations){
      randomized_cols <- getRandomizedDatCols(ncol_data, strataF, nCases, nControls, NA, NA)
      randomized_cols <- randomized_cols - 1
      RandIndicesMat[i,] <- randomized_cols
    }

    }else{

      for(i in 1:iterations){
        randomized_cols <- getRandomizedDatCols(ncol_data, strataF, nCases, nControls, stratagroups, stratanumbers)
        randomized_cols <- randomized_cols - 1
        RandIndicesMat[i,] <- randomized_cols
      }

    }

    return(RandIndicesMat)

  }


# Convert uids to protein symbols in paths
Uid2Symbol <- function(Ents, UserKpaths){

  splitted <- strsplit(as.character(UserKpaths), split = "->")
  symbpaths <- rep("", length(splitted))

  for(i in 1:length(splitted)){

    uids <- as.integer(splitted[[i]])
    inds <- match(uids, Ents$uid)
    symbs <- Ents$symbol[inds]
    symbpaths[i] <- paste(symbs, collapse = " -> ")

  }

  return(symbpaths)

}



# Convert signed uids to signed protein symbols in paths
SignUid2SignSymbol <- function(Ents, UserKsignpaths){

  splitted <- strsplit(as.character(UserKsignpaths), split = " -> ")
  symbpaths <- rep("", length(splitted))

  for(i in 1:length(splitted)){

    uids <- c()
    signs <- c()
    for(j in splitted[[i]]){ # j has the form: uid (sign)
      words <- strsplit(j, split = " ")[[1]]
      uids <- c(uids, words[1])
      signs <- c(signs, words[2])
    }

    uids <- as.integer(uids)
    inds <- match(uids, Ents$uid)
    symbs <- Ents$symbol[inds]
    symbpaths[i] <- paste(paste(symbs, signs, sep = " "), collapse = " -> ")

  }

  return(symbpaths)

}


# Prepare data to be in the order in which it will be joined to its corresponding path.
# trgs_toB_joined is a vector of uids whose data will be joined respectively
# to some path.
matchData <- function(genes_data, trgs_toB_joined){

  genes_data <- genes_data[match(trgs_toB_joined, genes_data$uid),]
  data <- as.matrix(genes_data[,3:ncol(genes_data)])
  return(data)

}


matchDataIndices <- function(genes_data, trgs_toB_joined){

  matched_indices <- match(trgs_toB_joined, genes_data$uid)
  return(matched_indices)

}



# Get a list of empty paths
getListEmptyPaths <- function(len_counts){

  lst <- list()
  lst[["counts_pos"]] <- rep(0, len_counts)
  lst[["counts_neg"]] <- rep(0, len_counts)
  lst[["counts_conflict"]] <- rep(0, len_counts)
  lst[["cols_pos"]] <- vector(mode="numeric", length=0)
  lst[["cols_neg"]] <- vector(mode="numeric", length=0)
  lst[["cols_conflict"]] <- vector(mode="numeric", length=0)
  lst[["locations_pos"]] <- rep(0, len_counts)
  lst[["locations_neg"]] <- rep(0, len_counts)
  lst[["locations_conflict"]] <- rep(0, len_counts)
  return(lst)

}

# This function computes the values table
getValuesTable <- function(nCases, nControls){

  probTable <- matrix(NA, nCases + 1, nCases + nControls + 1)
  valuesTable <- matrix(NA, nCases + 1, nControls + 1)
  n <- nCases + nControls
  for (i in 0:n) {
    i_vector = (max(0, (i - nControls)):min(i, nCases))
    probTable[i_vector + 1, i + 1] = dhyper(i_vector, nCases, nControls, i)
  }

  ## Now compute the p-values
  for (i in 0:n) {
    i_min = max(0, (i - nControls))
    i_max = min(i, nCases)
    i_vector =  i_min:i_max
    prob_dist = probTable[(i_min:i_max + 1), i+1]
    two_sided_pvalues = sapply(prob_dist, function(x){sum(prob_dist[prob_dist <= x])})
    valuesTable[cbind(i_vector + 1, i - i_vector + 1)] = -log(two_sided_pvalues)
  }
  valuesTable[is.infinite(valuesTable)] = max(valuesTable[is.finite(valuesTable)]) + 1;
  return(valuesTable)

}



# Preprocess phenotype data
PreprocessTable <- function(dataset, threshold_percent, nCases, nControls){

  df <- read.table(dataset, header = T, stringsAsFactors = F)

  if(names(df)[1] != "symbols"){
    stop("First column must be named symbols and must contain the gene symbols!")
  }

  df <- df[!is.na(df$symbols),]
  df <- df[!duplicated(df$symbols),]
  df[df == 2] <- 1

  if((nCases + nControls) != (ncol(df)-1)){
    stop("The number of patients in the dataset must be equal to nCases + nControls!")
  }

  if(any(df[,2:ncol(df)] != 0 & df[,2:ncol(df)] != 1)){
    stop("The patient columns must consist of only 0,1 or 2 entries!")
  }

  # Keep genes with frequencies less than threshold_percent
  freqs <- rowSums(df[,2:ncol(df)], na.rm = T)
  targetFreq <- threshold_percent * ncol(df)
  inds <- which(freqs <= targetFreq)
  df <- df[inds,]

  genes <- as.vector(df[,1])
  patients <- colnames(df[,2:ncol(df)])
  data <- as.matrix(df[,2:ncol(df)])
  rm(df)
  results <- list(data = data, genes = genes, patients = patients)
  return(results)

}



# Check the input parameters
check_input <- function(nCases, nControls, method, threshold_percent, K, pathLength, iterations, nthreads){

  if(!is.numeric(nCases) | length(nCases) != 1 | floor(nCases) != nCases | nCases < 2){
    stop("nCases must be an integer >= 2!")
  }

  if(!is.numeric(nControls) | length(nControls) != 1 | floor(nControls) != nControls | nControls < 2){
    stop("nControls must be an integer >= 2!")
  }

  if(!is.numeric(K) | length(K) != 1 | floor(K) != K | K < 1 | K > 10000){
    stop("K must be an integer >= 1 and <= 10000!")
  }

  # methods can have additional tags to specify an implementation (eg 'method2-lomem') --dmitri
  if(!grepl('^method[12]', method)) {
    stop("method must be one of: 'method1', 'method2'!")
  }

  if(!is.numeric(threshold_percent) | length(threshold_percent) != 1 | threshold_percent > 1 | threshold_percent <= 0){
    stop("threshold_percent must be a real number > 0 and <= 1!")
  }

  if(!is.numeric(pathLength) | length(pathLength) != 1 | floor(pathLength) != pathLength | pathLength > 5 | pathLength < 1){
    stop("pathLength must be an integer >= 1 and <= 5!")
  }

  # disabling permutations (==0) is useful for testing -- dmitri
  if(!is.numeric(iterations) | length(iterations) != 1 | floor(iterations) != iterations | iterations < 0){
    stop("iterations must be an integer greater than or equal to 0!")
  }

  if((nCases + nControls) > 65536) stop("Package cannot process data set with more than 65536 columns!")

  if(length(nthreads) != 1 | (!is.na(nthreads) & (!is.numeric(nthreads) | nthreads != floor(nthreads)))) stop("nthreads must be set to a postive integer, otherwise should be set to NA!")

  # if(!dir.exists(path)) stop("Please provide a valid path that the package can use to write and read data!")

}



# For columns <= nCases will be set to 1 if they are permuted and remain <= nCases.
# For columns > nControls will be set to 1 if they are permuted and remain > nCases.
getCaseORControl <- function(RandIndicesMat, nCases, nControls){

  CaseORControl <- matrix(0, nrow = nrow(RandIndicesMat), ncol = ncol(RandIndicesMat))

  for(i in 1:nrow(CaseORControl)){
    for(j in 1:nCases){
      CaseORControl[i,j] = ifelse(RandIndicesMat[i,j] < nCases, 1, 0)
    }
    for(j in (nCases+1):(nCases+nControls)){
      CaseORControl[i,j] = ifelse(RandIndicesMat[i,j] >= nCases, 1, 0)
    }

  }

  return(CaseORControl)

}
carltonyfakhry/geneticsCRE documentation built on May 13, 2019, 12:49 p.m.