R/data_maplist_utils.R

Defines functions ReadListFile .parseListInput MapMultiListIds MapListIds

##################################################
## R script for ExpressAnalyst
## Description: functions for mapping gene list ids (single or multiple)
##
## Authors: 
## Jeff Xia, jeff.xia@mcgill.ca
## Guangyan Zhou, guangyan.zhou@mail.mcgill.ca
###################################################


#'Mapping user supplied IDs
#'@description Mapping user supplied IDs to internal IDs (in most cases Entrez)
#'@param dataSetObj Input the name of the created datasetObj (see Init.Data).
#'@param listNm File name of the list
#'@param geneIDs Raw string of the gene list
#'@param org Three letters annotation of organism (i.e hsa, mmu)
#'@param idType The orginal ID type
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'
MapListIds <- function(listNm, geneIDs, org, idType){
  #geneIDs is text one string, need to make to vector
  paramSet <- readSet(paramSet, "paramSet");
  msgSet <- readSet(msgSet, "msgSet");
  dataSet <- list();
  dataSet$name <- listNm;
  dataSet$orig <- geneIDs;
  current.msg <- NULL;
  paramSet$data.org <- org;
  paramSet$data.idType <- idType;
  listNms <- vector();
  res <- .parseListInput(geneIDs, paramSet, msgSet);
  dataList <- res[[1]];
  paramSet <- res[[2]];
  msgSet <- res[[3]];
  paramSet$numOfLists <- length(dataList)

  all.prot.mat <- list(); 
  inx <- 0;
  for(i in 1:length(dataList)){
    dataSet$name = paste0("datalist", i);
    listNms[i] = dataSet$name;
    gene.mat <- prot.mat <- dataList[[i]];
    GeneAnotDB <-.doGeneIDMapping(rownames(gene.mat), idType, paramSet$data.org, "table");
    na.inx <- is.na(GeneAnotDB[,1]) | is.na(GeneAnotDB[,2]);
    if(sum(!na.inx) < 2){
      msgSet$current.msg <- "Less than two hits found in uniprot database. "; 
    }
    rownames(prot.mat) <- GeneAnotDB[,2];
    prot.mat <- prot.mat[!na.inx, , drop=F];
    
    # now merge duplicates
    res <- RemoveDuplicates(prot.mat, "mean", quiet=T, paramSet, msgSet, inx); 
    prot.mat <- res[[1]];
    msgSet <- res[[2]];
    dataSet$listInx <- paste0("datalist", inx);
    if(is.null(dim(prot.mat))){
      prot.mat <- matrix(prot.mat);
    }
    
    seed.proteins <- rownames(prot.mat);
    dataSet$GeneAnotDB <- GeneAnotDB;
    dataSet$sig.mat <- gene.mat;
    dataSet$prot.mat <- prot.mat;
    dataSet$seeds.proteins <- seed.proteins;
    if(i == 1){
      all.prot.mat <- prot.mat;
      totalseed.proteins <- seed.proteins;
    }else{
      totalseed.proteins <- c(totalseed.proteins, seed.proteins);
      all.prot.mat <- rbind(all.prot.mat, prot.mat)
    }
    inx <- inx + 1;
    RegisterData(dataSet); 
  }
  paramSet$all.ent.mat <- all.prot.mat;
  rownames(all.prot.mat) = doEntrez2SymbolMapping(rownames(all.prot.mat), paramSet$data.org, paramSet$data.idType);
  all.prot.mat <- data.frame(as.numeric(all.prot.mat[,1]), rownames(all.prot.mat));
  paramSet$all.prot.mat <- all.prot.mat;
  paramSet$listNms <- listNms;
  mdata.all <- list();
  for(i in 1:length(listNms)){
    mdata.all[i] <- 1;
  }
  names(mdata.all) <- listNms;
  paramSet$partialToBeSaved <- c( paramSet$partialToBeSaved, listNms);
  saveSet(paramSet, "paramSet");
  paramSet$mdata.all <- mdata.all
  paramSet$anal.type <- "genelist";

  saveSet(paramSet, "paramSet");
  saveSet(msgSet, "msgSet");
  return(RegisterData(dataSet));
}


MapMultiListIds <- function(listNm, org, geneIDs, type){
  paramSet <- readSet(paramSet, "paramSet");
  msgSet <- readSet(msgSet, "msgSet");
  dataSet <- list();
  dataSet$orig <- "";
  msgSet$current.msg <- NULL;
  paramSet$data.org <- org;
  paramSet$data.idType <- type;

  listNms <- multiFileNamesU; # multiFilesNamesU assigned from java
  paramSet$numOfLists <-length(multiFileNamesU);
  notOk = 0
  for(i in 1:length(listNms)){
    dataSet = readDataset(listNms[i])
    dataSet$name <- listNms[i];
    gene.mat <- prot.mat <- dataSet$prot.mat;
    GeneAnotDB <-.doGeneIDMapping(rownames(gene.mat), idType, paramSet$data.org, "table");
    
    na.inx <- is.na(GeneAnotDB[,1]) | is.na(GeneAnotDB[,2]);
    if(sum(!na.inx) < 2){
      msgSet$current.msg <- paste0("Less than two hits found in database for ", listNms[i]);
      saveSet(msgSet, "msgSet");
      return(0);
    }
    rownames(prot.mat) <- GeneAnotDB[,2];
    prot.mat <- prot.mat[!na.inx, , drop=F];
    
    # now merge duplicates
    dataSet$listInx <- listNms[i];
    res <- RemoveDuplicates(prot.mat, "mean", quiet=F, paramSet, msgSet); 
    prot.mat <- res[[1]];
    msgSet <- res[[2]];
    if(is.null(dim(prot.mat))){
      prot.mat <- matrix(prot.mat);
    }
    
    seed.proteins <- rownames(prot.mat);
    dataSet$GeneAnotDB <- GeneAnotDB;
    dataSet$sig.mat <- gene.mat;
    dataSet$prot.mat <- prot.mat;
    dataSet$seeds.proteins <- seed.proteins;
    if(i == 1){
      all.prot.mat <- prot.mat;
      totalseed.proteins = seed.proteins
    }else{
      totalseed.proteins  = c(totalseed.proteins, seed.proteins);
      all.prot.mat <- rbind(all.prot.mat, prot.mat)
    }
    RegisterData(dataSet); 
  }
  paramSet$all.ent.mat <- all.prot.mat
  rownames(all.prot.mat) = doEntrez2SymbolMapping(rownames(all.prot.mat), paramSet$data.org, paramSet$data.idType)
  all.prot.mat <- data.frame(as.numeric(all.prot.mat[,1]), rownames(all.prot.mat));
  paramSet$all.prot.mat <- all.prot.mat
  paramSet$listNms <- listNms
  mdata.all <- list();
  for(i in 1:length(listNms)){
    mdata.all[i] <- 1;
  }
  names(mdata.all) <- listNms;
  paramSet$mdata.all <- mdata.all;

  paramSet$anal.type <- "genelist";
  saveSet(paramSet, "paramSet");

  return(RegisterData(dataSet));
}


#########################################
##########################################
############# private utility methods #### 
##########################################
##########################################
# given a text from input area: one or two-col entries (geneID and logFC)
# parse out return the a matrix containing the logFc, with rows named by gene ID
# if no 2nd col (logFC), the value will be padded by 0s
.parseListInput <- function(geneIDs, paramSet, msgSet){
  spl <- unlist(strsplit(geneIDs, "\\//")[1]);
  spl <- spl[unlist(lapply(spl,function(x){!x %in% ""}))]
  spl <- lapply(spl,function(x){gsub("\\/", "",x)})
  dataList <- list();
  inxU <- 0;
  for (i in 1:length(spl)){
    lines <- unlist(strsplit(spl[[i]], "\r|\n|\r\n")[1]);
    # remove the beginning & trailing space 
    lines <- sub("^[[:space:]]*(.*?)[[:space:]]*$", "\\1", lines, perl=TRUE);
    if(substring(lines[1],1,1)=="#"){
      lines <- lines[-1];
    }
    gene.lists <- strsplit(lines, "\\s+");
    gene.mat <- do.call(rbind, gene.lists);
    
    if(dim(gene.mat)[2] == 1){ # add 0
      gene.mat <- cbind(gene.mat, rep(0, nrow(gene.mat)));
      msgSet$current.msg <- "Only one column found in the list. Abundance will be all zeros. ";
    }else if(dim(gene.mat)[2] > 2){
      gene.mat <- gene.mat[,1:2];
      msgSet$current.msg <- "More than two columns found in the list. Only first two columns will be used. ";
    }

    rownames(gene.mat) <- gene.mat[,1];
    gene.mat <- gene.mat[,-1, drop=F];
    res <- RemoveDuplicates(gene.mat, "mean", quiet=F, paramSet, msgSet); 
    gene.mat <- res[[1]];
    msgSet <- res[[2]];
    good.inx <- !is.na(gene.mat[,1]);
    gene.mat <- gene.mat[good.inx, , drop=F];
    dataList[[i]] <- gene.mat;
  }
  return(list(dataList, paramSet, msgSet));
}

#'Read gene list
#'@description read gene list, Fold Change (optional) and store the data in dataSet list object;
#'@param fileName file name of the data, .txt format
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'
ReadListFile <- function(fileName) {
  paramSet <- readSet(paramSet, "paramSet");
  dat1 <- data.table::fread(fileName, header=FALSE, check.names=FALSE, data.table=FALSE);
  dataSet$name <- fileName
  rowNms <- dat1[,1]
  if(length(dat1) == 1){
    dat1[,1] <- 0
  }else{
    dat1[,1] <- dat1[,2]
    dat1 <- dat1[,-2];
  }
  dataSet$prot.mat <- as.matrix(dat1)
  rownames(dataSet$prot.mat) <- rowNms;

  paramSet$anal.type <- "genelist";
  saveSet(paramSet, "paramSet");

  saveDataset(dataSet);# keep original copy, not in mem
  return(1)
}
xia-lab/NetworkAnalystR documentation built on Jan. 10, 2023, 4:47 a.m.