R/readData.R
In iCheck: QC Pipeline and Data Analysis Tools for High-Dimensional Illumina mRNA Expression Data

Documented in sortExpressionSet

###########
# created on June 10, 2010
#  (1) read in data
#
###########

########################################
# subset an ExpressionSet
extractSamples<-function(es,
  GCflag = TRUE,
  GCIDsymbols=c("128115", "Hela", "Brain"), 
  hybName = "Hybridization_Name", 
  verbose=TRUE)
{
  tmp<-extractIDs3(pDat=pData(es), mySet=GCIDsymbols, 
		   hybName=hybName,  
		   verbose=verbose)
  if(is.null(tmp))
  {
    pos<-numeric(0)
  } else {
    pos<-tmp$pos
  }

  if(GCflag)
  {
    if(length(pos))
    {
      es2<-es[, pos]
    } else {
      cat("In function 'extractSamples', no GC arrays in the ExpressionSet! NULL will be returned\n")
      return(NULL)
    }
  } else {
    if(length(pos))
    {
      es2<-es[, -pos]
    } else {
      es2<-es
    }
  }
  return(es2)
}
# obtain Subject ID for all arrays
# and for GC arrays.
# The first part of 'sampleIDs' must be Subject ID
# The first part of 'sampleIDs' and other parts of 'sampleIDs'
#  must be separated by '_'
extractIDs<-function(sampleIDs, 
  mySet=c("128115", "Hela", "Brain"), verbose=TRUE)

{
  hn<-as.character(sampleIDs)
  if(is.null(hn))
  {
    cat("The function argument 'sampleIDs' is empty!\n")
    return(NULL)
  }
  IDs<-sapply(hn, function(x) {
      x<-as.character(x)
      tt<-strsplit(x, split="_")
      res<-tt[[1]][1]
      return(res)
    }
  )
  rm(hn)

  sampleIDs.sub<-numeric(0) 
  pos<-numeric(0)
  if(length(mySet))
  { pos<-which(toupper(IDs) %in% toupper(mySet))
    n.pos<-length(pos)
    if(n.pos)
    {
      sampleIDs.sub<-sampleIDs[pos]
    }   
  } else {
    cat("The function argument 'mySet' is empty!\n")
    return(NULL)
  } 
  res<-list(pos=pos, sampleIDs.sub=sampleIDs.sub, IDs=IDs)
  return(res)
}

extractIDs3<-function(pDat, 
  mySet=c("128115", "Hela", "Brain"), 
  hybName = "Hybridization_Name", 
  verbose = TRUE)

{
  sm<-as.character(pDat[, c(hybName)])

  res<-extractIDs(sampleIDs=sm, mySet=mySet, verbose=FALSE)
  ttpos2<-res$pos

  pos<-unique(ttpos2)

  if(length(pos))
  {
    sampleIDs.sub<-sm[pos]
  } else {
    sampleIDs.sub<-NULL
  }

  hn<-as.character(sm)
  IDs<-sapply(hn, function(x) {
      x<-as.character(x)
      tt<-strsplit(x, split="_")
      res<-tt[[1]][1]
      return(res)
    }
  )
  rm(hn)

  res<-list(pos=pos, sampleIDs.sub=sampleIDs.sub, IDs=IDs)
  return(res)
}



########################################

# chipNames -- a vector of chip names 
# chipDir -- directory of arrays
# metaFile -- file name for meta file
#             meta file should be comma delimited with header line
#             and na.strings="NA"
# metaDir -- directory of meta file
# pDat -- a pheno data frame (rows are subjects and columns are variables)
#         (must contain columns: Hybridization_Name)
# mFileName -- file name including full directory path of manifest file
#         containing manifest data (rows are probes and columns are subjects) 
#          (must contain columns: Array_Address_Id
#          Symbol, Chromosome, and ILMN_Gene)
# GConly -- logical. Indicates if only Genetic Control Samples will be output.
#
# outObjFilePrefix - output object file name without file extension
#
#manifestDir="/proj/reilms/reilm00/GX/Manifests/",
readData<-function(
  path, 
  pDat, 
  pattern="SampleProbeProfile.txt",
  QCpattern="QCProbeProfile.txt",
  GCid = c("128115", "Hela", "Brain"),
  GConly = FALSE, 
  manifestDir="Manifests/",
  nuIDFlag = FALSE,
  combineRule = c("union", "intersection"),
  hybName = "Hybridization_Name",
  chipManifest = "Chip_Manifest",
  probeID = "ProbeID",
  arrayAddressID = "Array_Address_Id", 
  probeSeq = "Probe_Sequence",
  reporterGroupName = "Reporter_Group_Name",
  saveFlag = TRUE, 
  outObjFileDir=NULL,
  outObjFilePrefix="es",
  outObjFilePrefixQC="esQC",
  verbose=TRUE
  )
{
  mywd = getwd()

  # read sample probe profile
  cat("\n*** read sample probe profile>>>\n")
  es.raw<-readData.default(
    path=path, 
    pDat=pDat, 
    pattern=pattern,
    sampleProbeFlag = TRUE,
    GCid = GCid, 
    GConly = GConly, 
    manifestDir=manifestDir,
    nuIDFlag = nuIDFlag,
    combineRule = combineRule,
    hybName = hybName,
    chipManifest = chipManifest,
    probeID = probeID,
    arrayAddressID = arrayAddressID, 
    probeSeq = probeSeq,
    saveFlag = FALSE,
    outObjFileDir=outObjFileDir,
    outObjFilePrefix=outObjFilePrefix,
    verbose=verbose
    )

  # read QC probe profile
  cat("\n*** read QC probe profile>>>\n")
  esQC.raw<-readData.default(
    path=path, 
    pDat=pDat, 
    pattern=QCpattern,
    sampleProbeFlag = FALSE,
    GCid = GCid, 
    GConly = GConly, 
    manifestDir=manifestDir,
    nuIDFlag = nuIDFlag,
    combineRule = combineRule,
    hybName = hybName,
    chipManifest = chipManifest,
    probeID = probeID,
    arrayAddressID = arrayAddressID, 
    probeSeq = probeSeq,
    saveFlag = FALSE,
    outObjFileDir=outObjFileDir,
    outObjFilePrefix=outObjFilePrefixQC,
    verbose=verbose
    )

  sn<-sampleNames(es.raw)
  sn.QC<-sampleNames(esQC.raw)

  pos<-match(sn, sn.QC)
  #if(length(sn) != length(sn.QC) || any(is.na(pos)==TRUE))
  if(!identical(sort(sn), sort(sn.QC)))
  {
    sn.diff<-setdiff(sn, sn.QC)
    snQC.diff<-setdiff(sn.QC, sn)
    if(length(sn.diff))
    {
      cat("\n*************************\n")
      cat("The following arrays which are in sample probe profile file\n")
      cat("but are not in QC probe profile file!\n")
      print(sn.diff) 
      cat("\n")
    }
    if(length(snQC.diff))
    {
      cat("\n*************************\n")
      cat("The following arrays which are in QC probe profile file\n")
      cat("but are not in sample probe profile file!\n")
      print(snQC.diff) 
      cat("\n")
    }

    setwd(mywd)
    stop("array names in sample probe files do not match those in QC probe files")
  } 
  esQC.raw<-esQC.raw[,pos]

  ########
  # added controlData slot to es.raw, and esQC.raw.
  # without controlData slot, we can not do bgAdjust.
  ########
  cat("\n**** adding controlData slot>>\n")
  dat.QC<-exprs(esQC.raw)
  rn<-rownames(dat.QC)
  mat.QC<-as.matrix(dat.QC)

  fDat.QC<-fData(esQC.raw)
  if(!identical(rn, as.character(fDat.QC[, c(probeSeq)])))
  {
    setwd(mywd)
    stop(paste("rownames(dat.QC) != fDat.QC$", probeSeq, sep=""))
  }
  rownames(mat.QC)<-as.character(fDat.QC[, c(reporterGroupName)])
  control<-as.data.frame(mat.QC)
  
  es.raw@controlData<-control
  esQC.raw@controlData<-control

  if(saveFlag)
  {
    if(verbose)
    { cat("\n****************************\n") 
      cat("\nSaving raw data>>>>>>>\n")
    }
  
    if(!is.null(outObjFileDir))
    { 
      setwd(outObjFileDir)
    } else {
      outObjFileDir = getwd()
    }
    outFile.u<-paste(outObjFilePrefix, ".raw.rda", sep="")
    outFileQC.u<-paste(outObjFilePrefixQC, ".raw.rda", sep="")
    if(verbose)
    { cat("output file dir is>>", outObjFileDir, "\n")
      cat("output file is>>", outFile.u, "\n")
      cat("output QC file is>>", outFileQC.u, "\n")
    }
    
    save(es.raw, file=outFile.u)
    save(esQC.raw, file=outFileQC.u)
  
  }
  setwd(mywd)

  res<-list(es.raw=es.raw, esQC.raw=esQC.raw)
  invisible(res)
}

# pDat must contain a column 'Hybridization_Name'
# to uniquely define each array
readData.default<-function(
  path, 
  pDat, 
  pattern="SampleProbeProfile.txt",
  sampleProbeFlag = TRUE,
  GCid = c("128115", "Hela", "Brain"),
  GConly = FALSE, 
  manifestDir="Manifests/",
  nuIDFlag = FALSE,
  combineRule = c("union", "intersection"),
  hybName = "Hybridization_Name",
  chipManifest = "Chip_Manifest",
  probeID = "ProbeID",
  arrayAddressID = "Array_Address_Id", 
  probeSeq = "Probe_Sequence",
  saveFlag = TRUE,
  outObjFileDir=NULL,
  outObjFilePrefix="esGC",
  verbose=TRUE
  )
{
  combineRule = match.arg(arg=combineRule, 
    choices=c("union", "intersection"))

  if(is.null(pDat))
  {
    stop("pDat is NULL!\n")
  }

  # check if the directory specified by
  # 'path' exists
  mywd = getwd()
  setwd(path)
  setwd(mywd)

  # check if the directory specified by
  # 'manifestDir' exists
  setwd(manifestDir)
  setwd(mywd)

  # check if the directory specified by
  # 'outObjFileDir' exists
  if(!is.null(outObjFileDir))
  { 
    setwd(outObjFileDir)
    setwd(mywd)
  } else {
    outObjFileDir <- getwd()
  }

  flag<-requireNamespace("lumi")
  if(!flag)
  { 
    setwd(mywd)
    stop("library 'lumi' is required to do vsn normalization!\n") 
  }
  
  hn<-as.character(pDat[, c(hybName)])
  if(is.null(hn))
  {
    setwd(mywd)
    stop(paste("phenotype data 'pDat' should contain the column titled ", hybName, sep=""))
  }
  if(length(unique(hn)) != length(hn))
  {
    setwd(mywd)
    stop(paste("The value of ", hybName, " must be unique!", sep=""))
  }
  tmpres<-extractIDs3(pDat=pDat, mySet=GCid, 
		      hybName=hybName,  
		      verbose=verbose)
  pos<-tmpres$pos
  n.pos<-length(pos)
  if(verbose)
  {
    cat("\n\n************************\n")
    cat("No. of arrays in pDat=", nrow(pDat), "\n")
    cat("No. of GC arrays in pDat=", n.pos, "\n")
    cat("\n\n************************\n")
  }
  if(GConly)
  {
    if(n.pos)
    {
      pDat<-pDat[pos, ,drop=FALSE] 
    } else {
      setwd(mywd)
      stop("No Genetic Controls samples in the pDat data set!\n")
    }
  }

  setwd(path)

  targs = list.files(path, pattern=pattern, 
    recursive=TRUE, full.names=TRUE)

  nFiles<-length(targs)
  if(verbose)
  {
    cat("\n************\n")
    cat("Number of", pattern, "  files = ", nFiles, "\n")
    cat("targs>>>\n"); print(targs); cat("\n");
    cat("\n************\n")
    if(nFiles==0)
    {
      setwd(mywd)
      stop("No files with naming containing the phrase ", pattern, " are found!\n")
    }

    cat("\n**************************\n")
    cat("Reading ", pattern, " files>>>>\n")
  }
  count <- 0
  for (i in 1:nFiles) 
  {
    if(verbose)
    {
      cat("\n ******* reading files .....\n")
      cat("\ni=", i, " targs[i]>>", targs[i], "\n")
    }
    ########
    # read SampleProbeProfile
    ########
    tmp = lumiR(targs[i], QC=FALSE)

    sn<-sampleNames(tmp)

    # since lumiR will automatically added suffix to duplicated arrays
    # so the following condition should be always be FALSE.
    # If not, then there must be something wrong. We stop the program then
    # to check what's wrong.
    tt<-table(sn)
    posi<-which(tt>1)
    n.posi<-length(posi)
    if(n.posi)
    {
      sn.sub<-names(tt[posi])
      tmpframe<-data.frame(sampleName=sn.sub, fileName=rep(targs[i], n.posi))
      cat("\n************************\n")
      cat("length(unique(sampleNames(tmp))) =", length(unique(sn)) , "< length((sampleNames(tmp))) =", length((sn)) , "\n")
      cat("The following sample ids have duplicates\n")
      print(tmpframe)
      cat("\n************************\n")
      setwd(mywd) 
      stop("sampleNames in a LumiBatch object should be unique!")

    }

    noGC = FALSE

    if(GConly)
    {
      tmpres<-extractIDs(sampleIDs=sampleNames(tmp), mySet=GCid, verbose=verbose)
      pos<-tmpres$pos
      if(length(pos)==0)
      { 
        noGC <- TRUE
      } else {
        tmp<-tmp[,pos]
      } 
    }

    if(!noGC || (!GConly)) # contains GC samples
    {
      count <- count + 1
      if(verbose)
      { cat("dim(tmp)>>\n"); print(dim(tmp)); cat("\n"); 
        if(count > 1)
        {
          cat("\nCombining LumiBatch objects>>>\n")
        }
      }

      # get probe ids
      probeName.i<-as.numeric(as.character(featureNames(tmp)))
  
      # some arrays have different orders of rows
      # so we have to sorted rows for each array
      # sort probe IDs
      tmpmat.i<-cbind(probeName.i, 1:length(probeName.i))
      tmpmat.i2<-tmpmat.i[order(tmpmat.i[,1]),]
      rm(tmpmat.i)
     
      # sort the rows of mat.i by probe IDs
      # the reason to do this is to avoid
      # the cases in which some chips have different orders
      # to put probe IDs (i.e. row names are different by orders)
      x.lumi.i<-tmp[tmpmat.i2[,2],]
      rm(tmp)
      rm(tmpmat.i2)

      # add feature data  
      # match the samples in expression file with those in pData file
      # so that we can know which chip version they from
      # Hence, we can read in feature data file and extract 
      # sequence info for merging purpose
      sampleIDs.i<-sampleNames(x.lumi.i)
      aa<-match(sampleIDs.i, as.character(pDat[, c(hybName)]))
      tmppos<-which(is.na(aa)==FALSE)
      if(length(tmppos))
      { 
        chipMani<-as.character(pDat[, c(chipManifest)])
        chipver<-as.character(chipMani[aa[tmppos]])
        if(any(is.na(chipver)==TRUE))
        {
          setwd(mywd)
          #stop("There are empty chip version in pDat!")
          cat("\nWarning: There are empty chip version in pDat!")
        }
        if(length(table(chipver))!=1)
        { setwd(mywd) 
          print(table(chipver, useNA="ifany"))
          #stop("The arrays in the same SampleProbeProfile are from different chip version!\n")
          cat("\nWarning: The arrays in the same SampleProbeProfile are from different chip version!\n")
        }
        setwd(manifestDir)
        manifestFileVec = list.files(manifestDir, pattern=unique(chipver), 
          recursive=TRUE, full.names=TRUE)
        setwd(mywd)
        if(length(manifestFileVec)!=1)
        { cat("chip version>>\n"); print(chipver); cat("\n"); 
          setwd(mywd)
          #stop("There is no manifest file match the chip version!\n")
          cat("\nWarning: There is no manifest file match the chip version!\n")
        }
      
        manifestFile.i<-manifestFileVec[1] 
        if(count ==1 )
        { manifestFile = manifestFile.i }
        if(count==1 || !identical(manifestFile, manifestFile.i))
        {
          # read feature data file 
          tmp.bgx<-readbgx(filename=manifestFile.i, sep="\t",
            quote="", header=TRUE, probeStart="[Probes]",
            controlStart="[Controls]") 
          if(sampleProbeFlag)
          { 
            probeAnno.i<-tmp.bgx$probeAnno
          } else {
            probeAnno.i<-tmp.bgx$controlAnno
          }
        } 
        if(verbose)
        { cat("Adding manfiest data>>>>\n") }
        tt<-addfDat(x.lumi.i, probeAnno.i, 
          featureID.es=probeID, featureID.fDat=arrayAddressID,
          probeSeq = probeSeq, 
          verbose=verbose)
        if(!is.null(tt))
        {
          x.lumi.i<-tt
        } else {
          setwd(mywd)
          stop("Error occurs in function 'addfDat'!")
        }
      } else {
        setwd(mywd)
        cat("\n********************\n")
        cat("\nSamples in the expression files>>>\n")
        print(sampleIDs.i)
        cat("\n********************\n")
        stop("All samples in expression sets are not in pDat!\n")
      }
      setwd(mywd) 


      # use Probe_Sequence as featureNames
      # but Probe_Sequence might not be unique
      # that is, some Probe_Sequence correspond to multiple ProbeIDs
      # merge duplicates and replace feature names
      #  by featureID.es.
      tt<-mergeDuplicates(x.lumi.i, featureID.es=probeSeq)
      if(!is.null(tt))
      {
        x.lumi.i<-tt
      } else {
        setwd(mywd)
        stop("Error occurred in function 'mergeDuplicates'!")
      }

      manifestFile <- manifestFile.i
      if(count>1)
      { 
        tti<-myCombine(x=x.lumi, y=x.lumi.i, combineRule=combineRule, suffix=paste("_", i, sep=""))
        if(is.null(tti))
        {
          setwd(mywd)
          stop("Error in myCombine!\n")
        }
        x.lumi<-tti
      } else {
        x.lumi <- x.lumi.i
      }
      rm(x.lumi.i)
    } else if(GConly && noGC) {
      if(verbose)
      { cat("\n***********\n")
        setwd(mywd)
        stop("GConly=TRUE, but no GC samples!\n")
        cat("\n***********\n")
      }
    }
  }
  setwd(mywd)
  if(count==0)
  {
    setwd(mywd)
    stop("No required expression data avaiable!\n")
  }

  if(verbose)
  { cat("\n****************************\n") 
    cat("merging expression data with phenotype data>>>\n")
  }

  newname.mRNA=as.character(sampleNames(x.lumi))
  subj.pDat<-as.character(pDat[, c(hybName)])

  tt<-match(as.character(newname.mRNA), as.character(subj.pDat))
  pos<-which(is.na(tt)==FALSE)
  if(length(pos)==0)
  {
    setwd(mywd)
    stop("subjects in pDat do not match those in expression data!\n")
  }
  if(length(pos)<length(newname.mRNA))
  {
    if(verbose)
    { cat("\n*********\n")
      cat("\nWarnings: some subjects in expression data do not match those in pDat!\n")
    }
    pos.na<-which(is.na(tt)==TRUE)
    if(verbose)
    {
      cat("These subjects are>>\n")
      print(newname.mRNA[pos.na])
      cat("\n")
      cat("These arrays will be dropped!\n")
      cat("\n*********\n")
    }
  }
  x.lumi<-x.lumi[,pos]
  newname.mRNA<-sampleNames(x.lumi)
  pDat<-pDat[tt[pos],]
  subj.pDat<-as.character(pDat[, c(hybName)])

  flag<-identical(subj.pDat, newname.mRNA)
  if(!flag)
  { 
    setwd(mywd)
    stop("subjIDs in SampleProbe are different from those in phenoData!\n") 
  }

  rownames(pDat)=as.character(pDat[,c(hybName)])
  Biobase::sampleNames(x.lumi)=as.character(pDat[,c(hybName)])

  Biobase::pData(x.lumi)<-pDat
  es<-x.lumi
  rm(x.lumi)
  rm(pDat)

  pDat<-pData(es)

  # chip version
  if(verbose)
  { cat("\n************\n")
    cat("Frequencies of Chip versions>>\n")
    print(table(as.character(pDat[, c(chipManifest)]), useNA="ifany"))
    cat("\n")
    cat("\n************\n")
  }

#  # sort the arrays according
#  #   Batch_Run_Date
#  #   chip barcode (Chip_Barcode)
#  #   array position (Chip_Address)
#  if(verbose)
#  { cat("\n****************************\n") 
#    cat("\n Sort arrays according to Batch_Run_Date, Chip_Barcode, and Chip_Address>>>\n")
#  }
#  es.raw<-sortExpressionSet(es)

  es.raw<-es
  rm(es)

  # get nuIDs
  if(nuIDFlag)
  {
    fDat<-fData(es.raw)
    nuIDs<-seq2id(as.character(fDat[, c(probeSeq)]))
    fDat$nuIDs<-nuIDs
    Biobase::fData(es.raw)<-fDat
    #featureNames(es.raw)<-nuIDs
  }

  if(verbose)
  { cat("dim(es.raw)>>\n"); print(dim(es.raw)); cat("\n"); }

  if(verbose)
  { cat("\n**************************\n")
    cat("Dimension of raw LumiBatch>>>>\n")
    print(dim(es.raw))
    cat("\n")
  }
  if(saveFlag)
  { 

    if(verbose)
    { cat("\n****************************\n") 
      cat("\nSaving raw data>>>>>>>\n")
    }

    if(!is.null(outObjFileDir))
    { 
      setwd(outObjFileDir)
    } else {
      outObjFileDir = getwd()
    }
    outFile.u<-paste(outObjFilePrefix, ".raw.rda", sep="")
    if(verbose)
    { cat("output file dir is>>", outObjFileDir, "\n")
      cat("output file is>>", outFile.u, "\n")
    }
  
    save(es.raw, file=outFile.u)

  }
  setwd(mywd)

  invisible(es.raw)
}

# 
# fixed a bug on Jan 18, 2013
# fDat must have column: Array_Address_Id
# fDat must match the version of chip creating es 
addfDat<-function(es, fDat, featureID.es="ProbeID",
  featureID.fDat="Array_Address_Id", probeSeq = "Probe_Sequence", 
  verbose=TRUE)
{
  if(is.null(fDat[, c(featureID.fDat)]))
  {
    cat("Error: fDat does not have column titled", featureID.fDat, "!\n")
    return(NULL) 
  }
  if(!identical(featureNames(es), fData(es)[, c(featureID.es)]))
  {
    cat("Error: featureNames(es) not equal to ", featureID.es, "!\n") 
    return(NULL)
  }
  if(verbose)
  { cat("\n****************************\n") 
    cat("Adding feature data >>>>\n")
  }
  # update feature data
  ff<-featureNames(es)
  ff.fDat<-as.character(fDat[, c(featureID.fDat)])

  aa<-intersect(ff, ff.fDat) 
  if(length(aa))
  {
    fDat.es<-Biobase::fData(es)
    # check if there are any overlapping column names
    # except featureID.es and featureD.fDat
    int<-intersect(colnames(fDat.es), colnames(fDat))
    len<-length(int)
    if(len)
    {
      int2<-setdiff(int, c(featureID.es, featureID.fDat))
      if(length(int2))
      {
        fDat<-fDat[, -which(colnames(fDat) %in% int2), drop=FALSE]
      }
    }
    #pos.sel<-match(ff, ff.fDat)
    #fDat2<-fDat[pos.sel[which(is.na(pos.sel)==FALSE)],]
    bb2<-merge(x=fDat.es, y=fDat, by.x=featureID.es, 
        by.y=featureID.fDat, all.x=TRUE, sort=FALSE)

    rm(fDat.es)
    #rm(fDat)

    # update the feature data of 'es'
    pos<-match(ff, as.character(bb2[, c(featureID.es)]))
    bb3<-bb2[pos,]
    #ff2<-ff[pos]
    rm(bb2)

    if(!identical(as.character(bb3[, c(featureID.es)]), ff))
    {
      cat("Error: bb3[, ", featureID.es, "] != ff!\n")
      return(NULL)
    }
    
    #rownames(bb3)<-ff2
    rownames(bb3)<-ff
    Biobase::fData(es)<-bb3

    pos.del<-which(is.na(as.character(bb3[, c(probeSeq)]))==TRUE)
    if(length(pos.del))
    {
      cat("Warning: the following probes have missing", probeSeq, " and will be deleted>>>\n")
      print(fData(es)[pos.del,])
      cat("\n")
      es<-es[-pos.del,]
    }
    invisible(es)
  
  } else {
    cat("Error: No overlapping between featureNames(es) and ", featureID.fDat,"!\n")
    return(NULL)
  }
}



# Sort arrays according to Batch_Run_Date, Chip_Barcode, and Chip_Address
sortExpressionSet<-function(es, 
  varSort = c("Batch_Run_Date", "Chip_Barcode", "Chip_Address"), 
  timeFormat = c("%m/%d/%Y", NA, NA)
)
{
  pDat<-pData(es)
  cn<-colnames(pDat)
 
  len<-length(varSort)
  if(len!=length(timeFormat))
  { stop("varSort should have the same length as timeFormt!") }

  pos.nonNA<-which(is.na(timeFormat)==FALSE)
  len.t<-length(pos.nonNA)

  if(len.t)
  {
    for(i in 1:len.t)
    {
      ttpos<-which(cn==varSort[pos.nonNA[i]])
      tt<-attributes(pDat[, ttpos])$class[1]
      tt2<-substr(tt, start=1, stop=4)
      tt3<-as.character(pDat[, ttpos])
      if(!is.null(tt) && tt2=="POSI")
      {
  
      } else {
        # %Y not %y
        # %Y Year with century
        # %y Year without century (00-99)
        tt4<-strptime(tt3, format=timeFormat[pos.nonNA[i]])
        pDat[, ttpos]<-as.character(tt4)
      }
    }  
  }  

  pDat$pos<-1:nrow(pDat)

  if(len>1)
  { 
    pDat2<-pDat[do.call(order, pDat[, c(varSort)]),]
  } else {
    pDat2<-pDat[order(pDat[, c(varSort)]),]
  }

  pos.s<-as.numeric(as.character(pDat2$pos))
  es<-es[,pos.s]
  rm(pDat2)
  invisible(es)
}


# merge duplicates and replace feature names
#  by featureID.es.
# copy and revised from function lumi::lumiR
# and from method Biobase::setReplaceMethod
# (1) To simplify the process, we use sample average for duplicated probes
# for 'exprs', 'se.exprs', 'detection', etc.
# 'lumiR' uses more complicated formula to merge duplicated probes
#  for 'se.exprs', 'detection', etc.  
# (2) 'lumiR' uses 'probeID' as id. But some time, a Probe_Sequence 
#     can have 2 different probeIDs
mergeDuplicates<-function(x.lumi.i, featureID.es="Probe_Sequence")
{
  try(id<-as.character(fData(x.lumi.i)[, featureID.es]), silent=TRUE)
  if(length(id)==0)
  {
    cat("Error:", featureID.es, " is not in fData(x.lumi.i)!\n")
    return(NULL)
  }
  dupId <- unique(id[duplicated(id)])

  obj<-x.lumi.i
  if (length(dupId) > 0) {
      cat("Duplicated Probe_Sequence found and were merged!\n")

      storage.mode <- Biobase::storageMode(obj)
      switch(storage.mode,
             "lockedEnvironment" = {
                 aData <- Biobase::copyEnv(obj@assayData)

                 eltVec<-Biobase::assayDataElementNames(obj)
                 for(elt in eltVec)
                 {
                    rmInd <- NULL
                    for (dupId.i in dupId) {
                      selInd.i <- which(id == dupId.i)
                      aData[[elt]][selInd.i[1], ] <- colMeans(aData[[elt]][selInd.i, , drop = FALSE], na.rm=TRUE)
                      rmInd <- c(rmInd, selInd.i[-1])
                    }
                    if(length(rmInd))
                    { 
                      aData[[elt]]<-aData[[elt]][-rmInd,, drop=FALSE] 
                    }
                 }
               #  Biobase:::assayDataEnvLock(aData)
                 obj@assayData <- aData
             },
             "environment" = {
                 aData <- obj@assayData
                 eltVec<-Biobase::assayDataElementNames(obj)
                 for(elt in eltVec)
                 {
                    rmInd <- NULL
                    for (dupId.i in dupId) {
                      selInd.i <- which(id == dupId.i)
                      aData[[elt]][selInd.i[1], ] <- colMeans(aData[[elt]][selInd.i, , drop = FALSE], na.rm=TRUE)
                      rmInd <- c(rmInd, selInd.i[-1])
                    }
                    if(length(rmInd))
                    {
                      aData[[elt]]<-aData[[elt]][-rmInd,, drop=FALSE]
                    }
                 }
                 obj@assayData <- aData
             }
      )
      #obj
      if (!is.null(fData(obj)) && length(rmInd)) 
      {
        Biobase::fData(obj)<-fData(obj)[-rmInd,,drop=FALSE]
      }
  }
  Biobase::featureNames(obj)<-as.character(fData(obj)[, c(featureID.es)])

  invisible(obj)
}
Any scripts or data that you put into this service are public.
iCheck documentation built on Nov. 8, 2020, 11:09 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
iCheck
QC Pipeline and Data Analysis Tools for High-Dimensional Illumina mRNA Expression Data

R/readData.R
In iCheck: QC Pipeline and Data Analysis Tools for High-Dimensional Illumina mRNA Expression Data

Defines functions mergeDuplicates sortExpressionSet addfDat readData.default readData extractIDs3 extractIDs extractSamples

Documented in sortExpressionSet

Try the iCheck package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

iCheck QC Pipeline and Data Analysis Tools for High-Dimensional Illumina mRNA Expression Data

R/readData.R In iCheck: QC Pipeline and Data Analysis Tools for High-Dimensional Illumina mRNA Expression Data

Defines functions mergeDuplicates sortExpressionSet addfDat readData.default readData extractIDs3 extractIDs extractSamples

Documented in sortExpressionSet

Try the iCheck package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

iCheck
QC Pipeline and Data Analysis Tools for High-Dimensional Illumina mRNA Expression Data

R/readData.R
In iCheck: QC Pipeline and Data Analysis Tools for High-Dimensional Illumina mRNA Expression Data