R/dataPrePro.R

Defines functions lazyImputationClnc lazyImputeDFClnc lazyImputeDFClncFULL lazyImputeDFClncOslo CreateMetageneSpace CreateGeneSpace expandClnc expandClncFULL expandClncOslo getTags removeTaggedFeatures checkIntegrity

lazyImputationClnc <- function(c){ # so i'm lazy, bite me!
  for(i in 1:ncol(c)){
    idx = which(is.na(c[,i]))
    if(length(idx) > 0){
      if(class(c[,i])=="numeric") c[idx,i] = mean(c[,i], na.rm=TRUE)
      if(class(c[,i])=="factor") c[idx,i] = 0
    }
  }
  return (c)
}

lazyImputeDFClnc = function(c){
  idx = grep("NOT_IN_OSLOVAL",colnames(c))
  if(length(idx) > 0) c = c[,-idx]
  idx = grep("HER2_IHC", colnames(c))
  if(length(idx) > 0) c = c[,-idx]
  for(i in 1:ncol(c)){
    idx = which(is.na(c[,i]))
    if(length(idx) > 0){
      if(class(c[,i]) == "numeric") c[idx, i] = mean(c[-idx,i])
      if(class(c[,i]) == "factor" ){ 
        cc = as.vector(c[,i])
	cc[idx] = "NA"
	c[, i] = factor(cc)
	}
    }
  }
  return (c)
}
lazyImputeDFClncFULL = function(c){
  c$NOT_IN_OSLOVAL_stage <- factor(c$NOT_IN_OSLOVAL_stage)
  class(c$HER2_IHC_status) = "factor"
  killIdx = NULL
  for(i in 1:ncol(c)){
    mis = sum(is.na(c[,i]))
    if(mis > (nrow(c) * 0.3) ) killIdx = c(killIdx , i)
  }
  c = c[,-killIdx]
  idx = grep(".Expr", colnames(c))
  if(length(idx) > 0) c = c[,-idx]

  for(i in 1:ncol(c)){
    idx = which(is.na(c[,i]))
    if(length(idx) > 0){
      if(class(c[,i]) == "numeric") c[idx, i] = mean(c[-idx,i])
      if(class(c[,i]) == "factor" ){ 
        cc = as.vector(c[,i])
	cc[idx] = "NA"
	c[, i] = factor(cc)
	}
    }
  }
  return (c)
}
lazyImputeDFClncOslo = function(c){
  idx = grep("NOT_IN_OSLOVAL",colnames(c))
  if(length(idx) > 0) c = c[,-idx]
  idx = grep("IHC", colnames(c))
  if(length(idx) > 0) c = c[,-idx]
  for(i in 1:ncol(c)){
    idx = which(is.na(c[,i]))
    if(length(idx) > 0){
      if(class(c[,i]) == "numeric") c[idx, i] = mean(c[-idx,i])
      if(class(c[,i]) == "factor" ){ 
        cc = as.vector(c[,i])
	cc[idx] = "NA"
	c[, i] = factor(cc)
	}
    }
  }
  return (c)
}
CreateMetageneSpace <- function(ge, attractome, map, chosenProbes = NULL, maxGenes = NULL){
  if(is.null(chosenProbes)) {
  nMeta = length(attractome)
  metaSpace = matrix(NA, nrow=nMeta, ncol=ncol(ge))
  dimnames(metaSpace) = list(names(attractome), colnames(ge))
  pbs = list()
  mappedGenes = rep(NA, nrow(ge))
  names(mappedGenes) = rownames(ge)
  idx = intersect(rownames(map) , rownames(ge))
  mappedGenes[idx] = map[idx,1]
  for (i in 1:nMeta){
    #cat(i, "\n")
    #flush.console()
    a = attractome[[i]]
    if(!is.null(maxGenes)){
      if(nrow(a) > maxGenes){
        genes = a[1:maxGenes,1]
      }else{
        genes = a[, 1]
      }
    }else{
	genes = a[, 1]
    }
    il = lapply(genes, function(g){which(mappedGenes == g)})
    ill = sapply(il, length)
    goodIdx = sapply(il, function(i){ if(length(i) == 1) i})
    goodIdx = goodIdx[sapply(goodIdx, function(x){!is.null(x)})]
    numGood = sum(ill == 1)
    goodMat = NULL
    if(numGood > 0){
      goodMat = ge[unlist(goodIdx),]
    }
    badIdx = il[sapply(il, function(i){length(i) > 1 })]

    numBad = length(badIdx)
    
    badMat = NULL
    chosenIdx = NULL
    if(numBad > 0) {
      geneSum = apply(ge[unlist(il), ],2,sum)
      chosenIdx = lapply(badIdx, function(idcs){
	mis = sapply(idcs, function(i){getMI(geneSum, ge[i,])} )
	idcs[which(mis > 0.5)]
      })
      chosenIdx = chosenIdx[sapply(chosenIdx, function(x){length(x)>0})]
	#badMat = ge[chosenIdx,]
	badMat = t(sapply(chosenIdx, function(idcs){
		if(length(idcs) > 1){
			apply(ge[idcs,], 2, function(x){mean(x, na.rm=TRUE)})
		}else if(length(idcs) == 1){
			ge[idcs,]
		}else{
			rep(NA, ncol(ge))
		}
	}) )
	if(length(chosenIdx) == 0) {chosenIdx = NULL; badMat = NULL}
    }    
    pbs[[i]] = sapply(c(goodIdx, chosenIdx), names)
    metaSpace[i,] = (apply(rbind(goodMat, badMat), 2, function(x){mean(x, na.rm=TRUE)}))

    #o = sapply(genes, 
    #          function(g, ge, mappedGenes){
    #           idx = which(mappedGenes == g)
    #          if (length(idx)==1) return (ge[idx,])
    #             if (length(idx)==0) return (rep(NA, ncol(ge)))
    #             return (apply(ge[idx,], 2, function(x) mean(x, na.rm=T)))
    #           },
    #           ge = ge,
    #           mappedGenes = map[rownames(ge), "Gene.Symbol"]
    #           )
    #if(length(genes)==1){metaSpace[i,] = o}
    #else {metaSpace[i,] = apply(o, 1, function(x) mean(x, na.rm=T))}
  }
  names(pbs) = names(attractome)
  o = list(metaSpace = metaSpace, pbs = pbs)
  return (o)
  }else{
	metaSpace = t(sapply(chosenProbes, function(pb){
		pb = sapply(pb, function(p){intersect(p, rownames(ge))})
		gmat = sapply(pb, function(p, ge){
			if(length(p) > 1){
				apply(ge[p,], 2, mean)
			}else{
				ge[p,]
			}
		}, ge = ge)
		if(!is.null(dim(gmat))) {apply(gmat, 1, mean)}
		else{gmat}
	}) )
	return(metaSpace)

  }
}
CreateGeneSpace <- function(ge, oncogenes, map){
  ng = length(oncogenes)
  gSpace = matrix(NA, nrow=ng, ncol=ncol(ge))
  dimnames(gSpace) = list(oncogenes, colnames(ge))
  mappedGenes = rep(NA, nrow(ge))
  names(mappedGenes) = rownames(ge)
  idx = intersect(rownames(ge) , rownames(map))
  mappedGenes[idx] = map[idx,1]
  for(i in 1:ng){
    g = oncogenes[i]
    idx = which(mappedGenes == g)
    if(length(idx)==1){gSpace[i,] = ge[idx,]}
    else{gSpace[i,] = apply(ge[idx,], 2, mean)}
  }
  return (gSpace)
}

expandClnc = function(c){
  h.IDC =as.numeric(c$histological_type=="IDC")
  h.ILC =as.numeric(c$histological_type=="ILC")
  h.IDCpILC =as.numeric(c$histological_type=="IDC+ILC")
  h.IDCnMED =as.numeric(c$histological_type=="IDC-MED")
  h.IDCnMUC =as.numeric(c$histological_type=="IDC-MUC")
  h.IDCnTUB =as.numeric(c$histological_type=="IDC-TUB")
  #h.MIXED =as.numeric(c$histological_type=="MIXED NST AND A SPECIAL TYPE")
  #h.OTHER =as.numeric(c$histological_type=="OTHER")
  #h.OTHERINV =as.numeric(c$histological_type=="OTHER INVASIVE")
  #h.INVTUMOR =as.numeric(c$histological_type=="INVASIVE TUMOR")
  #h.other = as.numeric(c$histological_type=="MIXED NST AND A SPECIAL TYPE" | c$histological_type=="OTHER" | c$histological_type=="OTHER INVASIVE" | c$histological_type=="INVASIVE TUMOR" | c$histological_type=="PHYL")

  er.P=as.numeric(c$ER.Expr=="+")
  er.N=as.numeric(c$ER.Expr=="-")
  
#  her2.P=as.numeric(c$Her2.Expr=="+")
#  her2.N=as.numeric(c$Her2.Expr=="-")

#  pr.P=as.numeric(c$PR.Expr=="+")
#  pr.N=as.numeric(c$PR.Expr=="-")

  tr.CT = as.numeric((c$Treatment == "CT") | (c$Treatment == "CT/HT") | (c$Treatment == "CT/HT/RT") | (c$Treatment == "CT/RT"))
  tr.HT = as.numeric((c$Treatment == "HT") | (c$Treatment == "CT/HT") | (c$Treatment == "HT/RT") | (c$Treatment =="CT/HT/RT"))
  tr.RT = as.numeric((c$Treatment == "RT") | (c$Treatment == "CT/HT/RT") | (c$Treatment == "CT/RT") | (c$Treatment == "HT/RT"))
  tr.NONE = as.numeric(c$Treatment == "NONE")

  gd.1 = as.numeric(c$grade==1)
  gd.2 = as.numeric(c$grade==2)
  gd.3 = as.numeric(c$grade==3)

  #her2.1 = as.numeric(c$HER2_IHC_status==1)
  #her2.2 = as.numeric(c$HER2_IHC_status==2)
  #her2.3 = as.numeric(c$HER2_IHC_status==3)
  
  her2.snp6.gain = as.numeric(c$HER2_SNP6_state=="GAIN")
  her2.snp6.loss = as.numeric(c$HER2_SNP6_state=="LOSS")
  her2.snp6.neut = as.numeric(c$HER2_SNP6_state=="NEUT")
  
  cmat<-data.frame(c[, c(1:3)], gd.1, gd.2, gd.3, h.IDC, h.ILC, h.IDCpILC, h.IDCnMED, h.IDCnMUC, h.IDCnTUB, #h.other,# h.MIXED, h.OTHER, h.OTHERINV, h.INVTUMOR,
                   er.N, er.P, tr.CT, tr.HT, tr.RT, tr.NONE, her2.snp6.gain, her2.snp6.loss, her2.snp6.neut)#, p.LumA, p.LumB, p.Her2, p.Normal, p.Basal, her2.1, her2.2, her2.3)
  for(i in 4:ncol(cmat)){
    cmat[,i] = factor(cmat[,i])
  }
  return(cmat)
}

expandClncFULL = function(c){
  h.IDC =as.numeric(c$histological_type=="IDC")
  h.ILC =as.numeric(c$histological_type=="ILC")
  h.IDCpILC =as.numeric(c$histological_type=="IDC+ILC")
  h.IDCnMED =as.numeric(c$histological_type=="IDC-MED")
  h.IDCnMUC =as.numeric(c$histological_type=="IDC-MUC")
  h.IDCnTUB =as.numeric(c$histological_type=="IDC-TUB")
  #h.MIXED =as.numeric(c$histological_type=="MIXED NST AND A SPECIAL TYPE")
  #h.OTHER =as.numeric(c$histological_type=="OTHER")
  #h.OTHERINV =as.numeric(c$histological_type=="OTHER INVASIVE")
  #h.INVTUMOR =as.numeric(c$histological_type=="INVASIVE TUMOR")
  h.other = as.numeric(c$histological_type=="MIXED NST AND A SPECIAL TYPE" | c$histological_type=="OTHER" | c$histological_type=="OTHER INVASIVE" | c$histological_type=="INVASIVE TUMOR" | c$histological_type=="PHYL")

  er.P=as.numeric(c$ER_IHC_status=="pos")
  er.N=as.numeric(c$ER_IHC_status=="neg")
  
  tr.CT = as.numeric((c$Treatment == "CT") | (c$Treatment == "CT/HT") | (c$Treatment == "CT/HT/RT") | (c$Treatment == "CT/RT"))
  tr.HT = as.numeric((c$Treatment == "HT") | (c$Treatment == "CT/HT") | (c$Treatment == "HT/RT") | (c$Treatment =="CT/HT/RT"))
  tr.RT = as.numeric((c$Treatment == "RT") | (c$Treatment == "CT/HT/RT") | (c$Treatment == "CT/RT") | (c$Treatment == "HT/RT"))

  gd.1 = as.numeric(c$grade==1)
  gd.2 = as.numeric(c$grade==2)
  gd.3 = as.numeric(c$grade==3)

  grp.1 = as.numeric(c$NOT_IN_OSLOVAL_group==1)
  grp.2 = as.numeric(c$NOT_IN_OSLOVAL_group==2)
  grp.3 = as.numeric(c$NOT_IN_OSLOVAL_group==3)
  grp.4 = as.numeric(c$NOT_IN_OSLOVAL_group==4)

  stg.0 = as.numeric(c$NOT_IN_OSLOVAL_stage==0)
  stg.1 = as.numeric(c$NOT_IN_OSLOVAL_stage==1)
  stg.2 = as.numeric(c$NOT_IN_OSLOVAL_stage==2)
  stg.3 = as.numeric(c$NOT_IN_OSLOVAL_stage==3 | c$NOT_IN_OSLOVAL_stage==4)

#  her2.1 = as.numeric(c$HER2_IHC_status==1)
#  her2.2 = as.numeric(c$HER2_IHC_status==2)
#  her2.3 = as.numeric(c$HER2_IHC_status==3)
  
  her2.snp6.gain = as.numeric(c$HER2_SNP6_state=="GAIN")
  her2.snp6.loss = as.numeric(c$HER2_SNP6_state=="LOSS")
  her2.snp6.neut = as.numeric(c$HER2_SNP6_state=="NEUT")
  
  st.1 = as.numeric(c$NOT_IN_OSLOVAL_Site==1)
  st.2 = as.numeric(c$NOT_IN_OSLOVAL_Site==2)
  st.3 = as.numeric(c$NOT_IN_OSLOVAL_Site==3)
  st.4 = as.numeric(c$NOT_IN_OSLOVAL_Site==4)
  st.5 = as.numeric(c$NOT_IN_OSLOVAL_Site==5)

  mnpsl.pre = as.numeric(c$NOT_IN_OSLOVAL_menopausal_status_inferred=="pre" )
  mnpsl.post = as.numeric(c$NOT_IN_OSLOVAL_menopausal_status_inferred=="post" )

#  cell.hi = as.numeric(c$NOT_IN_OSLOVAL_cellularity == "high")
#  cell.mod = as.numeric(c$NOT_IN_OSLOVAL_cellularity == "moderate")
#  cell.lo = as.numeric(c$NOT_IN_OSLOVAL_cellularity == "low")

  p.LumA = as.numeric(c$NOT_IN_OSLOVAL_Pam50Subtype == "LumA")
  p.LumB = as.numeric(c$NOT_IN_OSLOVAL_Pam50Subtype == "LumB")
  p.Basal = as.numeric(c$NOT_IN_OSLOVAL_Pam50Subtype == "Basal")
  p.Her2 = as.numeric(c$NOT_IN_OSLOVAL_Pam50Subtype == "Her2")
  p.Normal = as.numeric(c$NOT_IN_OSLOVAL_Pam50Subtype == "Normal")

#  p53.mut = as.numeric(c$NOT_IN_OSLOVAL_P53_mutation_status == "MUT")
#  p53.wt = as.numeric(c$NOT_IN_OSLOVAL_P53_mutation_status == "WT")

#  ic.1 = as.numeric(c$NOT_IN_OSLOVAL_IntClustMemb==1)
#  ic.2 = as.numeric(c$NOT_IN_OSLOVAL_IntClustMemb==2)
#  ic.3 = as.numeric(c$NOT_IN_OSLOVAL_IntClustMemb==3)
#  ic.4 = as.numeric(c$NOT_IN_OSLOVAL_IntClustMemb==4)
#  ic.5 = as.numeric(c$NOT_IN_OSLOVAL_IntClustMemb==5)
#  ic.6 = as.numeric(c$NOT_IN_OSLOVAL_IntClustMemb==6)
#  ic.7 = as.numeric(c$NOT_IN_OSLOVAL_IntClustMemb==7)
#  ic.8 = as.numeric(c$NOT_IN_OSLOVAL_IntClustMemb==8)
#  ic.9 = as.numeric(c$NOT_IN_OSLOVAL_IntClustMemb==9)
#  ic.10 = as.numeric(c$NOT_IN_OSLOVAL_IntClustMemb==10)

#  gf.nn = as.numeric(c$NOT_IN_OSLOVAL_Genefu == "ER-/HER2-")
#  gf.pnhi = as.numeric(c$NOT_IN_OSLOVAL_Genefu == "ER+/HER2- High Prolif")
#  gf.pnlo = as.numeric(c$NOT_IN_OSLOVAL_Genefu == "ER+/HER2- Low Prolif")
#  gf.p = as.numeric(c$NOT_IN_OSLOVAL_Genefu == "HER2+")


  cmat<-data.frame(c[, c(1:3, 12:13)], gd.1, gd.2, gd.3, h.IDC, h.ILC, h.IDCpILC, h.IDCnMED, h.IDCnMUC, h.IDCnTUB, h.other,# h.MIXED, h.OTHER, h.OTHERINV, h.INVTUMOR,
                   er.N, er.P, tr.CT, tr.HT, tr.RT, her2.snp6.gain, her2.snp6.loss, her2.snp6.neut, 
		grp.1, grp.2, grp.3, grp.4, stg.0, stg.1, stg.2, stg.3, st.1, st.2, st.3, st.4, st.5, p.LumA, p.Basal, p.LumB, p.Normal, p.Her2,
		mnpsl.pre, mnpsl.post)
		#, cell.hi, cell.mod, cell.lo)
		#, p53.mut, p53.wt, her2.1, her2.2, her2.3)
		#, gf.nn, gf.pnhi, gf.pnlo, gf.p)
		#, ic.1, ic.2, ic.3, ic.4, ic.5, ic.6, ic.7, ic.8, ic.9, ic.10)
  for(i in 6:ncol(cmat)){
    cmat[,i] = factor(cmat[,i])
  }
  return(cmat)
}

expandClncOslo = function(c){
  h.IDC =as.numeric(c$histological_type=="IDC")
  h.ILC =as.numeric(c$histological_type=="ILC")
  h.IDCpILC =as.numeric(c$histological_type=="IDC+ILC")
  h.IDCnMED =as.numeric(c$histological_type=="IDC-MED")
  h.IDCnMUC =as.numeric(c$histological_type=="IDC-MUC")
  h.IDCnTUB =as.numeric(c$histological_type=="IDC-TUB")

  er.P=as.numeric(c$ER.Expr=="+")
  er.N=as.numeric(c$ER.Expr=="-")
  
  tr.CT = as.numeric((c$Treatment == "CT") | (c$Treatment == "CT/HT") | (c$Treatment == "CT/HT/RT") | (c$Treatment == "CT/RT"))
  tr.HT = as.numeric((c$Treatment == "HT") | (c$Treatment == "CT/HT") | (c$Treatment == "HT/RT") | (c$Treatment =="CT/HT/RT"))
  tr.RT = as.numeric((c$Treatment == "RT") | (c$Treatment == "CT/HT/RT") | (c$Treatment == "CT/RT") | (c$Treatment == "HT/RT"))

  gd.1 = as.numeric(c$grade==1)
  gd.2 = as.numeric(c$grade==2)
  gd.3 = as.numeric(c$grade==3)

  cmat<-data.frame(c[, c(1:3)], gd.1, gd.2, gd.3, h.IDC, h.ILC, h.IDCpILC, h.IDCnMED, h.IDCnMUC, h.IDCnTUB,  
                   er.N, er.P, tr.CT, tr.HT, tr.RT)
  for(i in 4:ncol(cmat)){
    cmat[,i] = factor(cmat[,i])
  }
  return(cmat)
}

getTags = function(ft){
	taglist = strsplit(ft, "\\.")
	tags = sapply(taglist, function(x){x[1]})
	return (tags)
}

removeTaggedFeatures = function(colName, ft){
	t = getTags(colName)
	tags = getTags(ft)
	killIdx = which(t %in% tags)
	out = colName[-killIdx]
	return (out)
}

checkIntegrity = function(c){
	killft = NULL
	for(i in 1:ncol(c)){
		if(length(table(c[,i]))<2) killft = c(killft, i)
	}
	colnames(c)[killft]
}
weiyi-bitw/DreamBox7 documentation built on May 4, 2019, 4:18 a.m.