R/fetch.expression.data.R

#' Fetch expression data from GEO / or given folder
#'
#' @param geo.id GEO id
#' @param sample.mapping.column The column in the sample annotation that contains phenotype 
#' info (default: "characteristics_ch1")
#' @param do.log2 Apply log2 transformation to the expression values. If NULL (default), 
#' deduced from the data (code from GEO2R at NCBI)
#' @param probe.conversion Convert probe level expression to gene level using provided 
#' annotation label (uses platform specific annotations downloaded from GEO)
#' Defaults to NULL (no conversion). In case of multiple probes, probe with absolute max
#' value is chosen
#' @param conversion.mapping Mapping of platform specific ids to user provided ids
#' @param conversion.mapping.function Function to process the mapped name such that it matches
#' with the ids provided in conversion.map
#' @param output.dir Directory where all files will be written (defaults to current working dir)
#' @param geo.id.sub GEO id for the sub-set (e.g., specific to the platform)
#' @return A list containing 3 data frames: expression matrix, sample mapping, gene mapping
#' @export
#' @examples
#' gds.data = fetch.expression.data("GDS4966", do.log2=F, probe.conversion="Gene ID")
#' expr = gds.data$expr
#' sample.mapping = gds.data$sample.mapping
fetch.expression.data<-function(geo.id, sample.mapping.column="characteristics_ch1", do.log2=NULL, probe.conversion=NULL, conversion.mapping=NULL, conversion.mapping.function=NULL, output.dir=getwd(), geo.id.sub=NULL, reprocess=NULL) {
    output.dir = file.path(output.dir, geo.id)
    if (!file.exists(output.dir)){
	dir.create(output.dir)
    } 
    out.file = file.path(output.dir, "expr.dat")
    out.file.2 = file.path(output.dir, "mapping.dat")
    out.file.3 = file.path(output.dir, "annotation.dat")
    # Load expression file if exists
    if(!all(file.exists(out.file), file.exists(out.file.2), file.exists(out.file.3))) {
	# Get data set
	data.set = get.data.set(geo.id, output.dir)
	# Using Biobase + GEOquery methods
	if(substr(geo.id,1,3) == "GDS") {
	    eset = GEOquery::GDS2eSet(data.set) #, do.log2=do.log2)
	    # Get sample - phenotype mapping
	    idx = which(colnames(Biobase::pData(eset)) == sample.mapping.column)
	    if(length(idx) == 0) { # sample.mapping.column == "characteristics_ch1") {
		idx = 2 
	    } 
	    sample.mapping = Biobase::pData(eset)
	    write.table(sample.mapping, paste0(out.file.2, ".full"), sep="\t", quote=F, row.names=F)
	    sample.mapping = Biobase::pData(eset)[,c(1, idx)] #[,c(1:2)]
	    colnames(sample.mapping) = c("sample", "type")
	} else { # GSE or ArrayExpress
	    if (length(data.set) > 1) {
		if(is.null(geo.id.sub)) {
		    idx = grep(geo.id, attr(data.set, "names")) 
		} else {
		    idx = grep(geo.id.sub, attr(data.set, "names")) 
		}
	    } else {
		idx = 1 
	    }
	    eset = data.set[[idx]]
	    sample.mapping = Biobase::pData(eset)
	    sample.mapping$sample = rownames(Biobase::pData(eset))
	    write.table(sample.mapping, paste0(out.file.2, ".full"), sep="\t", quote=F, row.names=F)
	    sample.mapping = data.frame(sample=rownames(Biobase::pData(eset)), type=Biobase::pData(eset)[, sample.mapping.column])
	} 
	write.table(sample.mapping, out.file.2, sep="\t", quote=F, row.names=F)
	# Get expression data
	if(!is.null(reprocess)) {
	    if(substr(geo.id,1,3) == "GSE") { # GSE
		output.dir.raw = file.path(output.dir, "raw", "")
		if (!file.exists(output.dir.raw)){
		    file.paths = GEOquery::getGEOSuppFiles(geo.id, baseDir = file.path(output.dir, "..", "")) 
		    untar(file.path(output.dir, paste0(geo.id, "_RAW.tar")), exdir=output.dir.raw)
		}
	    }
	    if(substr(reprocess, 1, nchar("affy")) == "affy") {
		if (!requireNamespace("affy", quietly=TRUE)) {
		    stop("Reprocessing of Affymetrix arrays requires affy package to be installed")
		}
		arguments = unlist(strsplit(reprocess, "\\|"))
		if(length(arguments) == 1) {
		    if(substr(geo.id,1,3) == "GSE") { # GSE
			d = affy::ReadAffy(celfile.path = output.dir.raw) 
			Biobase::sampleNames(d) <- sub("(_|\\.).*CEL\\.gz","",  Biobase::sampleNames(d))
		    } else { # ArrayExpress
			d = ArrayExpress::ae2bioc(mageFiles = eset)
		    }
		    eset.raw = affy::rma(d) 
		    expr = Biobase::exprs(eset.raw)
		} else {
		    if (!requireNamespace("makecdfenv", quietly=TRUE)) {
		    	stop("Reprocessing of Affymetrix arrays with custom annotation requires makecdfenv package to be installed")
		    }
		    # CDF environment is not recognized in rma (potentially due to namespace issues)
		    # Load manually created expr instead
		    if(file.exists(paste0(out.file, ".probe"))) {
			expr = read.table(paste0(out.file, ".probe"), sep="\t", header=T, check.names=F) 
		    } else {
			cdf = makecdfenv::make.cdf.env(arguments[2], cdf.path = output.dir.raw, compress=T) 
			environment(cdf) = asNamespace('affy')
			d = affy::ReadAffy(celfile.path = output.dir.raw, cdfname="cdf")
			Biobase::sampleNames(d) <- sub("(_|\\.).*CEL\\.gz","",  Biobase::sampleNames(d))
			eset.raw = affy::rma(d) 
			expr = Biobase::exprs(eset.raw)
			#write.table(expr, paste0(out.file, ".probe"), sep="\t", quote=F)
		    }
		}
	    } else if(substr(reprocess, 1, nchar("illumina")) == "illumina") {
		#if (!requireNamespace("beadarray", quietly=TRUE)) {
		#    stop("Reprocessing of Illumina arrays requires beadarray package to be installed")
		#}
		if (!requireNamespace("limma", quietly=TRUE) | !requireNamespace("preprocessCore", quietly=TRUE)) {
		    stop("Reprocessing of Illumina arrays requires limma and preprocessCore packages to be installed")
		}
		arguments = unlist(strsplit(reprocess, "\\|"))
		d = read.csv(paste0(output.dir, "filelist.txt"), sep="\t")
		data.set = limma::read.ilmn(d[d$Type=="TXT","Name"], path=output.dir.raw, probeid = arguments[2], expr = arguments[3]) # "ProbeSet_name" "Beadstudio_intensity"
		expr = apply(data.set$E, 2, as.numeric)
		expr = preprocessCore::normalize.quantiles(expr)
		rownames(expr) = rownames(data.set$E)
		colnames(expr) = sub("(_|\\.).*txt\\.gz","", d[d$Type=="TXT","Name"])
		# ctrlfiles = "AsuragenMAQC -controls.txt", annotation = "TargetID", other.columns = c("Detection  Pval") 
		#d = beadarray::readBeadSummaryData(paste0(output.dir.raw, geo.id, "_RAW.tar"), skip=0, ProbeID = "ProbeSet_name", columns=list(exprs = "Beadstudio_intensity")) # AVG_Signal
		#d = readIllumina(output.dir.raw, useImages=FALSE, illuminaAnnotation = "Humanv3")
		#expr = Biobase::exprs(d)
		#eset = beadarray::normaliseIllumina(expr, method="quantile", transform="log2")
	    } else {
		stop("Unrecognized reprocessing type")
	    }
	} else {
	    expr = Biobase::exprs(eset)
	}
	if(is.null(do.log2)) {
	    # Check whether data is already log transformed (from GEO2R)
	    qx = as.numeric(quantile(expr, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
	    LogC = (qx[5] > 100) || (qx[6]-qx[1] > 50 && qx[2] > 0) || (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
	    if (LogC) { 
		do.log2 = T
	    } else {
		do.log2 = F
	    }
	}
	if(do.log2) {
	    message("Expression will be log2 transformed")
	    # To avoid leaving out probes with negative values
	    #expr[which(expr <= 0)] = NA
	    if(min(expr, na.rm=T) < 0) {
		expr = expr - min(expr, na.rm=T)
	    }
	    expr = log2(expr+1)
	}
	# This was below before causing to ignore genes with NA probes (i.e. results for joerg)
	expr = na.omit(expr)
	# Get probe - gene mapping
	if(!is.null(probe.conversion)) {
	    if(is.null(nrow(Biobase::fData(eset))) | ncol(Biobase::fData(eset)) == 0) {
		if(substr(geo.id,1,3) == "GDS") {
		    geo.id.platform = GEOquery::Meta(data.set)$platform
		} else { # GSE or ArrayExpress
		    geo.id.platform = Biobase::annotation(eset)
		}
		gene.mapping = get.platform.annotation(geo.id.platform, probe.conversion, output.dir)
	    } else {
		gene.mapping = data.frame(Probe = Biobase::fData(eset)[,"ID"], Gene = Biobase::fData(eset)[,probe.conversion]) 
	    }
	    #print(head(gene.mapping, 20))
	    if(!is.null(conversion.mapping.function)) {
		gene.mapping$Gene = unlist(lapply(as.character(gene.mapping$Gene), conversion.mapping.function))
	    }
	    #print(head(gene.mapping, 20)) 
	    if(!is.null(conversion.mapping)) {
		# Likely to convert accession numbers to geneids - get rid of version (trailing dots and digits)
		gene.mapping = data.frame(Probe = gene.mapping$Probe, Gene = as.character(conversion.mapping[sub("\\.[0-9]+", "", gene.mapping$Gene)]))
		gene.mapping$Gene = factor(gene.mapping$Gene, levels=c(levels(gene.mapping$Gene), ""))
		gene.mapping[gene.mapping$Gene %in% c("NULL", "---", "-", " "), "Gene"] = ""
		#gene.mapping[gene.mapping$Gene == "NULL", "Gene"] = NA
		#gene.mapping = na.omit(gene.mapping) 
	    }
	    #print(head(gene.mapping, 20)) 
	    if(length(levels(factor(gene.mapping$Gene))) == 1) {
		stop("Problem with gene mapping!")
	    }
	    expr = convert.probe.to.gene.expression(expr, gene.mapping) 
	} else {
	    gene.mapping = data.frame(Probe=rownames(expr), Gene=rownames(expr))
	}
	write.table(gene.mapping, out.file.3, sep="\t", quote=F, row.names=F)
	#expr = na.omit(expr) 
	write.table(expr, out.file, sep="\t", quote=F)
    }
    expr = read.table(out.file, sep="\t", header=T, check.names=F) 
    sample.mapping = read.table(out.file.2, sep="\t", header=T, quote='"') 
    gene.mapping = read.table(out.file.3, sep="\t", header=T, quote='"', check.names=F) #, row.names=1) 
    return(list(expr=expr, sample.mapping=sample.mapping, gene.mapping=gene.mapping))
}
emreg00/pepper documentation built on May 16, 2019, 5:10 a.m.