setOldClass('ff_matrix') # Convert ff_matrix into an S4 class
setClassUnion('geno',c('dMatrix','matrix','ff_matrix'))
#' An S4 class to represent GWAS data.
#'
#' @slot pheno A \code{\link{data.frame}} that contains phenotypes.
#' @slot map A \code{\link{data.frame}} that contains a genetic map.
#' @slot geno A \code{geno} object (\code{dMatrix}, \code{ff_matrix}, or
#' \code{\link{matrix}}) that contains genotypes.
#' @export genData
#' @exportClass genData
genData<-setClass('genData',slots=c(pheno='data.frame',map='data.frame',geno='geno'))
#' @export
setMethod('initialize','genData',function(.Object,geno,pheno,map){
if(!is(geno,'geno')){
stop("Only dMatrix, ff_matrix, or regular matrix objects are allowed for geno.")
}
if(is.null(colnames(geno))){
colnames(geno)<-paste0('mrk_',1:ncol(geno))
}
if(is.null(rownames(geno))){
rownames(geno)<-paste0('id_',1:nrow(geno))
}
if(missing(pheno)){
pheno<-data.frame(IID=rownames(geno))
rownames(pheno)<-rownames(geno)
}
if(missing(map)){
map<-data.frame(mrk=colnames(geno))
rownames(map)<-colnames(geno)
}
.Object@geno<-geno
.Object@pheno<-pheno
.Object@map<-map
return(.Object)
})
#' Creates a \code{\linkS4class{genData}} object from a plaintext file.
#'
#' \code{setGenData} assumes that the plaintext file (\code{fileIn}) contains
#' records of individuals in rows, and phenotypes, covariates and markers in
#' columns. The columns included in columns \code{1:nColSkip} are used to
#' populate the slot \code{\code{@@pheno}} of a \code{\linkS4class{genData}}
#' object, and the remaining columns are used to fill the slot
#' \code{\code{@@geno}}. If the first row contains a header
#' (\code{header=TRUE}), data in this row is used to determine variables names
#' for \code{@@pheno} and marker names for \code{@@map} and \code{@@geno}.
#' Genotypes are stored in a distributed matrix (\code{dMatrix}). By default a
#' column-distributed (\code{\linkS4class{cDMatrix}}) is used for \code{@@geno},
#' but the user can modify this using the \code{distributed.by} argument. The
#' number of chunks is either specified by the user (use \code{nChunks} when
#' calling \code{setGenData}) or determined internally so that each
#' \code{ff_matrix} object has a number of cells that is smaller than
#' \code{.Machine$integer.max/1.2}. \code{setGenData} creates a folder
#' (\code{folderOut}) that contains the binary flat files (\code{geno_*.bin})
#' and the \code{\linkS4class{genData}} object (typically named
#' \code{genData.RData}. Optionally (if \code{returnData} is TRUE) it returns
#' the \code{\linkS4class{genData}} object to the environment. The filename of
#' the \code{ff_matrix} objects are saved as relative names. Therefore, to be
#' able to access the content of the data included in \code{@@geno} the working
#' directory must either be the folder where these files are saved
#' (\code{folderOut}) or the object must be loaded using the \code{loadGenData}
#' function included in the package.
#'
#' @param fileIn The path to the plaintext file.
#' @param header If TRUE, the file contains a header.
#' @param dataType The coding of genotypes. Use 'character' for A/C/G/T or
#' 'integer' for numeric coding.
#' @param distributed.by If columns a column-distributed matrix
#' (\code{\linkS4class{cDMatrix}}) is created, if rows a row-distributed
#' matrix (\code{\linkS4class{rDMatrix}}).
#' @param n The number of individuals.
#' @param p The number of markers.
#' @param folderOut The path to the folder where to save the binary files.
#' @param returnData If TRUE, the function returns a
#' \code{\linkS4class{genData}} object.
#' @param na.strings The character string use to denote missing value.
#' @param nColSkip The number of columns to be skipped to reach the genotype
#' information in the file.
#' @param idCol The index of the ID column.
#' @param verbose If TRUE, progress updates will be posted.
#' @param nChunks The number of chunks to create.
#' @param dimorder The physical layout of the chunks.
#' @return If \code{returnData} is TRUE, a \code{\linkS4class{genData}} object
#' is returned.
#' @export
setGenData<-function(fileIn,header,dataType,distributed.by='columns',n=NULL,p=NULL,
folderOut=paste('genData_',sub("\\.[[:alnum:]]+$","",basename(fileIn)),sep=''),
returnData=TRUE,na.strings='NA',nColSkip=6,idCol=2,verbose=FALSE,nChunks=NULL,
dimorder=if(distributed.by=='rows') 2:1 else 1:2){
if(file.exists(folderOut)){
stop(paste('Output folder',folderOut,'already exists. Please move it or pick a different one.'))
}
if(!dataType%in%c('character','integer','numeric')){
stop('dataType must be either character, integer or numeric')
}
if(!distributed.by%in%c('columns','rows')){
stop('distributed.by must be either columns or rows')
}
vMode<-ifelse(dataType%in%c('character','integer'),'byte','double')
if(is.null(n)){
# gzfile and readLines throw some warnings, but since it works, let's
# disable warnings for this block
warnLevel<-unlist(options('warn'))
options(warn=-1)
detN<-gzfile(fileIn,open='r')
n <- 0
while(length(readLines(detN,n=1))>0){
n<-n+1
}
if(header){
n<-n-1
}
close(detN)
# restore previous warning level
options(warn=warnLevel)
}
if(header){
pedFile<-gzfile(fileIn,open='r')
tmp<-scan(pedFile,nlines=1,what=character(),quiet=TRUE)
p<-length(tmp)-nColSkip
phtNames<-tmp[1:nColSkip]
mrkNames<-tmp[-(1:nColSkip)]
}else{
if(is.null(p)){
detP<-gzfile(fileIn,open='r')
tmp<-scan(detP,nlines=1,what=character(),quiet=TRUE)
p<-length(tmp)-nColSkip
close(detP)
}
pedFile<-gzfile(fileIn,open='r')
phtNames<-paste('v_',1:nColSkip,sep='')
mrkNames<-paste('mrk_',1:p,sep='')
}
IDs<-rep(NA,n)
pheno<-matrix(nrow=n,ncol=nColSkip)
colnames(pheno)<-phtNames
geno<-new(ifelse(distributed.by=='columns','cDMatrix','rDMatrix'),nrow=n,ncol=p,vmode=vMode,folderOut=folderOut,nChunks=nChunks,dimorder=dimorder)
colnames(geno)<-mrkNames
for(i in 1:n){
time<-proc.time()
xSkip<-scan(pedFile,n=nColSkip,what=character(),na.strings=na.strings,quiet=TRUE)
x<-scan(pedFile,n=p,what=dataType,na.strings=na.strings,quiet=TRUE)
pheno[i,]<-xSkip
IDs[i]<-xSkip[idCol]
geno[i,]<-x
if(verbose){
cat('Subject',i,' ',round(proc.time()[3]-time[3],3),'sec/subject.','\n')
}
}
close(pedFile)
# Adding names
rownames(geno)<-IDs
rownames(pheno)<-IDs
pheno<-as.data.frame(pheno,stringsAsFactors=FALSE)
pheno[]<-lapply(pheno,type.convert,as.is=TRUE)
genData<-new('genData',geno=geno,pheno=pheno)
attr(genData,'origFile')<-list(path=fileIn,dataType=dataType)
attr(genData,'dateCreated')<-date()
save(genData,file=paste(folderOut,'/genData.RData',sep=''))
if(returnData){ return(genData) }
}
#' @export
loadGenData<-function(path,envir=.GlobalEnv){
##
# Use: to load a genData object using the name of the folder where the meta-data and data are stored.
# path: the name of the folder where the data and meta data are stored.
# envir: the name of the environment where the object is returned.
# See also: load2() and setGenData()
##
if('genData'%in%ls(envir=envir)){
stop('There is already an object called genData in the environment. Please move it.')
}
if(!file.exists(paste0(path,'/genData.RData'))){
stop(paste('Could not find a genData object in path',path))
}
cwd<-getwd()
setwd(path)
load('genData.RData',envir)
cat('Loaded genData object into environment under name genData')
# Open all chunks for reading (we do not store absolute paths to ff files,
# so this has to happen in the same working directory)
chunks<-chunks(genData@geno)
for(i in 1:nrow(chunks)){
open(genData@geno[[i]])
}
# Restore working directory
setwd(cwd)
}
#' @export
load2<-function(file,envir=parent.frame(),verbose=TRUE){
##
# Function to load genData or dMatrix objects
# file: the name of the .RData file to be loaded (and possibly a path)
# envir: the environment where to load the data
# verbose: TRUE/FALSE
# See also: loadGenData()
##
# determining the object name
lsOLD<-ls();
load(file=file)
lsNEW<-ls();
objectName<-lsNEW[(!lsNEW%in%lsOLD)&(lsNEW!='lsOLD')];
# determining path and filename
path<-dirname(file)
fname<-basename(file)
# stores current working directiory and sets working directory to path
cwd<-getwd()
setwd(path)
# determining object class
objectClass<-class(eval(parse(text=objectName)))
if(verbose){
cat(' Meta data (',fname,') and its data were stored at folder ',path,'.\n',sep='')
cat(' Object Name: ',objectName,'\n',sep='')
cat(' Object Class: ',objectClass,'\n',sep='')
}
if(!(objectClass%in%c('genData','rDMatrix','cDMatrix'))){ stop( ' Object class must be either genData, cDMatrix or rDMatrix')}
# Determining number of chunks
if(objectClass=='genData'){
tmpChunks<-chunks(eval(parse(text=paste0(objectName,'@geno'))))
}else{
tmpChunks<-chunks(eval(parse(text=objectName)))
}
# opening files
for(i in 1:nrow(tmpChunks)){
if(verbose){ cat(' Opening flat file ', i,'\n') }
if(objectClass=='genData'){
open(eval(parse(text=paste0(objectName,'@geno[[',i,']]'))))
}else{
open(eval(parse(text=paste0(objectName,'[[',i,']]'))))
}
}
# sending the object to envir
assign(objectName,get(objectName), envir=envir)
# restoring the working directory
setwd(cwd)
if(verbose){ cat(' Original directory (',getwd(),') restored \n',sep='')}
}
#' Conducts an association study (GWAS) using a \code{\linkS4class{genData}}
#' object.
#'
#' This function conducts an association test using the formula provided by the
#' user (\code{formula}) plus one column of \code{@@geno}, one column at a time.
#' The data from the association tests is obtained from a
#' \code{\linkS4class{genData}} object.
#'
#' @param formula A formula (e.g. weight~sex+age) with the response on the
#' left-hand side and predictors (all the covariates except the markers) on
#' the right-hand side. The variables included in the formula must be in the
#' \code{@@pheno} object of the \code{\linkS4class{genData}}.
#' @param data A \code{\linkS4class{genData}} object.
#' @param method The regression method to be used. Currently, the following
#' methods are implemented: \code{\link{lm}}, \code{\link{lm.fit}},
#' \code{\link{lsfit}}, \code{\link{glm}} and \code{\link[lme4]{lmer}}.
#' @param plot If TRUE a Manhattan plot is produced and filled with points as
#' the association tests are run.
#' @param verbose If TRUE more messages are printed.
#' @param min.pValue Numeric, the minimum p-value expected, used to determine
#' the limits of the vertical axis of the Manhattan plot.
#' @param chunkSize Represents the number of columns of \code{@@geno} that are
#' brought into RAM for processing (10 by default).
#' @return Returns a matrix with estimates, SE, p-value, etc.
#' @export
GWAS<-function(formula,data,method,plot=FALSE,verbose=FALSE,min.pValue=1e-10,chunkSize=10,...){
if(class(data)!='genData'){ stop('data must genData')}
if(!method%in%c('lm','lm.fit','lsfit','glm','lmer','SKAT')){
stop('Only lm, glm, lmer and SKAT have been implemented so far.')
}
## We can have 'specialized methods, for instance for OLS it is better to use lsfit that is what GWAS.ols do
if(method%in%c('lm','lm.fit','lsfit','SKAT')){
if(method%in%c('lm','lm.fit','lsfit')){
OUT<-GWAS.ols(formula=formula,data=data,plot=plot,verbose=verbose,min.pValue=min.pValue,chunkSize=chunkSize,...)
}
if(method=='SKAT'){
OUT<-GWAS.SKAT(formula=formula,data=data,plot=plot,verbose=verbose,min.pValue=min.pValue,...)
}
}else{
if(method=='lmer'&&!requireNamespace("lme4",quietly=TRUE)){
stop("lme4 needed for this function to work. Please install it.",call.=FALSE)
}
if(method=='lmer'){
FUN<-lme4::lmer
}else{
FUN<-match.fun(method)
}
# could subset based on NAs so that subsetting does not take place in each iteration of the GWAS loop
pheno<-data@pheno
fm<-FUN(formula,data=pheno,...)
tmp<-getCoefficients(fm)
p<-ncol(data@geno)
OUT<-matrix(nrow=p,ncol=length(tmp),NA)
rownames(OUT)<-colnames(data@geno)
colnames(OUT)<-colnames(tmp)
GWAS.model<-update(as.formula(formula),'.~z+.')
if(plot){
tmp<-paste(as.character(GWAS.model[2]),as.character(GWAS.model[3]),sep='~')
plot(numeric()~numeric(),xlim=c(0,p),ylim=c(0,-log(min.pValue,base=10)),ylab='-log(p-value)',xlab='Marker',main=tmp)
}
nChunks<-ceiling(p/chunkSize)
end<-0
tmpRow<-0
for(i in 1:nChunks){
time.in<-proc.time()[3]
ini<-end+1
end<-min(ini+chunkSize-1,p)
Z<-data@geno[,ini:end,drop=FALSE]
for(j in 1:(end-ini+1)){
pheno$z<-Z[,j]
fm<-FUN(GWAS.model,data=pheno,...)
tmp<-getCoefficients(fm)
tmpRow<-tmpRow+1
OUT[tmpRow,]<-tmp
if(plot){
x=c(tmpRow-1,tmpRow)
y=-log(OUT[c(tmpRow-1,tmpRow),4],base=10)
if(tmpRow>1){ lines(x=x,y=y,col=8,lwd=.5) }
points(y=-log(tmp[4],base=10),col=2,cex=.5,x=tmpRow)
}
}
if(verbose){ cat(sep='','Chunk ',i,' of ', nChunks,' (',round(proc.time()[3]-time.in,2),' seconds/chunk, ',round(i/nChunks*100,3),'% done )\n') }
}
}
return(OUT)
}
getCoefficients<-function(x){
UseMethod('getCoefficients')
}
getCoefficients.lm<-function(x){
summary(x)$coef[2,]
}
getCoefficients.glm<-function(x){
summary(x)$coef[2,]
}
getCoefficients.lmerMod<-function(x){
ans<-summary(x)$coef[2,]
ans<-c(ans,c(1-pnorm(ans[3])))
return(ans)
}
## GWAS 'Ordinary least squares' (e.g., lsfit lm.fit lm)
GWAS.ols<-function(formula,data,plot=FALSE,verbose=FALSE,min.pValue=1e-10,chunkSize=10,...){
##
# formula: the formula for the GWAS model without including the marker, e.g., y~1 or y~factor(sex)+age
# all the variables in the formula must be in data@pheno
# data (genData) containing slots @pheno and @geno
##
X <- model.matrix(formula,data@pheno)
X <- X[match(rownames(data@pheno),rownames(X)),]
y<-data@pheno[,as.character(terms(formula)[[2]])]
p<-ncol(data@geno)
tmp<-ls.print(lsfit(x=X,y=y,intercept=FALSE),print=FALSE)$coef.table[[1]]
OUT<-matrix(nrow=p,ncol=ncol(tmp),NA)
colnames(OUT)<-colnames(tmp)
rownames(OUT)<-colnames(data@geno)
X<-cbind(0,X)
if(plot){
tmp<-paste(as.character(formula[2]),as.character(formula[3]),sep='~')
plot(numeric()~numeric(),xlim=c(0,p),ylim=c(0,-log(min.pValue,base=10)),ylab='-log(p-value)',xlab='Marker',main=tmp)
}
nChunks<-ceiling(p/chunkSize)
end<-0
tmpRow<-0
for(i in 1:nChunks){
time.in<-proc.time()[3]
ini<-end+1
end<-min(ini+chunkSize-1,p)
Z<-data@geno[,ini:end,drop=FALSE]
for(j in 1:(end-ini+1)){
X[,1]<-Z[,j]
tmpRow<-tmpRow+1
fm<-lsfit(x=X,y=y,intercept=FALSE)
tmp<-ls.print(fm,print=FALSE)$coef.table[[1]][1,]
OUT[tmpRow,]<-tmp
if(plot){
tmp.x=c(tmpRow-1,tmpRow)
tmp.y=-log(OUT[c(tmpRow-1,tmpRow),4],base=10)
if(tmpRow>1){ lines(x=tmp.x,y=tmp.y,col=8,lwd=.5) }
points(y=-log(tmp[4],base=10),col=2,cex=.5,x=tmpRow)
}
}
if(verbose){ cat(sep='','Chunk ',i,' of ', nChunks,' (',round(proc.time()[3]-time.in,2),' seconds/chunk, ',round(i/nChunks*100,3),'% done )\n') }
}
return(OUT)
}
GWAS.SKAT<-function(formula,data,groups,plot=FALSE,verbose=FALSE,min.pValue=1e-10,...){
##
# formula: the formula for the GWAS model without including the markers, e.g., y~1 or y~factor(sex)+age
# all the variables in the formula must be in data@pheno
# data (genData) containing slots @pheno and @geno
# groups: a vector mapping markers into groups (can be integer, character or factor).
##
if(!requireNamespace("SKAT",quietly=TRUE)){
stop("SKAT needed for this function to work. Please install it.",call.=FALSE)
}
p<-length(unique(groups))
OUT<-matrix(nrow=p,ncol=2,NA)
colnames(OUT)<-c('nMrk','p-value')
levels<-unique(groups)
rownames(OUT)<-levels
H0<-SKAT::SKAT_Null_Model(formula,data=data@pheno,...)
if(plot){
tmp<-paste(as.character(formula[2]),as.character(formula[3]),sep='~')
plot(numeric()~numeric(),xlim=c(0,p),ylim=c(0,-log(min.pValue,base=10)),ylab='-log(p-value)',xlab='Marker',main=tmp)
}
for(i in 1:p){
Z<-data@geno[,groups==levels[i],drop=FALSE]
fm<-SKAT::SKAT(Z=Z,obj=H0,...)
OUT[i,]<-c(ncol(Z),fm$p.value)
if(plot){
tmp.x=c(i-1,i)
tmp.y=-log(OUT[tmp.x,2],base=10)
if(i>1){ lines(x=tmp.x,y=tmp.y,col=8,lwd=.5) }
points(y=tmp.y[2],col=2,cex=.5,x=i)
}
}
if(verbose){
cat(sep='','Group ',i,' of ', p,' (',round(proc.time()[3]-time.in,2),' seconds/chunk, ',round(i/p*100,3),'% done )\n')
}
return(OUT)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.