#' Implementation of the SNP and LD data analysis using IRanges
#'
#' @description This function wraps the work-flow of a SNP analysis. SNPs are assigned to genes using findOverlaps. Then, gene-specific scores are computed as combined scores from the SNP-pvalues.
#'
#'
#' @param snpdata.url character: A URL or file to read the SNPs in tab separated format.
#' @param genome.url character: A URL or file to read the Genome in tab separated format.
#' @param ld.data.hdf.url character: A URL or file to read the optional LD-data as a HDF5 file.
#' @param include.ld.data logical: Should LD data be included in scoring? Takes more time and memory (default: FALSE)
#' @param population Choose a population code, character vector of length one. The population code corresponds to the population codes in the LD data file. (see Details) (default: "EUR")
#' @param use.position logical: Should markers be matched by genomic postions or rs ids?
#' @param full.match Do a full crossmatching of LD data, only if include.ld.data == TRUE. See Details. (default: TRUE)
#' @param ld.rho.cutoff numeric: Cutoff value for the LD correlation read from HDF file (Default: 0.8)
#' @param comparator Binary comparator (as string) for ld cutoff, used with the LD score in the ld file. one of ">","<",">=","<=","==", "!=" (default: ">=")
#' @param flank.genes.left integer: Expand gene regions by this on the left (default: 1000)
#' @param flank.genes.right integer: Expand gene regions by this on the right (default: 1000)
#' @param multiple.hits logical: Allow multiple overlaps (Default: TRUE)
#' @param scoring.function character: the name of the scoring function to use. One of: "max", "min", "mean", "median", "product", "sum", "count", "p.ratio", "snp.ratio.score", "saccone", "sidak", "bonferroni", "get.snps", "w.fisher", "brown", "forge", "vegas"
#' @param correction.type character: the name of the correction function to use. One of: "Nyholt", "Moskvina", "Gao"
#' @param genome.name character: optional genome.name
#' @param outfile.path character: A file path to save the output to. Default "./LDsnpRout.txt" If NULL, no file is generated
#' @param add.call.header logical: if TRUE a comment header is added to the output file containing the function call parameters
#' @param generate.plink.set logical: Generate output file in Plik set format (Default: FALSE) If TRUE, scoring.function is ignored
#' @param ld.structure logical: If TRUE, LD stucture from hdf5 file is loaded. If FALSE, LD structure is estimated based on p-values (Default: FALSE)
#' @param ... Additional parameters passed to the scoring function. (e.g. pMax for the p.ratio)
#' @export
#' @return gene.score RangedData: A hidden object of type RangedData containing the genomic ranges and a score column with the gene scores resulting from the scoring function. The result is writen to a tab separated txt file unless the output file parameter is NULL.
#' @keywords GWAS, SNP, LD
#' @references Fisher R. (1925), Statistical methods for research workers. London: Oliver and Lloyd.
#' @references Peirce JL, Broman KW, Lu L, Williams RW. (2007) A simple method for combining genetic mapping data from multiple crosses and experimental designs. PLoS One. 2(10):e1036. PMID: 17940600
#' @references Saccone SF, Hinrichs AL, Saccone NL, Chase GA, Konvicka K, et al. (2007) Cholinergic nicotinic receptor genes implicated in a nicotine dependence association study targeting 348 candidate genes with 3713 SNPs. Hum Mol Genet 16(1). PMID: 17135278
#' @exportClass H5IdComponent
#' @exportMethod show
#' @details This function wraps the work-flow of a SNP analysis. SNPs are assigned to genes using findOverlaps. Then, gene-specific scores are computed as combined scores from the SNP-pvalues. If LD data is provided (not NULL), the scores of SNPs in high LD are added to the gene-specific list of scoring genes. Due to the large data-volume, the LD-data must be stored in a hdf5 file in a defined format, grouped by chromosome with 3 or 5 datasets (1-dimensional) per group: snp.id.one(the rs-identifier, first column), snp.id.two (second column), value.set (numeric score value). For full crossmatching the additional data-sets marker.pos.one, marker.pos.two are required. Missing chromosomes are ignored. Group names in the hdf5 must match chromosome names exactly, otherwise the data will not be loaded. See the example data file for the structure. The rs-identifiers must be stored as integers without the rs prefix.
#' @import IRanges
#' @import rhdf5
#' @import fastmatch
snp.ld.analysis <- function (snpdata.url = stop("missing snpdata file"),
genome.url = stop("missing genome file"),
ld.data.hdf.url = NULL,
include.ld.data = !is.null( ld.data.hdf.url ),
population="CEU",
full.match=TRUE,
ld.rho.cutoff=0.8,
comparator=c(">=","<=",">","<","==", "!="),
flank.genes.left=10000,
flank.genes.right=10000,
multiple.hits=TRUE,
scoring.function=c("max",
"min",
"mean",
"median",
"product",
"sum",
"count",
"p.ratio",
"snp.ratio.score",
"fisher",
"saccone",
"sidak",
"bonferroni",
"slowscore",# "simes", # "vegas", "forge",
"get.snps",
"get.scores"
),
correction.type=c( "Moskvina",
"none"),
ld.structure = FALSE,
use.position=(scoring.function != "get.snps"),
genome.name="unknown genome",
outfile.path="./LDsnpRout.txt",
add.call.header = TRUE,
generate.plink.set=FALSE,
...
){
population <- match.arg(population, c("ASW",
"CEU",
"CHB",
"CHD",
"EUR",
"GIH",
"JPT",
"LWK",
"MEX",
"MKK",
"TSI",
"YRI"))
scoring.function <- match.arg(scoring.function)
correction.type<-match.arg(correction.type)
comparator <- match.arg(comparator)
comparator <- getMethod(comparator)
flank.genes.right <- as.integer(flank.genes.right)
flank.genes.left <- as.integer(flank.genes.left)
if (! include.ld.data )
ld.data.hdf.url <- NULL
cat("loading snps\n")
snps <- tab2RangedData(snpdata.url, genome.name=genome.name, name.col=1, start.col=3, chrom.col=2, score.col=4, constwidth=1, header=TRUE, remove.duplicates=FALSE, sep="\t", use.position=use.position)
cat ("loading genome\n")
genes <- parse.cds(genome.url, filetype="delim", flank.at.start=flank.genes.left, flank.at.end=flank.genes.right )
cat("finding Overlaps\n")
ol <- findOverlaps(query=snps, subject=genes, select="all") # removed obsolete argument multiple =TRUE
fun <- if (generate.plink.set)
get.snps2
else
switch(scoring.function,
max = max,
min = min,
mean = mean,
median = median,
sum = sum,
product = prod,
count = function(x, ...) sum(!is.na(x)),
p.ratio = ratio.score,
snp.ratio.score = snp.ratio.score,
fisher = fisher.combined.p,
saccone = saccone,
sidak = sidak,
slowscore = slowscore,
bonferroni = bonferroni, #vegas = vegas, #forge = forge,
get.snps = get.snps,
get.scores = get.scores
)
cat ("computing weights\n")
comb.weights = compute.weights(matched.ranges=ol, snps=snps, genes=genes, ld.hdf.file=ld.data.hdf.url , ld.cutoff=ld.rho.cutoff, population=population, fun=fun, full.match=full.match, comparator=comparator, use.position=use.position, ld.structure = ld.structure, correction.type = correction.type, ...)
# reformat the data a bit:
if (! is.null(outfile.path)) {
if (add.call.header)
writeLines(paste("#", deparse(match.call())), con=outfile.path)
if (generate.plink.set) {
write.plink.set(comb.weights, outfile.path, append=add.call.header)
} else {
comb.weights.frame = as.data.frame(comb.weights)
if ('gpos' %in% colnames(comb.weights.frame))
comb.weights.frame = comb.weights.frame[,-grep('gpos',colnames(comb.weights.frame))]
print(paste("writing output to ", outfile.path))
suppressWarnings(write.table(file=outfile.path, comb.weights.frame, quote=FALSE, sep="\t", append=add.call.header))
}
}
print ("done")
attr(comb.weights, "call") <- match.call()
invisible(comb.weights)
}
# dependencies: IRanges, rhdf5
#uire(IRanges)
#require(rhdf5)
## get the names
get.snps <- function(x, sep=";", ...) {
12398274783298775;
return(paste(x, sep=sep, collapse=sep))
}
get.scores <- function(x, sep=";", ...) {
return(paste(as.numeric(x), sep=sep, collapse=sep))
}
get.snps2 <- function(x, sep="\n", ...) {
paste(x, sep=sep, collapse=sep)
}
annotate.snps <- function(x, ...) {
return(list(as.factor(x)))
}
# define some gene scoring functions according to Andrea Christoforou's notes
## (1) Minimum P-value
## > minP = min(gene1_pvals) #already implemented in the package.
## proportion of significant scores
## (2) SNP ratio – the percentage of the SNPs in the bin that meet a user-defined P-value threshold.
ratio.score <- function(pvals, maxP=0.05, ...) {
sum (pvals <= maxP, na.rm=TRUE) / sum(!is.na(pvals))
}
## (2a) SNP ratio score ^2 - ratio weighted by the number of total snps assigned
snp.ratio.score <- function(pvals, maxP=0.05, ...) {
rs <- ratio.score(pvals, maxP=0.05, ...)
len <- sum(!is.na(pvals))
rs^2 * len
}
## (3) Fisher’s combination test P-value
## > fishers_stat=(-2)*(sum(log(gene1_pvals),na.rm=TRUE))
## > fishers_pval=pchisq(fishers_stat,2*(length(gene1_pvals)),lower.tail=FALSE)
fisher.combined.p <- function (pvals, ...) {
fishers.stat= -2 * (sum(log(pvals), na.rm=TRUE))
return (pchisq(fishers.stat, 2*(sum(!is.na(pvals))), lower.tail=FALSE))
}
# Sidak_Saccone2007 = 1-((1-minP)^((N+1)/2))
saccone <- function(pvals, ...) {
N <- sum(!is.na(pvals))
minP <- min(pvals, na.rm=T)
1-((1-minP)^((N+1)/2))
}
sidak <- function(pvals, cor.array=NULL, correction.type = "none", ...){
if (correction.type!="none"){
fun2 <- switch(correction.type,
Nyholt = Nyholt,
Moskvina = Moskvina,
Gao = Gao
)
N<-fun2(cor.array)
}else{N <- sum(!is.na(pvals))}
minP <- min(pvals, na.rm=T)
1-(1-minP)^(N)
}
bonferroni <- function(pvals, ...){
N <- sum(!is.na(pvals))
minP <- min(pvals, na.rm=T)
minP*N
}
simes <- function(pvals, ...){
N <- sum(!is.na(pvals))
r<-rep((1/N), N)
index<-sort(pvals[!is.na(pvals)], index.return=TRUE)
N_e<- c(1:N)
min((N*index$x)/N_e)
}
###################################################################################################
###correction types
Moskvina<-function(cor.array)
{
if (is.matrix(cor.array)){
r_M<-rep(0,nrow(cor.array)-1)
k_M<-rep(0,nrow(cor.array)-1)
for (k1 in 1:(nrow(cor.array)-1))
{
r_M[k1]<-max(sqrt(cor.array[1:k1,k1+1]))
k_M[k1]<-sqrt(1-(r_M[k1])^(-1.31*log10(0.01)))
}
K_Mosk<-1+sum(k_M)
}else{K_Mosk<-1}
return(K_Mosk)
}
#######combination approarch
slowscore<-function(pvals, cor.array, ...)
{
T_0 <- -2 * (sum(log(pvals), na.rm=TRUE))
sigma2<-4*length(pvals) + 2*sum(cor.array*(3.25 + 0.75*cor.array))
c <- sigma2/(4*length(pvals))
pchisq((T_0/c), (2*length(pvals)/c), lower.tail=FALSE)
}
write.plink.set <- function(res, file=stdout(), append) {
df= as.data.frame(res[!is.na(res$'score.data'),])[,c('names','score.data')]
cat(paste(df$names, df$score.data, "END\n\n", sep="\n"), sep="", file=file, append=append)
}
parse.cds <- function (cdsfile, filetype=NULL, flank.at.start=NULL, flank.at.end=NULL) {
if (missing(filetype)) {
gr <- import(cdsfile)
return(gr)
} else if (filetype=="gff") {
gr <- import.gff(cdsfile, asRangedData=TRUE)
return (gr)
} else if(filetype=="delim") {
cdstable = read.delim(cdsfile)
if (ncol(cdstable) < 4 )
stop("need at least four columns")
if (! setequal (tolower(colnames(cdstable)[1:4]), c("names","chr", "start", "end")))
colnames(cdstable)[1:4] <- c("names","chr", "start", "end")
genenames <- make.names(cdstable$names, unique=TRUE)
startstop = t ( apply(cbind(as.integer(cdstable$start), as.integer(cdstable$end)), 1, sort) )
if (! is.null(flank.at.start))
startstop[,1] = startstop[,1] - flank.at.start
if (! is.null(flank.at.end))
startstop[,2] = startstop[,2] + flank.at.end
if (any(duplicated(genenames)))
stop(genenames[(duplicated(genenames))])
#if (! is.null(flank.at.start))
ranges = IRanges(start=startstop[,1], end=startstop[,2], names=genenames)
if (ncol(cdstable) > 4 ) {
meta <- cdstable[,-c(1:4)]
#row.names(meta) <- genenames
rd = RangedData(ranges, space=cdstable$"chr", meta)
} else {
rd = RangedData(ranges, space=cdstable$"chr")
}
return(rd)
} else {
gr <- import(cdsfile, type=type)
return(gr)
}
}
# utility function to load input data into RangedData object, not to be used externally
tab2RangedData <- function(filename, genome.name=NULL, name.col=1, start.col=2, end.col=3, chrom.col=4, score.col=NULL, flank.at.start=NULL, flank.at.end=NULL, constwidth=NULL, remove.duplicates=FALSE, header=TRUE, na.strings=c("", "NA","na"), sep="\t", use.position, ...) {
genes = read.table(filename, sep=sep, na.strings=na.strings, nrows=4, ...)
what = as.list(rep(NULL, ncol(genes)))
what[[name.col]] = character(0)
what[[chrom.col]] = character(0)
what[[start.col]] = integer(0)
if (! is.null(end.col) )
what[[end.col]] = integer(0)
if (! is.null(score.col) )
what[[score.col]] = double(0)
genes = scan (filename, what=what, sep=sep, na.strings=na.strings, skip=(as.numeric(header) ))
## remove possibly incomplete records
## genes = genes[complete.cases(genes),]
if (remove.duplicates) {
# TODO: not neccessary
# genes = genes[! duplicated(genes[[name.col]])]
}
## if (any(duplicated(genes[[name.col]]))) {
## stop (genes[duplicated(genes[[name.col]])])
## }
## make new marker names based on possition: like [chr.pos] e.g. 22.55100
## or keep the marker names as they are
range.names = if (use.position)
as.character(paste(genes[[chrom.col]], genes[[start.col]], sep="."))
else
make.names(as.character(genes[[name.col]]))
print (head(range.names))
if (is.null(constwidth)) {
stpos = genes[[start.col]]
endpos = genes[[end.col]]
if (! is.null(flank.at.start))
stpos = stpos - flank.at.start
if (! is.null(flank.at.end))
endpos = endpos + flank.at.end
ir = IRanges(start=stpos, end=endpos, names=range.names)
genome = RangedData(ranges=ir, space=genes[[chrom.col]], universe=genome.name)
}
else {
genome <- if (is.null (score.col)) {
RangedData(ranges =
IRanges(start=genes[[start.col]], width=constwidth, names=range.names),
space=genes[[chrom.col]], universe=genome.name) }
else {
RangedData(ranges =
IRanges(start=genes[[start.col]], width=constwidth, names=range.names),
space=genes[[chrom.col]], score=genes[[score.col]], universe=genome.name)
}
}
#rownames(genome) <- genome[,"names"]
return (genome)
}
# simulate some SNP data if you don't have them
simulate.snpdata <- function (n, genome.size, nchr=1){
ir = IRanges(start=as.integer(runif(n, min=1, max=genome.size)), width=1)
rd = RangedData(ir , p.value = pt(rt(n,df=100), df=100), space=rep(paste( 1:nchr, sep=""), length=n ), universe="Mygenome")
row
rownames(rd) = paste ("rs",sprintf(paste("%0",ceiling(log10(n))+1,".0f", sep=""),1:n), sep="")
return(rd)
}
# the LD data for a whole genome are too big to load into memory
# the LD data is loaded from a hdf5 file with a certain layout:
# top level groups - the population code
# the chromosomes must be saved on the second as groups
# the group name must match the chromosome name
# on the third level, there is a lists with elements: "rs1" "rs2" "score"
## example /CEU/1/rs1
# cutoff: cutoff value for the LD list of score. The list is reduced immediately
# to shrink it as soon as possible to save memory
# the content is returned as a list with elements $rs1 $rs2 $score
# if full.match == TRUE then 'pos1' and 'pos2' are added containing
# the snp positions in the LD files
# if sim=TRUE, simulated snp data of max. length max.pairs is returned
load.ld <- function(chrom.name=NULL, population=population, snps=NULL, hdf5.file=NULL, max.pairs=1E5, cutoff=0.8, comparator=get(">="), rsprefix="rs", sim=FALSE, full.match=FALSE, use.position) {
if (sim) {
# generate a random snp list of size max.pairs
nams = rownames(snps)
max.pairs = min(max.pairs, length(nams))
if (is.null(nams)) stop ("need all rownames to simulate ld data")
print (paste("sampling random ld space data of size", max.pairs))
return (list(rs1= sample(nams, size=max.pairs), rs2= sample(nams, size=max.pairs) ))
}
# this is supported now
if (is.na(hdf5.file)) {
print("NA hdf5 file")
return (NULL)
}
dsnams = as.vector(h5ls(hdf5.file)[[1]]) ##hdf5dir(file=hdf5.file, paste("/", population, sep=""))
if (!(paste("/", population,"/", chrom.name, sep="") %in% dsnams) ) {
cat(file=stderr(),paste("Chromosome", chrom.name, "in population", population, "not matched\n"))
return(NULL)
}
chr <- chrom.name # save the original chrom name
chrom.name <- paste("/", population, "/", chrom.name , sep="")
# get the data with low memory profile
print ( paste("loading ld data for ", chrom.name))
tmp0 = h5read(file=hdf5.file, chrom.name)
global.tmp <<- tmp0
global.ls.slice.name <<- chrom.name
ret = tmp0$value.set ##(hdf5load(file=hdf5.file, dataset="value.set", group=chrom.name,load=FALSE, verb=2))
#print ("gc()")
#gc() # free the mem as soon as possible
# make the filter:
print ("make filter")
fint = ( comparator(ret[[1]], cutoff) )
stopifnot ( is.logical(fint) )
print (paste("got", sum(fint), "entries"))
print("loading the data") # filter the list:
if (!use.position) {
ret = list(
rs1= tmp0[[3]], ###(hdf5load(file=hdf5.file, dataset="snp.id.one", group=chrom.name,load=FALSE, verb=2))[[1]][fint],
rs2= tmp0[[4]] ###(hdf5load(file=hdf5.file, dataset="snp.id.two", group=chrom.name,load=FALSE, verb=2))[[1]][fint]
)
#gc()
## add the prefix again, not so nice
ret[[1]] = paste(rsprefix, ret[[1]], sep="")
ret[[2]] = paste(rsprefix, ret[[2]], sep="")
} else {
## make marker names based on possition: like [chr.pos] e.g. 22.55100
ret = list(
rs1= tmp0[[1]], ### (hdf5load(file=hdf5.file, dataset="marker.pos.one", group=chrom.name,load=FALSE, verb=2)[[1]][fint]),
rs2= tmp0[[2]] ### (hdf5load(file=hdf5.file, dataset="marker.pos.two", group=chrom.name,load=FALSE, verb=2))[[1]][fint]
)
ret[[1]] = paste(chr, ret[[1]], sep=".")
ret[[2]] = paste(chr, ret[[2]], sep=".")
stopifnot(all(!is.na(ret)))
}
if (full.match) {
# load the other contents of the data
ret = append (ret, list(
loc1=tmp0[[1]][fint], ### (hdf5load(file=hdf5.file, dataset="marker.pos.one", group=chrom.name,load=FALSE, verb=2))[[1]][fint],
loc2=tmp0[[2]][fint] ###(hdf5load(file=hdf5.file, dataset="marker.pos.two", group=chrom.name,load=FALSE, verb=2))[[1]][fint]
)
)
}
rm(tmp0)
return(ret)
}
match.inbound <- function(snp.pos, names, snpnames, filter, genes, space) {
# match the =right= column of snp pos, using the names from the left
# why? because:
# snppos.1 snppos.2 snpid.1 snpid.2
# 10 1000 rs1(on chip) rs2(not on chip)
# rs1 ~ rs2, but rs2 is not on the chip, take the name of rs2 to get its p-value
# snps must be unique but the names are not, because they replace snps with
# on in linkage disequilibrium
# if there is nothing match then we can get off here
if (length(filter) < 1 || sum(filter) < 1)
return ( NULL )
# we to keep the snps in line in the names vector
names <- names[filter]
# for reasons of debugging make unique names
snpsadd = RangedData(
IRanges(start=snp.pos[filter], width=1, names=make.names(names, unique=TRUE) ),
space=space,
nu.names=names
)
#print (snpsadd)
# the scoring function expects matches per chromosome, even if there is only
# one space in the matched list
# the indices of the genes should be ok then
ol1 <- as.matrix (findOverlaps(query=snpsadd, subject=genes, select="all")[[space]])
# add these matches to the initial match matrix,
# therefore, we have to match the snpnames to the positions in the
# chip snps, the genes are the same as before so we can leave them
# 1. get the names of the snps:
rn <- names[ol1[,"queryHits"]]
# 2. match the names with the global snps
# and replace them, there cannot be any missing because we tested for them before
ol1[,"queryHits"] <- match(rn, snpnames)
# ol1 now must be made unique again
ol1 <- unique(ol1)
cat(paste("number of inbound overlaps unique:",nrow(ol1),"\n"))
return (ol1)
}
# compute the combined weights,
# internal function
compute.weights <- function(matched.ranges, snps, genes, ld.hdf.file=NULL, population=NULL, ld.cutoff=0.8, fun, full.match=FALSE, comparator = get(">="), use.position, ld.structure = ld.structure, correction.type = correction.type, ...) {
# the datasets in the ld data list must have the same names as the spaces
if (! is.null(ld.hdf.file)) {
# add a global position vector
genes$"gpos" <- 1:nrow(genes)
gscore = rep(NA, nrow(genes))
for (m in 1:length(matched.ranges)) {
# cannot load all ld data at once, so we have to go space by space
space = names(matched.ranges[m])
print (paste("processing space",space))
if (! space %in% names(genes)) {
cat (paste("space", space, "not present in gene space, skipping\n"))
next
}
snp.space = snps[space]
gene.space = genes[space]
matchmat = as.matrix(matched.ranges[[m]])
print(paste("got", dim(matchmat), "matches"))
snpnames = as.character(rownames(snp.space)) # a bit faster not to load the names all the time
if (is.null(snpnames))
stop ("All SNPs need valid rs-snp ids to run LD analysis")
ldmatch = load.ld(chrom.name=space, population=population, snp.space, hdf5.file=ld.hdf.file, max.pairs=1E6, cutoff=ld.cutoff, sim=FALSE, full.match=full.match, comparator=comparator, use.position=use.position )
# ldmatch=NULL
if (!is.null(ldmatch)) {
# something is found
# exchange the identifiers of the ld data with the numeric match indices
print ("matching ids on chip:")
print (length(ldmatch[[1]]))
# print(head(ldmatch[[1]]))
# print(head(ldmatch[[2]]))
print(head(snpnames))
rs1 = match(as.character(ldmatch[[1]]), snpnames)
rs2 = match(as.character(ldmatch[[2]]), snpnames)
# print(cbind(rs1,rs2))
# remove NA from non-matching snps,
# they have to be matched back by crossmatching
tmp = ! (is.na(rs1) | is.na(rs2))
if ( sum((tmp)) <= 0 ) warning("Nothing matched! Incompatible SNP coordinates?")
rs1 = rs1[tmp]
rs2 = rs2[tmp]
print("reduced to")
print(length(rs1))
# boil down:
# remove those, which are not in "query"
print("boiling down to:")
query = as.numeric(matchmat[,"queryHits"])
ind = ( ( rs1 %in% query ) | (rs2 %in% query) )
print (length(ind))
rs1 = as.numeric(rs1[ind])
rs2 = as.numeric(rs2[ind])
print("making unique pairs...")
crossmat = unique( rbind (cbind(rs1,rs2), cbind(rs2,rs1) ) )
print (dim(crossmat))
print("added value...")
## make two crossmatching lists for fast query:
addedv = split(f=crossmat[,1], crossmat[,2])
print("value, not NA:")
## extend the match matrix: this needs some dirty tricks:
addedv2 = addedv[match(as.character(query), names(addedv))]
# need to remove data later
fint = which( ! is.na(names(addedv2)))
print(length(fint))
# list has length of 'query' and the same order as the matchmatrix
# with some missing values, but the
# names are still wrong, so replace them with the gene names
genenams = as.character(matchmat[,"subjectHits"])
names(addedv2) <- genenams
# now we can remove the empty entries,
# the list now associates genes and added snps
addedv3 = addedv2[fint]
print (paste("length of added value: ", length(addedv3)))
# the list names are geneids and not unique
print ("double split")
spl2 = split(addedv3, names(addedv3)) # split on listnames to get these into the list, BAD TRICK!
# make a match gene matrix by repeating the gene
print ("extend match matrix")
ret = lapply(spl2, function(x) {
as.matrix( cbind(unlist(x), rep(names(x)[1], length(unlist(x))) ) )
}
)
# add crossmatching, therefore we need to overlap backwards again
# This step is equivalent to simple crossmatch, if and only if:
# All LD snps in the LD data are also on the chip
if (full.match) {
cat("doing inbound matching\n")
# we need both the matches to filter
match.back.1 <- ldmatch[[1]] %in% snpnames
match.back.2 <- ldmatch[[2]] %in% snpnames
# we seek the pairs where exactly one partner is on the chip,
# if both are on the chip, we have already matched outbound:
# Done: if only inbound matching is applied, then
# crossmatch has to be OR because otherwise we would loose these cases
# Done: implement inbound/outbound strategy separatly
crossmatch = xor(match.back.1, match.back.2)
# this gives us a reduced overlap index
# reduce the data further:
match.back.1 <- match.back.1 & crossmatch
match.back.2 <- match.back.2 & crossmatch
ol1 <- match.inbound(ldmatch[[4]], ldmatch[[1]], snpnames, match.back.1, genes, space)
ol2 <- match.inbound(ldmatch[[3]], ldmatch[[2]], snpnames, match.back.2, genes, space)
# this can be added to the match.matrix:
matchmat <- rbind(matchmat,ol1, ol2)
}
print ("extend step2")
#lapply(ret, function(x) { #matchmat <<- rbind(matchmat, x);})
# add all results to the match matrix
matchmat <- rbind(matchmat, do.call(rbind, ret))
print(paste("after extension got", nrow(matchmat), "matches"))
#
matchmat <- unique(matchmat)
print (paste("after uniquifying got", nrow(matchmat), "matches"))
}
print("add to scoring")
gscore = do.score2(matchmat, genes, gene.space, snp.space, fun, gscore, correction.type, hdf5.file=ld.hdf.file, ld.structure, ...)
}
# assign score vector to genome:
genes$"score.data" <- unlist(gscore)
return(genes)
} else {
# print(matched.ranges)
tmp = as.matrix(matched.ranges)
# print(dim(tmp))
return (do.score(tmp, genes, snps, fun, correction.type, hdf5.file=ld.hdf.file, ld.structure, ...))
}
}
# rbinds a list of matrices together, all matrices must have
# the same number of columns
# this is faster than using an intermediate dataframe
# a simple test:
# bla = list(a=(matrix(1:55,nrow=1E4,ncol=2,by=T)), b=(matrix(1:7,nrow=1E4,ncol=2,by=T)))
# all (list.rbind(bla) == rbind(bla[[1]],bla[[2]]))
#should be true
list.rbind <- function(matlist) {
ncol = ncol(matlist[[1]])
matlist = lapply(matlist,t)
return ( t(matrix(unlist(matlist), nrow=ncol, byrow=F)) )
}
# compute combined scores given a whole match matrix object
# for the non-LD case
do.score <- function (match.mat, genes, snps, fun, correction.type, hdf5.file, ld.structure, ...) {
## claculate the combined score
## get the p-values
print ("get scores....")
scorevect = rep(NA, nrow(genes))
match.mat= as.matrix(match.mat)
nu.nams = rownames(snps)
##add chr and bp
chr.nams = as.numeric(as.character(snps$space))
bp.nams = data.frame(snps$ranges)$start
#tmp = data.frame(match.mat, score=score( snps[as.numeric(match.mat[,"queryHits"]),]), snp.ids=( rownames( snps[as.numeric(match.mat[,"queryHits"]),]) ) )
#####12.11.2014. include chr and bp
tmp = data.frame(match.mat, score=score( snps[as.numeric(match.mat[,"queryHits"]),]),
snp.ids = nu.nams[as.numeric(match.mat[,"queryHits"])], snp.chr = chr.nams[as.numeric(match.mat[,"queryHits"])], snp.bp = bp.nams[as.numeric(match.mat[,"queryHits"])], row.names=NULL )
## split p-values by gene (subject)
print("split...")
if ( ( ! is.null (body(fun)))
&& ( length (body(fun)) == length(body(get.snps)) )
&& all(body(fun) == body(get.snps)))
{
print ("using snp names instead of score")
splitti = split(tmp$snp.ids, f=tmp$subjectHits)
} else {
splitti = split(tmp$score, f=tmp$subjectHits)
splitti.chr = split(tmp$snp.chr, f=tmp$subjectHits)
splitti.bp = split(tmp$snp.bp, f=tmp$subjectHits)
}
rm (tmp)
## apply the combination function
print("before sapply")
if (is.na(pmatch("cor.array",formalArgs(fun))))
{
tmp2 = lapply(splitti, fun, ...)
}
else{
##splitti.chr and splitti.bp to function func.r2
##create r2 matrix and merge list of pvals with the list of matrixes
##
## print("merge coordinates...")
coor<-cbind(splitti.chr, splitti.bp)
cor.array<-list()
if (ld.structure){
cor.array<-mapply(func_r2, splitti.chr, splitti.bp, hdf5.file)
}else{
cor.array<-mapply(func_r2_est, splitti, splitti.bp)##mapply(func_r2, splitti.chr, splitti.bp)
##i use this option because the exact ld loading is very long.
##
}
tmp2<-mapply(fun, splitti, cor.array, correction.type)#list()
}
rm(splitti)
print("getting index")
## assign it to the right positions in the scorevector
ind = as.numeric(names(tmp2))
print("add to scorevect")
scorevect[ind] <- tmp2
stopifnot(length(ind) == length(tmp2))
genes$"score.data" <- unlist(scorevect)
## the scorevector contains all gene scores and can be attached to the genes
return (genes)
}
## do the combined score and return a score vector
## applied for each chromosome/space and a match matrix made
## from ranges lists
## the gene space needs a gpos field denoting the absolute position
## of the Range
## takes and returns a scorevector
do.score2 <- function (match.mat, global.genes, local.genes, local.snps, fun, score.vector, correction.type, hdf5.file=ld.hdf.file, ld.structure, ld.hdf.file,...) {
# claculate the combined score
# get the p-values
print ("get scores 2....")
stopifnot( nrow(global.genes) == length(score.vector))
if (any (is.na(local.genes$gpos)) || any (is.null(local.genes$gpos)))
stop("all ranges need a global position attribute(gpos)")
match.mat = as.matrix(match.mat)
nu.nams = rownames(local.snps)
##add chr and bp
chr.nams = as.numeric(as.character(local.snps$space))
bp.nams = data.frame(local.snps$ranges)$start
#tmp = data.frame(match.mat, score=score( local.snps[as.numeric(match.mat[,"queryHits"]),]),
# snp.ids = nu.nams[as.numeric(match.mat[,"queryHits"])], row.names=NULL )
# # split p-values by gene (subject)
#####12.11.2014. include chr and bp
tmp = data.frame(match.mat, score=score( local.snps[as.numeric(match.mat[,"queryHits"]),]),
snp.ids = nu.nams[as.numeric(match.mat[,"queryHits"])], snp.chr = chr.nams[as.numeric(match.mat[,"queryHits"])], snp.bp = bp.nams[as.numeric(match.mat[,"queryHits"])], row.names=NULL )
# split p-values by gene (subject)
print("split...")
if ( ( ! is.null (body(fun)))
&& ( length (body(fun)) == length(body(get.snps)) )
&& all(body(fun) == body(get.snps))) {
print ("using snp names instead of score")
splitti = split(tmp$snp.ids, f=tmp$subjectHits)
} else {
splitti = split(tmp$score, f=tmp$subjectHits)
splitti.chr = split(tmp$snp.chr, f=tmp$subjectHits)
splitti.bp = split(tmp$snp.bp, f=tmp$subjectHits)
}
#rm (tmp)
# apply the combination function
# print("before sapply")
if (is.na(pmatch("cor.array",formalArgs(fun))))
{
tmp2 = lapply(splitti, fun, ...)
} else{
##splitti.chr and splitti.bp to function func.r2
##create r2 matrix and merge list of pvals with the list of matrixes
##
coor<-cbind(splitti.chr, splitti.bp)
if (ld.structure){
print ('retreiving ld structure for all genes')
### SLOW, this is the point that needs to be improved
print(system.time(cor.array<-mapply(func_r2, splitti.chr, splitti.bp, hdf5.file)))
}else{
cor.array<-mapply(func_r2_est, splitti, splitti.bp)##mapply(func_r2, splitti.chr, splitti.bp)
}
tmp2<-mapply(fun, splitti, cor.array, correction.type)
}
#rm(splitti)
print("getting index")
# assign it to the right positions in the scorevector
ind = as.integer(names(tmp2))
# the index is the names of the local.genes in chromosome
# as split might have reordered completely, we have to re-order it
global.ind = local.genes[ind,]$gpos
if (length(global.ind) != length(tmp2))
stop (paste("different length in scores: gind: ",length(global.ind),"tmp ", length(tmp2) ))
print("add to scorevect")
score.vector[global.ind] <- tmp2
# genes$"score.data" <- scorevect
# the scorevector contains all gene scores and can be attached to the genes
return (score.vector)
}
#######################################################################################################################################
func_r2<-function(chr, bp, hdf5.file){
chr<-as.numeric(chr)
bp<-as.numeric(bp)
if (length(bp)>1){
print(bp[1])
mat<-load.ld.matrix.2(hdf5.file=hdf5.file,chrom.name=chr[1], positions=bp)
# index<-match(bp, row.names(matrix)) This is already done by the load ld matrix function
# cor.array<-matrix[index,index]
}else{
mat<-matrix(1)
}
return(mat)
}
#####################################################################################################
load.ld.matrix.bak <- function(chrom.name=NULL, population="EUR", hdf5.file=NA, positions=NA, verbose=F, rsprefix="rs") {
positions <- sort(unique(positions)) # just in case
print("tmp1...")
if (is.na(hdf5.file)) { print("NA hdf5 file"); return (NULL) }
#if (anyNA(positions)) { print("NA positions"); return(NULL) }
dsnams = as.vector(h5ls(hdf5.file)[[1]]) ##hdf5dir(file=hdf5.file, paste("/", population, sep=""))
if (!(paste("/", population,"/", chrom.name, sep="") %in% dsnams) ) {
cat(file=stderr(),paste("Chromosome", chrom.name, "in population", population, "not matched\n"))
return(NULL)
}
ld.slice.name <- paste("/", population, "/", chrom.name , sep="")
## try to avoid loading same ld data over and over again
if (ld.slice.name != global.ls.slice.name) {
rm (global.tmp)
rm (global.ls.slice.name)
global.ld.slice.name <<- ld.slice.name
## get the data with low memory profile
print ( paste("Loading ld data positions for", ld.slice.name))
tmp <- h5read(file=hdf5.file, ld.slice.name)
global.tmp <<- tmp
} else {
print ("Not loading, using global object!")
tmp <- global.tmp
}
snp.pairs = list(
pos1 = tmp[[1]],
pos2 = tmp[[2]]
)
print ( paste("Selecting associations for",length(positions),'snps on chromosome',chrom.name) )
### SLOW:
### making the selection is slow
selection <- (( snp.pairs[[1]] %in% positions & snp.pairs[[2]] %in% positions ))
### if selection is empty, we can return immediately
if (sum (selection) < 1) {
return(diag(1,nrow=length(positions), ncol=length(positions) ))
}
### SLOW: change this into some vectorized operation
print (" make matrix of indexes, this is slow" )
pos1.sel = snp.pairs[[1]][selection]
pos2.sel = snp.pairs[[2]][selection]
#### This is better written as:
#m<-matrix(rep(NA,length(positions)**2),ncol=length(positions))
m<-matrix(NA, nrow = length(positions), ncol=length(positions))
for (i in 1:length(positions)) {
### SLOW: if the matrix is symmetric, we don't need to start at
### j for each i, instead we could start at i+1 ????
for (j in 1:length(positions)) {
if (verbose) {
#cat(length(new.row),'.',fill=F,sep='')
cat('.',fill=F,sep='')
}
### SLOW: this is the diagonale,isnt it set to 1 anyway?
### This will no longer happen and the matrix was already initialized as such
if (i == j) {
m[i,j] <- NA
next
}
if ( sum(positions[i] == pos1.sel & positions[j] == pos2.sel) >= 1 ) {
if (sum(positions[i] == pos1.sel & positions[j] == pos2.sel) > 1) { print("Multiple associations!!!")
tmp0 <- which(selection)[which(pos1.sel == positions[i] & pos2.sel == positions[j])]
m[i,j]<-tmp0[1]}else{
m[i,j] <- which(selection)[which(pos1.sel == positions[i] & pos2.sel == positions[j])]}
m[j,i]<-m[i,j]
}
}
}
print ("...done")
#cross matrix.selection with assoc
assoc = tmp$value.set[as.vector(m)]
# rm(tmp)
if (length(pos1.sel)>0){
ret = matrix(assoc, nrow=length(positions), ncol=length(positions))
}else{ret<-mat.or.vec(length(positions),length(positions))}
# make result matrix
ret[is.na(ret)]<-0
rownames(ret)<-positions
colnames(ret)<-positions
# set the diagonal
diag(ret) <- 1.0
return(ret)
}
load.ld.matrix.2 <- function(chrom.name=NULL, population="EUR", hdf5.file=NA, positions=NA, verbose=F, rsprefix="rs") {
positions <- sort(unique(as.integer(positions)))
mti <- rep(positions, 1000) # don't ask :D
# mti <- positions
# just in case, we don't need anything twice
#print("tmp1...")
if (is.na(hdf5.file)) { print("NA hdf5 file"); return (NULL) }
#if (anyNA(positions)) { print("NA positions"); return(NULL) }
dsnams = as.vector(h5ls(hdf5.file)[[1]]) ##hdf5dir(file=hdf5.file, paste("/", population, sep=""))
if (!(paste("/", population,"/", chrom.name, sep="") %in% dsnams) ) {
cat(file=stderr(),paste("Chromosome", chrom.name, "in population", population, "not matched\n"))
return(NULL)
}
ld.slice.name <- paste("/", population, "/", chrom.name , sep="")
## try to avoid loading same ld data over and over again
if (ld.slice.name != global.ls.slice.name) {
rm (global.tmp)
rm (global.ls.slice.name)
global.ld.slice.name <<- ld.slice.name
## get the data with low memory profile
print ( paste("Loading ld data positions for", ld.slice.name))
tmp <- h5read(file=hdf5.file, ld.slice.name)
global.tmp <<- tmp
} else {
print ("Not loading, using global object!")
tmp <- global.tmp
}
## redundant assignment
#snp.pairs <- NULL
## check what the fastest processing would be
print ( paste("Selecting associations for",length(positions),'snps on chromosome',chrom.name) )
### SLOW:
### making the selection is slow, and incorrect, because we wanted all snps contained in genes, right?
### select all snps where at least one is inside the gene:
# selection <- ( tmp[[1]] %in% positions | tmp[[2]] %in% positions )
### select all snps where both snps are inside the gene:
## leaving it like this for timing comparison
selection <- intersect(
which(fmatch(tmp[[1]], mti, nomatch = 0 ) != 0),
which(fmatch(tmp[[2]], mti, nomatch = 0 ) != 0)
)
#browser()
#### selection <- ( tmp[[1]] %in% positions) & (tmp[[2]] %in% positions )
### if selection is empty, we can return immediately,
### Question, what should be return then????
### Make the return matrix, it's a diagonale matrix with all other entries 0
### dimension is the same as length positions
mat <- diag(1,nrow=length(positions), ncol=length(positions) )
### assing proper dimnames
dimnames(mat) <- list(positions, positions)
if (length (selection) < 1) {
return(mat)
}
### SLOW: change this into some vectorized operation
print ("reduce to selected entries only" )
pos1.sel <- tmp$marker.pos.one[selection]
pos2.sel <- tmp$marker.pos.two[selection]
val <- tmp$value.set[selection] # the r2 values
#### This is better written as:
#m<-matrix(rep(NA,length(positions)**2),ncol=length(positions))
#m<-matrix(NA, nrow = length(positions), ncol=length(positions))
### assign sparse values to matrix by indexing using a matrix:
### create the match index:
print("matching positions")
#ind1 <- match(pos1.sel, positions)
#ind2 <- match(pos2.sel, positions)
mat[cbind(as.character(pos1.sel), as.character(pos2.sel))] <- val
return (mat)
# print ("computing eigen values")
### that's it, compute the eigen values and return them only:
## does not work
# return (eigen(mat, only.values=T))
#### END!
}
############################
###dummy temporal function
func_r2_est<-function(p,bp){
p<-as.numeric(p)
bp<-as.numeric(bp)
if (length(p)>1){
c<-mat.or.vec(length(p),length(p))
diag(c)<-1
for (i in 1:(length(p)-1)){
for (j in (i+1):(length(p)))
{
if (abs(bp[i]-bp[j])<500000){
c[i,j]<-min(p[i],p[j])/max(p[i],p[j])
c[j,i]<-c[i,j]
}
}
}
}else{c<-1}
return(c)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.