chunk_h5file <- function(in_h5file,out_h5dir,chunksize=NULL){
require(h5)
stopifnot(file.exists(in_h5file))
}
chunk_h5d <- function(in_h5file,in_groupname,in_dataname,out_h5dir,out_fname,out_groupname,out_dataname,chunksize=NULL){
require(h5)
stopifnot(!is.null(chunksize))
df <- h5file(in_h5file,'r')
dpath <- df[paste0('/',in_groupname,'/',in_dataname)]
ddim <- dim(dpath)
}
sample_h5_df <-function(mh5file,groupname,blocksize=20000,pval.cutoff=1e-5,gwasdf=NULL,tstat_df=337){
require(rhdf5)
cut_t <- qt(p = pval.cutoff,df = tstat_df,lower.tail = F)
tdf <- read_h5_df(mh5file,groupname=groupname,subcols = c("theta","serr"))
filtervec <- abs(tdf$theta/tdf$serr)>cut_t
rm(tdf)
snpdf <- read_h5_df(mh5file,groupname=groupname,subcols=c("chrom","pos")) %>% distinct
stopifnot(nrow(distinct(snpdf,chrom))==1)
ldblocks <- sort(unique(c(seq(from=min(snpdf$pos)-1,to=max(snpdf$pos)+1,by = blocksize),max(snpdf$pos)+1)))
snpdf <- mutate(snpdf,ldblock=cut(pos,breaks = ldblocks,labels = F,right=F))
sigdf <- read_h5_df(mh5file,groupname=groupname,filtervec = filtervec) %>% inner_join(snpdf,by=c("chrom","pos"))
if(!is.null(gwasdf)){
sigdf <- inner_join(sigdf,gwasdf,by=c("chrom","pos"))
}
sigdf<- mutate(sigdf,tstat=theta/serr) %>% mutate(pvale=pt(abs(tstat),tstat_df,lower.tail = F)) %>%
group_by(fgeneid,ldblock) %>% filter(pvale==min(pvale)) %>% ungroup()
gc()
return(sigdf)
}
#' Turns matrix of diagonals in to sparse banded matrix
#'
#' @param bmat matrix of diagonals
#' @param bwd band width
from_band <- function(bmat){
tmat <- bandSparse(n = ncol(bmat),m = ncol(bmat),k=-c(0:(nrow(bmat)-1)),diagonals = t(bmat),symmetric = T)
return(tmat)
}
readMATLAB <- function(matfile,group="BR"){
require(rhdf5)
matres <- from_band(h5read(matfile,name=group))
}
matLines <- function(gzc,nr,nc){
tL <-readLines(gzc,n = nr)
tLL <- matrix(as.integer(unlist(strsplit(tL,split = " ",fixed = T,useBytes = T))),
nrow = nr,ncol = nc,byrow = T)
return(tLL)
}
#'Write covariate data to HDF5 data
#'@param covarf covariate file
#'@param h5file output H5file
#'@param chunksize (this is not really necessary)
write_covar_h5 <- function(covarf,h5file,chunksize=1,deflate_level=9){
require(rhdf5)
library(dplyr)
covardat <- read.table(covarf,header=T,stringsAsFactors = F)
matdat <-t(data.matrix(select(covardat,-ID)))
write_2dmat_h5(h5file,"Covardat","covariates",data = matdat)
h5createGroup(file = h5file,group = "Covarinfo")
h5createDataset(h5file,"/Covarinfo/id",
dims=c(nrow(covardat)),
storage.mode="character",size=max(nchar(covardat$ID))+1,
chunk=chunksize,level=deflate_level)
h5write(covardat$ID,file=h5file,name="/Covarinfo/id")
}
read_ind_h5 <- function(h5filename,groupname,dataname,index){
require(h5)
minind <- min(index)
maxind <- max(index)
hf <- h5file(h5filename,'r')
hd <- hf[paste0('/',groupname,'/',dataname)]
dimd <- dim(hd)
subd <- hd[,minind:maxind][,(index-minind)+1]
h5close(hf)
return(subd)
}
read_subset_h5 <- function(h5filename,groupname,dataname,index,chunk_size=NULL,chunk_num=NULL){
stopifnot(!is.null(chunk_size)||!is.null(chunk_num))
require(BBmisc)
if(!is.null(chunk_size)){
chunkind <-chunk(x = index,chunk.size = chunk_size)
}else{
chunkind <-chunk(x = index,n.chunks = chunk_num)
}
fdata <- do.call("cbind",
lapply(chunkind,read_ind_h5,h5filename=h5filename,groupname=groupname,dataname=dataname))
stopifnot(length(index)==ncol(fdata))
return(fdata)
}
copy_subset_h5 <-function(h5filename,groupname,dataname,index,chunk_size=NULL,chunk_num=NULL,newh5filename){
stopifnot(!is.null(chunk_size)||!is.null(chunk_num))
require(BBmisc)
if(!is.null(chunk_size)){
chunkind <-chunk(x = index,chunk.size = chunk_size)
}else{
chunkind <-chunk(x = index,n.chunks = chunk_num)
}
for(i in 1:length(chunkind)){
tdata <- read_ind_h5(h5filename,groupname,dataname,chunkind[[i]])
write_2dmat_h5(newh5filename,groupname,dataname,deflate_level=4,data=tdata,append=T)
}
nf <- h5file(newh5filename,mode='r')
nd <- nf[paste0('/',groupname,'/',dataname)]
stopifnot(ncol(nd)==length(index))
h5close(nf)
return(T)
}
read_2d_mat_h5 <- function(h5f,groupn,datan,rows=NULL,cols=NULL){
require(h5)
stopifnot(file.exists(h5f))
h5ff <- h5file(h5f,'r')
h5g <- h5ff[groupn]
if(is.null(rows)&is.null(cols)){
data <- h5g[datan][,]
}else{
if(is.null(cols)){
data <- h5g[datan][rows,]
}else{
if(is.null(rows)){
data <- h5g[datan][,cols]
}else{
data <- h5g[datan][rows,cols]
}
}
}
return(data)
}
write_2dmat_h5 <- function(h5f,groupn,datan,chunksize=c(5000,5000),deflate_level=4,data=NULL,append=F){
require(h5)
stopifnot(!is.null(data))
if(any(chunksize>dim(data))){
chunksize[which(chunksize>=dim(data))] <-dim(data)[which(chunksize>=dim(data))]/2
}
hf <- h5file(h5f,mode='a')
if(!existsGroup(hf,groupname=groupn)){
grp <- createGroup(hf,groupname = groupn)
}else{
grp <- openGroup(hf,groupname = paste0("/",groupn))
}
if(existsDataSet(grp,datan)){
if(append){
ds <- openDataSet(grp,datan)
current_row <- nrow(ds)
current_col <- ncol(ds)
data_row <- nrow(data)
data_col <- ncol(data)
stopifnot(current_row==data_row)
new_dim <- c(current_row,current_col+data_col)
nds <-extendDataSet(ds,dims = new_dim)
nds[,(current_col+1):new_dim[2]] <- data
}
else{stop("Dataset already exists!")}
}else{
wdata <- createDataSet(grp,datasetname = datan,data=data,chunksize=chunksize,compression=deflate_level)
stopifnot(all(dim(wdata)==dim(data)))
}
h5close(hf)
return(T)
}
write_covar_2d <- function(covarf,h5filename,chunksize=1,deflate_level=9){
require(h5)
covardat <- read.table(covarf,header=T,stringsAsFactors = F)
matdat <-t(data.matrix(select(covardat,-ID)))
ncovar <- ncol(matdat)
nid <- nrow(matdat)
write_2dmat_h5(h5f = h5filename,groupn = "Covardat",datan = "covariates",data=matdat,append=F)
h5createGroup(file = h5file,group = "Covarinfo")
h5createDataset(h5file,"/Covarinfo/id",
dims=c(nrow(covardat)),
storage.mode="character",size=max(nchar(covardat$ID))+1,
chunk=chunksize,level=deflate_level)
h5write(covardat$ID,file=h5file,name="/Covarinfo/id")
}
# write_rows_gzfile <- function(gzfilename,subset_rows,h5filename,groupname,dataname,max.size=335544320L){
# library(data.table)
# tempf <- tempfile()
# write.table(subset_rows,file=tempf,sep="\n",row.names=F,col.names=F,quote=F)
# cmdtxt <-paste0('awk \'FNR==NR{line[$0]=$0; next} FNR in line\' ',tempf, ' <(gzip -cd ', gzfilename,')')
# tcmd <- system(cmdtxt,intern = T)
# all_data <- fread(input = cmdtxt)
#
# tsh <- system()
#
# }
write_rows_gzfile_h5 <- function(gzfilename,subset_rows,h5filename,groupname,dataname,max.size=335544320L){
library(iotools)
gzf <- gzfile(gzfilename,open = 'rt')
chunk_mat <- iotools::chunk.reader(gzf,sep=" ")
i <- 1
mchunk <- iotools::mstrsplit(iotools::read.chunk(chunk_mat,max.size = max.size),sep=" ",type="numeric")
while(nrow(mchunk)>0){
cat("chunk:",i,"\n")
file_rows <- i:(i+nrow(mchunk)-1)
stopifnot(length(file_rows)==nrow(mchunk))
mat_rows <- 1:nrow(mchunk)
sub_chunk <- mat_rows[which(file_rows %in% subset_rows)]
tchunk <- t(mchunk[sub_chunk,,drop=F])
cat("dim:",dim(tchunk))
if(ncol(tchunk)>0){
write_2dmat_h5(h5filename,groupn=groupname,datan=dataname,data=tchunk,append=T)
}
if(i==1){
nid <- nrow(tchunk)
}
i <- file_rows[length(file_rows)]+1
mchunk <- iotools::mstrsplit(iotools::read.chunk(chunk_mat,max.size = max.size),sep=" ",type="numeric")
}
h5f <- h5file(h5filename,'r')
h5d <- h5f[paste0("/",groupname,"/",dataname)]
stopifnot(ncol(h5d)==length(subset_rows))
h5close(h5f)
return(T)
}
write_gtex_eqtl_h5 <- function(txtfile,eqtlh5file,chunksize=500000){
require(readr)
require(tidyr)
require(dplyr)
gtex_cbf <- function(x,pos){
th5file <- eqtlh5file
annocols <-c("chrom","pos","ref","alt","b37")
nx <-separate(x,variant_id,into = annocols,sep="_",convert = T) %>% mutate(doFlip=as.integer(ref<alt),fgeneid=as.integer(gsub("ENSG([0-9]+).+","\\1",gene_id)))
annodf <- select(nx,-b37,-gene_id,-ref,-alt) %>%rename(pval.e=pval_nominal,weight=slope) %>%
mutate(chrom=ifelse(chrom=="X",23,chrom)) %>%
mutate(chrom=as.integer(chrom),
pos=as.integer(pos),
weight=ifelse(doFlip==1,-weight,weight))
rmna <- is.na(annodf$chrom)|is.na(annodf$pos)
annodf <- filter(annodf,!rmna)
write_df_h5(df = annodf,groupname = "cis_eQTL",outfile = th5file,deflate_level = 4)
}
wbdf <- read_delim_chunked(txtfile,delim = "\t",col_names=T,callback = SideEffectChunkCallback$new(gtex_cbf),chunk_size = chunksize)
gc()
return(T)
}
write_df_h5 <- function(df,groupname,outfile,deflate_level=4L,chunksize=1000L){
if(nrow(df)<chunksize){
chunksize <- nrow(df)
}
deflate_level <- as.integer(deflate_level)
require(h5)
dataname <- colnames(df)
f <-h5file(outfile,mode = 'a')
if(existsGroup(f,groupname)){
h5close(f)
res <- append_df_h5(df,groupname,outfile,deflate_level)
return(res)
}
group <- createGroup(f,groupname)
for(i in 1:length(dataname)){
dsn <- dataname[i]
td <- df[[dsn]]
tdata <- createDataSet(.Object = group,
datasetname = dsn,
type = typeof(td),
dimensions = length(td),
chunksize =chunksize,
maxdimensions = NA_integer_,
compression = deflate_level)
tdata[] <- td
}
h5close(f)
return(T)
}
append_df_h5 <- function(df,groupname,outfile,deflate_level=4){
require(h5)
dataname <- colnames(df)
f <-h5file(outfile,mode = 'a')
group <- openGroup(f,groupname)
for(i in 1:length(dataname)){
dsn <- dataname[i]
td <- df[[dsn]]
if(existsDataSet(group,dsn)){
tdata <- openDataSet(group,dsn)
odim <- dim(tdata)
ntdata <- extendDataSet(tdata,odim+length(td))
ntdata[(odim+1):(odim+length(td))] <- td
}else{
h5close(f)
stop(paste0(dsn," does not exist in ",outfile))
}
}
h5close(f)
return(T)
}
read_df_h5 <- function(h5filepath,groupname=NULL,subcols=NULL,filtervec=NULL){
require(h5)
require(dplyr)
require(lazyeval)
stopifnot(file.exists(h5filepath))
f <- h5file(h5filepath,mode = 'r')
if(!is.null(groupname)){
stopifnot(existsGroup(f,groupname))
group <- openGroup(f,groupname)
dsets <- list.datasets(group)
}else{
dsets <- list.datasets(f)
}
if(!is.null(subcols)){
if(!is.null(groupname)){
subcols <- paste0("/",groupname,"/",subcols)
}else{
subcols <- paste0("/",subcols)
}
dsets <- dsets[dsets %in% subcols]
}
stopifnot(length(dsets)>0)
if(!is.null(groupname)){
dsnames <- gsub(pattern = paste0("/",groupname,"/"),"",dsets)
}else{
dsnames <-gsub("/","",dsets)
}
return(as_data_frame(setNames(lapply(dsets,function(x,file,fvec){
if(is.null(fvec)){
return(x=c(file[x][]))
}else{
if(fvec[length(fvec)]==(fvec[1]+length(fvec)-1)){
return(file[x][filtervec])
}else{
return(file[x][][filtervec])
}
}
},file=f,fvec=filtervec),dsnames)
))
}
.ls.objects <- function (pos = 1, pattern, order.by,
decreasing=FALSE, head=FALSE, n=5) {
napply <- function(names, fn) sapply(names, function(x)
fn(get(x, pos = pos)))
names <- ls(pos = pos, pattern = pattern)
obj.class <- napply(names, function(x) as.character(class(x))[1])
obj.mode <- napply(names, mode)
obj.type <- ifelse(is.na(obj.class), obj.mode, obj.class)
obj.prettysize <- napply(names, function(x) {
capture.output(print(object.size(x), units = "auto")) })
obj.size <- napply(names, object.size)
obj.dim <- t(napply(names, function(x)
as.numeric(dim(x))[1:2]))
vec <- is.na(obj.dim)[, 1] & (obj.type != "function")
obj.dim[vec, 1] <- napply(names, length)[vec]
out <- data.frame(obj.type, obj.size, obj.prettysize, obj.dim)
names(out) <- c("Type", "Size", "PrettySize", "Rows", "Columns")
if (!missing(order.by))
out <- out[order(out[[order.by]], decreasing=decreasing), ]
if (head)
out <- head(out, n)
out
}
# shorthand
lsos <- function(..., n=10) {
.ls.objects(..., order.by="Size", decreasing=TRUE, head=TRUE, n=n)
}
gigsize <- function(rows,cols,size=8){
(rows/1e9*cols*size)
}
#' read a sparse matrix from HDF5
#' read a sparse matrix from HDF5 stored in Compressed Column Storage (CCS) format
#' @template h5fun
#' @export
read_ccs_h5 <- function(h5filename,groupname,dataname="data",iname="ir",pname="jc"){
require("h5")
require("Matrix")
h5f <- h5::h5file(h5filename,mode = 'r')
h5g <- h5f[groupname]
h5attr(h5g,"Layout")
isSymmetric <- h5attr(h5g,"isSymmetric")=="TRUE"
data <- h5g[dataname][]
i <- h5g[iname][]
p <- h5g[pname][]
h5::h5close(h5f)
return(Matrix::sparseMatrix(i=i,p=p,x=data,index1 = F,symmetric = isSymmetric))
}
read_coo_h5 <- function(h5filename,groupname,dataname="data",iname="i",jname="j",giveCsparse=F){
require(h5)
require(Matrix)
h5f <- h5::h5file(h5filename,mode = 'r')
h5g <- h5f[paste0("/",groupname)]
data_dim <- h5attr(h5g,'Dimensions')
data <- h5g[dataname][]
i <- h5g[iname][]
j <- h5g[jname][]
h5::h5close(h5f)
return(Matrix::sparseMatrix(i=i+1,j=j+1,x=data,dims = data_dim,giveCsparse = giveCsparse))
}
append_h5_h5 <- function(input_h5,input_groupname,input_dataname,output_h5,output_groupname,output_dataname){
library(h5)
require(Matrix)
stopifnot(file.exists(input_h5))
cat("Opening input file\n",input_h5,"\n")
ih5f <- h5file(name = input_h5,mode = 'r')
cat("Opening input group\n")
ih5g <- ih5f[input_groupname]
ih5d <- ih5g[input_dataname]
oh5f <- h5file(output_h5,mode='a')
if(!existsGroup(oh5f,output_groupname)){
oh5g <- createGroup(oh5f,output_groupname)
}
oh5g <- oh5f[output_groupname]
if(!existsDataSet(.Object = oh5g,datasetname = output_dataname)){
h5::createDataSet(.Object = oh5g,
datasetname = output_dataname,
data=ih5d[],
chunksize=as.integer(dim(ih5d)/10),
maxdimensions = NA_integer_,
compression=as.integer(4))
}else{
oh5d <- oh5g[output_dataname]
noh5d <- c(oh5d,ih5d[])
}
h5close(ih5f)
h5close(oh5f)
gc()
return(T)
}
write_coo_h5 <- function(h5filename,spmat,groupname,dataname="data",iname="i",jname="j",compression_level=4){
require(h5)
spmat <- as(spmat,"dgTMatrix")
i <- spmat@i
j <- spmat@j
data <- spmat@x
h5f <- h5::h5file(h5filename,'a')
if(existsGroup(h5f,groupname)){
h5g <- h5f[groupname]
tid <- h5g[iname]
oli <- dim(tid)
ntidata <- extendDataSet(tid,oli+length(i))
ntidata[(oli+1):(oli+length(i))] <- i
tjd <- h5g[jname]
olj <- dim(tjd)
ntjdata <- extendDataSet(tjd,olj+length(j))
ntjdata[(olj+1):(olj+length(j))] <- j
txd <- h5g[dataname]
olx <- dim(txd)
ntxdata <- extendDataSet(txd,olx+length(data))
ntxdata[(olx+1):(olx+length(data))] <- data
}else{
h5g <- createGroup(h5f,groupname)
#Matrix::sparseMatrix(i=i+1,p=p,x=data))
h5::createAttribute(.Object = h5g,attributename = "Layout",data="COO")
h5::createAttribute(.Object = h5g,attributename = "Dimensions",data=dim(spmat))
id <- h5::createDataSet(.Object = h5g,
datasetname = iname,
data=i,
chunksize=as.integer(length(i)/10),
maxdimensions = NA_integer_,
compression=as.integer(compression_level))
pd <- h5::createDataSet(.Object = h5g,
datasetname = jname,
data=j,
chunksize=as.integer(length(j)/10),
maxdimensions = NA_integer_,
compression=as.integer(compression_level))
dd <- h5::createDataSet(.Object = h5g,
datasetname = dataname,
data=data,
chunksize=as.integer(length(data)/10),
maxdimensions = NA_integer_,
compression=as.integer(compression_level))
}
h5close(h5f)
return(T)
}
write_ccs_h5 <- function(h5filename,spmat,groupname,dataname="data",iname="ir",pname="jc",compression_level=8,symmetric=F){
require(h5)
h5f <- h5::h5file(h5filename,'a')
h5g <- createGroup(h5f,groupname)
#Matrix::sparseMatrix(i=i+1,p=p,x=data))
i <- spmat@i
p <- spmat@p
data <- spmat@x
h5::createAttribute(.Object = h5g,attributename = "Layout",data="CCS")
h5::createAttribute(.Object = h5g,attributename = "Dimensions",data=dim(spmat))
h5::createAttribute(.Object = h5g,attributename = "isSymmetric",data=ifelse(symmetric,"TRUE","FALSE"))
id <- h5::createDataSet(.Object = h5g,
datasetname = iname,
data=i,
chunksize=as.integer(length(i)/10),
maxdimensions = NA_integer_,
compression=as.integer(compression_level))
pd <- h5::createDataSet(.Object = h5g,
datasetname = pname,
data=p,
chunksize=as.integer(length(p)/10),
maxdimensions = NA_integer_,
compression=as.integer(compression_level))
dd <- h5::createDataSet(.Object = h5g,
datasetname = dataname,
data=data,
chunksize=as.integer(length(data)/10),
maxdimensions = NA_integer_,
compression=as.integer(compression_level))
h5close(h5f)
return(T)
}
read_vec <- function(h5filename,datapath){
requireNamespace("h5")
h5f <- h5::h5file(h5filename,'r')
data <- h5f[datapath][]
h5::h5close(h5f)
return(data)
}
write_vec <- function(h5filename,vec,datapath,deflate_level=4,chunksize=NULL){
require(h5)
h5f <- h5::h5file(h5filename,'a')
if(is.null(chunksize)){
chunksize <- length(vec)/2
}
data <- h5f[datapath,compression=deflate_level,chunksize=chunksize] <- vec
h5::h5close(h5f)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.