R/generics.R

#' Creating a new h5Variant object
#' @param h5_name path of hdf5 file
#' @param map_name path of bed map file
#' @param overwrite overwrite files?

#' @details ...

#' @examples
#' \dontrun{
#' }
#' @author Leandro Roser \email{learoser@@gmail.com}
#' @export h5Variant


setGeneric("h5Variant", function(h5_name, map_name = NULL,
                                overwrite = FALSE, 
                                type = c("RLE", "sparse")) {
    type <- match.arg(type)
    new("h5Variant", h5_name, map_name, overwrite, type)
})


#' Get row of h5Variant object
#' @description ...
#' @param ...
#' @examples
#'
#' \dontrun{
#' }
#'
#' @author Leandro Roser \email{learoser@@gmail.com}
#' @keywords internal

setGeneric("h5v_get_row", function(object, row) {
    where <- object@h5_location
    rowline <- h5read(where, "/map", index = list(row,4:6))
    this_block <- paste0("/data/chunk_", rowline[3])
    start_read <- rowline[1]
    count_read <- rowline[2] - rowline[1]
    lengths <- h5read(where,
        name = paste0(this_block, "/lengths"),
        start = start_read,
        count = count_read)
    values <- h5read(where,
        name = paste0(this_block, "/values"),
        start = start_read,
        count = count_read)
    S4Vectors::Rle(values = values, lengths = lengths)
})


setGeneric("h5v_get_type", function(object, block) {
    object@type
})

#' Get row of h5Variant object
#' @description ...
#' @param ...
#' @examples
#'
#' \dontrun{
#' }
#'
#' @author Leandro Roser \email{learoser@@gmail.com}
#' @export

setGeneric("h5v_get_column_binary_search", function(object, block, column) {
    where <- object@h5_location
  
    this_block <- paste0("/data/chunk_", block)

    attr <- h5readAttributes(where, name = this_block)
    rowline <- h5read(where, "/map", 
            index = list(attr$row_start:attr$row_end,4:5))
    out <- list()
    for(i in seq_len(attr$row_end)){
    row_lengths <- h5read(where,
            name = paste0(this_block, "/lengths"),
            index = list(rowline[i, 1]:rowline[i, 2]))
    idx <-  binary_search_RLE(row_lengths, column)
    out[[i]] <-    h5read(where,
                          name = paste0(this_block, "/lengths"), 
                          index = idx)
    cat("Reading row ", i, "/", attr$row_end, "\r")
    }
    unlist(out)
})

setGeneric("h5v_get_column_expansion", function(object, block, column) {
    where <- object@h5_location
    
    this_block <- paste0("/data/chunk_", block)
    
    attr <- h5readAttributes(where, name = this_block)
    rowline <- h5read(where, "/map", 
        index = list(attr$row_start:attr$row_end, 4:5))
    out <- list()
    for(i in seq_len(attr$row_end)){
        row_lengths <- h5read(where,
            name = paste0(this_block, "/lengths"),
            index = list(rowline[i, 1]:rowline[i, 2]))
        row_values <- h5read(where,
            name = paste0(this_block, "/values"),
            index = list(rowline[i, 1]:rowline[i, 2]))
        temp <- as.vector(Rle(values = row_values, lengths = row_lengths))
        out[[i]] <- temp[column]
        cat("Reading row ", i, "/", attr$row_end, "\r")
    }
    do.call("rbind", out)
})


setGeneric("h5v_get_column_block_expansion", function(object, block, column, 
                                                      method = c("RleArray", 
                                                                 "matrix")) {
    block_data <- h5v_get_block(object, block, method = method)
    block_data[, column, drop = FALSE]
})


#' Get block of h5Variant object
#' @description ...
#' @param ...
#' @examples
#'
#' \dontrun{
#'
#' }
#'
#' @author Leandro Roser \email{learoser@@gmail.com}
#' @keywords internal
#'
setGeneric("h5v_get_block", function(object, block, 
                                     method = c("RleArray", "matrix")) {
    where <- object@h5_location
    this_block <- paste0("/data/chunk_", block)
    data <- h5read(where, name = this_block)
    attr <- h5readAttributes(where, name = this_block)
    block_rownames <- h5read(where, name = "rownames",
        index = list(seq(as.numeric(attr$row_start),
            as.numeric(attr$row_end), 1)))

    if(method == "RleArray") {
    out <-  Rle(values =  data$values, lengths = data$lengths)
    RleArray(unlist(IRanges::RleList(out)),
        dim = c(ncol(object), attr$nrow),
        dimnames = list(colnames(object), block_rownames))
    
    } else if(method == "matrix") {
        out <- list()
        
        rowline <- h5read(where, "/map",
            index = list(attr$row_start:attr$row_end, 4:5))
        
        start <- rowline[, 1]
        end <- rowline[, 2]
        for(i in seq_len(attr$nrow)) {
            tmp <-  S4Vectors::Rle(values =  (data$values)[start[i]:end[i]],
                lengths = (data$lengths)[start[i]:end[i]])
            out[[i]] <- as.vector(tmp)
        }
        
        out <- do.call("cbind", out)
        colnames(out) <- block_rownames
        rownames(out) <- colnames(object)
        out
    }
})


#' scan_h5Variant
#' @description ...
#' @param ...
#' @examples
#'
#' \dontrun{
#' }
#'
#' @author Leandro Roser \email{learoser@@gmail.com}
#' @export

setGeneric("h5v_scan", function(object, location,
                                  position = NULL,
                                  param = NULL,
                                  yieldSize = NA) {

    where <- object@map_location
    tbx <- open(TabixFile(where, yieldSize = yieldSize))
    on.exit(close(tbx))

    if(!is.null(param)) {
        to_scan <- param
    } else {
        if(!is.null(position)) {
            if(length(position != 2))
                stop("position must be a vector of length 2")
            to_scan <- paste(location, ":", position[1], "-", position[2])
        } else {
            to_scan <- location
        }
    }

    scanfun <- function(tbx, what) {
       if(is.null(param)) {
            scanTabix(tbx, what)
        } else {
            scanTabix(tbx, param = what)
        }
    }

    if(is.na(yieldSize)) {
        out <- scanfun(tbx, to_scan)
    } else {
        res <- list()
        i <- 1
        while(length( out <- scanfun(tbx, to_scan)) > 0) {
            res[[i]] <- out
            out <- out + 1
        }
    }
    out <- unlist(out, use.names = FALSE)
    if(length(out) != 0) {
        read.table(textConnection(out),
            header=FALSE,sep = "\t")
    } else {
        out
    }
})



#' export_map
#' @description ...
#' @param ...
#' @examples
#'
#' \dontrun{
#' }
#'
#' @author Leandro Roser \email{learoser@@gmail.com}
#' @export


setGeneric("h5v_export_map", function(h5_path,
    name = sprintf("%s.bed", gsub("\\.h5$", "", h5_path)),
    overwrite = FALSE) {
    if(file.exists(name)) {
        if(overwrite) {
            file.remove(name)
        } else {
            stop("ranges bed file exists in path but overwrite is FALSE")
        }
    }

    # TODO: use chunking
    seqnames <- h5read(h5_path, "seqnames")
    map <- h5read(h5_path, "map")
    names <- h5read(h5_path, "rownames")
    tmp <- tempfile()
    write.table(cbind(seqnames, map, names), tmp,
        row.names = FALSE, col.names = FALSE, sep="\t",
        quote = FALSE)
    bgzip(tmp, dest = name)
    file.remove(tmp)
    indexTabix(name, "bed", seq = 1, start = 2, end = 3)
    invisible(NULL)
})


#' Create a skeleton for hdf5 file

create_h5_skeleton <- function(output, blocksize, chunksize, 
                               overwrite,
                               level, rownames_size,
                               type = c("RLE", "sparse", "uncompress")) {
    
    type <- match.arg(type)
    if(file.exists(output)) {
        if(overwrite) {
            file.remove(output)
        } else stop("file exists but overwrite is FALSE")
    }
    
    h5createFile(output)
    fid <- H5Fcreate(output)
    
    h5writeAttribute.integer(blocksize, fid, "blocksize")
    h5writeAttribute.integer(chunksize, fid, "chunksize")
    
    if(type != "uncompress") {
    h5createGroup(fid, "data")
    } 
    
    map_col <- if(type == "RLE"){
        map_col <- 6
    } else if(type=="sparse"){
        map_col <- 4
    } else {
        map_col <- 3
    }
    
    h5createDataset(fid, "map",
            dims = c(0, map_col),
            maxdims = c(1E17, map_col),
            chunk = c(chunksize, map_col),
            storage.mode = "integer",
            level = level)
        
    if(type =="RLE") {
        mid <- H5Dopen(fid,"map")
        h5writeAttribute.array(c("start", "end", "row",
            "start_block", "end_block",
            "superblock"),
            mid,
            "colnames", size = 4)
        
    } else if(type == "sparse")  {
        mid <- H5Dopen(fid,"map")
        h5writeAttribute.array(c("start", "end", "row",
                              "superblock"), mid, "colnames",
                               size = 4)
    } else {
        mid <- H5Dopen(fid,"map")
        h5writeAttribute.array(c("start", "end", "row"), mid, "colnames",
            size = 4)
    }
    H5Dclose(mid)
    h5createDataset(fid, "rownames",
        dims = 0,
        maxdims = 1E17,
        chunk = chunksize,
        storage.mode = "character",
        size = rownames_size,
        level = level)
    
    H5Fclose(fid)
}


setGeneric("h5v_import_variants",
    function(data, index = NULL,
        output = sprintf("%s.h5", gsub("\\.vcf.*$",  "", data)),
        map_name = sprintf("%s.bed", gsub("\\.vcf.*$", "", data)),
        method = c("vcf", "plain"),
        blocksize = 100000,
        chunksize = 10000,
        overwrite = FALSE,
        genome = "hg19",
        encode = TRUE,
        default = "0/0",
        encoded_as = "H5T_NATIVE_USHORT",
        level = 7,
        rownames_size = 200,
        sep = " ",
        quoted = FALSE,
        has_colnames = TRUE,
        has_rownames = TRUE,
        ...) {
 
        method <- match.arg(method)
        
        create_h5_skeleton(output, blocksize, 
                           chunksize, 
                           overwrite, level, 
                           rownames_size, "RLE")

    
        if(method == "vcf") {
            if(is.null(index)) {
                index <- paste0(data, ".tbi")
            }
            tab <- open(TabixFile(data, index, yieldSize = blocksize))
            on.exit(close(tab))
            param <- ScanVcfParam(fixed = NA, info = NA, geno = c("GT"))
            initialized <- FALSE
            
            scanfun <- function() {
                vcf_yield <- readVcf(tab, genome, param = param)
                to_write <<- geno(vcf_yield)$GT
                if(!initialized) {
                    seqnames_levels <<- levels(seqnames(rowRanges(vcf_yield)))
                    
                    h5write(colnames(to_write), output, name = "colnames")
                    
                    h5write(seqnames_levels,
                        output, name = "seqlevels")
                    
                    h5createDataset(output, "seqnames",
                        dims = 0,
                        maxdims = 1E17,
                        chunk = chunksize,
                        storage.mode = "character",
                        size = max(nchar(seqnames_levels)) + 1,
                        showWarnings=FALSE)
                    
                    initialized <<- TRUE
                }
                this_rownames <<- names(vcf_yield)
                nrow_data <<- nrow(to_write)
                vcf_start <<- start(vcf_yield)
                vcf_end <<- end(vcf_yield)
                chrom <<- as.vector(seqnames(vcf_yield))
                
                if(nrow(to_write) > 0) {
                 TRUE
                } else {
                FALSE
                }
            }
            cat("Reading vcf ...\n")
            
        } else {
            this_data <- chunker(path = data, 
                sep = sep,
                quoted = quoted,
                has_colnames = has_colnames,
                has_rownames = has_rownames,
                chunksize = chunksize, 
                ...)
            
            scanfun <- function(){
                next_chunk(this_data)
            }
            cat("Reading file ...\n")
        }
        
        n <- 1
        map_start <- 1
        map_extent <- 0
      
        while(scanfun()) {
            
            map_extent <- map_extent + nrow_data
            cat(n * blocksize, " lines read    \n")
            cat("Preparing data and writing chunk ...\n")

            this_group <- paste0("data/chunk_", n)
            h5createGroup(output,  this_group)
            
            #rownames
            h5set_extent(output, "rownames", map_extent)
            h5write(this_rownames, output, "rownames",
                start = map_start)
            
            # data
            fid <- H5Fopen(output)
            did <- H5Gopen(fid, this_group)
            h5writeAttribute.integer(nrow_data, did, "nrow")
            h5writeAttribute.integer(map_start, did, "row_start")
            h5writeAttribute.integer(nrow_data + map_start - 1,
                did, "row_end")
            H5Gclose(did)
            H5Fclose(fid)
            
            # two vectors, because c(a,b)  for Rle vectors, reduce
            # the cells in the contact point of the vectors if same element
            out_values <- list()
            out_lengths <- list()
            val_len <- integer()
            for(i in seq_len(nrow_data)) {
                temp <- Rle(to_write[i, ])
                out_values[[i]] <- temp@values
                out_lengths[[i]] <- temp@lengths
                val_len[i] <- length(temp@values)
            }
            out_values <- do.call("c", out_values)
            out_lengths <- do.call("c", out_lengths)
  
            # map
            rowdata <- seq(map_start, map_start + nrow_data - 1, 1)
            end_row <- cumsum(val_len)
            start_row <- c(1, end_row[-length(end_row)] + 1)
            block <- rep(n, nrow_data)
            chunk_map <- cbind(vcf_start, vcf_end, rowdata, 
                               start_row, end_row, block)
            
            h5set_extent(output, "map", c(map_extent, 6))
            h5write(chunk_map, output, "map", start = c(map_start, 1))
            
            if(encode) {
                
                # values
                out_values <- encode_FactorVector(out_values, 
                                                  default_value = default,
                                                  key_start = 1)
                levels_values <- levels(out_values)
                
                suppressWarnings({
                    h5createDataset(output, paste0(this_group, "/values"), 
                        H5type = encoded_as,
                        dims = length(out_values), 
                        chunk = chunksize)
                    
                    h5write(out_values,
                        output, name = paste0(this_group, "/values"))
                    
                    h5createDataset(output,
                        paste0(this_group, "/lengths"), 
                        H5type = "H5T_NATIVE_ULLONG",
                        dims = length(out_values), 
                        chunk = chunksize)
                    
                    h5write(out_lengths,
                        output, name = paste0(this_group, "/lengths"))
                    
                    h5createDataset(output,
                        paste0(this_group, "/encoding"), 
                        storage.mode = "character",
                        size = max(nchar(levels_values)) + 1,
                        dims = length(levels_values))
                    
                    h5write(levels_values,
                        output, name = paste0(this_group, "/encoding"))
                })
            } else {
                
                # values
                suppressWarnings({
                    h5createDataset(output, paste0(this_group, "/values"), 
                        storage.mode = "character",
                        size = max(nchar(out_values)) + 1,
                        dims = length(out_values),
                        chunk= chunksize)
                    
                    h5write(out_values,
                        output, name = paste0(this_group, "/values"))
                    
                    h5write(out_lengths,
                        output, name = paste0(this_group, "/lengths"))
                })
            }
            
            # number of elements and rownames for the block
            
            # seqranges
            h5set_extent(output, "seqnames", map_extent)
            h5write(chrom, output, "seqnames",
                start = c(map_start))
            
            cat("Chunk ", n, "written ...         \r")
            n <- n + 1
            map_start <- map_start + nrow_data
        }
       
        cat("\nexporting bed file ...\n")
        h5v_export_map(output, name = map_name, overwrite = overwrite)
        
        out <- h5Variant(output, map_name, overwrite = overwrite)
        cat("done!\n")
        out
})



#' vcf_2_h5Variant
#' @description ...
#' @param ...
#' @examples
#'
#' \dontrun{
#' }
#'
#' @author Leandro Roser \email{learoser@@gmail.com}
#' @export

setGeneric("h5v_import_variants_sparse",
    function(data, index = NULL,
        output = sprintf("%s.h5", gsub("\\.vcf.*$",  "", data)),
        map_name = sprintf("%s.bed", gsub("\\.vcf.*$", "", data)),
        method = c("vcf", "txt"),
        blocksize = 100000,
        chunksize = 10000,
        overwrite = FALSE,
        genome = "hg19",
        default_value = "0/0",
        encoded_as = "H5T_NATIVE_USHORT",
        level = 7,
        rownames_size = 200,
        sep = " ",
        quoted = FALSE,
        has_colnames = TRUE,
        has_rownames = TRUE,
        ...) {

        method <- match.arg(method)
        
        create_h5_skeleton(output, blocksize, chunksize, 
            overwrite, level, 
            rownames_size, "sparse")
        
        
        if(method == "vcf") {
            if(is.null(index)) {
                index <- paste0(data, ".tbi")
            }
            tab <- open(TabixFile(data, index, yieldSize = blocksize))
            on.exit(close(tab))
            param <- ScanVcfParam(fixed = NA, info = NA, geno = c("GT"))
            
            initialized <- FALSE
            scanfun <- function(){
                vcf_yield <- readVcf(tab, genome, param = param)
                to_write <<- geno(vcf_yield)$GT
                h5write(colnames(to_write), output, name = "colnames")
                
                if(!initialized) {
                    seqnames_levels <- levels(seqnames(rowRanges(vcf_yield)))
                    h5write(seqnames_levels,
                        output, name = "seqlevels")
                    h5createDataset(output, "seqnames",
                        dims = 0,
                        maxdims = 1E17,
                        chunk = chunksize,
                        storage.mode = "character",
                        size = max(nchar(seqnames_levels))+1,
                        showWarnings=FALSE)
                    # added 1 for some wear reazon, without the 1
                    # size is 1 character less
                    
                    #rownames
                    initialized <<- TRUE
                }
                this_rownames <<- names(vcf_yield)
                vcf_start <<- start(vcf_yield)
                vcf_end <<- end(vcf_yield)
                chrom <<- as.vector(seqnames(vcf_yield))
                if(nrow(to_write) > 0) {
                    TRUE 
                } else {
                FALSE
                }
            }
            
        } else {
            this_data <- chunker(path = data, 
                sep = sep,
                quoted = quoted,
                has_colnames = has_colnames,
                has_rownames = has_rownames,
                chunksize = chunksize, 
                ...)
            
            scanfun <- function(){
                next_chunk(this_data)
            }
        }
        
        n <- 1
        map_start <- 1
        map_extent <- 0
        
        cat("Reading vcf ...\n")
      
        while(scanfun()) {
            
            cat(n * blocksize, " lines read    \n")
            nrow_data <- nrow(to_write)
            
            map_extent <- map_extent + nrow_data
            
            cat("Preparing data and writing chunk ...\n")
 
            h5set_extent(output, "rownames", map_extent)
            h5write(this_rownames, output, "rownames",
                start = c(map_start))
            
            this_group <- paste0("data/chunk_", n)
            h5createGroup(output,  this_group)
            
            # data
            fid <- H5Fopen(output)
            did <- H5Gopen(fid, this_group)
            h5writeAttribute.integer(nrow_data, did, "nrow")
            h5writeAttribute.integer(map_start, did, "row_start")
            h5writeAttribute.integer(nrow_data + map_start - 1,
                did, "row_end")
            # h5writeAttribute.array(this_rownames, did, "rownames",
            #     size = max(nchar(this_rownames))+1)
            H5Gclose(did)
            H5Fclose(fid)
            
            # conversion into sparse matrix
            cat("Converting into sparse matrix ...\n")
            linear_data <- encode_FactorMatrix(to_write, 
                                               default_value = default_value,
                                               key_start = 0)
            levels_data <- levels(linear_data)
            linear_data <- Matrix::Matrix(linear_data, sparse = TRUE)
            
            # values
            cat("Writing data ...\n")
            h5createDataset(output,  paste0(this_group, "/x"), 
                            H5type = encoded_as,
                            dims = length(linear_data@x), 
                            chunk = chunksize)
            h5write(linear_data@x,
                output, name = paste0(this_group, "/x"))
                
            h5write(linear_data@i,
                output, name = paste0(this_group, "/i"))
            
            h5write(linear_data@p,
                output, name = paste0(this_group, "/p"))
            
            h5write(levels_data,
                output, name = paste0(this_group, "/codes"))
            
            # map
            rowdata <- seq(map_start, map_start + nrow_data - 1, 1)
            block <- rep(n, nrow_data)
            chunk_map <- cbind(vcf_start,  vcf_end, rowdata, block)
            
            h5set_extent(output, "map", c(map_extent, 4))
            h5write(chunk_map, output, "map", start = c(map_start, 1))
            
            # number of elements and rownames for the block
            
            # seqranges
            h5set_extent(output, "seqnames", c(map_extent, NULL))
            h5write(chrom, output, "seqnames",
                start = c(map_start, NULL))
            
            cat("Chunk ", n, "written ...         \r")
            cat("\nReading vcf ...\n")
            n <- n + 1
            map_start <- map_start + nrow_data
        }
        cat("\nexporting bed file ...\n")
        h5v_export_map(output, name = map_name, overwrite = overwrite)
        
        out <- h5Variant(output, map_name, overwrite = overwrite, type = "sparse")
        cat("done!\n")
        out
})

#' vcf_2_h5Variant
#' @description ...
#' @param ...
#' @examples
#'
#' \dontrun{
#' }
#'
#' @author Leandro Roser \email{learoser@@gmail.com}
#' @export

setGeneric("h5v_import_variants_uncompress",
    function(data, index = NULL,
        output = sprintf("%s.h5", gsub("\\.vcf.*$",  "", data)),
        map_name = sprintf("%s.bed", gsub("\\.vcf.*$", "", data)),
        method = c("vcf", "txt"),
        blocksize = 100000,
        chunksize = 10000,
        overwrite = FALSE,
        genome = "hg19",
        default_value = "0/0",
        encoded_as = "H5T_NATIVE_USHORT",
        level = 7,
        rownames_size = 200,
        sep = " ",
        quoted = FALSE,
        has_colnames = TRUE,
        has_rownames = TRUE,
        ...) {
        
        
        method <- match.arg(method)
        
        
        
        create_h5_skeleton(output, blocksize, 
            chunksize, 
            overwrite, level, 
            rownames_size, "uncompress")
        
        
        if(method == "vcf") {
            if(is.null(index)) {
                index <- paste0(data, ".tbi")
            }
            tab <- open(TabixFile(data, index, yieldSize = blocksize))
            on.exit(close(tab))
            param <- ScanVcfParam(fixed = NA, info = NA, geno = c("GT"))
            
            initialized <- FALSE
            scanfun <- function(){
                vcf_yield <- readVcf(tab, genome, param = param)
                to_write <<- geno(vcf_yield)$GT
                
                if(!initialized) {
                    h5write(colnames(to_write), output, name = "colnames")
                    ncol_data <<- ncol(to_write)
                    seqnames_levels <- levels(seqnames(rowRanges(vcf_yield)))
                    h5write(seqnames_levels,
                        output, name = "seqlevels")
                    h5createDataset(output, "seqnames",
                        dims = 0,
                        maxdims = 1E17,
                        chunk = chunksize,
                        storage.mode = "character",
                        size = max(nchar(seqnames_levels))+1,
                        showWarnings=FALSE,
                        level = level)
                    h5createDataset(output,  "data", 
                        H5type = encoded_as,
                        maxdims = c(1E17,  ncol_data),
                        dims = c(0, ncol_data), 
                        chunk = c(chunksize, ncol_data),
                        level = level)
                    
                    # added 1 for some wear reazon, without the 1
                    # size is 1 character less
                    
                    #rownames
                    initialized <<- TRUE
                }
                this_rownames <<- names(vcf_yield)
                vcf_start <<- start(vcf_yield)
                vcf_end <<- end(vcf_yield)
                chrom <<- as.vector(seqnames(vcf_yield))
                if(nrow(to_write) > 0) {
                    TRUE
                } else {
                    FALSE
                }
            }
            
        } else {
            this_data <- chunker(path = data, 
                sep = sep,
                quoted = quoted,
                has_colnames = has_colnames,
                has_rownames = has_rownames,
                chunksize = chunksize, 
                ...)
            
            scanfun <- function(){
                next_chunk(this_data)
            }
        }
        
        n <- 1
        map_start <- 1
        map_extent <- 0
        
        cat("Reading vcf ...\n")
        while(scanfun()) {
            
            cat(n * blocksize, " lines read    \n")
            nrow_data <- nrow(to_write)
            
            map_extent <- map_extent + nrow_data
            
            cat("Preparing data and writing chunk ...\n")
            
            h5set_extent(output, "rownames", map_extent)
            h5write(this_rownames, output, "rownames",
                start = c(map_start))
            
            
            # conversion into sparse matrix
            cat("Writing data ...\n")
            data_output <- encode_FactorMatrix(to_write, 
                default_value = default_value,
                key_start = 0)
            levels_data <- levels(data_output)
            
            # values
            
            h5set_extent(output, "rownames", map_extent)
            h5write(this_rownames,  output, "rownames",
                start = c(map_start))
            
            h5set_extent(output, "data", dims = c(map_extent, ncol_data))
            h5write(data_output, output, "data",  start = c(map_start, 1))
            
            rowdata <- seq(map_start, map_start + nrow_data - 1, 1)
            chunk_map <- cbind(vcf_start,  vcf_end, rowdata)
            
            h5set_extent(output, "map", c(map_extent, 3))
            h5write(chunk_map, output, "map", start = c(map_start, 1))
            
            # number of elements and rownames for the block
            
            # seqranges
            h5set_extent(output, "seqnames", c(map_extent, NULL))
            h5write(chrom, output, "seqnames",
                start = c(map_start, NULL))
            
            cat("Chunk ", n, "written ...         \r")
            cat("\nReading vcf ...\n")
            n <- n + 1
            map_start <- map_start + nrow_data
        }
        cat("\nexporting bed file ...\n")
        h5v_export_map(output, name = map_name, overwrite = overwrite)
        
        out <- h5Variant(output, map_name, overwrite = overwrite, type = "uncompress")
        cat("done!\n")
        out
})

#' Get row of h5Variant object
#' @description ...
#' @param ...
#' @examples
#'
#' \dontrun{
#' }
#'
#' @author Leandro Roser \email{learoser@@gmail.com}
#' @keywords internal

setGeneric("h5v_sparse_get_column", function(object, from, to, block) {
    #extract_col <- function(z,col) 
    # {a<-z@p[col]+1; b <- z@p[col+1] ; if(a<=b) z@x[a:b] else integer(0)}
    
    where <- object@h5_location
    this_group <- paste0("/data/chunk_", block)
    
    a <- h5read(where, 
        name = paste0(this_group, "/p"), index = list(from)) + 1
    a <- as.integer(a)
    b <- h5read(where, 
        name = paste0(this_group, "/p"), index = list(to + 1))
    b <- as.integer(b)
    
    if(a<=b) { 
        # i <- h5read(where, 
        #     name = paste0(this_group, "/i"), index = list(seq(a, b, 1)))
        # out <- h5read(where, 
        #     name = paste0(this_group, "/x"), index = list(seq(a, b, 1)))
        # outmatrix <- list()
        # k  = 1
        # ci = 1
        # temp <-integer
        # while(k <= to - from + 1) {
        #     newtemp <- temp
        #     while(i[k]> i[k-1]) {
        #         newtemp[i[k]] <- x[k]
        #     }
        #     outmatrix[ci] <- newtemp
        #     ci <- ci + 1
        # }
   
    } else integer(0)
})


#' Get block of h5Variant object
#' @description ...
#' @param ...
#' @examples
#'
#' \dontrun{
#'
#' }
#'
#' @author Leandro Roser \email{learoser@@gmail.com}
#' @keywords internal
#'
setGeneric("h5v_sparse_get_block", function(object, block) {
    where <- object@h5_location
    this_block <- paste0("/data/chunk_", block)
    data <- h5read(where, name = this_block)
    attr <- h5readAttributes(where, name = this_block)
    block_rownames <- h5read(where, name = "rownames",
        index = list(seq(as.numeric(attr$row_start),
            as.numeric(attr$row_end), 1)))
    block_colnames <- h5read(where, name = "colnames")
   sparseMatrix(i = data$i, p = data$p, x = data$x, index1 = FALSE, 
                dimnames = c(block_rownames, block_colnames))
})
leandroroser/h5Variant documentation built on May 8, 2019, 3:14 a.m.