Nothing
#' Extracting part of genetic data.
#'
#' This function enables to extract (and save for later use) part of genetic data
#' read in with \link{genDataRead}.
#'
#' The genetic data from GWAS studies can be quite large, and thus the analysis
#' is time-consuming. If a user knows where they want to focus the analysis,
#' they can use this function to extract part of the entire dataset and use
#' only this part in subsequent Haplin analysis.
#'
#' @param data.in The data object (in format as the output of \link{genDataRead}).
#' @param file.out The base for the output filename (default: "my_data_part").
#' @param dir.out The path to the directory where the output files will be saved.
#' @param design The design used in the study - choose from:
#' \itemize{
#' \item \emph{triad} - (default), data includes genotypes of mother, father and child;
#' \item \emph{cc} - classical case-control;
#' \item \emph{cc.triad} - hybrid design: triads with cases and controls;
#' }.
#'
#' Any of the following can be given to narrow down the dataset:
#' @param markers Vector with numbers or names indicating which markers to choose.
#' @param indiv.ids Character vector giving IDs of individuals. \strong{CAUTION:}
#' in a standard PED file, individual IDs are not unique, so this will select
#' all individuals with given IDs.
#' @param rows Numeric vector giving the positions - this will select only these rows.
#' @param cc One or more values to choose based on case-control status ('cc' column).
#' @param sex One or more values to choose based on the 'sex' column.
#' @param overwrite Whether to overwrite the output files: if NULL (default), will prompt
#' the user to give answer; set to TRUE, will automatically overwrite any existing files;
#' and set to FALSE, will stop if the output files exist.
#' @param ... If any additional covariate data are available in \code{data.in},
#' the user can choose based on values of these (see the Examples section).
#'
#' @return A list object with three elements:
#' \itemize{
#' \item \emph{cov.data} - a \code{data.frame} with covariate data (if available in
#' the input file)
#' \item \emph{gen.data} - a list with chunks of the genetic data; the data is divided
#' column-wise, using 10,000 columns per chunk; each element of this list is a
#' \link[ff]{ff} matrix
#' \item \emph{aux} - a list with meta-data and important parameters.
#' }
#' This now contains only the selected subset of data.
#'
#' @examples
#' # The argument 'overwrite' is set to TRUE!
#' # Read the data:
#' examples.dir <- system.file( "extdata", package = "Haplin" )
#' example.file <- file.path( examples.dir, "HAPLIN.trialdata2.txt" )
#' my.gen.data.read <- genDataRead( file.in = example.file, file.out = "trial_data",
#' dir.out = tempdir( check = TRUE ), format = "haplin", allele.sep = "", n.vars = 2,
#' cov.header = c( "smoking", "sex" ), overwrite = TRUE )
#' my.gen.data.read
#' # Extract part with only men:
#' men.subset <- genDataGetPart( my.gen.data.read, design = "triad", sex = 1,
#' dir.out = tempdir( check = TRUE ), file.out = "gen_data_men_only", overwrite = TRUE )
#' men.subset
#' # Extract the part with only smoking women:
#' women.smoke.subset <- genDataGetPart( my.gen.data.read, design = "triad",
#' dir.out = tempdir( check = TRUE ), sex = 0, smoking = c( 1,2 ), overwrite = TRUE )
#' women.smoke.subset
#'
#' @section Warning:
#' No checks are performed when choosing a subset of the data - it is the user's
#' obligation to check whether the data subset contains correct number of individuals
#' (especially important when using the \code{triad} design study) and/or markers!
#'
genDataGetPart <- function(
data.in = stop( "No data given!", call. = FALSE ),
design = stop( "Design type must be given!" ),
markers,
indiv.ids,
rows,
cc,
sex,
file.out = "my_data_part",
dir.out = ".",
overwrite = NULL,
...
){
#---- checking the input params ---------------------
files.list <- f.make.out.filename( file.out = file.out, dir.out = dir.out, overwrite = overwrite )
if( !is( data.in, "haplin.data" ) ||
!all( names( data.in ) == .haplinEnv$.haplin.data.names ) ){
stop( "The input data is not in the correct format!", call. = FALSE )
}
design.list <- get( ".design.list", envir = .haplinEnv )
if( !( design %in% design.list ) ){
stop( "Given design(", design,") not recognized! Design has to be one of: ", paste( design.list, collapse = ", " ), call. = FALSE )
}
#---- done checking the input params ----------------
#----------------------------------------------------
# get all arguments
all.args <- match.call()[-1]
all.args.names <- names( all.args )
# arguments that do not define selection of data subset:
non.sel.args <- c( "data.in", "file.out", "dir.out", "overwrite", "design" )
which.non.sel.args.given <- c()
for( i in 1:length( non.sel.args ) ){
which.cur.arg <- match( non.sel.args[ i ], all.args.names )
if( !is.na( which.cur.arg ) ){
which.non.sel.args.given <- c( which.non.sel.args.given, which.cur.arg )
}
}
# arguments that define selection of data subset:
selection.args <- all.args[ -which.non.sel.args.given ]
# this is the maximum set of arguments for subset selection, based on the read in covariate data
format <- data.in$aux$info$filespecs$format
# if( format == "ped" ){
# correct.sel.args <- union( names( formals() ), colnames( data.in$cov.data )[ -(1:4) ] )
# } else { # haplin file
correct.sel.args <- union( names( formals() ), colnames( data.in$cov.data ) )
# }
correct.sel.args <- correct.sel.args[ -( match( c( non.sel.args, "..." ),
correct.sel.args ) ) ]
max.rows <- nrow( data.in$gen.data[[1]] )
all.rows <- 1:max.rows
all.markers <- sum( unlist( lapply( data.in$gen.data, ncol ) ) )
all.cols <- 1:all.markers
subset.rows <- all.rows
subset.cols <- all.cols
is.subset.cols <- FALSE
is.subset.rows <- FALSE
family <- "c"
if( design %in% c( "triad", "cc.triad" ) & format == "haplin" ){
family <- "mfc"
}
cat( "Provided arguments:\n" )
for( arg in names( selection.args ) ){
if( arg == "markers" ){
val <- eval( all.args$markers, envir = rlang::caller_env() )
print.val <- paste( val, collapse = ", " )
if( length( val ) > 20 ){
print.val <- paste( paste( head( val ), collapse = ", "), "...", paste( tail( val ), collapse = ", " ) )
}
cat( " --- chosen markers:", print.val, "\n" )
if(is.numeric(val)){
if( any( val > all.markers ) | any( val < 0 ) ){
stop( "wrong markers chosen; not in range of given data: max ", all.markers, "!\n", call. = FALSE )
}
if( identical( val, all.cols ) ){
warning( "this selection is equal to choosing all markers!", call. = FALSE )
}
}
all.column.names <- unlist(
lapply(
data.in$gen.data,
colnames
)
)
subset.cols <- f.sel.markers(
n.vars = 0,
markers = val,
family = family,
split = TRUE,
all.marker.names = all.column.names
)
is.subset.cols <- TRUE
} else if( arg == "indiv.ids" ){
val <- eval( all.args$indiv.ids, envir = rlang::caller_env() )
print.val <- paste( val, collapse = ", " )
if( length( val ) > 20 ){
print.val <- paste( paste( head( val ), collapse = ", "), "...", paste( tail( val ), collapse = ", " ) )
}
cat( " --- individual IDs:", print.val, "\n" )
if( is.null( data.in$cov.data ) |
!( "id.c" %in% colnames( data.in$cov.data ) ) ){
# this can happen when the haplin-formatted file had no extra info
stop( "There is no information on individual IDs in the data!", call. = FALSE )
}
chosen.indiv.logic <- val %in% as.character( data.in$cov.data[ ,"id.c" ] )
if( !any( chosen.indiv.logic ) ){
stop( "wrong individual IDs chosen!", call. = FALSE )
} else if( sum( chosen.indiv.logic ) == max.rows ){
warning( "this selection is equal to choosing all the individuals available in dataset!", call. = FALSE )
} else {
chosen.indiv <- do.call( c, sapply( val, function( x ){
list( which( as.character( data.in$cov.data[ ,"id.c" ] ) == x ) )
} ) )
subset.rows <- intersect( subset.rows, chosen.indiv[ !( is.na( chosen.indiv ) ) ] )
is.subset.rows <- TRUE
}
} else if( arg == "rows" ){
val <- eval( all.args$rows, envir = rlang::caller_env() )
print.val <- paste( val, collapse = ", " )
if( length( val ) > 20 ){
print.val <- paste( paste( head( val ), collapse = ", "), "...", paste( tail( val ), collapse = ", " ) )
}
cat( " --- rows chosen:", print.val, "\n" )
if( any( val < 1 ) | any( val > nrow( data.in$cov.data ) ) ){
stop( "wrong rows, not in range of given data: max ", max.rows, "\n", call. = FALSE )
}
if( identical( val, all.rows ) ){
warning( "this selection is equal to choosing all the rows available in dataset!", call. = FALSE )
} else {
subset.rows <- intersect( subset.rows, val )
is.subset.rows <- TRUE
}
} else if( !( arg %in% correct.sel.args ) ){
stop( "the given argument: ", arg, " is not recognizable!", call. = FALSE )
} else { #any other argument if additional covariate data are loaded
val <- eval( all.args[[ match( arg, names( all.args ) ) ]], envir = rlang::caller_env() )
print.val <- paste( val, collapse = ", " )
if( length( val ) > 20 ){
print.val <- paste( paste( head( val ), collapse = ", "), "...", paste( tail( val ), collapse = ", " ) )
}
cat( " --- argument: ", arg,", chosen values: ", print.val, "\n", sep = "" )
avail.vals <- unique( as.numeric( data.in$cov.data[ ,arg ] ) )
if( !all( val %in% avail.vals ) ){
stop( "wrong argument(", arg, ") chosen, available: ", paste( avail.vals, sep = " ", collapse = " " ), "!", call. = FALSE )
}
if( all( avail.vals %in% val ) ){
warning( "this selection is equal to choosing all the possible values of the given covariate (", arg, ")!", call. = FALSE )
} else {
chosen.indiv <- do.call( c, sapply( val, function( x ){
list( which( as.numeric( data.in$cov.data[ ,arg ] ) == x ) )
} ) )
subset.rows <- intersect( subset.rows, chosen.indiv )
is.subset.rows <- TRUE
}
}
}
if( is.subset.rows | is.subset.cols ){
cat( "INFO: Will select ", length( subset.rows ), " rows and ", length( subset.cols ), " columns.\n", sep = "" )
} else {
stop( "No subset selected.\n", call. = FALSE )
}
if( is.subset.cols ){
# check how many chunks will be needed
nb.cols.per.chunk <- get( ".nb.cols.per.chunk", envir = .haplinEnv )
nb.chunks <- ceiling( length( subset.cols ) / nb.cols.per.chunk )
gen.data.col.wise <- sapply( 1:nb.chunks, function( x ){
first.col <- ( nb.cols.per.chunk*( x - 1 ) + 1 )
last.col <- min( ( nb.cols.per.chunk*x ), length( subset.cols ) )
cur.cols <- subset.cols[ first.col:last.col ]
list( f.get.gen.data.cols( data.in$gen.data, cur.cols ) )
} )
} else {
gen.data.col.wise <- data.in$gen.data
}
cov.data.in <- NULL
if( is.subset.rows ){
# need to choose from both gen.data and cov.data
gen.data.col.wise <- lapply( gen.data.col.wise, function( x ){
sub <- x[ subset.rows, ]
out <- ff::ff( sub, levels = levels( sub ), dim = dim( sub ), vmode = ff::vmode( x ) )
colnames( out ) <- colnames( sub )
return( out )
})
if( !is.null( data.in$cov.data ) ){
cov.data.in <- data.in$cov.data[ subset.rows, ]
}
} else if( !is.null( data.in$cov.data ) ){
cov.data.in <- data.in$cov.data
}
data.out <- list( cov.data = cov.data.in, gen.data = gen.data.col.wise, aux = data.in$aux )
class( data.out ) <- class( data.in )
# update the marker names
if("markers" %in% names(selection.args)){
which.markers.retain <- eval(all.args$markers, envir = rlang::caller_env())
data.out$aux$marker.names <- data.in$aux$marker.names[which.markers.retain]
}
## saving the chosen part of the data
cat( "Saving data... \n" )
cur.names <- c()
for( i in 1:length( gen.data.col.wise ) ){
cur.name <- paste( get( ".gen.cols.name", envir = .haplinEnv ), i, sep = "." )
assign( cur.name, gen.data.col.wise[[i]] )
cur.names <- c( cur.names, cur.name )
}
aux <- data.out$aux
save.list <- c( cur.names, "aux" )
if( !is.null( cov.data.in ) ){
save.list <- c( save.list, "cov.data.in" )
}
ff::ffsave( list = save.list, file = file.path( dir.out, files.list$file.out.base ) )
cat( "... saved to files: ", files.list$file.out.ff, ", ", files.list$file.out.aux, "\n", sep = "" )
return( data.out )
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.