Nothing
#' Create family matrix of data missingness
#'
#' Help function that checks the family structure, based on IDs and provided data in .ped
#' file, and then creates a matrix with families, where each indiv. has TRUE/FALSE value,
#' depending on whether they have or not genetic data.
#'
#' @param data.in The data object (in format as the output of \link{genDataRead}); note
#' that the design of the data is assumed to be triad.
#' @param new.ids A list with pedIndex and ids, as generated by \link{f.check.unique.ids}.
#'
#' @return A matrix where each member of the family has TRUE or FALSE
#' (whether they have any genetic data)
#'
#' @keywords internal
create.missingness.matrix <- function(
data.in = stop( "No data given!", call. = FALSE ),
new.ids = stop( "No IDs given", call. = FALSE )
){
id <- new.ids$ids
pedIndex <- new.ids$pedIndex
check.rows.only.NAs <- function( gen.data, ind ){
sum.not.na.list <- lapply( gen.data, function( gen.el.ff ){
gen.el <- gen.el.ff[ ind, ]
which.na.level <- which( is.na( levels( gen.el ) ) )
apply( gen.el, 1, function( row ){
sum( as.numeric( row ) != which.na.level )
} )
})
if( length( sum.not.na.list ) == 1 ){
sum.not.na <- matrix( sum.not.na.list[[1]], ncol = 1 )
} else {
sum.not.na <- Reduce( cbind, sum.not.na.list )
colnames( sum.not.na ) <- NULL
}
# this tells us which rows have only NAs
rows.only.na <- rowSums( sum.not.na ) == 0
return( rows.only.na )
}
# check the fathers first - which have genetic data?
d.f <- match( pedIndex[ ,'id.father' ], id )
d.f.NAs <- check.rows.only.NAs( data.in$gen.data, ind = d.f )
ids.f.gen.data <- (id[ d.f ])[ !d.f.NAs ]
# now, for the mothers...
d.m <- match( pedIndex[ ,'id.mother' ], id )
d.m.NAs <- check.rows.only.NAs( data.in$gen.data, ind = d.m )
ids.m.gen.data <- (id[ d.m ])[ !d.m.NAs ]
# and the children
d.c <- match( pedIndex[ ,'id.child' ], id )
d.c.NAs <- check.rows.only.NAs( data.in$gen.data, ind = d.c )
ids.c.gen.data <- (id[ d.c ])[ !d.c.NAs ]
# create a matrix where each member of the family has TRUE or FALSE
# (whether they have any genetic data)
which.gen.data.fam <- as.data.frame( pedIndex, stringsAsFactors = FALSE )
which.gen.data.fam$id.child <- FALSE
which.gen.data.fam$id.father <- FALSE
which.gen.data.fam$id.mother <- FALSE
which.gen.data.fam$id.mother[
match( ids.m.gen.data, pedIndex[ , "id.mother" ] ) ] <- TRUE
which.gen.data.fam$id.father[
match( ids.f.gen.data, pedIndex[ , "id.father" ] ) ] <- TRUE
which.gen.data.fam$id.child[
match( ids.c.gen.data, pedIndex[ , "id.child" ] ) ] <- TRUE
return( which.gen.data.fam )
}
#' Getter for all full triads
#'
#' Wrapper function for \link{genDataGetPart} that returns a subset of the data containing
#' only full triads (where all, the child, the mother and the father have genetic data).
#'
#' @param data.in The data object (in format as the output of \link{genDataRead}); note
#' that the design of the data is assumed to be triad.
#' @param file.out The base for the output filename (default: "my_data_onlyTriads").
#' @param dir.out The path to the directory where the output files will be saved.
#' @param overwrite Whether to overwrite the output files: if NULL (default), will prompt
#' the user to give answer; set to TRUE, will automatically overwrite any existing files;
#' and set to FALSE, will stop if the output files exist.
#'
#' @return A list object with three elements:
#' \itemize{
#' \item \emph{cov.data} - a \code{data.frame} with covariate data (if available in
#' the input file)
#' \item \emph{gen.data} - a list with chunks of the genetic data; the data is divided
#' column-wise, using 10,000 columns per chunk; each element of this list is a
#' \link[ff]{ff} matrix
#' \item \emph{aux} - a list with meta-data and important parameters.
#' }
#' This now contains only the selected subset of data.
#'
getFullTriads <- function( data.in = stop( "No data given!", call. = FALSE ),
file.out = "my_data_onlyTriads",
dir.out = ".", overwrite = NULL ){
if( !is( data.in, "haplin.data" ) ||
!all( names( data.in ) == .haplinEnv$.haplin.data.names ) ){
stop( "The input data is not in the correct format!", call. = FALSE )
}
format <- data.in$aux$info$filespecs$format
if( format == "ped" ){
# first - get all the IDs and check family structure
new.ids <- f.check.unique.ids( data.in$cov.data )
id <- new.ids$ids
pedIndex <- new.ids$pedIndex
# check that none of the children are missing
pedIndex <- new.ids$pedIndex <- pedIndex[!is.na(pedIndex[,"id.child"]), ]
# check if there are any families that have missing members in pedIndex
any.NAs.families <- apply(pedIndex, 1, function(row){
any(is.na(row))
})
pedIndex <- new.ids$pedIndex <- pedIndex[!any.NAs.families, ]
which.gen.data.fam <- create.missingness.matrix( data.in, new.ids )
# here are IDs of the members where the full family information is available
full.triads <- apply( which.gen.data.fam[ ,-1 ], 1, all )
pedIndex.triads <- pedIndex[ full.triads, ]
# check how many families found
if( nrow( pedIndex.triads ) == 0 ){
stop( "No full triads found!", call. = FALSE )
}
full.triads.rows <- sort( c( match( pedIndex.triads[ ,'id.father' ], id ),
match( pedIndex.triads[ ,'id.mother' ], id ),
match( pedIndex.triads[ ,'id.child' ], id ) ) )
return( genDataGetPart( data.in, design = "triad",
rows = full.triads.rows,
file.out = file.out, dir.out = dir.out, overwrite = overwrite )
)
} else if( format == "haplin" ){
stop( "Not implemented yet", call. = FALSE )
} else {
stop( paste( "Unrecognized format:", format ), call. = FALSE )
}
}
#' Getter only for all dyads (child and one parent)
#'
#' Wrapper function for \link{genDataGetPart} that returns a subset of the data containing
#' only dyads (where the child and only one parent have genetic data), i.e., not triads.
#'
#' @param data.in The data object (in format as the output of \link{genDataRead}); note
#' that the design of the data is assumed to be "triad".
#' @param file.out The base for the output filename (default: "my_data_onlyDyads").
#' @param dir.out The path to the directory where the output files will be saved
#' (default: ".", the current directory).
#' @param overwrite Whether to overwrite the output files: if NULL (default), will prompt
#' the user to give answer; set to TRUE, will automatically overwrite any existing files;
#' and set to FALSE, will stop if the output files exist.
#'
#' @return A list object with three elements:
#' \itemize{
#' \item \emph{cov.data} - a \code{data.frame} with covariate data (if available in
#' the input file)
#' \item \emph{gen.data} - a list with chunks of the genetic data; the data is divided
#' column-wise, using 10,000 columns per chunk; each element of this list is a
#' \link[ff]{ff} matrix
#' \item \emph{aux} - a list with meta-data and important parameters.
#' }
#' This now contains only the selected subset of data.
#'
getDyads <- function( data.in = stop( "No data given!", call. = FALSE ),
file.out = "my_data_onlyDyads",
dir.out = ".",
overwrite = NULL ){
if( !is(data.in, "haplin.data") ||
!all( names( data.in ) == .haplinEnv$.haplin.data.names ) ){
stop( "The input data is not in the correct format!", call. = FALSE )
}
format <- data.in$aux$info$filespecs$format
if( format == "ped" ){
new.ids <- f.check.unique.ids( data.in$cov.data )
id <- new.ids$ids
pedIndex <- new.ids$pedIndex
# check that none of the children are missing
pedIndex <- new.ids$pedIndex <- pedIndex[!is.na(pedIndex[,"id.child"]), ]
# check which families have only one missing member in pedIndex
one.NA.families <- apply(pedIndex, 1, function(row){
sum(is.na(row)) == 1
})
# if there is "NA" instead of a parental ID, this is a dyad
dyads.from.pedIndex <- c()
if (any(one.NA.families)){
dyads.from.pedIndex <- pedIndex[one.NA.families, ]
}
# check if there are any families that have missing members in pedIndex
any.NAs.families <- apply(pedIndex, 1, function(row){
any(is.na(row))
})
pedIndex <- new.ids$pedIndex <- pedIndex[!any.NAs.families, ]
# now - taking care of the genetic data missingness
which.gen.data.fam <- create.missingness.matrix( data.in, new.ids )
# first, exclude the full triads
full.triads <- apply( which.gen.data.fam[ ,-1 ], 1, all )
pedIndex.no.triads <- pedIndex[ !full.triads, ]
which.gen.data.fam <- which.gen.data.fam[ !full.triads, ]
# then exclude the families with one member only
single.mem.fam <- apply( which.gen.data.fam[ ,-1 ], 1,
function( x ){
sum( x ) == 1
} )
pedIndex.dyads <- pedIndex.no.triads[ !single.mem.fam, ]
which.gen.data.fam <- which.gen.data.fam[ !single.mem.fam, ]
# remove the rows where only info from parents is available (if any such rows exist)
only.parents <- apply( which.gen.data.fam[ ,-1 ], 1,
function( x ){
! x[ "id.child" ]
} )
if( any( only.parents ) ){
pedIndex.dyads <- pedIndex.dyads[ !only.parents, ]
which.gen.data.fam <- which.gen.data.fam[ !only.parents, ]
}
# check how many dyads found
if (nrow(pedIndex.dyads) == 0 &
length(dyads.from.pedIndex) == 0){
stop( "No dyads found!", call. = FALSE )
}
if (nrow(pedIndex.dyads) == 0){
final.sel.IDs <- unique(
na.omit(as.character(
dyads.from.pedIndex[, -1]
))
)
} else if (length(dyads.from.pedIndex) == 0){
final.sel.IDs <- unique(c(
pedIndex.dyads[ which.gen.data.fam$id.father ,'id.father' ],
pedIndex.dyads[ which.gen.data.fam$id.mother ,'id.mother' ],
pedIndex.dyads[ which.gen.data.fam$id.child ,'id.child' ]
))
} else {
final.sel.IDs <- unique(c(
pedIndex.dyads[ which.gen.data.fam$id.father ,'id.father' ],
pedIndex.dyads[ which.gen.data.fam$id.mother ,'id.mother' ],
pedIndex.dyads[ which.gen.data.fam$id.child ,'id.child' ],
na.omit(as.character(
dyads.from.pedIndex[, -1]
))
))
}
# find the correct rows, based on matching the IDs from the final selection
only.dyads.rows <- sort( match( final.sel.IDs, id ) )
return( genDataGetPart( data.in, design = "triad",
rows = only.dyads.rows, file.out = file.out,
dir.out = dir.out, overwrite = overwrite )
)
} else if( format == "haplin" ){
stop( "Not implemented yet", call. = FALSE )
} else {
stop( paste( "Unrecognized format:", format ), call. = FALSE )
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.