or Case-Control Data with SNP Haplotypes

Documented in create.missingness.matrix getDyads getFullTriads

#' Create family matrix of data missingness
#' 
#' Help function that checks the family structure, based on IDs and provided data in .ped
#'   file, and then creates a matrix with families, where each indiv. has TRUE/FALSE value,
#'   depending on whether they have or not genetic data.
#' 
#' @param data.in The data object (in format as the output of \link{genDataRead}); note
#'   that the design of the data is assumed to be triad.
#' @param new.ids A list with pedIndex and ids, as generated by \link{f.check.unique.ids}.
#'
#' @return A matrix where each member of the family has TRUE or FALSE
#'   (whether they have any genetic data)
#'
#' @keywords internal
create.missingness.matrix <- function( 
	data.in = stop( "No data given!", call. = FALSE ),
	new.ids = stop( "No IDs given", call. = FALSE )
){

	id <- new.ids$ids
	pedIndex <- new.ids$pedIndex

	check.rows.only.NAs <- function( gen.data, ind ){
		sum.not.na.list <- lapply( gen.data, function( gen.el.ff ){
			gen.el <- gen.el.ff[ ind, ]
			which.na.level <- which( is.na( levels( gen.el ) ) )
			apply( gen.el, 1, function( row ){
				sum( as.numeric( row ) != which.na.level )
			} )
		})
		if( length( sum.not.na.list ) == 1 ){
			sum.not.na <- matrix( sum.not.na.list[[1]], ncol = 1 )
		} else {
			sum.not.na <- Reduce( cbind, sum.not.na.list )
			colnames( sum.not.na ) <- NULL
		}
		# this tells us which rows have only NAs
		rows.only.na <- rowSums( sum.not.na ) == 0
		return( rows.only.na )
	}

	# check the fathers first - which have genetic data?
	d.f <- match( pedIndex[ ,'id.father' ], id )
	d.f.NAs <- check.rows.only.NAs( data.in$gen.data, ind = d.f )
	ids.f.gen.data <- (id[ d.f ])[ !d.f.NAs ]
	
	# now, for the mothers...
	d.m <- match( pedIndex[ ,'id.mother' ], id )
	d.m.NAs <- check.rows.only.NAs( data.in$gen.data, ind = d.m )
	ids.m.gen.data <- (id[ d.m ])[ !d.m.NAs ]
	
	# and the children
	d.c <- match( pedIndex[ ,'id.child' ], id )
	d.c.NAs <- check.rows.only.NAs( data.in$gen.data, ind = d.c )
	ids.c.gen.data <- (id[ d.c ])[ !d.c.NAs ]
	
	# create a matrix where each member of the family has TRUE or FALSE
	#  (whether they have any genetic data)
	which.gen.data.fam <- as.data.frame( pedIndex, stringsAsFactors = FALSE )
	which.gen.data.fam$id.child <- FALSE
	which.gen.data.fam$id.father <- FALSE
	which.gen.data.fam$id.mother <- FALSE

	which.gen.data.fam$id.mother[ 
		match( ids.m.gen.data, pedIndex[ , "id.mother" ] ) ] <- TRUE
	which.gen.data.fam$id.father[ 
		match( ids.f.gen.data, pedIndex[ , "id.father" ] ) ] <- TRUE
	which.gen.data.fam$id.child[ 
		match( ids.c.gen.data, pedIndex[ , "id.child" ] ) ] <- TRUE

	return( which.gen.data.fam )
}


#' Getter for all full triads
#'
#' Wrapper function for \link{genDataGetPart} that returns a subset of the data containing 
#'  only full triads (where all, the child, the mother and the father have genetic data).
#'
#' @param data.in The data object (in format as the output of \link{genDataRead}); note
#'   that the design of the data is assumed to be triad.
#' @param file.out The base for the output filename (default: "my_data_onlyTriads").
#' @param dir.out The path to the directory where the output files will be saved.
#' @param overwrite Whether to overwrite the output files: if NULL (default), will prompt
#'   the user to give answer; set to TRUE, will automatically overwrite any existing files;
#'   and set to FALSE, will stop if the output files exist.
#'
#' @return A list object with three elements:
#'   \itemize{
#'     \item \emph{cov.data} - a \code{data.frame} with covariate data (if available in
#'        the input file)
#'     \item \emph{gen.data} - a list with chunks of the genetic data; the data is divided
#'        column-wise, using 10,000 columns per chunk; each element of this list is a
#'        \link[ff]{ff} matrix
#'     \item \emph{aux} - a list with meta-data and important parameters.
#'   }
#'   This now contains only the selected subset of data.
#'

getFullTriads <- function( data.in = stop( "No data given!", call. = FALSE ),
	file.out = "my_data_onlyTriads",
	dir.out = ".", overwrite = NULL ){

	if( !is( data.in, "haplin.data" ) ||
		!all( names( data.in ) == .haplinEnv$.haplin.data.names ) ){
		stop( "The input data is not in the correct format!", call. = FALSE )
	}
	
	format <- data.in$aux$info$filespecs$format
	
	if( format == "ped" ){
		# first - get all the IDs and check family structure
		new.ids <- f.check.unique.ids( data.in$cov.data )
		id <- new.ids$ids
		pedIndex <- new.ids$pedIndex

  	# check that none of the children are missing
  	pedIndex <- new.ids$pedIndex <- pedIndex[!is.na(pedIndex[,"id.child"]), ]
  	# check if there are any families that have missing members in pedIndex
  	any.NAs.families <- apply(pedIndex, 1, function(row){
  	  any(is.na(row))
  	})
  	pedIndex <- new.ids$pedIndex <- pedIndex[!any.NAs.families, ]
	
		which.gen.data.fam <- create.missingness.matrix( data.in, new.ids )

		# here are IDs of the members where the full family information is available
		full.triads <- apply( which.gen.data.fam[ ,-1 ], 1, all )
		pedIndex.triads <- pedIndex[ full.triads, ]
		
		# check how many families found
		if( nrow( pedIndex.triads ) == 0 ){
			stop( "No full triads found!", call. = FALSE )
		}
		
		full.triads.rows <- sort( c( match( pedIndex.triads[ ,'id.father' ], id ),
													match( pedIndex.triads[ ,'id.mother' ], id ),
													match( pedIndex.triads[ ,'id.child' ], id ) ) )
		return( genDataGetPart( data.in, design = "triad",
										rows = full.triads.rows,
										file.out = file.out, dir.out = dir.out, overwrite = overwrite )
					)
	} else if( format == "haplin" ){
		stop( "Not implemented yet", call. = FALSE )
	} else {
		stop( paste( "Unrecognized format:", format ), call. = FALSE )
	}
}

#' Getter only for all dyads (child and one parent)
#'
#' Wrapper function for \link{genDataGetPart} that returns a subset of the data containing 
#'  only dyads (where the child and only one parent have genetic data), i.e., not triads.
#'
#' @param data.in The data object (in format as the output of \link{genDataRead}); note
#'   that the design of the data is assumed to be "triad".
#' @param file.out The base for the output filename (default: "my_data_onlyDyads").
#' @param dir.out The path to the directory where the output files will be saved
#'   (default: ".", the current directory).
#' @param overwrite Whether to overwrite the output files: if NULL (default), will prompt
#'   the user to give answer; set to TRUE, will automatically overwrite any existing files;
#'   and set to FALSE, will stop if the output files exist.
#'
#' @return A list object with three elements:
#'   \itemize{
#'     \item \emph{cov.data} - a \code{data.frame} with covariate data (if available in
#'        the input file)
#'     \item \emph{gen.data} - a list with chunks of the genetic data; the data is divided
#'        column-wise, using 10,000 columns per chunk; each element of this list is a
#'        \link[ff]{ff} matrix
#'     \item \emph{aux} - a list with meta-data and important parameters.
#'   }
#'   This now contains only the selected subset of data.
#'

getDyads <- function( data.in = stop( "No data given!", call. = FALSE ),
	file.out = "my_data_onlyDyads",
	dir.out = ".",
	overwrite = NULL ){

	if( !is(data.in, "haplin.data") ||
		!all( names( data.in ) == .haplinEnv$.haplin.data.names ) ){
		stop( "The input data is not in the correct format!", call. = FALSE )
	}
	
	format <- data.in$aux$info$filespecs$format
	
	if( format == "ped" ){
		new.ids <- f.check.unique.ids( data.in$cov.data )
		id <- new.ids$ids
		pedIndex <- new.ids$pedIndex

  	# check that none of the children are missing
  	pedIndex <- new.ids$pedIndex <- pedIndex[!is.na(pedIndex[,"id.child"]), ]

  	# check which families have only one missing member in pedIndex
  	one.NA.families <- apply(pedIndex, 1, function(row){
  	  sum(is.na(row)) == 1
  	})

  	# if there is "NA" instead of a parental ID, this is a dyad
	  dyads.from.pedIndex <- c()
  	if (any(one.NA.families)){
    	dyads.from.pedIndex <- pedIndex[one.NA.families, ]
  	}
  	
  	# check if there are any families that have missing members in pedIndex
  	any.NAs.families <- apply(pedIndex, 1, function(row){
  	  any(is.na(row))
  	})
  	pedIndex <- new.ids$pedIndex <- pedIndex[!any.NAs.families, ]

  	# now - taking care of the genetic data missingness
  	which.gen.data.fam <- create.missingness.matrix( data.in, new.ids )

		# first, exclude the full triads
		full.triads <- apply( which.gen.data.fam[ ,-1 ], 1, all )
		pedIndex.no.triads <- pedIndex[ !full.triads, ]
		which.gen.data.fam <- which.gen.data.fam[ !full.triads, ]
		# then exclude the families with one member only
		single.mem.fam <- apply( which.gen.data.fam[ ,-1 ], 1,
			function( x ){
				sum( x ) == 1
				} )
		pedIndex.dyads <- pedIndex.no.triads[ !single.mem.fam, ]
		which.gen.data.fam <- which.gen.data.fam[ !single.mem.fam, ]
		# remove the rows where only info from parents is available (if any such rows exist)
		only.parents <- apply( which.gen.data.fam[ ,-1 ], 1,
			function( x ){
				! x[ "id.child" ]
				} )
		if( any( only.parents ) ){
			pedIndex.dyads <- pedIndex.dyads[ !only.parents, ]
			which.gen.data.fam <- which.gen.data.fam[ !only.parents, ]
		}
		# check how many dyads found
		if (nrow(pedIndex.dyads) == 0 &
		    length(dyads.from.pedIndex) == 0){
			stop( "No dyads found!", call. = FALSE )
		}
		if (nrow(pedIndex.dyads) == 0){
		  final.sel.IDs <- unique(
		    na.omit(as.character(
				  dyads.from.pedIndex[, -1]
				))
		  )
		} else if (length(dyads.from.pedIndex) == 0){
		  final.sel.IDs <- unique(c(
				pedIndex.dyads[ which.gen.data.fam$id.father ,'id.father' ],
				pedIndex.dyads[ which.gen.data.fam$id.mother ,'id.mother' ],
				pedIndex.dyads[ which.gen.data.fam$id.child ,'id.child' ]
			))
		} else {
		  final.sel.IDs <- unique(c(
				pedIndex.dyads[ which.gen.data.fam$id.father ,'id.father' ],
				pedIndex.dyads[ which.gen.data.fam$id.mother ,'id.mother' ],
				pedIndex.dyads[ which.gen.data.fam$id.child ,'id.child' ],
				na.omit(as.character(
				  dyads.from.pedIndex[, -1]
				))
			))
		}
		
		# find the correct rows, based on matching the IDs from the final selection
		only.dyads.rows <- sort( match( final.sel.IDs, id ) )
		return( genDataGetPart( data.in, design = "triad",
										rows = only.dyads.rows, file.out = file.out,
										dir.out = dir.out, overwrite = overwrite )
					)
	} else if( format == "haplin" ){
		stop( "Not implemented yet", call. = FALSE )
	} else {
		stop( paste( "Unrecognized format:", format ), call. = FALSE )
	}
}