R/built_kh.R

#' Built Krummhörn data files
#'
#' This function prepares SPSS data provided by the 'Krummhörn Family Reconstitution'
#' carried out by Voland et al. (e.g. Voland 2015) for being used with 'kinlab'.
#' @param source_path Character string for the file path from which SPSS source(s) (e.g. `file1`) will be read.
#' @param local_path Character string for a file path to which 'kinlab' writes produced `.RData` files.
#' @param file1 Character string for the mame of the first source file in SPSS format  (i.e. `.sav`).
#' @param file2 An optional character string for the mame of the second source file in SPSS format  (i.e. `.sav`).
#' @param silent If TRUE, a final message giving the runtime of this function is omitted. (Default is FALSE)
#' @param n_sample Number of random draws, if n_sample < number of observations. (Default is 100)
#' @param force_new Should existing files be replaced? Default is FALSE
#' @param keep_files List specific files being kept in 'local_path' in case of `force_new=TRUE`.
#' @param set_seed Optional random seed being to allow reproducibility? Default is null, no seed is set.
#' @param kinset Should kin networks be computed?
#' @keywords pedigree
#' @export
#' @examples
#' \dontrun{
#' grap_ped(1067, df_ped)
#' }

built_kh <- function(source_path = NULL, local_path = getwd(),
                     file1 = "kindermc.sav",
                     file2 = "familienmc.sav",
                     silent = FALSE,
                     n_sample = 10,
                     force_new=FALSE,
                     keep_files = NULL,
                     set_seed = NULL, 
                     kinset = FALSE){
  if (!is.null(set_seed)){set.seed(set_seed)}
t_start <- Sys.time()
if(force_new==FALSE){my_files <- list.files(local_path)} else {my_files <- paste(keep_files)}
message("## t_start: ", format(t_start, "%X"))
message("## source_path: \"", source_path, "\"")
message("## local_path: \"", local_path, "\"")
if(length(my_files)>0){
  for (i in my_files){
    message("## left unchanged: ", i)
  }
} else  message("## local_path empty.")
if (!all(c("kh_evn.RData", "kh_geo.RData" ,
       "kh_ind.RData", "kh_kin.RData" ,
       "kh_mat.RData", "kh_par.RData", "kh_ped.RData",
       "kh_src.RData", "kh_twn.RData") %in% my_files)){
###########################################################
#
#     'kh_src' IS A LIST OF SOURCE DATA:
#           [[1]] "kh_src1" 'KINDERMC.SAV', SEE DATA MANUAL
#           [[2]] "kh_src2"  'FAMILIENMC.SAV', SEE DATA MANUAL
#
#     'id' IS A UNIQUE IDENTIFIER FOR INDIVIDUALS IN [[1]]
#
###########################################################

if (!"kh_src.RData" %in% my_files){
    kh_src1 <- haven::read_sav(
      paste(source_path, file1, sep="/")
      )
#    knitr::kable(head(kh_src1[,1:9], 4))
    kh_src2 <- haven::read_sav(
      paste(source_path, file2, sep="/")
      )
#    knitr::kable(head(kh_src2[,1:9], 4))
    kh_src1$id <- 1:nrow(kh_src1)
    kh_src2 <- get_df(kh_src1, kh_src2, get_fam=TRUE)
    kh_src <- list(kh_src1, kh_src2)
#     message("     ", format(Sys.time(), "%X"), " | Now writing \"", local_path, "/kh_src.RData\"")
#     save(kh_src,
#          file = file.path(local_path, "kh_src.RData")
#          )
} else load(file.path(local_path, "kh_src.RData"))

kh_src1 <- kh_src[[1]]
kh_src2 <- kh_src[[2]]
###########################################################
#
#     'kh_ind' IS A DATAFRAME FOR INDIVIDUAL PEDIGREE DATA
#           [,1] "id" -- UNIQUE IDENTIFIER FOR INDIVIDUALS
#           [,2] "dadid" -- ID OF FATHER (0, IF FOUNDER)
#           [,2] "momid" -- ID OF MOTHER (0, IF FOUNDER)
#
###########################################################

if (!"kh_ind.RData" %in% my_files){
  kh_ind <- get_df(kh_src1, kh_src2)
  # ASSIGN INDIVIDUAL PEDIGREE MEMBERSHIP:
  kh_ind$famid <- with(kh_ind,
                       kinship2::makefamid(id, dadid, momid)
                       )
  table(kh_ind$famid)[1:10]
  table(kh_ind$famid==1)
  round(table(kh_ind$famid==1)/sum(table(kh_ind$famid==1)),2)
  # EXCLUDE INDIVIDUALS OUTSIDE THE LARGEST PEDIGREE?
#   kh_ind <- kh_ind[kh_ind$famid==1,]
#   message("     ", format(Sys.time(), "%X"), " | Now writing \"", local_path, "/kh_ind.RData\"")
#   save(kh_ind,
#        file = file.path(local_path, "kh_ind.RData")
#   )
  } else load(file.path(local_path, "kh_ind.RData"))

###########################################################
#
#     'kh_twn' IS A MATRIX WITH THREE COLUMNS:
#         [,1] 'id1' ID OF INDIVIDUAL 1
#         [,2] 'id2' ID OF INDIVIDUAL 2
#         [,3] 'code' RELATION CODE:
#               1=MONOZYGOTIC, 2=DIZYGOTIC, 3=UNKNOWN
#
###########################################################

if (!"kh_twn.RData" %in% my_files){
  kh_twn <- get_twinmat(kh_ind,
                              bstatus_ok = c("", "G")
                              )
  knitr::kable(kh_twn[1:10,])
#   message("     ", format(Sys.time(), "%X"), " | Now writing \"", local_path, "/kh_twn.RData\"")
#   save(kh_twn,
#        file = file.path(local_path, "kh_twn.RData")
#   )
  } else load(file.path(local_path, "kh_twn.RData"))

###########################################################
#
#     'kh_evn' IS A DATAFRAME FOR INDIVIDUAL EVENTS IN LONG
#              FORMAT
#
###########################################################

if (!"kh_evn.RData" %in% my_files){
  kh_evn <- get_dt_ev(kh_ind, kh_src2)
#  knitr::kable(kh_evn[1:20,])
  kh_evn$evloc <- recode_evloc(kh_evn$evloc)
#   message("     ", format(Sys.time(), "%X"), " | Now writing \"", local_path, "/kh_evn.RData\"")
#   save(kh_evn,
#        file = file.path(local_path, "kh_evn.RData")
#   )
  } else load(file.path(local_path, "kh_evn.RData"))

###########################################################
#
#     'kh_mat' IS AN ARRAY WITH THREE DIMENSIONS:
#              'id' INDIVIDUAL ID
#              'tt' TRANSITIONS 1:X
#                       [,1,] "*"       BIRTH
#                       [,2,] "#"       PARITY
#                       [,X,] "+"       DEATH
#              'vv' VARIABLES
#                       [,,1] "evdat"   NUMERIC DATE
#                       [,,2] "evid"    INDIVIDUAL
#                       [,,3] "evloc"   LOCATION
#                       [,,4] "evspc"   SPECIAL
#                       [,,5] "status"  STATUS
#
###########################################################

if (!"kh_mat.RData" %in% my_files){
  if(!"status" %in% names(kh_evn)){kh_evn$status <- 1}
  kh_mat <- get_evmat_list(kh_evn)
#   message("     ", format(Sys.time(), "%X"), " | Now writing \"", local_path, "/kh_mat.RData\"")
#   save(kh_mat,
#        file = file.path(local_path, "kh_mat.RData")
#        )
} else load(file.path(local_path, "kh_mat.RData"))


###########################################################
#
#       kh_mom IS A SAMPLE OF n MOTHERS
#
###########################################################

###########################################################
#
#     'kh_ped' IS A LIST CONTAINING SCALED PEDIGREES AS
#              RETURNED FROM 'kinship2::pedigree()'
#
#     *** COMPUTATION CAN BE VERY TIME-CONSUMING !! ***
#
###########################################################

if (!"kh_ped.RData" %in% my_files){

  set.seed(13)
  kh_mom <- kinlab::sample_kh(kh_ind, kh_src2, n = n_sample)

  message("     Start scaling of ", ifelse(n_sample>length(kh_mom),length(kh_mom), n_sample), " pedigrees at ", format(Sys.time(), "%X..."))
  if(silent==FALSE){ kh_ped <- plyr::llply(
    kh_mom, kinlab::grap_ped, kh_ind, kh_twn,
    as.numeric(dimnames(kh_mat[[1]])$id), .progress="text"
    )} else kh_ped <- plyr::llply(
      kh_mom, kinlab::grap_ped, kh_ind, kh_twn,
      as.numeric(dimnames(kh_mat[[1]])$id)
    )
  message("     ...finished at ", format(Sys.time(), "%X."))

  names(kh_ped) <- kh_mom
  message("     ", format(Sys.time(), "%X"), " | Now writing \"", local_path, "/kh_ped.RData\"")
  save(kh_ped,
       file=file.path(local_path, "kh_ped.RData")
       )
} else load(file.path(local_path, "kh_ped.RData"))

message("     kh_ped, a list of ", length(kh_ped), " pedigrees")

###########################################################
#
#
#       kh_geo -- A LIST WITH THREE COMPONENTS:
#                     - DATAFRAME HOLDING GEOPOSITIONS
#                     - MAP-OBJECTS
#                     - A CORRESPONDING DISTANCE MATRIX
#
###########################################################

if (!"kh_geo.RData" %in% my_files){
  kh_geo <- get_geo()
  message("     ", format(Sys.time(), "%X"), " | Now writing \"", local_path, "/kh_geo.RData\"")
  save(kh_geo,
       file=file.path(local_path, "kh_geo.RData")
       )
} else load(file.path(local_path, "kh_geo.RData"))

###########################################################
#
#       'kh_kin' -- A NESTED LIST OF LENGTH N INDIVIDUALS,
#                   EACH COMPONENT IS A LIST OF DATAFRAMES
#                   HOLDING KIN DATA AT A SPECIFIC DATE
#
###########################################################

if (kinset) {
  if (!"kh_kin.RData" %in% my_files){
  kh_kin <- vector("list", length(kh_ped))
  names(kh_kin) <- names(kh_ped)
  t_1 <- Sys.time()
  for (my_id in names(kh_ped)){
    if(silent==FALSE){cat("ID",
      paste0(my_id),"|", format(Sys.time(),"%H:%M:%S"), "\n"
      )}
    kh_kin[[paste(my_id)]] <- vector("list",
                               sum(kh_mat[[1]][paste(my_id),,5])
                               )
    names(kh_kin[[paste(my_id)]]) <-
      paste(as.Date(kh_mat[[1]][paste(my_id),,1][
          which(kh_mat[[1]][paste(my_id),,5]==1)],
          origin="1970-01-01")
          )
    for (my_date in paste(as.Date(kh_mat[[1]][paste(my_id),,1][
         which(kh_mat[[1]][paste(my_id),,5]==1)],
         origin="1970-01-01"))){
    if(silent==FALSE){cat("         |", format(Sys.time(), "%X"), "|",
          paste0(my_date), "\n")}
      kh_kin[[paste(my_id)]][[paste(my_date)]] <-
        try(kinlab::get_kinset(my_id, my_date,
                               kh_ped[[paste(my_id)]],
                               kh_ind, kh_src2, kh_mat[[1]],
                               kh_geo[["mat"]]
                               )
        )
      }
    if(silent==FALSE){ cat(round(which(names(kh_ped)==my_id)/length(kh_ped)*100, 2),
        "% (runtime:", round(difftime(Sys.time(), t_1), 2), attr(difftime(Sys.time(), t_1), "units"),
      ")\n------------------------------------------\n"
    )}
  }
  message("     ", format(Sys.time(), "%X"), " | Now writing \"", local_path, "/kh_kin.RData\"")
  save(kh_kin,
       file=file.path(local_path, "kh_kin.RData")
  )
} else load(file.path(local_path, "kh_kin.RData"))} else kh_kin <- NULL

if (n_sample<length(kh_ped)){
out <- restrict2sample(df_evn=kh_evn,
                       df_ind =kh_ind,
                       list_kin =kh_kin,
                       array_mat =kh_mat[[1]],
                       list_src =kh_src,
                       df_twn =kh_twn,
                       param = kh_mat[[2]])
} else {
  out <- list(evn = kh_evn, 
              src = list(kh_src1, kh_src1), 
              mat = kh_mat[[1]], 
              ind = kh_ind, 
              twn = kh_twn, 
              param = kh_mat[[2]])
}

if(file.exists(file.path(local_path, "kh_mat.RData"))){file.remove(file.path(local_path, "kh_mat.RData"))}


if (!"kh_evn.RData" %in% my_files){
kh_evn <- out[["evn"]]
save(kh_evn, file = file.path(local_path, "kh_evn.RData"))
message(paste0(format(Sys.time(), "%X"), " ...wrote \"", local_path, "/kh_evn.RData\""))
}

if (!"kh_mat.RData" %in% my_files){
kh_mat <- out[["mat"]]
save(kh_mat, file = file.path(local_path, "kh_mat.RData"))
message(paste0(format(Sys.time(), "%X"), " ...wrote \"", local_path, "/kh_mat.RData\""))
}

if (!"kh_ind.RData" %in% my_files){
kh_ind <- out[["ind"]]
save(kh_ind, file = file.path(local_path, "kh_ind.RData"))
message(paste0(format(Sys.time(), "%X"), " ...wrote \"", local_path, "/kh_ind.RData\""))
}

if (!"kh_src.RData" %in% my_files){
kh_src <- out[["src"]]
save(kh_src, file = file.path(local_path, "kh_src.RData"))
message(paste0(format(Sys.time(), "%X"), " ...wrote \"", local_path, "/kh_src.RData\""))
}

if (!"kh_twn.RData" %in% my_files){
kh_twn <- out[["twn"]]
save(kh_twn, file = file.path(local_path, "kh_twn.RData"))
message(paste0(format(Sys.time(), "%X"), " ...wrote \"", local_path, "/kh_twn.RData\""))
}

if (!"kh_par.RData" %in% my_files){
kh_par <- out[["param"]]
save(kh_par, file = file.path(local_path, "kh_par.RData"))
message(paste0(format(Sys.time(), "%X"), " ...wrote \"", local_path, "/kh_par.RData\""))
}
} else message("no action required!")<
kinlab::run_time(t_start)
}
johow/kinlab documentation built on July 5, 2019, 4:23 p.m.