require(PedigreeFromTvdData) require(dplyr) require(magrittr)
This vignette describes implementation of transformation that are described in the companion vignette on strategies entitled Pedigree Checks - Concepts and Strategies.
The vignette on the strategy defined transformations to be routines that search for pedigree records that do not fullfill certain requirement conditions. Transformations are either delete a complete pedigree record or invalidate with NA a field of the pedigree record.
Animal-IDs are the primary key. If duplicates of animal-IDs have been found, the complete record has to be deleted.
s_data_file_dup_id <- system.file(file.path("extdata","KLDAT_20171214_dup_id_100.txt"), package = "PedigreeFromTvdData") tbl_ped_dup_id <- laf_open_fwf_tvd_input(ps_input_file = s_data_file_dup_id, pb_out = TRUE) tbl_non_uni_dup_id <- check_unique_animal_id(tbl_ped_dup_id, pb_out = TRUE)
(tbl_transform_ped <- transform_unique_animal_id(ptbl_pedigree = tbl_ped_dup_id, output_check = tbl_non_uni_dup_id, pn_ani_id_col_idx = getTvdIdColsDsch()$TierIdCol, pb_out = TRUE))
The above version of the transformation function transform_unique_animal_id()
keeps only the records which have a unique ID. The records with duplicate IDs are discarded. It could well be possible that we want to keep one record of those with duplicate records. In that case, we certainly want to keep the one with the most information available.
To implement this approach of keeping one record of those with duplicate IDs, we have to inspect those records with duplicate IDs a bit closer. These records are found as a result of the check-function check_unique_animal_id()
. We run a join to see what information is available from these records.
tbl_ped_dup_id %>% inner_join(tbl_non_uni_dup_id, by = c("V12" = "Animal")) %>% select(V5,V11,V12,V16)
The above shown output points to a problem that the information available from the records with duplicate IDs can be ambiguous. For example, birthdates can be different. Without information about the parents, we cannot say which birthdate is correct. The required information about the parents can only become available when we try to merge with the pedigree in ARGUS.
If the parent are older than offspring, the parent's birthdate field will be invalidate with NA.
sDataFileName <- system.file(file.path("extdata","KLDAT_20170524_10000.txt"), package = "PedigreeFromTvdData") tbl_ped_uni_id <- laf_open_fwf_tvd_input(ps_input_file = sDataFileName, pb_out = TRUE) (l_tvd_id_col_dsch <- getTvdIdColsDsch()) (n_bd_col_idx <- getBirthdateColIdxDsch()) (tbl_fail_mother_age <- check_parent_older_offspring(ptbl_pedigree = tbl_ped_uni_id, pn_offspring_col = l_tvd_id_col_dsch$TierIdCol, pn_birthday_col = n_bd_col_idx, pn_parent_col = l_tvd_id_col_dsch$MutterIdCol))
The result of the above check give us a dataframe containing animals and parents which have wrong birthdates. According to https://stackoverflow.com/questions/27909000/set-certain-values-to-na-with-dplyr, we can use dplyr::mutate and replace. But this shows only how to do it with one ID and not how this could work with a vector of IDs.
We start with a single ID
### # select first animal-ID (s_single_id <- tbl_fail_mother_age$Animal[1]) ### # assign result dataframe and do the mutation tbl_transform_ped <- tbl_ped_uni_id %>% mutate(V11 = replace(V11, which(V12 == s_single_id), NA))
Checking whether the birthdate of the pedigree record for animal with the ID r s_single_id
was changed can be done by
tbl_transform_ped %>% filter(V12 == s_single_id) %>% select(V5,V11,V12,V16)
The above approach can be used, as soon as we can provide an index vector of all records of which the birthdate must be changed.
(vec_ani_ids <- c(tbl_fail_mother_age$Animal, tbl_fail_mother_age$Parent)) vec_ani_idx <- sapply(vec_ani_ids, function(x) which(tbl_ped_uni_id$V12 == x), USE.NAMES = FALSE) tbl_transform_ped <- tbl_ped_uni_id %>% mutate(V11 = replace(V11, vec_ani_idx, NA))
Checking the records
tbl_transform_ped %>% inner_join(tbl_fail_mother_age, by = c("V12" = "Animal")) %>% select(V12,V11) tbl_transform_ped %>% inner_join(tbl_fail_mother_age, by = c("V12" = "Parent")) %>% select(V12,V11)
Comparing the transformed pedigree records to the original records shows that birthdates are invalidated.
tbl_ped_uni_id %>% inner_join(tbl_fail_mother_age, by = c("V12" = "Animal")) %>% select(V12,V11) tbl_ped_uni_id %>% inner_join(tbl_fail_mother_age, by = c("V12" = "Parent")) %>% select(V12,V11)
(tbl_transform_ped <- transform_check_parent_older_offspring(ptbl_pedigree = tbl_ped_uni_id, output_check = tbl_fail_mother_age, pb_out = TRUE))
If the sex of the parent are inconsistent, the parent's sex field will be invalidate with NA.
sDataFileSex <- system.file(file.path("extdata","KLDAT_20170524_100_mother_sex.txt"), package = "PedigreeFromTvdData") tbl_ped_sex <- laf_open_fwf_tvd_input(ps_input_file = sDataFileSex, pb_out = TRUE) tbl_result_sex <- check_sex_tbl(ptblPedigree = tbl_ped_sex)
(tbl_transform_ped <- transform_check_sex_tbl(ptbl_pedigree = tbl_ped_sex, output_check = tbl_result_sex, pb_out = TRUE))
If the format of TVD-IDs is not correct, the complete record of an animal has to be deleted. If the parent TVD-Ids is not correct, the field TVDids will be with NA invalidated.
sData_NotCorrectId <- system.file(file.path("extdata","KLDAT_20170524_20_notcorrectId.txt"), package ="PedigreeFromTvdData") tbl_ped_correctFormat <- laf_open_fwf_tvd_input(ps_input_file = sData_NotCorrectId, pb_out = TRUE) lIdCols <- getTvdIdColsDsch() tbl_result_tvdformat <- correct_tvd_format_tbl(p_tbl_ped = tbl_ped_correctFormat, plFormatBorder = getTVDIdBorder(), plIdCols = lIdCols, pnIdCol = lIdCols$TierIdCol)
(tbl_transform_ped <- transform_correct_tvd_format_tbl(ptbl_pedigree = tbl_ped_correctFormat, output_check = tbl_result_tvdformat, pb_out = TRUE))
If the format of birthdate is not correct, the birthdate field will be invalidate with NA.
sData_NotCorrectBirthdate <- system.file(file.path("extdata","KLDAT_20170524_20_notcorrectBirthdate.txt"), package ="PedigreeFromTvdData") tbl_ped_NcorrectBirthdate <- laf_open_fwf_tvd_input(ps_input_file = sData_NotCorrectBirthdate, pb_out = TRUE) tbl_result_checkbirthdate_nC <- check_birthdate_tbl(ptblPedigree = tbl_ped_NcorrectBirthdate)
(tbl_transform_ped <- transform_check_birthdate_tbl(ptbl_pedigree = tbl_ped_NcorrectBirthdate, output_check = tbl_result_checkbirthdate_nC, pb_out = TRUE))
Tranformation after checks of uniqueness of animal-IDs
l_tvd_id_col_dsch <- getTvdIdColsDsch() n_bd_col_idx <- getBirthdateColIdxDsch() (mb.uniqId <- microbenchmark::microbenchmark( data.base = PedigreeFromTvdData::transform_unique_animal_id(ptbl_pedigree = tbl_ped_uni_id, output_check = check_unique_animal_id(ptbl_pedigree = tbl_ped_uni_id), pn_ani_id_col_idx = getTvdIdColsDsch()$TierIdCol), times = 5, unit = "s"))
Transformation after checks, if parents are older than offspring
(mb.parentOld <- microbenchmark::microbenchmark( data.base = PedigreeFromTvdData::transform_check_parent_older_offspring(ptbl_pedigree = tbl_ped_uni_id, output_check = tbl_fail_mother_age), times = 5, unit = "s"))
Tranformation by sex inconsistency
(mb.sexConsistency <- microbenchmark::microbenchmark( data.base = PedigreeFromTvdData::transform_check_sex_tbl(ptbl_pedigree = tbl_ped_uni_id, output_check = check_sex_tbl(ptblPedigree = tbl_ped_uni_id)), times = 5, unit = "s"))
Transformation by incorrect IDs format
lIdCols <- getTvdIdColsDsch() (mb.format <- microbenchmark::microbenchmark( data.base = PedigreeFromTvdData::transform_correct_tvd_format_tbl(ptbl_pedigree = tbl_ped_uni_id, output_check = correct_tvd_format_tbl(p_tbl_ped = tbl_ped_uni_id, plFormatBorder = getTVDIdBorder(), plIdCols = lIdCols, pnIdCol = lIdCols$TierIdCol)), times = 5, unit = "s"))
Transformation by incorrect format of bithdates
(mb.birthdate <- microbenchmark::microbenchmark( data.base = PedigreeFromTvdData::transform_check_birthdate_tbl(ptbl_pedigree = tbl_ped_uni_id, output_check = check_birthdate_tbl(ptblPedigree = tbl_ped_uni_id)), times = 5, unit = "s"))
Overview of results The following table shows the median times (column MedianTime) in seconds for all function that were used.
vecMethods <- c("PedigreeFromTvdData::transform_unique_animal_id", "PedigreeFromTvdData::transform_check_parent_older_offspring", "PedigreeFromTvdData::transform_check_sex_tbl", "PedigreeFromTvdData::transform_correct_tvd_format_tbl", "PedigreeFromTvdData::transform_check_birthdate_tbl") vecMedTimes <- c(mb.uniqId$time[3], mb.parentOld$time[3], mb.sexConsistency$time[3], mb.format$time[3], mb.birthdate$time[3]) nMinTime <- vecMedTimes[order(vecMedTimes)][1] dfMedTime <- data.frame(Methode = vecMethods, MedianTime = round(vecMedTimes*10^(-9), digits = 4)) knitr::kable(dfMedTime)
sessionInfo()
r paste(Sys.time(),paste0("(", Sys.info()[["user"]],")" ))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.