doc/SNP_missing_value_imputation_process.R

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, include=FALSE-----------------------------------------------------
library(SNPFastImputeMac)
library(tidyverse)
library(ggplot2)
library(ggpubr)
#library(doParallel)
#library(foreach)
#library(doRNG)

## ----vcf, echo=T--------------------------------------------------------------
data("full_vcf")
dim(full_vcf)
full_vcf[1:5, c(1:7, 9:14)]

## ----vcf2df, message=FALSE----------------------------------------------------
full_df <- vcf2df(full_vcf)

## ----showdf, echo=F-----------------------------------------------------------
full_df[1:4, 1:4]
sub <- 200

## ----subset-------------------------------------------------------------------
SNP_orig_sub <- full_df[, 1:sub]

## ----introduceNA--------------------------------------------------------------
ratios <- seq(0.05, 0.25, by = 0.05)
SNP_NA_dfs <- list()
ratios_len <- length(ratios)
#set.seed(20980)
for (i in 1:ratios_len){
  SNP_NA_dfs[[i]] <- NA_Generator(SNP_orig_sub, ratios[i])
  print(SNP_NA_dfs[[i]]$NA_percent_generate)
}
names(SNP_NA_dfs) <- paste("missing", as.character(ratios))

## ----classification error calculation for different missing rate--------------
errors <- rep(NA, ratios_len)
for(i in 1:ratios_len){
  errors[i] <- classification_error(SNP_orig_sub, df_fills[[i]], SNP_NA_dfs[[i]]$NP_generate_positions)
}

## ----calculate missing ratio vs error rate and time, message=FALSE------------
errors_df <- data.frame(ratios, errors)
times_df <- data.frame(ratios, proctimes)

## ----plot missing ratio vs error rate and time, echo = T, eval = T, fig.width=8, message = F, tidy = T, tidy.opts=list(width.cutoff=35)----
p1 <- ggplot(errors_df, aes(x = ratios, y = errors)) + geom_point() + ylim(c(0, 0.2))
p2 <- ggplot(times_df, aes(x = ratios, y = proctimes)) + geom_point() + ylim(c(0, 20))
ggarrange(p1, p2, nrow = 1)

## ----windows size list--------------------------------------------------------
sizes <- seq(10, 50, by = 10)
sizes_len <- length(sizes)
SNP_NA_df02 <- SNP_NA_dfs[[4]]$SNP_NA_df

## ----classification error calculation for different windows size--------------
errors <- rep(NA, ratios_len)
for(i in 1:ratios_len){
  errors[i] <- classification_error(SNP_orig_sub, df_fills[[i]], SNP_NA_dfs[[i]]$NP_generate_positions)
}

## ----calculate windows size vs error rate and time, message=FALSE-------------
errors_df <- data.frame(sizes, errors)
times_df <- data.frame(sizes, proctimes)

## ----plot windows size vs error rate and time, echo = T, eval = T, fig.width=8, message = F, tidy = T, tidy.opts=list(width.cutoff=35)----
p1 <- ggplot(errors_df, aes(x = sizes, y = errors)) + geom_point() + ylim(c(0, 0.2))
p2 <- ggplot(times_df, aes(x = sizes, y = proctimes)) + geom_point() + ylim(c(0, 20))
ggarrange(p1, p2, nrow = 1)
GaoGN517/689_SNPFastImpute_Mac documentation built on Dec. 8, 2019, 12:33 a.m.