#' RAREsim
#'
#' Simulate rare variant genetic data
#'
#' @param hap_file_name Name of the haplotype file to prune (gzipped: end in .gz)
#'
#' @param MAC vector of allele count per variant that corresponds with the haplotype file
#'
#' @param expected dataframe of expected number of variants per bin - as well as bin boundaries
#'
#' @param sed_or_gsed Specify if gsed is loaded onto computer instead of sed
#'
#' @return Number of variants pruned
#'
#' @author Megan M Null, \email{megan.null@ucdenver.edu}
#'
#' @keywords RAREsim
#'
#' @export
#'
#' @importFrom utils write.table
#' @importFrom stats runif
Pruning_function <- function(hap_file_name, MAC, expected, sed_or_gsed){
rem_all<-c()
change_all <- c()
if(length(which(MAC$V1>0))<sum(expected$Expected_var)){
print('Error, not enough rare alleles')
}
MAC$num <- 1:nrow(MAC)
colnames(MAC)[1] <- 'V1'
for(k in nrow(expected):1){
tmp <- MAC[which(MAC$V1 >= expected$Lower[k] & MAC$V1 <= expected$Upper[k]),]
if(nrow(tmp) == 0){
next
}
tmp <- as.data.frame(tmp)
colnames(tmp)[1] <- 'V1'
##### if there are not any SNVs in the bin, tmp is all NA's...
n1 <- nrow(tmp)
expect <- expected$Expected_var[k]
#print(expect)
if(n1>0){
p <- expect/n1
}else{
if(expect < 1){next}
if(expect >=1){
p <- expect
}
}
if(p <= 1){
tmp$rd <- runif(n1)
rem <- tmp[c(which(tmp$rd >= p)),]
rem_all <- rbind(rem_all, rem)
}
if(p>1){
if(k==7){
if((expect-n1) >2){
print(c('Error!',n1, expect, (n1-expect)))} ## instead just skip
next}
n2 <- expect - n1 ### n2 is how many more we need
p1 <- n2/nrow(rem_all) ### this is the proportion from rem_all
rem_all$rd <- runif(nrow(rem_all))
to_change <- rem_all[which(rem_all$rd <= p1),]
mac_range <- c(expected$Lower[k]:expected$Upper[k])
to_change$new_MAC <- sample(mac_range, nrow(to_change),
replace = TRUE)
### change them within the MAC file.
#### this needs to change for the haplotype file!!!
rem_all <- rem_all[which(rem_all$rd > p1),]
change_all<- rbind(change_all, to_change)
}
}
##### here we are done with the theoretical pruning
##### now change the haplotypes
### First, the variants that change alternate alleles
command_unzip <- paste0('gunzip ', hap_file_name)
system(command_unzip)
hap_file_name1 <- as.character(hap_file_name)
hap_file_name1 <- substr(hap_file_name1, 1, (nchar(hap_file_name1)-3))
con1 <- hap_file_name1
con2 <- paste0(hap_file_name1, '_new.haps')
lin_nos <- change_all$num
if(length(change_all)>0){
new <- as.data.frame(matrix(nrow=nrow(change_all), ncol = 113770))
##### change all: MAC, row #, draw, new MAC
for(ch in 1:nrow(change_all)){
n <- change_all$num[ch] #### this is the line of interest
k <- change_all$new_MAC[ch]
#### Make sure this works on a MAC
if(sed_or_gsed == 'gsed'){
get_line <- paste0('sed -n \'', n, 'p;',
(n+1),'q\' ', hap_file_name1)
}else{
get_line <- paste0('gsed -n \'', n, 'p;',
(n+1),'q\' ', hap_file_name1)
}
get_line <- paste0('sed -n \'', n, 'p;',
(n+1),'q\' ', hap_file_name1)
ri <- system(get_line, intern = TRUE)
re <- unlist(strsplit(ri, split=" "))
re <- as.numeric(as.character(re))
#### find which have alternate alleles
aa <- which(re>0)
#### sample
aa_n <- sample(aa, k)
#### and now make a new row to insert into the file
ref <- rep(0,length(re))
ref[aa_n] <- 1
#### write to a new file
new[ch,] <- ref
#### and remove the 'orginal' line n
}
f3 = file("test_full_text2add1.txt", "w")
for (i in 1:nrow(new)){
ref1 <- paste(new[i,], collapse = ' ')
write(ref1, f3)
}
close(f3)
repl_lines = readLines("test_full_text2add1.txt")
system.time(m_update(con1, con2, repl_lines, line_nos = lin_nos))
}else{
system(paste0('cp ', con1, ' ', con2))
}
#### copy the original one if nothing to change??
#### then go in and remove the other lines
todel <- paste(rem_all$num, collapse = 'd; ')
todel <- paste0(todel,'d')
write.table(todel, 'List2delete.sed', row.names = FALSE,
col.names = FALSE, quote = FALSE)
#### delete the entire list
if(sed_or_gsed == 'gsed'){
torun <- paste0('gsed -i -f List2delete.sed ', con2)
}else{
torun <- paste0('sed -i -f List2delete.sed ', con2)
}
#### Need to make sure this worked... with the
system(torun)
system(paste0('gzip ', con2))
system(paste0('gzip ', con1))
return(nrow(rem_all))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.