R/SRS.R

Defines functions SRS

Documented in SRS

SRS <- function(data, Cmin, set_seed = TRUE, seed = 1){
  if(set_seed == T){
    set.seed(seed)
    if(Cmin > min(colSums(data))){ #remove samples with number of reads lower than Cmin
      samples_discarded <- colnames(data[,colSums(data) < Cmin, drop = F])
      cat(noquote(paste(paste(length(samples_discarded),"sample(s) discarded due to low number of counts (number of counts < Cmin): ",
                              paste(samples_discarded, collapse=', ')))))
      data<-data[,colSums(data) >= Cmin, drop = F]
      SRS(data, Cmin, set_seed, seed)
    }
    else {
      if(Cmin < 0){
        print("ERROR: Cmin < 0. Please select Cmin >= 0.")
      } else {
        if(Cmin %% 1 > 0){
          print("ERROR: SRS accepts only integers for Cmin")
        } else {
          counter = 1 #counting the loops
          for(i in seq(1, ncol(data), 1)){
            if (i == 1) {
              fixed_factor <- (data[,i]/(sum(data[,i])/Cmin))
              assign(paste(names(data)[i],sep=""),fixed_factor)
              fixed_factor_1 <- data.frame(get(names(data)[i]))
              colnames(fixed_factor_1)[i] <- names(data)[i]
            } else {
              fixed_factor <- (data[,i]/(sum(data[,i])/Cmin))
              assign(paste(names(data)[i],sep=""),fixed_factor)
              fixed_factor_1 <- cbind(fixed_factor_1, fixed_factor)
              colnames(fixed_factor_1)[{
                counter = counter + 1
              }] <- names(data)[i]
            }
          }
          
          fixed_factor_1
          
          revtrunc_fixed_factor_1 <- floor(fixed_factor_1) # floor (e.g. 1.9 will become 1)
          revtrunc_fixed_factor_1
          
          diff_counts <- Cmin-colSums(revtrunc_fixed_factor_1) #how many counts differences to the selected library size?
          diff_counts
          
          revtrunc <- function(x) { sign(x) * (x - floor(x)) }
          revtrunc_fixed_factor <- (round(revtrunc(fixed_factor_1),10000000))
          revtrunc_fixed_factor
          
          x <- as.data.frame(revtrunc_fixed_factor)
          counter = 1
          for(i in seq(1, ncol(x), 1)){
            if (i == 1) {
              if(diff_counts[i] == 0){
                fixed_factor <- revtrunc_fixed_factor_1[,i]
                assign(paste(names(data)[i],sep=""),fixed_factor)
                fixed_factor_1 <- data.frame(get(names(data)[i]))
                colnames(fixed_factor_1)[i] <- names(data)[i]
              } #if the sum of the counts in the library = Cmin
              else {
                maxN <- function(x, N=diff_counts[i]){
                  len <- length(x)
                  if(N>len){
                    warning('N greater than length(x).  Setting N=length(x)')
                    N <- length(x)
                  }
                  sort(x,partial=len-N+1)[len-N+1]
                }
                max <- which(x[,i] == maxN(x[,i]), arr.ind = TRUE)
                max
                sum(x[,i] > unique(x[,i][max]))
                normalization_value <- diff_counts[i] - sum(x[,i] > unique(x[,i][max]))
                normalization_value
                
                lowest_level_choise <- as.data.frame(which(x[,i] == unique(maxN(x[,i]))))
                lowest_level_choise
                length(t(lowest_level_choise)) #how many counts have to be sampled?
                
                if(sum(revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]]) == 0){
                  
                  lowest_level <- as.numeric(as.vector(sample(as.factor(lowest_level_choise[,1]), normalization_value, replace = F)))
                  y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                  y[lowest_level] = 1 #set the randomly selected counts to 1
                  y
                  
                } #if all of the integer values of the lowest rank = 0, do random subsmpling
                else {
                  sub_int <- subset(lowest_level_choise, (revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]] >= 1) == TRUE)
                  sub_int_bind <- as.data.frame(cbind(sub_int,revtrunc_fixed_factor_1[,i][sub_int[,1]]))
                  names(sub_int_bind)[1] <- "V1"
                  names(sub_int_bind)[2] <- "V2"
                  
                  sub_int_bind_ordered <- sub_int_bind[order(sub_int_bind$V2, decreasing = TRUE),]
                  sub_int_bind_ordered
                  
                  sub_int_bind_ordered_V1 <- sub_int_bind_ordered$V1
                  sub_int_bind_ordered_V2 <- sub_int_bind_ordered$V2
                  
                  if((length(unique(sub_int_bind_ordered_V2)) == 1 & length(sub_int_bind_ordered_V2)>as.vector(normalization_value))){
                    
                    lowest_level <- as.numeric(as.vector(sample(as.factor(sub_int_bind_ordered_V1), normalization_value, replace = F)))
                    y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                    y[lowest_level] = 1 #set the randomly selected counts to 1
                    y
                  } #if all of the integer values of the lowest rank are equal, do random subsampling
                  else {
                    if(length(sub_int_bind_ordered_V1)>normalization_value){
                      
                      maxN_1 <- function(x, N=normalization_value){
                        len <- length(x)
                        if(N>len){
                          warning('N greater than length(x).  Setting N=length(x)')
                          N <- length(x)
                        }
                        sort(x,partial=len-N+1)[len-N+1]
                      }
                      max_1 <- which(as.data.frame(sub_int_bind_ordered_V2)[,1] == maxN_1(as.data.frame(sub_int_bind_ordered_V2)[,1]), arr.ind = TRUE)
                      max_1
                      sum(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1]))
                      
                      normalization_value_1 <- normalization_value - sum(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1])) # how may values are above the maxima that is the one to be normalized?
                      normalization_value_1
                      lowest_level_choise_1 <- as.data.frame(which(as.data.frame(sub_int_bind_ordered_V2)[,1] == unique(maxN_1(as.data.frame(sub_int_bind_ordered_V2)[,1]))))
                      lowest_level_choise_1
                      
                      lowest_level <- as.numeric(as.vector(sample(as.factor(lowest_level_choise_1[,1]), normalization_value_1, replace = F)))
                      lowest_level <- sub_int_bind_ordered_V1[lowest_level]
                      lowest_level
                      
                      lowest_level_1 <- sub_int_bind_ordered_V1[(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1]))]
                      
                      lowest_level <-c(lowest_level_1, lowest_level)
                      y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                      y[lowest_level] = 1 #set the randomly selected counts to 1
                      y
                      
                    } #if integer ranks are > normalization_value, do ranked subsampling
                    else {
                      if(length(sub_int_bind_ordered_V1)<normalization_value){
                        sub_int_zeros <- subset(lowest_level_choise, (revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]] < 1) == TRUE)
                        length(t(sub_int_zeros))
                        
                        lowest_level_2 <- as.numeric(as.vector(sample(as.factor(sub_int_zeros[,1]), (normalization_value-length(sub_int_bind_ordered_V1)), replace = F)))
                        lowest_level_2
                        lowest_level_3 <- c(sub_int_bind_ordered_V1,lowest_level_2)
                        
                        y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                        y[lowest_level_3] = 1 #set the randomly selected counts to 1
                        y
                        
                      } #if integer ranks are < normalization_value, do ranked subsampling of the zero integers
                      else {
                        y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                        y[sub_int_bind_ordered_V1] = 1 #set the randomly selected counts to 1
                        y
                      } # if integer ranks are = normalization_value, sample all
                    }
                    
                    
                  }
                  
                }
                
                SRS <- revtrunc_fixed_factor_1[,i] + ceiling(x[,i] > unique(x[,i][max])) + y #sum it all u
                SRS
                sum(SRS) #verification
                assign(paste(names(data)[i],sep=""),SRS)
                fixed_factor_1 <- data.frame(get(names(data)[i]))
                colnames(fixed_factor_1)[i] <- names(data)[i]
              } #if the sum of the counts in the library > Cmin
            } #for the first libraray
            else {
              if(diff_counts[i] == 0){
                fixed_factor <- revtrunc_fixed_factor_1[,i]
                assign(paste(names(data)[i],sep=""),fixed_factor)
                fixed_factor_1 <- cbind(fixed_factor_1, fixed_factor)
                colnames(fixed_factor_1)[{
                  counter = counter + 1
                }] <- names(data)[i]
              } #if the sum of the counts in the library = Cmin
              else {
                maxN <- function(x, N=diff_counts[i]){
                  len <- length(x)
                  if(N>len){
                    warning('N greater than length(x).  Setting N=length(x)')
                    N <- length(x)
                  }
                  sort(x,partial=len-N+1)[len-N+1]
                }
                max <- which(x[,i] == maxN(x[,i]), arr.ind = TRUE)
                max
                sum(x[,i] > unique(x[,i][max]))
                normalization_value <- diff_counts[i] - sum(x[,i] > unique(x[,i][max]))
                normalization_value
                
                lowest_level_choise <- as.data.frame(which(x[,i] == unique(maxN(x[,i]))))
                lowest_level_choise
                length(t(lowest_level_choise)) #how many counts have to be sampled?
                
                if(sum(revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]]) == 0){
                  
                  lowest_level <- as.numeric(as.vector(sample(as.factor(lowest_level_choise[,1]), normalization_value, replace = F)))
                  y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                  y[lowest_level] = 1 #set the randomly selected counts to 1
                  y
                  
                } #if all of the integer values of the lowest rank = 0, do random subsmpling
                else {
                  sub_int <- subset(lowest_level_choise, (revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]] >= 1) == TRUE)
                  sub_int_bind <- as.data.frame(cbind(sub_int,revtrunc_fixed_factor_1[,i][sub_int[,1]]))
                  names(sub_int_bind)[1] <- "V1"
                  names(sub_int_bind)[2] <- "V2"
                  
                  sub_int_bind_ordered <- sub_int_bind[order(sub_int_bind$V2, decreasing = TRUE),]
                  sub_int_bind_ordered
                  
                  sub_int_bind_ordered_V1 <- sub_int_bind_ordered$V1
                  sub_int_bind_ordered_V2 <- sub_int_bind_ordered$V2
                  
                  if((length(unique(sub_int_bind_ordered_V2)) == 1 & length(sub_int_bind_ordered_V2)>as.vector(normalization_value))){
                    
                    lowest_level <- as.numeric(as.vector(sample(as.factor(sub_int_bind_ordered_V1), normalization_value, replace = F)))
                    y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                    y[lowest_level] = 1 #set the randomly selected counts to 1
                    y
                  } #if all of the integer values of the lowest rank are equal, do random subsampling
                  else {
                    if(length(sub_int_bind_ordered_V1)>normalization_value){
                      
                      maxN_1 <- function(x, N=normalization_value){
                        len <- length(x)
                        if(N>len){
                          warning('N greater than length(x).  Setting N=length(x)')
                          N <- length(x)
                        }
                        sort(x,partial=len-N+1)[len-N+1]
                      }
                      max_1 <- which(as.data.frame(sub_int_bind_ordered_V2)[,1] == maxN_1(as.data.frame(sub_int_bind_ordered_V2)[,1]), arr.ind = TRUE)
                      max_1
                      sum(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1]))
                      
                      normalization_value_1 <- normalization_value - sum(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1])) # how may values are above the maxima that is the one to be normalized?
                      normalization_value_1
                      lowest_level_choise_1 <- as.data.frame(which(as.data.frame(sub_int_bind_ordered_V2)[,1] == unique(maxN_1(as.data.frame(sub_int_bind_ordered_V2)[,1]))))
                      lowest_level_choise_1
                      
                      lowest_level <- as.numeric(as.vector(sample(as.factor(lowest_level_choise_1[,1]), normalization_value_1, replace = F)))
                      lowest_level <- sub_int_bind_ordered_V1[lowest_level]
                      lowest_level
                      
                      lowest_level_1 <- sub_int_bind_ordered_V1[(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1]))]
                      
                      lowest_level <-c(lowest_level_1, lowest_level)
                      y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                      y[lowest_level] = 1 #set the randomly selected counts to 1
                      y
                      
                    } #if integer ranks are > normalization_value, do ranked subsampling
                    else {
                      if(length(sub_int_bind_ordered_V1)<normalization_value){
                        sub_int_zeros <- subset(lowest_level_choise, (revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]] < 1) == TRUE)
                        length(t(sub_int_zeros))
                        
                        lowest_level_2 <- as.numeric(as.vector(sample(as.factor(sub_int_zeros[,1]), (normalization_value-length(sub_int_bind_ordered_V1)), replace = F)))
                        lowest_level_2
                        lowest_level_3 <- c(sub_int_bind_ordered_V1,lowest_level_2)
                        
                        y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                        y[lowest_level_3] = 1 #set the randomly selected counts to 1
                        y
                        
                      } #if integer ranks are < normalization_value, do ranked subsampling of the zero integers
                      else {
                        y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                        y[sub_int_bind_ordered_V1] = 1 #set the randomly selected counts to 1
                        y
                      } # if integer ranks are = normalization_value, sample all
                    }
                    
                    
                  }
                  
                }
                
                SRS <- revtrunc_fixed_factor_1[,i] + ceiling(x[,i] > unique(x[,i][max])) + y #sum it all up
                SRS
                sum(SRS) #verification
                assign(paste(names(data)[i],sep=""),SRS)
                fixed_factor_1 <- cbind(fixed_factor_1, SRS)
                colnames(fixed_factor_1)[{
                  counter = counter + 1
                }] <- names(data)[i]
              } #if the sum of the counts in the library > Cmin
            } #for all other libaries
          }
          set.seed(Sys.time())
          SRS_output <- fixed_factor_1
          SRS_output
        }
      }
    }
  } else { 
    if(Cmin > min(colSums(data))){
      print("ERROR: Cmin > minimum library size. Please select a Cmin that is <= the minimum library size of the dataset.")
    } else {
      if(Cmin < 0){
        print("ERROR: Cmin < 0. Please select a Cmin >= 0.")
      } else {
        if(Cmin %% 1 > 0){
          print("ERROR: Please select a Cmin without decimal places.")
        } else {
          counter = 1 #counting the loops
          for(i in seq(1, ncol(data), 1)){
            if (i == 1) {
              fixed_factor <- (data[,i]/(sum(data[,i])/Cmin))
              assign(paste(names(data)[i],sep=""),fixed_factor)
              fixed_factor_1 <- data.frame(get(names(data)[i]))
              colnames(fixed_factor_1)[i] <- names(data)[i]
            } else {
              fixed_factor <- (data[,i]/(sum(data[,i])/Cmin))
              assign(paste(names(data)[i],sep=""),fixed_factor)
              fixed_factor_1 <- cbind(fixed_factor_1, fixed_factor)
              colnames(fixed_factor_1)[{
                counter = counter + 1
              }] <- names(data)[i]
            }
          }
          
          fixed_factor_1
          
          revtrunc_fixed_factor_1 <- floor(fixed_factor_1) # floor (e.g. 1.9 will become 1)
          revtrunc_fixed_factor_1
          
          diff_counts <- Cmin-colSums(revtrunc_fixed_factor_1) #how many counts differences to the selected library size?
          diff_counts
          
          revtrunc <- function(x) { sign(x) * (x - floor(x)) }
          revtrunc_fixed_factor <- (round(revtrunc(fixed_factor_1),10000000))
          revtrunc_fixed_factor
          
          x <- as.data.frame(revtrunc_fixed_factor)
          counter = 1
          for(i in seq(1, ncol(x), 1)){
            if (i == 1) {
              if(diff_counts[i] == 0){
                fixed_factor <- revtrunc_fixed_factor_1[,i]
                assign(paste(names(data)[i],sep=""),fixed_factor)
                fixed_factor_1 <- data.frame(get(names(data)[i]))
                colnames(fixed_factor_1)[i] <- names(data)[i]
              } #if the sum of the counts in the library = Cmin
              else {
                maxN <- function(x, N=diff_counts[i]){
                  len <- length(x)
                  if(N>len){
                    warning('N greater than length(x).  Setting N=length(x)')
                    N <- length(x)
                  }
                  sort(x,partial=len-N+1)[len-N+1]
                }
                max <- which(x[,i] == maxN(x[,i]), arr.ind = TRUE)
                max
                sum(x[,i] > unique(x[,i][max]))
                normalization_value <- diff_counts[i] - sum(x[,i] > unique(x[,i][max]))
                normalization_value
                
                lowest_level_choise <- as.data.frame(which(x[,i] == unique(maxN(x[,i]))))
                lowest_level_choise
                length(t(lowest_level_choise)) #how many counts have to be sampled?
                
                if(sum(revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]]) == 0){
                  
                  lowest_level <- as.numeric(as.vector(sample(as.factor(lowest_level_choise[,1]), normalization_value, replace = F)))
                  y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                  y[lowest_level] = 1 #set the randomly selected counts to 1
                  y
                  
                } #if all of the integer values of the lowest rank = 0, do random subsmpling
                else {
                  sub_int <- subset(lowest_level_choise, (revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]] >= 1) == TRUE)
                  sub_int_bind <- as.data.frame(cbind(sub_int,revtrunc_fixed_factor_1[,i][sub_int[,1]]))
                  names(sub_int_bind)[1] <- "V1"
                  names(sub_int_bind)[2] <- "V2"
                  
                  sub_int_bind_ordered <- sub_int_bind[order(sub_int_bind$V2, decreasing = TRUE),]
                  sub_int_bind_ordered
                  
                  sub_int_bind_ordered_V1 <- sub_int_bind_ordered$V1
                  sub_int_bind_ordered_V2 <- sub_int_bind_ordered$V2
                  
                  if((length(unique(sub_int_bind_ordered_V2)) == 1 & length(sub_int_bind_ordered_V2)>as.vector(normalization_value))){
                    
                    lowest_level <- as.numeric(as.vector(sample(as.factor(sub_int_bind_ordered_V1), normalization_value, replace = F)))
                    y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                    y[lowest_level] = 1 #set the randomly selected counts to 1
                    y
                  } #if all of the integer values of the lowest rank are equal, do random subsampling
                  else {
                    if(length(sub_int_bind_ordered_V1)>normalization_value){
                      
                      maxN_1 <- function(x, N=normalization_value){
                        len <- length(x)
                        if(N>len){
                          warning('N greater than length(x).  Setting N=length(x)')
                          N <- length(x)
                        }
                        sort(x,partial=len-N+1)[len-N+1]
                      }
                      max_1 <- which(as.data.frame(sub_int_bind_ordered_V2)[,1] == maxN_1(as.data.frame(sub_int_bind_ordered_V2)[,1]), arr.ind = TRUE)
                      max_1
                      sum(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1]))
                      
                      normalization_value_1 <- normalization_value - sum(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1])) # how may values are above the maxima that is the one to be normalized?
                      normalization_value_1
                      lowest_level_choise_1 <- as.data.frame(which(as.data.frame(sub_int_bind_ordered_V2)[,1] == unique(maxN_1(as.data.frame(sub_int_bind_ordered_V2)[,1]))))
                      lowest_level_choise_1
                      
                      lowest_level <- as.numeric(as.vector(sample(as.factor(lowest_level_choise_1[,1]), normalization_value_1, replace = F)))
                      lowest_level <- sub_int_bind_ordered_V1[lowest_level]
                      lowest_level
                      
                      lowest_level_1 <- sub_int_bind_ordered_V1[(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1]))]
                      
                      lowest_level <-c(lowest_level_1, lowest_level)
                      y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                      y[lowest_level] = 1 #set the randomly selected counts to 1
                      y
                      
                    } #if integer ranks are > normalization_value, do ranked subsampling
                    else {
                      if(length(sub_int_bind_ordered_V1)<normalization_value){
                        sub_int_zeros <- subset(lowest_level_choise, (revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]] < 1) == TRUE)
                        length(t(sub_int_zeros))
                        
                        lowest_level_2 <- as.numeric(as.vector(sample(as.factor(sub_int_zeros[,1]), (normalization_value-length(sub_int_bind_ordered_V1)), replace = F)))
                        lowest_level_2
                        lowest_level_3 <- c(sub_int_bind_ordered_V1,lowest_level_2)
                        
                        y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                        y[lowest_level_3] = 1 #set the randomly selected counts to 1
                        y
                        
                      } #if integer ranks are < normalization_value, do ranked subsampling of the zero integers
                      else {
                        y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                        y[sub_int_bind_ordered_V1] = 1 #set the randomly selected counts to 1
                        y
                      } # if integer ranks are = normalization_value, sample all
                    }
                    
                    
                  }
                  
                }
                
                SRS <- revtrunc_fixed_factor_1[,i] + ceiling(x[,i] > unique(x[,i][max])) + y #sum it all u
                SRS
                sum(SRS) #verification
                assign(paste(names(data)[i],sep=""),SRS)
                fixed_factor_1 <- data.frame(get(names(data)[i]))
                colnames(fixed_factor_1)[i] <- names(data)[i]
              } #if the sum of the counts in the library > Cmin
            } #for the first libraray
            else {
              if(diff_counts[i] == 0){
                fixed_factor <- revtrunc_fixed_factor_1[,i]
                assign(paste(names(data)[i],sep=""),fixed_factor)
                fixed_factor_1 <- cbind(fixed_factor_1, fixed_factor)
                colnames(fixed_factor_1)[{
                  counter = counter + 1
                }] <- names(data)[i]
              } #if the sum of the counts in the library = Cmin
              else {
                maxN <- function(x, N=diff_counts[i]){
                  len <- length(x)
                  if(N>len){
                    warning('N greater than length(x).  Setting N=length(x)')
                    N <- length(x)
                  }
                  sort(x,partial=len-N+1)[len-N+1]
                }
                max <- which(x[,i] == maxN(x[,i]), arr.ind = TRUE)
                max
                sum(x[,i] > unique(x[,i][max]))
                normalization_value <- diff_counts[i] - sum(x[,i] > unique(x[,i][max]))
                normalization_value
                
                lowest_level_choise <- as.data.frame(which(x[,i] == unique(maxN(x[,i]))))
                lowest_level_choise
                length(t(lowest_level_choise)) #how many counts have to be sampled?
                
                if(sum(revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]]) == 0){
                  
                  lowest_level <- as.numeric(as.vector(sample(as.factor(lowest_level_choise[,1]), normalization_value, replace = F)))
                  y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                  y[lowest_level] = 1 #set the randomly selected counts to 1
                  y
                  
                } #if all of the integer values of the lowest rank = 0, do random subsmpling
                else {
                  sub_int <- subset(lowest_level_choise, (revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]] >= 1) == TRUE)
                  sub_int_bind <- as.data.frame(cbind(sub_int,revtrunc_fixed_factor_1[,i][sub_int[,1]]))
                  names(sub_int_bind)[1] <- "V1"
                  names(sub_int_bind)[2] <- "V2"
                  
                  sub_int_bind_ordered <- sub_int_bind[order(sub_int_bind$V2, decreasing = TRUE),]
                  sub_int_bind_ordered
                  
                  sub_int_bind_ordered_V1 <- sub_int_bind_ordered$V1
                  sub_int_bind_ordered_V2 <- sub_int_bind_ordered$V2
                  
                  if((length(unique(sub_int_bind_ordered_V2)) == 1 & length(sub_int_bind_ordered_V2)>as.vector(normalization_value))){
                    
                    lowest_level <- as.numeric(as.vector(sample(as.factor(sub_int_bind_ordered_V1), normalization_value, replace = F)))
                    y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                    y[lowest_level] = 1 #set the randomly selected counts to 1
                    y
                  } #if all of the integer values of the lowest rank are equal, do random subsampling
                  else {
                    if(length(sub_int_bind_ordered_V1)>normalization_value){
                      
                      maxN_1 <- function(x, N=normalization_value){
                        len <- length(x)
                        if(N>len){
                          warning('N greater than length(x).  Setting N=length(x)')
                          N <- length(x)
                        }
                        sort(x,partial=len-N+1)[len-N+1]
                      }
                      max_1 <- which(as.data.frame(sub_int_bind_ordered_V2)[,1] == maxN_1(as.data.frame(sub_int_bind_ordered_V2)[,1]), arr.ind = TRUE)
                      max_1
                      sum(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1]))
                      
                      normalization_value_1 <- normalization_value - sum(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1])) # how may values are above the maxima that is the one to be normalized?
                      normalization_value_1
                      lowest_level_choise_1 <- as.data.frame(which(as.data.frame(sub_int_bind_ordered_V2)[,1] == unique(maxN_1(as.data.frame(sub_int_bind_ordered_V2)[,1]))))
                      lowest_level_choise_1
                      
                      lowest_level <- as.numeric(as.vector(sample(as.factor(lowest_level_choise_1[,1]), normalization_value_1, replace = F)))
                      lowest_level <- sub_int_bind_ordered_V1[lowest_level]
                      lowest_level
                      
                      lowest_level_1 <- sub_int_bind_ordered_V1[(as.data.frame(sub_int_bind_ordered_V2)[,1] > unique(as.data.frame(sub_int_bind_ordered_V2)[,1][max_1]))]
                      
                      lowest_level <-c(lowest_level_1, lowest_level)
                      y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                      y[lowest_level] = 1 #set the randomly selected counts to 1
                      y
                      
                    } #if integer ranks are > normalization_value, do ranked subsampling
                    else {
                      if(length(sub_int_bind_ordered_V1)<normalization_value){
                        sub_int_zeros <- subset(lowest_level_choise, (revtrunc_fixed_factor_1[,i][lowest_level_choise[,1]] < 1) == TRUE)
                        length(t(sub_int_zeros))
                        
                        lowest_level_2 <- as.numeric(as.vector(sample(as.factor(sub_int_zeros[,1]), (normalization_value-length(sub_int_bind_ordered_V1)), replace = F)))
                        lowest_level_2
                        lowest_level_3 <- c(sub_int_bind_ordered_V1,lowest_level_2)
                        
                        y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                        y[lowest_level_3] = 1 #set the randomly selected counts to 1
                        y
                        
                      } #if integer ranks are < normalization_value, do ranked subsampling of the zero integers
                      else {
                        y <- as.vector(rep(0,length(x[,1]))) #create an empty vector
                        y[sub_int_bind_ordered_V1] = 1 #set the randomly selected counts to 1
                        y
                      } # if integer ranks are = normalization_value, sample all
                    }
                    
                    
                  }
                  
                }
                
                SRS <- revtrunc_fixed_factor_1[,i] + ceiling(x[,i] > unique(x[,i][max])) + y #sum it all up
                SRS
                sum(SRS) #verification
                assign(paste(names(data)[i],sep=""),SRS)
                fixed_factor_1 <- cbind(fixed_factor_1, SRS)
                colnames(fixed_factor_1)[{
                  counter = counter + 1
                }] <- names(data)[i]
              } #if the sum of the counts in the library > Cmin
            } #for all other libaries
          }
          
          SRS_output <- fixed_factor_1
          SRS_output
        }
      }
    }
  }
}

Try the SRS package in your browser

Any scripts or data that you put into this service are public.

SRS documentation built on March 27, 2022, 5:05 p.m.