R/call_erra.R

#' Error Analysis for 1D Electrophoresis Gels
#'
#' This function analyzes fingerprint replicate data. Replicates are repeated measurements of the same sample in one or more gels, which allows the reproducibility of the measurement to be ascertained.
#'
#' @param rep.file is the full name and path of the replicate file
#' @param work_dir is where the datasets should be stored
#' @param dnum is the file number
#' @param sd is the standard deviation needed to calculate all things, Salamon et al. (1998) suggested a value of 4 for sd
#' @param delete logical value indicating if you want to delete any pre-existing acm.par files. Default is FALSE
#' @return the parameter filed called acm.par, needed by call_acm.  This file contains information on about the measurement error present in the data.
#' @note \code{call_erra} doesn't work for unequal number of trials. This function needs to be run before call_acm.
#' @references 
#' Salamon et al. (1998) Accommodating Error Analysis in Comparison and Clustering of Molecular Fingerprints. Emerging Infectious Diseases Vol. 4, No. 2, April-June 1998
#' @references Abasci LLC. JAMES v1.0 User Documentation. 2002. 
#' @author Andrea Benedetti \email{andrea.benedetti@@mcgill.ca}
#' @author Sahir Rai Bhatnagar
#' @author XiaoFei Zhao
#' @examples
#' \dontshow{
#' (WD <- getwd())
#' if (!is.null(WD)) setwd(WD)
#' data(replicates.in)
#' write.table(replicates.in,  "replicates.in", quote=FALSE, row.names=FALSE)
#' }
#' #generate the error parameter file needed by call_acm
#' call_erra("replicates.in",  dnum=1, sd=1, delete=TRUE)
#' @export 
call_erra <- function(rep.file, work_dir=".", dnum=1, sd=4, delete=FALSE)
{	
	# Only rep.file is the real input
	# workdir: incorporated in rep.file
	# dnum:    always a constant
	# sd:      recommended to be 4 in the original article
	# delete:  always delete the old parameter file
	
	call_erra2 <- function()
	{
		if (!file.exists(work_dir)) {
			if (!dir.create(work_dir)) {
				print(paste("In directory ", getwd(), " ",                sep = "'"))
				print(paste(" - directory ", work_dir,  " does not exist ", sep = "'"))
				print(paste(" - and the program cannot create the directory ", work_dir, " ", sep = "'"))
				stop()
			}
		}
		acm.file.exists(rep.file)
		rep.file = tools::file_path_as_absolute(rep.file)
		setwd(work_dir)
		#exe.dir <- system.file(package="acm4r", "executables", "win32")
		#file.copy(from = rep.file, to = paste(exe.dir, "/erra.in",  sep = ""), overwrite = TRUE)
		#setwd(exe.dir)
		#acm.par is automatically generated
		#cmd <- paste("./erra", "erra.in", "erra.out", sd, sep = " ")
		#print (paste("executing command:", cmd, sep = " "))
		outputfilename = paste("erra.out", ".", dnum, sep = "")
		if (!delete) 
		{
			acm.file.checkdelete(outputfilename)
		}
		erra(rep.file, outputfilename, sd)
		#system(cmd, intern = TRUE, ignore.stdout = FALSE, ignore.stderr = TRUE)
		file.copy(from = paste(outputfilename, ".acm", sep=""), to = "acm.par", overwrite = TRUE)
		#file.show("acm.par", title = "parameter file generated by acm/erra")
		#file.show("erra.out", title = "error analysis file generated by acm/erra")
		#acm.file.copy(from = "acm.par", to = paste("acm.par", dnum, sep = "."), overwrite = delete)
		#acm.file.copy(from = "erra.out", to = paste("erra.out", dnum, sep = "."), overwrite = delete)
		#print(paste("call_erra finished, data is stored at ", exe.dir, ".", sep = "'"))	
		print("call_erra finished.")
	}
	old.dir <- getwd()
	tryCatch(call_erra2(), finally = setwd(old.dir))
}

# The code below is translated from C code for Align-and-Count-Method by XiaoFei Zhao

fprintf = function(filename = "", format = "", ...) {
  write(sprintf(format, ...), file = filename, append = TRUE)
}

VERSION1 <- "erra alpha version 0.2.1 written by Hugh Salamon\n"
VERSION2 <- "Copyright ?1998 The Regents of the University of California.\nAll Rights Reserved.\n"
VERSION3 <- "erra.0.2.1 15-Apr-1998: Error Analysis for 1D Electrophoresis Gels\n"

version = function(fp) {
  fprintf(fp, VERSION1);
  fprintf(fp, VERSION2);
  fprintf(fp, VERSION3);
  ### R does not have the limitations that C has, so the next three lines are not needed.
  # fprintf(fp, "LANES = %d\n", LANES);
  # fprintf(fp, "MAXBANDS = %d\n", MAXBANDS);
  # fprintf(fp, "IDENTIFIER_LENGTH = %d\n", IDENTIFIER_LENGTH);
  return (0)
}

align = function (first, second, numbands) {
  pro_err_array = 1:numbands
  sum = 0
  for (i in 1:numbands) {
    if (second[i] <= 0 || first[i] <= 0) {
      return (c(-1, pro_err_array))
    } else {
      sum = sum + log(first[i] / second[i])
    }
  }
  s = exp(sum / numbands)
  for (i in 1:numbands) {
    new = s * second[i]
    pro_err_array[i] = abs(new - first[i]) / ((new + first[i]) / 2)
  }
  c(s, pro_err_array)
}

erra = function(infilename, outfilename, sd) {
  stdin  <- stdin()
	stdout <- stdout()
	stderr <- stderr()
  # table of 0.01 significance level cutoff for
  #  proportional error reduction (s_reduction or d_reduction)
  #	empirically defined from simulationse
  table01 = c(
    0
    ,0
    ,0.545853
    ,0.707217
    ,0.786821
    ,0.829467
    ,0.859871
    ,0.878989
    ,0.895801
    ,0.907533
    ,0.917482
    ,0.923378
    ,0.930336
    ,0.936066
    ,0.940792
    ,0.944424
    ,0.948582
    ,0.950775
    ,0.953878
    ,0.956662
    ,0.958364
    )
  # table of 0.05 significance level cutoff for 
  # proportional error reduction (s_reduction or d_reduction)
  # empirically defined from simulations 
  table05 = c(
    0
    ,0 
    ,0.584046
    ,0.73282
    ,0.805951
    ,0.844601
    ,0.872386
    ,0.890113
    ,0.90525
    ,0.915821
    ,0.925033
    ,0.930465
    ,0.936637
    ,0.941961
    ,0.946195
    ,0.949619
    ,0.953305
    ,0.955293
    ,0.958113
    ,0.960667
    ,0.96228
    )
  
  UNINITIALIZED = 0
  
  logfile    = "erra.log"  
    # log file for errors                     # FILE*
  rfile      = "erra.rcm"   
    # recommendation file for MFA software    # FILE*
  infile     = infilename
    # fragment data file                      # FILE*
  outfile    = outfilename 
    # output file                             # FILE*
  acmpar     = paste(outfile, ".acm", sep="")
    # acm parameter output file               # FILE*
  sitepar    = paste(outfile, "_s.ite", sep="")
    # interr parameter output file            # FILE*
  ditepar    = paste(outfile, "_d.ite", sep="")
    # interr parameter output file            # FILE*
  
  write("", file = logfile, append = FALSE)
  write("", file = rfile, append = FALSE)
  write("", file = outfile, append = FALSE)
  write("", file = acmpar, append = FALSE)
  write("", file = sitepar, append = FALSE)
  write("", file = ditepar, append = FALSE)
  
  acmpar_name         = UNINITIALIZED           
                        # char
  itepar_s_name       = UNINITIALIZED        
                        # char
  itepar_d_name       = UNINITIALIZED        
                        # char[256]
  infile_name         = UNINITIALIZED           
                        # char[256]
  i                   = UNINITIALIZED                    
                        # int
  j                   = UNINITIALIZED                    
                        # int
  k                   = UNINITIALIZED                    
                        # int
  use_interr          = UNINITIALIZED      
                        # int
  num                 = UNINITIALIZED             
                        # int
  testnum             = UNINITIALIZED        
                        # int
  indicator           = UNINITIALIZED      
                        # int
  scanreturn          = UNINITIALIZED      
                        # int
  freturn             = UNINITIALIZED         
                        # int
  newbandnum          = 0            
                        # int
  numbands            = UNINITIALIZED        
                        # int[LANES]
  lane                = UNINITIALIZED            
                        # char[LANES][IDENTIFIER_LENGTH]
  gel                 = UNINITIALIZED             
                        # char[LANES][IDENTIFIER_LENGTH] 
  bands               = UNINITIALIZED          
                        # double[LANES][MAXBANDS + 1]
  laneident           = UNINITIALIZED       
                        # char[IDENTIFIER_LENGTH]
  line                = UNINITIALIZED            
                        # char[200]
  ## storage for reading in data
  scan_uniqueid       = UNINITIALIZED   
                        # char[IDENTIFIER_LENGTH]
  scan_gelid          = UNINITIALIZED      
                        # char[IDENTIFIER_LENGTH]
  scan_length         = UNINITIALIZED    
                        # double
  ## sd is the number of standard deviations used to control sensitivity 
  ## in acm and interr parmeters output by this program 
  # sd                  = UNINITIALIZED              
                        # double 
  s_count_sums        = 0        
                        # int
  d_count_sums        = 0          
                        # int
  array1              = UNINITIALIZED          
                        # double[MAXBANDS]
  array2              = UNINITIALIZED          
                        # double[MAXBANDS]
  pro_error_array     = UNINITIALIZED
                        # double[MAXBANDS]
  s_sums              = UNINITIALIZED     
                        # double[MAXBANDS]
  d_sums              = UNINITIALIZED     
                        # double[MAXBANDS]
  s_mean              = UNINITIALIZED     
                        # double[MAXBANDS]
  d_mean              = UNINITIALIZED     
                        # double[MAXBANDS]
  s_v_sum             = UNINITIALIZED     
                        # double[MAXBANDS]
  d_v_sum             = UNINITIALIZED     
                        # double[MAXBANDS]
  s_v                 = UNINITIALIZED     
                        # double[MAXBANDS]
  d_v                 = UNINITIALIZED     
                        # double[MAXBANDS]
  s_within_v          = UNINITIALIZED     
                        # double
  d_within_v          = UNINITIALIZED     
                        # double
  s_between_v         = 0    
                        # double
  d_between_v         = 0    
                        # double
  s_corrected_sums    = UNINITIALIZED     
                        # double
  d_corrected_sums    = UNINITIALIZED     
                        # double
  s_df_1              = UNINITIALIZED     
                        # double
  d_df_1              = UNINITIALIZED     
                        # double
  s_df_2              = UNINITIALIZED     
                        # int
  d_df_2              = UNINITIALIZED     
                        # int
  s_overall_mean      = UNINITIALIZED     
                        # double
  d_overall_mean      = UNINITIALIZED     
                        # double
  s_aligned_error_sum = 0  
                        # double
  d_aligned_error_sum = 0  
                        # double
  s_aligned_error_sum_sq = 0
                        # double
  d_aligned_error_sum_sq = 0
                        # double
  s_overall_aligned_v = 0
                        # double
  d_overall_aligned_v = 0   
                        # double
  sa_count            = 0  
                        # int
  da_count            = 0  
                        # int
  mean_frag           = UNINITIALIZED   
                        # double[MAXBANDS]
  s_F                 = UNINITIALIZED   
                        # double
  d_F                 = UNINITIALIZED   
                        # double
  s_ssd               = UNINITIALIZED   
                        # double
  d_ssd               = UNINITIALIZED   
                        # double
  prob                = UNINITIALIZED   
                        # double
  dummy               = UNINITIALIZED   
                        # double
  scale_factor        = UNINITIALIZED   
                        # double
  s_scale_factor      = 0  
                        # double
  d_scale_factor      = 0  
                        # double
  s_scale_factor_sq   = 0  
                        # double
  d_scale_factor_sq   = 0  
                        # double
  s_scale_factor_v    = 0  
                        # double
  d_scale_factor_v    = 0  
                        # double
  s_aligned_error     = UNINITIALIZED   
                        # double[MAXBANDS] 
  d_aligned_error     = UNINITIALIZED   
                        # double{MAXBANDS]
  s_overall_aligned   = 0  
                        # double
  d_overall_aligned   = 0  
                        # double
  s_reduction         = UNINITIALIZED   
                        # double
  d_reduction         = UNINITIALIZED   
                        # double
  s_red_flag          = UNINITIALIZED   
                        # int
  d_red_flag          = UNINITIALIZED   
                        # int
  entry               = UNINITIALIZED   
                        # int
  s_repeat            = 0
                        # int
  d_repeat            = 0  
                        # int
  s_flag              = UNINITIALIZED  
                        # int[LANES]
  d_flag              = UNINITIALIZED   
                        # int[LANES] 
  p_flag              = 0  
                        # int
  s_scale_count       = 0  
                        # int
  d_scale_count       = 0  
                        # int
  maxfrag             = 0 
                        # double
  diff_slide          = UNINITIALIZED   
                        # double
  diff_deviation      = UNINITIALIZED   
                        # double
  diff_increments     = UNINITIALIZED   
                        # int
  diff_hmw            = UNINITIALIZED   
                        # double
  diff_hmw_rate       = UNINITIALIZED   
                        # double
  same_slide          = UNINITIALIZED   
                        # double
  same_deviation      = UNINITIALIZED   
                        # double
  same_increments     = UNINITIALIZED   
                        # int
  same_hmw            = UNINITIALIZED   
                        # double
  same_hmw_rate       = UNINITIALIZED   
                        # double 

  version(logfile);
  fprintf(logfile, "infile:  \"%s\"\n", infile);
  fprintf(logfile, "outfile: \"%s\"\n", outfile);
  fprintf(logfile, "begin data read: any data which did not scan?\n");
  
  ########## START ERROR HANDLIGN ##########
  
  ########## END   ERROR HANDLIGN ##########
  
  repTable = read.table(infile, skip = 1, sep = "", fill = TRUE, header = FALSE, col.names = 1:5)
  #if (nrow(repTable) < 3) {
  #  print(paste("Error, the file ", infile, " is not a good replicate file."))
  #  stop(-1)
  #}
  laneCount = 0
  fragCount = 0
  laneName = ""
  testnum = -1;
  numbands = 0;
  for (i in 1:nrow(repTable)) {
    if (repTable[i, 1] != "" & repTable[i, 2] != "" & is.numeric(repTable[i, 3])) {
      fragCount = fragCount + 1
    }
    if (laneName != repTable[i, 1]) {
      laneCount = laneCount + 1
      laneName = repTable[i, 1]
      if (laneCount == 2) {
        testnum = numbands
      }
      if (laneCount > 1 & testnum != numbands) {
        fprintf(logfile, "Error: all lanes must have the same number of bands!\n");
        fprintf(logfile, "Lanes 1 has %d bands while lane %d has %d bands.\n", testnum, laneCount-1, numbands);       
        fprintf(stdout, "Error: all lanes must have the same number of bands!\n");
        fprintf(stdout, "Lanes 1 has %d bands while lane %d has %d bands.\n", testnum, laneCount-1, numbands);  
        fprintf(logfile, "EXITING DUE TO ERROR\n");
        fprintf(stderr, "EXITING DUE TO ERROR (see erra.log)\n");
        stop(8);
      }
      numbands = 0
    }
    numbands = numbands + 1
  }
  if (laneCount < 2) {
    fprintf(logfile, "There were fewer than two lanes found.\n");
    fprintf(logfile, "EXITING DUE TO ERROR\n");
    fprintf(stderr, "EXITING DUE TO ERROR (see erra.log)\n");
    stop(7);
  }
  
  workTable = matrix(0, ncol=laneCount, nrow=testnum)
  k = 1
  for (j in 1:ncol(workTable)) {
    gel[j] = as.character(repTable[k,2])
	for (i in 1:nrow(workTable)) {
      if (repTable[k,1] != "" & repTable[k,2] != "" & is.numeric(repTable[i, 3])) {
        workTable[i,j] = repTable[k,3]
        k = k + 1
      } else {
        fprintf(logfile, "%s", as.character(repTable[i,]))
      }
    }
  }
  
  fprintf(logfile, "end data read: %d lanes read\n", num);
  fprintf(logfile, "   erra error analysis starting...\n");
  
  fprintf(stderr,VERSION1);
  fprintf(stderr,VERSION2);
  fprintf(stderr,VERSION3);
  
  fprintf(stderr,"\n   erra error analysis starting, please wait...");
  
  fprintf(logfile,"   erra error analysis: calculating unaligned proportional errors ");
  fprintf(stderr,"   erra error analysis: calculating unaligned proportional errors ");
  
  # <aligned_error description="initialization of aligned_error">
  for (i in 1:nrow(workTable)) {
    s_aligned_error[i] = 0
    d_aligned_error[i] = 0
  }
  # </aligned_error>

  for (i in 1:ncol(workTable)) {
    s_flag[i] = 0
    d_flag[i] = 0
  }
  for (i in 1:nrow(workTable)) {
    mean_frag[i] = mean(workTable[i,])
  }
  s_ssd = 0
  d_ssd = 0

  for (i in 1:nrow(workTable)) {
    s_count_sums = 0
    d_count_sums = 0
    s_sums[i] = 0
    d_sums[i] = 0
    # calculate unaligned proportional error sums
    for (j in 1:ncol(workTable)) {
      for (k in (j + 1):ncol(workTable)) {
        if (j + 1 > ncol(workTable)) {
          next
        }
        if (identical(gel[j], gel[k])) {
          # print(paste("Those gels are identical: ", gel[j], ", ", gel[k]))
          if (i == 1) {
            if (s_flag[j] == 1) {
              s_repeat = s_repeat + 1
            } else {
              s_flag[j] = 1
            }
            if (identical(s_flag[k], 1)) {
              s_repeat = s_repeat + 1
            } else {
              s_flag[k] = 1
            }
          }
          s_sums[i] = s_sums[i] + abs(workTable[i,j] - workTable[i,k]) / ((workTable[i,j] + workTable[i,k]) / 2.0)
          s_count_sums = s_count_sums + 1 
        } else {
          if (i == 1) {
            if (d_flag[j] == 1) {
              d_repeat = d_repeat + 1
            } else {
              d_flag[j] = 1
            }
            if (d_flag[k] == 1) {
              d_repeat = d_repeat + 1
            } else {
              d_flag[k] = 1
            }
          }
          d_sums[i] = d_sums[i] + abs(workTable[i,j] - workTable[i,k]) / ((workTable[i,j] + workTable[i,k]) / 2.0)
          d_count_sums = d_count_sums + 1
        }
      }
    }
    # calculate unaligned proportional error means
    s_mean[i] = s_sums[i] / s_count_sums
    d_mean[i] = d_sums[i] / d_count_sums
    
    s_v_sum[i] = 0
    d_v_sum[i] = 0
    
    # calculate unaligned proportional error standard deviation
    for (j in 1:ncol(workTable)) {
      for (k in (j+1):ncol(workTable)) {
        if (k > ncol(workTable)) {
          next
        }
        if (identical(gel[j], gel[k])) {
          dummy = (abs(workTable[i,j] - workTable[i,k]) / ((workTable[i,j] + workTable[i,k]) / 2)) - s_mean[i]
          s_v_sum[i] = s_v_sum[i] + dummy * dummy
        } else {
          dummy = (abs(workTable[i,j] - workTable[i,k]) / ((workTable[i,j] + workTable[i,k]) / 2)) - d_mean[i]
          d_v_sum[i] = d_v_sum[i] + dummy * dummy
        }
      }
    }
    maxfrag = max(workTable)
    
    s_ssd = s_ssd + s_v_sum[i]
    d_ssd = d_ssd + d_v_sum[i]
    s_v[i] = s_v_sum[i] / (s_count_sums - 1)
    d_v[i] = d_v_sum[i] / (d_count_sums - 1)
  }
    
  s_within_v = s_ssd / (nrow(workTable) * (s_count_sums - 1))
  d_within_v = d_ssd / (nrow(workTable) * (d_count_sums - 1))
  
  s_overall_mean = mean(s_mean)
  d_overall_mean = mean(d_mean)

  s_between_v = sum((s_mean - s_overall_mean)^2)
  d_between_v = sum((d_mean - d_overall_mean)^2)
  
  # s_between_v = s_between_v * (double)s_count_sums / ((double)testnum - 1)
  # d_between_v = d_between_v * (double)d_count_sums / ((double)testnum - 1)
    
  # subtract 0.5 degrees of freedom for every repeated use of lane in a 
  # comparison used to calculate between-group (band) variances         
  
  s_corrected_sums = s_count_sums - s_repeat / 2
  d_corrected_sums = d_count_sums - d_repeat / 2
  
  s_between_v = s_between_v * s_corrected_sums / (nrow(workTable) - 1)
  d_between_v = d_between_v * d_corrected_sums / (nrow(workTable) - 1)
  
  s_F = s_between_v / s_within_v
  d_F = d_between_v / d_within_v
  
  s_df_2 = nrow(workTable) - 1
  d_df_2 = nrow(workTable) - 1
  # /*s_df_1 = testnum * (s_count_sums - 1)
  # d_df_1 = testnum * (d_count_sums - 1)*/
  s_df_1 = nrow(workTable) * (s_corrected_sums - 1)
  d_df_1 = nrow(workTable) * (s_corrected_sums - 1)
  
  ### <hypothesisTesting>
  
  version(outfile);
  fprintf(outfile, "\n  band   mean fragment size\n");
  for(i in 1:nrow(workTable)) {
    fprintf(outfile, "  %2d        %12.3f\n", i, mean_frag[i]);
  }
  fprintf(outfile, "\nUnaligned Proportional Error Analysis\n\n");
  fprintf(outfile, "   %d pairwise same gel lane comparisons\n", s_count_sums);
  fprintf(outfile, "   %d repeated lane uses in same gel comparisons\n", s_repeat);
  fprintf(outfile, "   %d pairwise different gel lane comparisons\n", d_count_sums);
  fprintf(outfile, "   %d repeated lane uses in different gel comparisons\n\n", d_repeat);
  fprintf(outfile, "\n   Proportional error:\n");
  fprintf(outfile, "                           band          same gel              different gel\n");
  for (i in 1:nrow(workTable)) {
    fprintf(outfile, "   sample means (variance)   %2d   %10.8f (%10.8f)   %10.8f (%10.8f)\n", 
            i, s_mean[i], s_v[i], d_mean[i], d_v[i]);
  }
  fprintf(outfile, "   overall means (variance)       %10.8f (%10.8f)   %10.8f (%10.8f)\n\n", 
          s_overall_mean, s_within_v, d_overall_mean, d_within_v);
  
  if(s_count_sums > 1)
  {
    fprintf(outfile, "   Test Hypothesis H_o: Error is proportional to fragment size.\n\n");
    fprintf(outfile, "   one-way ANOVA on proportional errors (with df corrections for non-orthogonal comparisons)\n");
    fprintf(outfile, "   same gel variances:      between = %10.8f    within = %10.8f\n", s_between_v, s_within_v);
    fprintf(outfile, "   same gel analysis of variance:       F = %f, df1 = %f, df2 = %d\n", s_F, s_df_1, s_df_2);
    prob = pbeta(s_df_2 / (s_df_2 + s_df_1 * s_F), 0.5 * s_df_2, 0.5 * s_df_1);
    fprintf(outfile, "   probability = %f\n", prob);
    if (prob < 0.05) {
      fprintf(outfile, "   H_o rejected, error is not proportional to fragment size\n\n");
      p_flag = 1;
    } else {
      fprintf(outfile, "   H_o not rejected.\n\n");
    }
  }
  
  if(d_count_sums > 1) {
    fprintf(outfile, "   Test Hypothesis H_o: Error is proportional to fragment size.\n\n");
    fprintf(outfile, "   one-way ANOVA on proportional errors (with df corrections for non-orthogonal comparisons)\n");
    fprintf(outfile, "   different gel variances: between = %10.8f    within = %10.8f\n", d_between_v, d_within_v);
    fprintf(outfile, "   different gel analysis of variance:  F = %f, df1 = %f, df2 = %d\n", d_F, d_df_1, d_df_2);
    prob = pbeta(d_df_2 / (d_df_2 + d_df_1 * d_F), 0.5 * d_df_2, 0.5 * d_df_1);
    fprintf(outfile, "   probability = %f\n", prob);
    if(prob < 0.05) {
      fprintf(outfile, "   H_o rejected, error is not proportional to fragment size\n\n");
      p_flag = 1;
    } else {
      fprintf(outfile, "   H_o not rejected.\n\n");
    }
  }
  
  ### </hypothesisTesting>
  
  # append to logfile
  write("   erra error analysis: calculating aligned proportional errors ", stderr)
  
  for (j in 1:(ncol(workTable)-1)) {
    for (k in (j+1):ncol(workTable)) {
      for (i in 1:nrow(workTable)) {
        array1[i] = workTable[i,j]
        array2[i] = workTable[i,k]
      }
      freturn = align(array1, array2, nrow(workTable))
      scale_factor = freturn[1]
      pro_error_array = freturn[2:17]
      
      if (scale_factor < 0) {
        fprintf(logfile, "function align error: possibly because alignment can be performed")
        fprintf(logfile, "                      only with non-zero fragment sizes")
        fprintf(logfile, "EXITING DUE TO ERROR")
        fprintf(stderr, "EXITING DUE TO ERROR (see erra.log)")
        stop(7)
      }
      if (scale_factor < 1) {
        scale_factor = 1 / scale_factor
      }
      if (identical(gel[j], gel[k])) {
        s_scale_count = s_scale_count + 1
        s_scale_factor = s_scale_factor + scale_factor
        s_scale_factor_sq = s_scale_factor_sq + scale_factor * scale_factor
        for (i in 1:nrow(workTable)) {
          s_aligned_error[i] = s_aligned_error[i] + pro_error_array[i]
          s_aligned_error_sum = s_aligned_error_sum + pro_error_array[i]
          s_aligned_error_sum_sq = s_aligned_error_sum_sq + pro_error_array[i] * pro_error_array[i]
          sa_count = sa_count + 1
        }
      } else {
        d_scale_count = d_scale_count + 1
        d_scale_factor = d_scale_factor + scale_factor
        d_scale_factor_sq = d_scale_factor_sq + scale_factor * scale_factor
        for (i in 1:nrow(workTable)) {
          d_aligned_error[i] = d_aligned_error[i] + pro_error_array[i]
          d_aligned_error_sum = d_aligned_error_sum + pro_error_array[i]
          d_aligned_error_sum_sq = d_aligned_error_sum_sq + pro_error_array[i] * pro_error_array[i]
          da_count = da_count + 1
        }
      }
    }
  }
  
  # s_scale_factor has sum of values, s_scale_factor_sq has sum of squared values
  # for s_scale_count values -- analagous situation for d_scale_factor, etc.     
  
  # /* calculate variance in scale_factors */
  if (s_scale_count > 1) {
    s_scale_factor_v = (s_scale_factor_sq - (s_scale_factor / s_scale_count) * s_scale_factor) / 
        (s_scale_count - 1)
  }
  if (d_scale_count > 1) {
    d_scale_factor_v = (d_scale_factor_sq - (d_scale_factor / d_scale_count) * d_scale_factor) / 
        (d_scale_count - 1)
  }
  
  # /* calcuate mean scale factors */
  s_scale_factor = s_scale_factor / s_scale_count
  d_scale_factor = d_scale_factor / d_scale_count
  
  # /* calculate the aligned error and variance in error for all band comparisons */
  s_overall_aligned_v = (s_aligned_error_sum_sq - (s_aligned_error_sum / sa_count) * s_aligned_error_sum) /
      (sa_count - 1)
  
  d_overall_aligned_v = (d_aligned_error_sum_sq - (d_aligned_error_sum / da_count) * d_aligned_error_sum) /
      (da_count - 1)
  
  s_overall_aligned = s_aligned_error_sum / sa_count
  
  d_overall_aligned = d_aligned_error_sum / da_count

  fprintf(outfile, "Mean Same Gel Scaling Factor = %6.4f (%6.4f%%)\n", s_scale_factor, 100 * (s_scale_factor - 1.0));
  fprintf(outfile, "  variance = %10.8f\n\n", s_scale_factor_v);
  fprintf(outfile, "Mean Different Gel Scaling Factor = %6.4f (%6.4f%%)\n", d_scale_factor, 100 * (d_scale_factor - 1.0));
  fprintf(outfile, "  variance = %10.8f\n\n", d_scale_factor_v);
  
  fprintf(outfile, "Aligned Proportional Error Analysis\n\n");
  fprintf(outfile, "                        band   same gel            different gel\n");
  for (i in 1:nrow(workTable)) {
    #fprintf(outfile, paste("s_aligned_scale_count: ", s_aligned_error[i]))
    s_aligned_error[i] = s_aligned_error[i] / s_scale_count
    d_aligned_error[i] = d_aligned_error[i] / d_scale_count
    fprintf(outfile, "   aligned sample means  %2d   %10.8f            %10.8f\n", i, s_aligned_error[i], d_aligned_error[i]);
  }
  fprintf(outfile, "   overall means              %10.8f            %10.8f\n", s_overall_aligned, d_overall_aligned);
  
  s_reduction = s_overall_aligned / s_overall_mean
  d_reduction = d_overall_aligned / d_overall_mean
  
  if(ncol(workTable) > 20) {
    entry = 20
  } else {
    entry = ncol(workTable)
  }
  
  fprintf(outfile, "\n   same gel reduction in error due to alignment %5.1f%%\n", 100 * (1.0  - s_reduction));
  if (s_reduction < table01[entry]) {
    fprintf(outfile,"   which represents a significant reduction in proportional error (p <= 0.01)\n");
    s_red_flag = 0
  } else if (s_reduction < table05[entry]) {
    fprintf(outfile,"   which represents a significant reduction in proportional error (p <= 0.05)\n");
    s_red_flag = 1
  } else {
    fprintf(outfile,"   which is not a clearly significant reduction in proportional error\n");
    s_red_flag = 2
    if (ncol(workTable) < 9) {
      fprintf(outfile, "\n    The number of bands in the replicate strains is not large enough to reject that this\n");
      fprintf(outfile, "    reduction in proportional error would result even in the absence of scaling error.\n");
      fprintf(outfile, "    Increasing the number of fragments in replicate genotypes is recommended.\n\n");
    }
  }
  
  fprintf(outfile, "\n   different gel reduction in error due to alignment %5.1f%%\n", 100 * (1 - d_reduction));
  if (d_reduction < table01[entry]) {
    fprintf(outfile,"   which represents a significant reduction in proportional error (p <= 0.01)\n");
    d_red_flag = 0;
  } else if (d_reduction < table05[entry]) {
    fprintf(outfile,"   which represents a significant reduction in proportional error (p <= 0.05)\n");
    d_red_flag = 1;
  } else {
    fprintf(outfile, "   which is not a clearly significant reduction in proportional error")
    d_red_flag = 2
    if (nrow(workTable) < 9) {
      fprintf(outfile, "\n    The number of bands in the replicate strains is not large enough to reject that this\n");
      fprintf(outfile, "    reduction in proportional error would result even in the absence of scaling error.\n");
      fprintf(outfile, "    Increasing the number of fragments in replicate genotypes is recommended.\n\n");
    }
  }
  
  fprintf(outfile,"\n");
  
  if(s_count_sums < 45 && d_count_sums < 45) {
    fprintf(outfile, "The small number of comparisons may make the following recommendations unreliable.\n");
    fprintf(outfile, "Increasing the number of replicate genotypes is recommended.\n\n");
  }
  
  if (p_flag == 1) {
    # /* significant evidence that error is not proportional to fragment sizes was found */
    fprintf(outfile, " ---------------------------------------------------------------------------\n");
    fprintf(outfile, "| The Align-and-Count Method is not recommended for comparison of this data.|\n");
    fprintf(outfile, "| Use Interpolated Error Method instead.                                    |\n");
    fprintf(outfile, " ---------------------------------------------------------------------------\n");
    use_interr = 1;
  } else if ((s_red_flag == 2 && d_red_flag == 2)) {
  	# /* no significant evidence for reduction of error with alignment was found */
  	if ((s_scale_factor - 1) / s_overall_mean < 0.5 && (d_scale_factor - 1) / d_overall_mean < 0.5) {
  	# /* and the scaling factors found were less than half the pre-alignment error levels */
  	  fprintf(outfile, " ---------------------------------------------------------------------------------\n");
  	  fprintf(outfile, "| The Align-and-Count Method is unlikely to be useful for comparison of this data.|\n");
  	  fprintf(outfile, "| Use Interpolated Error Method instead.                                          |\n");
  	  fprintf(outfile, " ---------------------------------------------------------------------------------\n");
  	  use_interr = 1;
  	} else {
  	# /* but at least one of the scaling factors was found to be at least half as large as the error levels */
  	  fprintf(outfile, " ----------------------------------------------------------------------\n");
  	  fprintf(outfile, "| The Align-and-Count Method may be useful for comparison of this data.|\n");
  	  fprintf(outfile, " ----------------------------------------------------------------------\n");
  	  use_interr = 1;
  	}
  } else {
	# /* proportional error and */
	# /* significant reduction in error with alignment means the Align-and-Count Method should be useful */
    fprintf(outfile, " -------------------------------------------------------------------------------\n");
    fprintf(outfile, "| The Align-and-Count Method is likely to be useful for comparison of this data.|\n");
    fprintf(outfile, " -------------------------------------------------------------------------------\n");
    use_interr = 0;
  }
  fprintf(outfile,"\n");
  
  fprintf(logfile, "   erra error analysis: errors calculated, writing files\n\n");
  fprintf(stderr, "   erra error analysis: errors calculated, writing files\n\n");
  
  if (use_interr == 1) {
    fprintf(rfile, "Interpolated Error\ninterr\n");
  } else {
    fprintf(rfile, "ACM\nacm\n");
  }
  
	# /* calculate parameters for the acm parameter file */

	diff_slide = d_scale_factor - 1.0 + sd * sqrt(d_scale_factor_v)
	diff_deviation = d_overall_aligned + sd * sqrt(d_overall_aligned_v)
	diff_increments = 2 * floor((d_scale_factor - 1) * 25 / sqrt(d_overall_aligned_v)) + 1
	diff_hmw = maxfrag * 3
	diff_hmw_rate = 0

	same_slide = s_scale_factor - 1 + sd * sqrt(s_scale_factor_v)
	same_deviation = s_overall_aligned + sd * sqrt(s_overall_aligned_v)
	same_increments = 2 * floor((s_scale_factor - 1) * 25 / sqrt(s_overall_aligned_v)) + 1
	same_hmw = maxfrag * 3
	same_hmw_rate = 0
	
  fprintf(acmpar, "This Align-and-Count Method (ACM) parameter file was automatically generated by\n");
  fprintf(acmpar, VERSION3);
  fprintf(acmpar, "The file %s was analyzed\n", infile_name);
  fprintf(acmpar, "Comments can go anywhere as long as the parameter label and value are\n");
  fprintf(acmpar, "the first two tokens, respectively, found in a line using sscanf in C.\n");
  fprintf(acmpar, "The order of parameter declarations is not restricted.  If an expected\n");
  fprintf(acmpar, "parameter is not defined, acm will exit, and errors will be recorded\n");
  fprintf(acmpar, "in acm.log.\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "DIFF_SLIDE         %10.8f    This is the maximum scaling factor\n", diff_slide);
  fprintf(acmpar, "for relative scaling of lanes from different gels or images.\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "DIFF_DEVIATION     %10.8f    This is the cutoff proportional deviation\n", diff_deviation);
  fprintf(acmpar, "factor for mutually closest bands to be considered a match.\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "DIFF_INCREMENTS    %10d    This is the number of increments used in\n", diff_increments);
  fprintf(acmpar, "searching through the range given by DIFF_SLIDE.  An odd number\n");
  fprintf(acmpar, "is recommended.\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "DIFF_HMW           %10.8f    This is the value above which a larger\n", diff_hmw);
  fprintf(acmpar, "deviation tolerance may be allowed.  (Three times the largest fragment analyzed\n");
  fprintf(acmpar, "is used in the automated generation of this file.)\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "DIFF_HMW_RATE      %10.8f    The per unit increase in DIFF_DEVIATION\n", diff_hmw_rate);
  fprintf(acmpar, "applied for fragments larger than DIFF_HMW.  (The default is zero for this automatically\n");
  fprintf(acmpar, "generated file.)\n");
  
  fprintf(acmpar, "\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "SAME_SLIDE         %10.8f    This is the maximum scaling factor\n", same_slide);
  fprintf(acmpar, "for relative scaling of lanes from samegels or images.\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "SAME_DEVIATION     %10.8f    This is the cutoff proportional deviation\n", same_deviation);
  fprintf(acmpar, "factor for mutually closest bands to be considered a match.\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "SAME_INCREMENTS    %10d    This is the number of increments used in\n", same_increments);
  fprintf(acmpar, "searching through the range given by SAME_SLIDE.  An odd number\n");
  fprintf(acmpar, "is recommended.\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "SAME_HMW           %10.8f    This is the value above which a larger\n", same_hmw);
  fprintf(acmpar, "deviation tolerance may be allowed.  (Three times the largest fragment analyzed\n");
  fprintf(acmpar, "is used in the automated generation of this file.)\n");
  fprintf(acmpar, "\n");
  fprintf(acmpar, "SAME_HMW_RATE      %10.8f    The per unit increase in SAME_DEVIATION\n", same_hmw_rate);
  fprintf(acmpar, "applied for fragments larger than SAME_HMW.  (The default is zero for this automatically\n");
  fprintf(acmpar, "generated file.)\n");
  
  # /* write parameter files for interr (Interpolated Error Method) */
    
  for (i in 1:nrow(workTable)) {
    fprintf(sitepar,"%12.8f %10.8f\n",mean_frag[i], (s_mean[i] + sd * sqrt(s_v[i])));
    fprintf(ditepar,"%12.8f %10.8f\n",mean_frag[i], (d_mean[i] + sd * sqrt(d_v[i])));
  }
}

# erra("replicates.in", "test.out", sd = 1)

  

Try the acm4r package in your browser

Any scripts or data that you put into this service are public.

acm4r documentation built on May 1, 2019, 7:50 p.m.