R/acm.R

#' Align-and-Count Method comparisons of RFLP data
#'
#' Fragment lengths or molecular weights from pairs of lanes are compared, and
#' a number of matching bands are calculated using the Align-and-Count Method.
#' Band intensities are not used in this method.This version of acm will perform 
#' all distinct pairwise comparisons between lanes. The first lane will be compared 
#' to all following lanes. The second lane will be compared to the third, fourth, etc..
#'
#' @param input The input file format for fragment and lane information are as the file
#' example.in shows. Each line (record) should define a band. The file must be
#' sorted so that the bands from a lane are together; the file must be sorted
#' by unique lane identifier (ULI). There must be three fields (separated by
#' white space -- that is, seperated by any combination of spaces and tabs).
#' The first field must be the ULI. The second field is the gel image
#' identifier (GII). This is only critical for the first fragment read in for
#' the lane; for subsequent fragments, the second field must simply be present.
#' ULI and GII names must not contain any spaces; I strongly suggest that these
#' fields contain only letters, numbers, and underscore characters. The third
#' field should contain the fragment length or molecular weight data for the
#' fragment. Lines that cannot be parsed into these three fields will be
#' recored in acm.log. Check acm.log after each run to see the number of lanes
#' read in and to check for lines from the input file that could not be parsed.
#' Lines must be no longer than 199 characters. 
#' @param work_dir This is where the work is being done
#' @param dnum is the file number
#' @return The output from acm or acmone gives a comparison of two lanes on each line
#' of output in the form:
#' ULI ULI numbands numbands num_matches
#' The first field is identifier for the first lane in the comparison. The
#' third field is the number of bands read in for this first lane. The second
#' field is the identifier for the second lane in the comparison. The fourth
#' field is the number of band read in for this second lane. The fifth field is
#' the number of fragments found to match between the two lanes. The number of
#' matches found may be different depending on whether the lanes came from the
#' same gel image or from different gel images (see the file acm.par).
#' @references 
#' Salamon et. al (1998) Accommodating Error Analysis in Comparison and Clustering of Molecular Fingerprints. Emerging Infectious Diseases Vol. 4, No. 2, April-June 1998
#' @references Abasci LLC. JAMES v1.0 User Documentation. 2002.  
#' @author Andrea Benedetti \email{andrea.benedetti@@mcgill.ca}
#' @author Sahir Rai Bhatnagar
#' @author XiaoFei Zhao
#' @note Requires a parameter file generated by \code{\link{call_erra}}
#' @examples 
#' \dontshow{
#' (WD <- getwd())
#' if (!is.null(WD)) setwd(WD)
#' data(replicates.in)
#' write.table(replicates.in,  "replicates.in", quote=FALSE, row.names=FALSE)
#' data(experiments.in)
#' write.table(experiments.in, "experiments.in", quote=FALSE, row.names=FALSE)
#' call_erra("replicates.in",  dnum=1, sd=1, delete=TRUE)
#' }
#' #perform the basic pairwise comparison via the align and count algorithm
#' res1<-call_acm("experiments.in")
#' @export
call_acm <- function(input, work_dir=".", dnum=1)
{	
	call_acm2 <- function()
	{
		if (!file.exists(work_dir)) {
			if (!dir.create(work_dir)) {
				print(paste("In directory ", getwd(), " ",                sep = "'"))
				print(paste(" - directory ", work_dir,  " does not exist ", sep = "'"))
				print(paste(" - and the program cannot create the directory ", work_dir, " ", sep = "'"))
				stop()
			}
		}
		acm.file.exists(input)
		input = tools::file_path_as_absolute(input)
		setwd(work_dir)
		#exe.dir <- system.file(package="acm4r", "executables", "win32")
		#file.copy(from = input, to = paste(exe.dir, "/acm.in",  sep = ""), overwrite = TRUE)
		#setwd(exe.dir)
		#cmd = paste("./acm", input, "acm.out", sep = " ")
		outputname = paste("acm.out", ".", dnum, sep = "")
		outtable = acm(input, outputname)
		#print (paste("executing command:", cmd, sep = " "))
		#system(cmd, intern = TRUE, ignore.stdout = FALSE, ignore.stderr = TRUE)
		file.show(outputname, title = "Result of all pairwise matches by acm (align and count with exact matching)")
		#file.copy(from = "acm.out", to = paste("acm.out", dnum, sep = "."), overwrite = delete);
		#print(paste("call_acm finished, data is stored at ", exe.dir, ".", sep = "'"))		
		print("call_acm finished.")
		return(outtable)
	}
	old.dir <- getwd()
	tryCatch((outtable = call_acm2()), finally = setwd(old.dir))
	return(outtable);
}

# The code below is translated from C code for Align-and-Count-Method by XiaoFei Zhao

VERSION1 = "acm alpha version 0.6 written by Hugh Salamon"
VERSION2 = "Copyright  1996, 1997 The Regents of the University of California."
VERSION3 = "All Rights Reserved.           acm.0.6.1 7-Apr-1998"

fprintf = function(filename = "", format = "", ...) {
  write(sprintf(format, ...), file = filename, append = TRUE)
}

read_pars = function(logfile, fp)
# /* returns 1 if all parameters cannot be read and converted, 0 otherwise) */
{
	
	params = list(s_increments="", s_slide="", s_deviation="", s_hmw="", s_hmw_rate="",
	        d_increments="", d_slide="", d_deviation="", d_hmw="", d_hmw_rate="", retval=0)

	s_inc_tag = "SAME_INCREMENTS";
	d_inc_tag = "DIFF_INCREMENTS";
	s_sli_tag = "SAME_SLIDE";
	d_sli_tag = "DIFF_SLIDE";
	s_dev_tag = "SAME_DEVIATION";
	d_dev_tag = "DIFF_DEVIATION";
	s_hmw_tag = "SAME_HMW";
	d_hmw_tag = "DIFF_HMW";
	s_hmwr_tag = "SAME_HMW_RATE";
	d_hmwr_tag = "DIFF_HMW_RATE";

	filecontent = readLines(fp);
	
  # print(filecontent);
  
	for (i in 1:length(filecontent))
	{
		aline = unlist(strsplit(filecontent[i], "( )+"))
		tag = aline[1]
	    options(warn = -1)
        value = as.numeric(aline[2])
        options(warn = 0)
		if(length(aline) >= 2 && is.finite(value))
		{
            # print(paste("value ((", value, ")) is finite = "))
      #write("S_INC_TAG found", file = stderr())
      #print(value)
			if(s_inc_tag == tag)
			{
				params$s_increments = floor(value);
			}
			else if(d_inc_tag == tag)
			{
				params$d_increments = floor(value);
			}
			else if(s_sli_tag == tag)
			{
				params$s_slide = value;
			}
			else if(d_sli_tag == tag)
			{
				params$d_slide = value;
			}
			else if(s_dev_tag == tag)
			{
				params$s_deviation = value;
			}
			else if(d_dev_tag == tag)
			{
				params$d_deviation = value;
			}
			else if(s_hmw_tag == tag)
			{
				params$s_hmw = value;
			}
			else if(d_hmw_tag == tag)
			{
				params$d_hmw = value;
			}
			else if(s_hmwr_tag == tag)
			{
				params$s_hmw_rate = value;
			}
			else if(d_hmwr_tag == tag)
			{
				params$d_hmw_rate = value;
			}
		}
	}

	if ("" == params$s_increments) 
	{
	  params$retval = 1;
		fprintf(logfile, "SAME_INCREMENTS value not found in acm.par");
	}
	if ("" == params$d_increments) 
	{
	  params$retval = 1;
		fprintf(logfile, "DIFF_INCREMENTS value not found in acm.par");
	}
	if ("" == params$s_slide) 
	{
	  params$retval = 1;
		fprintf(logfile, "SAME_SLIDE value not found in acm.par");
	}
	if("" == params$d_slide) 
	{
	  params$retval = 1;
		fprintf(logfile, "DIFF_SLIDE value not found in acm.par");
	}
	if("" == params$s_deviation) 
	{
	  params$retval = 1;
		fprintf(logfile, "SAME_DEVIATION value not found in acm.par");
	}
	if("" == params$d_deviation) 
	{
	  params$retval = 1;
		fprintf(logfile, "DIFF_DEVIATION value not found in acm.par");
	}
	if("" == params$s_hmw) 
	{
	  params$retval = 1;
		fprintf(logfile, "SAME_HMW value not found in acm.par");
	}
	if("" == params$d_hmw) 
	{
	  params$retval = 1;
		fprintf(logfile, "DIFF_HMW value not found in acm.par");
	}
	if("" == params$s_hmw_rate) 
	{
		retval = 1;
		fprintf(logfile, "SAME_HMW_RATE value not found in acm.par");
	}
	if("" == params$d_hmw_rate) 
	{
		retval = 1;
		fprintf(logfile, "DIFF_HMW_RATE value not found in acm.par");
	}
  
	return(params);
}

showpar_samediff = function(fp, deviation, slide, increments, hmw, hmw_rate) {
  fprintf(fp, VERSION1);
  fprintf(fp, VERSION2);
  fprintf(fp, VERSION3);
  # fprintf(fp, "LANES = %d", LANES);
  # fprintf(fp, "MAXBANDS = %d", MAXBANDS);
  # fprintf(fp, "IDENTIFIER_LENGTH = %d", IDENTIFIER_LENGTH);
  # fprintf(fp, "deviation = %f", deviation);
  # fprintf(fp, "slide = %f", slide);
  # fprintf(fp, "increments = %d", increments);
  # fprintf(fp, "hmw = %f", hmw);
  # fprintf(fp, "hmw_rate = %f", hmw_rate);
  return(0);
}


slidefun_samediff = function(array1, array2, bands1, bands2, increments, slide, deviation, hmw, hmw_rate)
{
  #  /*array1: first lane, array of band sizes -- e.g. in kilobases */
  #  /*array2: second lane, array of band sizes */
  #  double new1[MAXBANDS];  /* array to hold shifted values of first lane */
  #  double new2[MAXBANDS];  /* array to hold shifted values of second lane */
  #  double adjust1, adjust2;
  #  int j, k, maxscore, tmps;
  
  UNINITIALIZED = 0
  
  new1 = rep(0, bands1)
  new2 = rep(0, bands2)
  
  #  /* should check that bands1 and bands2 are not > MAXBANDS */
    
    maxscore = 0;
  
  #  /* Lanes -- arrays -- are shifted relative to each other by multiplying */
  #  /* bands in each lane by a single constant for that lane (e.g., 1.035). */
  #  /* The new, shifted, lanes are then compared by function score.  This */
  #  /* shifting and scoring is performed increments number of times in the */
  #  /* following loop.  The maximum score calculated is reported. */
    
    j = -(increments - 1) / 2;
    while (j <= (increments - 1) / 2)
    {
      adjust1 = 1 + as.numeric(j) * as.numeric(slide) / as.numeric(increments - 1);
      adjust2 = 1 - as.numeric(j) * as.numeric(slide) / as.numeric(increments - 1);
     
      for (k in 1:bands1)
      {
        new1[k] = array1[k] * adjust1;
      }
      
      for (k in 1:bands2)
      {
        new2[k] = array2[k] * adjust2;
      }
      
      tmps = score_samediff(new1,new2, bands1, bands2, deviation, hmw, hmw_rate);
      
      if(tmps > maxscore)
      {
        maxscore = tmps;
      }
      j = j + 1;
    }
  return(maxscore);
}


score_samediff = function(bands_1, bands_2, numx, numy, deviation, hmw, hmw_rate) {
  
  # /* The returned score is the number of bands determined to match between the */
  # /* two arrays.  Mutually closest bands whose distance apart is less than */
  # /* deviation (or dev) multiplied by the mean size of the pair of mutually closest */
  # /* bands contribute +1 to the score returned. */
    
  UNINITIALIZED = 0;
  
  closestx = rep(0, numx);
  closesty = rep(0, numy);
  matches = UNINITIALIZED;
  #x = UNINITIALIZED;
  #y = UNINITIALIZED;
  best = UNINITIALIZED; 
  closest_mean = UNINITIALIZED; 
  dev = UNINITIALIZED;
  test = UNINITIALIZED;
  
  matches = 0;
  
  for (x in 1:numx)
  {
    #best = abs(bands_1[x] - bands_2[1]);
    #closestx[x] = 1;
    #for(y in 2:numy)
    #{
    #  test = abs(bands_1[x] - bands_2[y]);
    #  if (test < best)
    #  {
    #    best = test;
    #    closestx[x] = y;
    #  }
    #}
	closestx[x] = which.min(abs(bands_2 - bands_1[x]))
  }
  
  for (y in 1:numy)
  {
    #best = abs(bands_2[y] - bands_1[1]);
    #closesty[y] = 1;
    #for(x in 2:numx)
    #{
    #  test = abs(bands_2[y] - bands_1[x]);
    #  if (test < best)
    #  {
    #    best = test;
    #    closesty[y] = x;
    #  }
    #}
	closesty[y] = which.min(abs(bands_1 - bands_2[y]))
  }
  # print(closestx); print(closesty);
  for (x in 1:numx)
  {
    if (closesty[closestx[x]] == x)
    {
      closest_mean = (bands_1[x] + bands_2[closestx[x]]) / 2;
      if (closest_mean > hmw)
      {
        dev = deviation + (closest_mean - hmw) * hmw_rate;
        if (abs(bands_1[x] - bands_2[closestx[x]])/(0.5 * (bands_1[x] + bands_2[closestx[x]])) <= dev)
        {
          matches = matches + 1;
        }
      }
      else
      {
        if (abs(bands_1[x] - bands_2[closestx[x]])/(0.5 * (bands_1[x] + bands_2[closestx[x]])) <= deviation)
        {
          matches = matches + 1;
        }
      }
    }
  }
  
  return(matches);
  
}


acm <- function(infilename, outfilename)
{
  
  stderr = stderr()
  stdout = stdout()
  stdin = stdin()
  # /* parameters for matching functions: */
  # /* s_ for same gel, d_ for different gel */
    
  UNINITIALIZED = 0;
  
  s_increments = UNINITIALIZED;
  s_slide = UNINITIALIZED;
  s_deviation = UNINITIALIZED;
  s_hmw = UNINITIALIZED;
  s_hmw_rate = UNINITIALIZED;
  
  d_increments = UNINITIALIZED;
  d_slide = UNINITIALIZED;
  d_deviation = UNINITIALIZED;
  d_hmw = UNINITIALIZED;
  d_hmw_rate = UNINITIALIZED;
  
  acmpar = "acm.par";  # /* parameter_file */
  logfile = "acm.log"; # /* log file for errors */
  write("", file = logfile)
  infile = infilename;  # /* fragment data file */
  outfile = outfilename; # /* output file */
  write("", file = outfile)
    
  par_error = UNINITIALIZED;
  
  array1 = UNINITIALIZED;  #/* first lane for comparison, array of band sizes -- e.g. in kilobases */
  array2 = UNINITIALIZED;  #/* second lane for comparison, array of band sizes */
  bands1 = UNINITIALIZED; 
  bands2 = UNINITIALIZED;
  # i; j; k;
  num = UNINITIALIZED;
  comparisons = UNINITIALIZED;
  total_comparisons = UNINITIALIZED;
  progress = UNINITIALIZED;
  storescore = UNINITIALIZED;
  # indicator = UNINITIALIZED;
  # scanreturn;
  # newbandnum;
  numbands = UNINITIALIZED;
  lane = UNINITIALIZED;
  gel = UNINITIALIZED;
  bands = UNINITIALIZED;
  laneident = UNINITIALIZED;
  line = UNINITIALIZED;
  #/* storage for reading in data */
  # scan_uniqueid;
  # scan_gelid;
  # scan_length;
  
  comparisons = 0;
  # newbandnum = 0;
  
  params = read_pars(logfile, acmpar);
  par_error = params$retval;
  if (par_error != 0) {
    fprintf(stderr, "Error(s) reading acm.par, see acm.log.\n");
    stop(1);
  }
  
  s_increments = params$s_increments;
  s_slide = params$s_slide;
  s_deviation = params$s_deviation;
  s_hmw = params$s_hmw;
  s_hmw_rate = params$s_hmw_rate;
  
  d_increments = params$d_increments;
  d_slide = params$d_slide;
  d_deviation = params$d_deviation;
  d_hmw = params$d_hmw;
  d_hmw_rate = params$d_hmw_rate;
  
#print(params)
#print(paste("s_slide ============ ", s_slide))

  if (infile == "-p") {
    fprintf(stdout(), "  same gel comparisons:");
    showpar_samediff(stdout(), s_deviation, s_slide, s_increments, s_hmw, s_hmw_rate);
    showpar_samediff(logfile, s_deviation, s_slide, s_increments, s_hmw, s_hmw_rate);
    fprintf(stdout(), "\n  different gel comparisons:");
    showpar_samediff(stdout(), d_deviation, d_slide, d_increments, d_hmw, d_hmw_rate);
    showpar_samediff(logfile, d_deviation, d_slide, d_increments, d_hmw, d_hmw_rate);
    stop(1);
  }

  fprintf(logfile, "begin data read: any data which did not scan?");
  
  # START reading data
  
  intable = read.table(infile, skip = 1, sep = "", fill = TRUE, header = FALSE, col.names = 1:5)
#  if (nrow(intable) < 3) {
#    print(paste("Error, the file ", infile, " is not a good input file."))
#    stop(-1)
#  }
  laneCount = 0
  fragCount = 0
  laneName = ""
  laneNames = "";
  gel = UNINITIALIZED;
  for (i in 1:nrow(intable)) {
    if (intable[i, 1] != "" & intable[i, 2] != "" & is.numeric(intable[i, 3])) {
      fragCount = fragCount + 1
      if (laneName != intable[i, 1]) {
        laneCount = laneCount + 1
        laneName = as.character(intable[i, 1])
        gel[laneCount] = intable[i, 2]
        laneNames[laneCount] = as.character(laneName);
      }
    } else {
      # print(paste("lane  ", i, "  is incorrect!"))
    }
  }

  laneVec = matrix(-1, nrow = laneCount, ncol = 50);
  laneCount = 0;
  bandCount = 0;
  laneName = "";
  fragCountAtLane = UNINITIALIZED;
  for (i in 1:nrow(intable)) {
    if (intable[i, 1] != "" & intable[i, 2] != "" & is.numeric(intable[i, 3])) {
      if (laneName != intable[i, 1]) {
        laneName = as.character(intable[i, 1]);
        laneCount = laneCount + 1;
        bandCount = 1;
        laneVec[laneCount, 1] = as.numeric(intable[i,3]);
        # laneNames[laneCount] = laneName;
      } else {
        bandCount = bandCount + 1;
        fragCountAtLane[laneCount] = bandCount;
        laneVec[laneCount, bandCount] = as.numeric(intable[i,3]);
      }
    } else {
      fprintf(logfile, "%s", intable[i,]);
    }
  }

  # END reading data
  num = laneCount
  
  fprintf(logfile, "end data read: %d lanes read", num);
  fprintf(logfile, "   acm starting...");
  total_comparisons = num * (num - 1L) / 2L;
  fprintf(stderr,VERSION1);
  fprintf(stderr,VERSION2);
  fprintf(stderr,VERSION3);
  fprintf(stderr,"\n   acm starting, please wait...");
  matrixIndex = 1
  outtable = matrix(data = NA, nrow = (num-1) * (num) / 2,  ncol = 5)
  for(i in 1:(num-1)) {
    bands1 = fragCountAtLane[i];
    #print(bands1)
    for(k in 1:bands1) {
      array1[k] = laneVec[i,k];
    }
    # print(paste("num = ", num))
    for (j in (i + 1):num) {
      comparisons = comparisons + 1;
      if ((comparisons %% 100) == 0) {
        progress = comparisons * 100 / total_comparisons;
        fprintf(stderr,"   acm: %d of %d comparisons (approximately %5.1f percent complete)", 
                as.integer(comparisons), as.integer(total_comparisons), progress);
      }
      bands2 = fragCountAtLane[j];
      for (k in 1:bands2) {
        array2[k] = laneVec[j,k];
      }
      if(identical(gel[i], gel[j])) {
          storescore = slidefun_samediff(array1,array2,bands1,bands2,s_increments,s_slide,s_deviation,s_hmw,s_hmw_rate);
      } else {
          storescore = slidefun_samediff(array1,array2,bands1,bands2,d_increments,d_slide,d_deviation,d_hmw,d_hmw_rate);
      }
	  outtable[matrixIndex, 1] = as.character(laneNames[i])
	  outtable[matrixIndex, 2] = as.character(laneNames[j])
	  outtable[matrixIndex, 3] = as.integer(fragCountAtLane[i])
	  outtable[matrixIndex, 4] = as.integer(fragCountAtLane[j])
	  outtable[matrixIndex, 5] = as.numeric(storescore)
	  matrixIndex = matrixIndex + 1
    }
  }
  fprintf(stderr, "\n   acm: %d comparisons completed!", as.integer(total_comparisons));
  fprintf(logfile, "\n%d comparisons\n   acm completed!\n", as.integer(total_comparisons));
  fprintf(logfile, "infile:  \"%s\"", infile);
  fprintf(logfile, "outfile: \"%s\"", outfile);
  fprintf(logfile, "\n  same gel comparisons:");
  showpar_samediff(logfile, s_deviation, s_slide, s_increments, s_hmw, s_hmw_rate);
  fprintf(logfile, "\n  different gel comparisons:\n");
  showpar_samediff(logfile, d_deviation, d_slide, d_increments, d_hmw, d_hmw_rate);
  return(outtable)
  #return(0);
}

#x = read_pars(stdout(), "acm.par")
#print(x)

#acm("replicates.in", "acm_test.out")

Try the acm4r package in your browser

Any scripts or data that you put into this service are public.

acm4r documentation built on May 1, 2019, 7:50 p.m.