R/normalise_data.R

Defines functions run_RUVIII fastRUVIII make_residual_mat_several_rep save_norm_files normalise_data

Documented in normalise_data

#' @rdname normalise_data
#' @title normalise_data
#'
#' contains all the functions to normalise several Cytof datasets by removing the
#' unwanted variation between datasets arising from experimental artefacts.
#'
#' @param data Datasets before normalisation
#' @param raw_data Raw data
#' @param rep_samples Replicated samples
#' @param norm_clusters Clusters to be normalised
#' @param k Dimension of the unwanted variation
#' @param num_clusters Total number of clusters
#' @param wd_data Path to the directory containing all raw fcs files, the metadata file
#' and the panel file
#' @param dir_norm_data Directory name containing all norm fcs files, the metadata file
#' and the panel file
#' @param other_markers_fcs_file Optional to add more markers to be saved in the .fcs file not normalised
#'
#' @return Normalised metadata file
#'
#' @importFrom rsvd rsvd
#' @importFrom stats sd
#' @importFrom flowCore exprs fsApply write.FCS
#' @export
#'

normalise_data <- function(data,raw_data,rep_samples, norm_clusters, k, num_clusters,wd_data,dir_norm_data,other_markers_fcs_file=NULL){

  # Normalise the cells
  norm_cells <- run_RUVIII(raw_data, norm_clusters,k,rep_samples)
  # Output to save files
  output_dir <- file.path(wd_data, dir_norm_data)
  if (!dir.exists(output_dir)){
    dir.create(output_dir)
  }
  # Save data into fcs files
  #print(data$md)
  new_md=save_norm_files(data,norm_cells, data$fcs_raw, data$md,data$panel,output_dir,k,other_markers_fcs_file)
  # Save Metadata file
  writexl::write_xlsx(new_md, path=file.path(output_dir,"Norm_Metadata.xlsx"),
                      col_names=TRUE, format_headers = TRUE)

  writexl::write_xlsx(data$panel, path=file.path(output_dir,"Norm_Panel.xlsx"),
                      col_names=TRUE, format_headers = TRUE)
  ## Save normalisation details
  norm_setup=list(rep_samples=rep_samples,cluster_to_norm=norm_clusters,k=k,metadata_norm=new_md)
  saveRDS(norm_setup,file =file.path(output_dir,"Norm_setup.rds"))
  ## Save norm data
  saveRDS(norm_cells,file = file.path(output_dir,"Norm_data.rds"))

}

save_norm_files<- function(data,norm_cells, fcs_raw, md,panel,output_dir,k,other_markers_fcs_file){

  # All the file names
  file_ids <- rep(md$file_name, flowCore::fsApply(fcs_raw, nrow))
  #Number of files
  num_files <- length(data$fcs_raw)
  # Panel markers fullname
  length_other_markers=length(other_markers_fcs_file)
  all_fullname_markers=c("Time",other_markers_fcs_file,c(panel$fcs_colname[panel$antigen%in%data$lineage_markers],
                                                         panel$fcs_colname[panel$antigen%in%data$functional_markers]))

  fcs_raw_asinh=fsApply(fcs_raw,function(x,cofactor = 5){
    expr <- exprs(x)
    expr <- expr[,all_fullname_markers]
    expr[,all_fullname_markers[2:length(all_fullname_markers)]] <- asinh(expr[,all_fullname_markers[2:length(all_fullname_markers)]] / cofactor)
    exprs(x) <- expr
    x
  })

  # Extract how many cells per sample
  n_cells <- fsApply(fcs_raw_asinh, function(x) nrow(exprs(x)))
  # Calculate the correction vector - how much we should substract from the inds to line them up the start of the flowframe
  correction_vec <- c(0, cumsum(n_cells))

  # # Helper function which returns a modified flowframe with only inds indexed cells
  subset_flowframe <- function(flow_frame, inds){
    expr <- exprs(flow_frame)
    expr <- expr[inds, ]
    exprs(flow_frame) <- expr
    flow_frame
  }

  ### Save Norm data
  fcs_norm=fcs_raw_asinh
  new_md=md
  dir_new_md=md
  for (i in 1:num_files) {
    # The inds of the cells in the right file
    inds <- which(file_ids == md$file_name[i])
    # Line the inds up with the flowframe
    if (length(inds)>1){
      corrected_inds <- inds - correction_vec[i]
      dir_new_md$file_name[i] <- file.path(output_dir,paste("Norm_RUVIII_k",k,"_",md$file_name[i],sep=""))
      new_md$file_name[i] <- file.path(paste("Norm_RUVIII_k",k,"_",md$file_name[i],sep=""))
      tmp_exp=exprs(fcs_norm[[i]])
      tmp_exp[,all_fullname_markers[(2+length_other_markers):length(all_fullname_markers)]]= norm_cells[inds,c(data$lineage_markers,data$functional_markers)]
      exprs(fcs_norm[[i]])=tmp_exp
      write.FCS(subset_flowframe(fcs_norm[[i]], corrected_inds), dir_new_md$file_name[i])
    }
  }
  return(new_md)
}

### Functions
make_residual_mat_several_rep<- function(data,clusters, norm_clus_list,samples,rep_samples_list){
  #make_residual_mat(raw_Y,data$cluster, norm_clusters,norm_clusters_second,data$sample,rep_samples,second_rep_samples)
  res_mat=data
  res_mat[]=0
  for (r in 1:length(rep_samples_list)){
    norm_clus=norm_clus_list[[r]]
    rep_samples=rep_samples_list[[r]]
    mean_pseud=matrix(nrow = length(norm_clus),ncol=dim(data)[2])
    for (i in 1:length(norm_clus)){
      tmp=((clusters == norm_clus[i])&(samples%in%rep_samples))
      mean_pseud[i,]=colMeans(data[tmp,])
      res_mat[tmp,]<- t(apply(data[tmp,], 1, function(x) x-mean_pseud[i,]))
    }
  }
  return(res_mat)
}


fastRUVIII = function(Y, M, ctl,res_mat,k=NULL, eta=NULL, average=FALSE, fullalpha=NULL){
  # Assumes good input
  if (!(k > 0)) stop("Bad input - read the documentation")
  Y = ruv::RUV1(Y,eta,ctl)
  m = nrow(Y)
  #Y0 = fast_residop(Y, M)
  Y0=res_mat
  fullalpha = diag(rsvd(Y0)$d) %*% t(rsvd(Y0)$v)
  alpha = fullalpha[1:k,,drop=FALSE]
  ac = alpha[,ctl,drop=FALSE]
  W = Y[,ctl] %*% t(ac) %*% solve(ac %*% t(ac))
  newY = Y - W %*% alpha
  return(list(newY = newY, fullalpha=fullalpha))
}


fast_residop <- function (A, B) {
  return(A - B %*% solve(t(B) %*% B) %*% (t(B) %*% A))
}


run_RUVIII <- function(data, norm_clusters, k,rep_samples){
  raw_Y <- as.matrix(data[3:ncol(data)])
  # Standardise the input and then compensate output
  col_means <- colMeans(raw_Y)
  col_sds <- apply(raw_Y, 2, function(x) sd(x))

  for(i in 1:ncol(raw_Y)){
    raw_Y[,i] <- (raw_Y[,i] - col_means[i])/col_sds[i]
  }
  # Run the actual RUVIII
  res_mat<-make_residual_mat_several_rep(raw_Y,data$cluster, norm_clusters,data$sample,rep_samples)
  norm_Y <- fastRUVIII(Y = raw_Y, M, ctl = c(1:ncol(raw_Y)), res_mat=res_mat,k = k)$newY

  for(i in 1:ncol(norm_Y)){
    norm_Y[,i] <- norm_Y[,i]*col_sds[i] + col_means[i]
  }

  return(norm_Y)
}
mtrussart/CytofRUV documentation built on Aug. 3, 2022, 2:28 a.m.