R/dataManipulation.R

Defines functions center.and.scale.by.column center.and.scale.by.row convert_to_long remove_duplicated_rows tf_idf_transform

Documented in center.and.scale.by.column center.and.scale.by.row convert_to_long remove_duplicated_rows

#' @export
#' @import Matrix
tf_idf_transform<-function(matrix){
  idf <- log( ncol(matrix) / ( 1 + Matrix::rowSums(matrix != 0) ) )
  idf<-.sparseDiagonal(x=idf)
  tf_idf <- crossprod(matrix, idf)
  colnames(tf_idf) <- rownames(matrix)
  tf_idf_out<-t(tf_idf / sqrt( rowSums( tf_idf^2 ) ))
  return(tf_idf_out)
}


#' Removes data (collapses) with duplicated names
#'
#' @description Will collapse a matrix based on duplicated labels that have the greatest variance
#' @param matrix Input object with unique rownames
#' @param labels the non-unique labels (i.e. Gene Symbol)
#' @export
#' 

remove_duplicated_rows<-function(matrix, labels){
  rn<-labels
  if (length(make.unique(labels)) != length(unique(labels))) {
    unique_rn<-rownames(matrix)[!rn %in% rn[duplicated(rn)]]
    duplicated_rn<-rownames(matrix)[rn %in% rn[duplicated(rn)]]
    duplicated_lab<-rn[rn %in% rn[duplicated(rn)]]
    duplicated<-matrix[rn %in% rn[duplicated(rn)],]
    ts = data.table(rowV = matrixStats::rowVars(as.matrix(duplicated)), rn_D=duplicated_lab, rn_U=duplicated_rn)
    tokeep<-c(ts[ , .SD[which.max(rowV)], by = rn_D]$rn_U, unique_rn)
    matrix_out<-matrix[rownames(matrix) %in% tokeep,]
    rownames(matrix_out)<-labels[rownames(matrix) %in% tokeep]
    return(matrix_out)
  }
}

#' Converts to long data format
#'
#' @description Will convert a data object (either DESeq2 object or a matrix) to a data table for further work.  
#' @param matrix Input object with unique rownames
#' @param coldata a factor or dataframe of factors used to add information to long data table, the string "all" can be used if the class of the input
#' object is a DESeq2 dataset and all coldata information is to be copied to long data table
#' @export
#' 
convert_to_long<-function(object, coldata="all", values_to="NormalizedExpression", names_to = "sample"){
  if(class(object) %in% c("DESeqTransform", "DESeqDataSet")){
    if(coldata=="all"){
      coldata=colData(object)
    }else{
      coldata<-data.frame(coldata, row.names = colnames(object))
    }
  }
  if(class(object) %in% c("matrix")){
    if(coldata=="all"){
      stop("The 'all' feature is not supported for matrix input")
    }else{
      coldata<-data.frame(coldata, row.names = colnames(object))
    }
  }
  dt<-data.table::as.data.table(assay(object))
  dt$Gene<-rownames(object)
  dtl<-dt %>% tidyr::pivot_longer(-Gene, values_to = values_to, names_to = names_to)
  dtout<-cbind(dtl, data.table::as.data.table(coldata[match(dtl[[names_to]], rownames(coldata)),]))
  colnames(dtout)<-make.unique(colnames(dtout))
  return(dtout)
}

#' Center and scale by row
#'
#' @description Just like it sounds but old school
#' @param dataset input matrix
#' @export
#' 

center.and.scale.by.row<-function(dataset){
  expr.mean=apply(dataset,1,mean) #Mean of each column
  expr.sd=apply(dataset,1,sd) #Standard deviation of each column
  expr.norm=matrix(NA,nrow(dataset),ncol(dataset))
  for(i in 1:nrow(dataset)){
    temp=(dataset[i,]-expr.mean[i])/expr.sd[i]
    expr.norm[i,]=temp
  }
  expr.norm=as.data.frame(expr.norm)
  expr.norm=sapply(expr.norm, function(x) as.numeric(as.character(x)))
  rownames(expr.norm)=rownames(dataset) #CHECK THIS IS RIGHT
  colnames(expr.norm)=colnames(dataset) #CHECK THIS IS RIGHT
  return(expr.norm)
}

#' Center and scale by row
#'
#' @description Just like it sounds but old school
#' @param dataset input matrix
#' @export
#' 
center.and.scale.by.column<-function(dataset){
  expr.mean=apply(dataset,2,mean) #Mean of each column
  expr.sd=apply(dataset,2,sd) #Standard deviation of each column
  expr.norm=matrix(NA,nrow(dataset),ncol(dataset))
  for(i in 1:ncol(dataset)){
    temp=(dataset[,i]-expr.mean[i])/expr.sd[i]
    expr.norm[,i]=temp
  }
  expr.norm=as.data.frame(expr.norm)
  expr.norm=sapply(expr.norm, function(x) as.numeric(as.character(x)))
  rownames(expr.norm)=rownames(dataset) #CHECK THIS IS RIGHT
  colnames(expr.norm)=colnames(dataset) #CHECK THIS IS RIGHT
  return(expr.norm)
}
scfurl/seqGlue documentation built on Sept. 1, 2024, 11:20 a.m.