R/data_preprocess.R

Defines functions data_preprocess

data_preprocess <- function(Xmat = NA, Ymat = NA, feature_table_file, 
    parentoutput_dir, class_labels_file, num_replicates = 3, 
    feat.filt.thresh = NA, summarize.replicates = TRUE, summary.method = "mean", 
    all.missing.thresh = 0.5, group.missing.thresh = 0.7, 
    log2transform = TRUE, medcenter = TRUE, znormtransform = FALSE, 
    quantile_norm = TRUE, lowess_norm = FALSE, madscaling = FALSE, 
    missing.val = 0, samplermindex = NA, rep.max.missing.thresh = 0.5, 
    summary.na.replacement = "zeros", featselmethod = NA) {
    
    options(warn = -1)
    
    # read file; First row is column headers
    if (is.na(Xmat == TRUE)) {
        data_matrix <- read.table(feature_table_file, sep = "\t", 
            header = TRUE)
    } else {
        data_matrix <- Xmat
        # rm(Xmat)
    }
    # print('signal filter threshold ')
    # print(group.missing.thresh) print(missing.val)
    
    if (is.na(samplermindex) == FALSE) {
        data_matrix <- data_matrix[, -c(samplermindex)]
    }
    
    # use only unique records
    data_matrix <- unique(data_matrix)
    
    if (is.na(missing.val) == FALSE) {
        
        # print('Replacing missing values with NAs.')
        data_matrix <- replace(as.matrix(data_matrix), which(data_matrix == 
            missing.val), NA)
    }
    
    
    # print('dim of original data matrix')
    # print(dim(data_matrix))
    data_matrix_orig <- data_matrix
    
    
    snames <- colnames(data_matrix)
    
    
    
    
    
    dir.create(parentoutput_dir, showWarnings = FALSE)
    parentoutput_dir <- paste(parentoutput_dir, "/Stage1/", 
        sep = "")
    
    dir.create(parentoutput_dir, showWarnings = FALSE)
    fheader = "transformed_log2fc_threshold_"
    setwd(parentoutput_dir)
    
    # Step 1) PID or CV calculation
    if (is.na(feat.filt.thresh) == FALSE) {
        if (num_replicates > 1) {
            
            feat.eval.result = evaluate.Features(cbind(data_matrix[, 
                c(1:2)], data_matrix), num_replicates, alignment.tool = "apLCMS", 
                impute.bool = TRUE)
            
            cnames = colnames(feat.eval.result)
            feat.eval.result <- apply(feat.eval.result, 2, 
                as.numeric)
            feat.eval.result <- as.data.frame(feat.eval.result)
            feat.eval.result.mat = cbind(data_matrix[, c(1:2)], 
                feat.eval.result)
            feat.eval.outfile = paste("featureassessment_usinggoodsamples.txt", 
                sep = "")
            # write results
            write.table(feat.eval.result.mat, feat.eval.outfile, 
                sep = "\t", row.names = FALSE)
            
            # data_m<-data_m[which(as.numeric(feat.eval.result$median)<=feat.filt.thresh),]
            data_matrix <- data_matrix[which(as.numeric(feat.eval.result$median) <= 
                feat.filt.thresh), ]
        }
    }
    
    #################################################################################### 
    
    # Step 2) Average replicates
    if (summarize.replicates == TRUE) {
        if (num_replicates > 1) {
            
            data_m <- getSumreplicates(data_matrix, alignment.tool = "apLCMS", 
                numreplicates = num_replicates, numcluster = 10, 
                rep.max.missing.thresh = rep.max.missing.thresh, 
                summary.method = summary.method, summary.na.replacement, 
                missing.val = missing.val)
            
            if (summary.method == "mean") {
                # print('Replicate averaging done')
                filename <- paste("Rawdata_averaged.txt", 
                  sep = "")
            } else {
                if (summary.method == "median") {
                  # print('Replicate median summarization done')
                  filename <- paste("Rawdata_median_summarized.txt", 
                    sep = "")
                }
                
            }
            
            data_m_prenorm <- cbind(data_matrix[, c(1:2)], 
                data_m)
            
            write.table(data_m_prenorm, file = filename, 
                sep = "\t", row.names = FALSE)
            # num_samps_group[[1]]=(1/num_replicates)*num_samps_group[[1]]
            # num_samps_group[[2]]=(1/num_replicates)*num_samps_group[[2]]
        } else {
            data_m <- data_matrix[, -c(1:2)]
            if (summary.na.replacement == "zeros") {
                data_m <- replace(data_m, which(is.na(data_m) == 
                  TRUE), 0)
            } else {
                if (summary.na.replacement == "halfsamplemin") {
                  data_m <- apply(data_m, 2, function(x) {
                    naind <- which(is.na(x) == TRUE)
                    if (length(naind) > 0) {
                      x[naind] <- min(x, na.rm = TRUE)/2
                    }
                    return(x)
                  })
                } else {
                  
                  if (summary.na.replacement == "halfdatamin") {
                    
                    
                    min_val <- min(data_m, na.rm = TRUE) * 
                      0.5
                    data_m <- replace(data_m, which(is.na(data_m) == 
                      TRUE), min_val)
                    
                    # data_m<-apply(data_m,1,function(x){naind<-which(is.na(x)==TRUE);
                    # if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2};
                    # return(x)})
                  } else {
                    if (summary.na.replacement == "halffeaturemin") {
                      data_m <- apply(data_m, 1, function(x) {
                        naind <- which(is.na(x) == TRUE)
                        if (length(naind) > 0) {
                          x[naind] <- min(x, na.rm = TRUE)/2
                        }
                        return(x)
                      })
                      data_m <- t(data_m)
                    }
                  }
                }
                
                
            }
        }
    } else {
        data_m <- data_matrix[, -c(1:2)]
        if (summary.na.replacement == "zeros") {
            data_m <- replace(data_m, which(is.na(data_m) == 
                TRUE), 0)
        } else {
            if (summary.na.replacement == "halfsamplemin") {
                data_m <- apply(data_m, 2, function(x) {
                  naind <- which(is.na(x) == TRUE)
                  if (length(naind) > 0) {
                    x[naind] <- min(x, na.rm = TRUE)/2
                  }
                  return(x)
                })
            } else {
                
                if (summary.na.replacement == "halfdatamin") {
                  
                  
                  min_val <- min(data_m, na.rm = TRUE) * 
                    0.5
                  data_m <- replace(data_m, which(is.na(data_m) == 
                    TRUE), min_val)
                  
                  # data_m<-apply(data_m,1,function(x){naind<-which(is.na(x)==TRUE);
                  # if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2};
                  # return(x)})
                } else {
                  if (summary.na.replacement == "halffeaturemin") {
                    data_m <- apply(data_m, 1, function(x) {
                      naind <- which(is.na(x) == TRUE)
                      if (length(naind) > 0) {
                        x[naind] <- min(x, na.rm = TRUE)/2
                      }
                      return(x)
                    })
                    data_m <- t(data_m)
                  }
                }
            }
            
            
        }
        
    }
    
    
    data_matrix <- cbind(data_matrix[, c(1:2)], data_m)
    
    data_matrix_orig <- data_matrix
    data_subjects <- data_m
    
    ordered_labels = {
    }
    
    num_samps_group <- new("list")
    
    if (is.na(class_labels_file) == FALSE) {
        
        # print('Class labels file:') print(class_labels_file)
        
        data_matrix = {
        }
        
        if (is.na(Ymat) == TRUE) {
            classlabels <- read.table(class_labels_file, 
                sep = "\t", header = TRUE)
        } else {
            classlabels <- Ymat
        }
        
        
        classlabels <- as.data.frame(classlabels)
        colnames(classlabels) <- c("SampleID", "Class")
        f1 <- table(classlabels$SampleID)
        
        
        classlabels <- as.data.frame(classlabels)
        classlabels <- classlabels[seq(1, dim(classlabels)[1], 
            num_replicates), ]
        # print(classlabels)
        class_labels_levels <- levels(as.factor(classlabels[, 
            2]))
        bad_rows <- which(class_labels_levels == "")
        if (length(bad_rows) > 0) {
            class_labels_levels <- class_labels_levels[-bad_rows]
        }
        
        for (c in 1:length(class_labels_levels)) {
            if (c > 1) {
                data_matrix <- cbind(data_matrix, data_subjects[, 
                  which(classlabels[, 2] == class_labels_levels[c])])
            } else {
                data_matrix <- data_subjects[, which(classlabels[, 
                  2] == class_labels_levels[c])]
            }
            classlabels_index <- which(classlabels[, 2] == 
                class_labels_levels[c])
            ordered_labels <- c(ordered_labels, as.character(classlabels[classlabels_index, 
                2]))
            num_samps_group[[c]] <- length(classlabels_index)
            
        }
        
        # colnames(data_matrix)<-as.character(ordered_labels)
        data_matrix <- cbind(data_matrix_orig[, c(1:2)], 
            data_matrix)
        data_m <- as.matrix(data_matrix[, -c(1:2)])
        
    } else {
        if (is.na(Ymat) == TRUE) {
            classlabels <- rep("A", dim(data_m)[2])
            ordered_labels <- classlabels
            num_samps_group[[1]] <- dim(data_m)[2]
            class_labels_levels <- c("A")
            data_m <- as.matrix(data_matrix[, -c(1:2)])
            
        } else {
            classlabels <- Ymat
            classlabels <- as.data.frame(classlabels)
            colnames(classlabels) <- c("SampleID", "Class")
            f1 <- table(classlabels$SampleID)
            
            
            classlabels <- as.data.frame(classlabels)
            classlabels <- classlabels[seq(1, dim(classlabels)[1], 
                num_replicates), ]
            # print(classlabels)
            class_labels_levels <- levels(as.factor(classlabels[, 
                2]))
            bad_rows <- which(class_labels_levels == "")
            if (length(bad_rows) > 0) {
                class_labels_levels <- class_labels_levels[-bad_rows]
            }
            
            for (c in 1:length(class_labels_levels)) {
                # if(c>1){
                # data_matrix<-cbind(data_matrix,data_subjects[,which(classlabels[,2]==class_labels_levels[c])])
                # }else{
                # data_matrix<-data_subjects[,which(classlabels[,2]==class_labels_levels[c])]
                # }
                classlabels_index <- which(classlabels[, 
                  2] == class_labels_levels[c])
                ordered_labels <- c(ordered_labels, as.character(classlabels[classlabels_index, 
                  2]))
                num_samps_group[[c]] <- length(classlabels_index)
                
            }
            
            # colnames(data_matrix)<-as.character(ordered_labels)
            # data_matrix<-cbind(data_matrix_orig[,c(1:2)],data_matrix)
            # data_m<-as.matrix(data_matrix[,-c(1:2)])
        }
        
        
        
    }
    
    
    ################################################################################## 
    metab_zeros = {
    }
    data_clean <- {
    }
    clean_metabs <- {
    }
    # num_samps_group[[3]]<-0
    
    if (is.na(all.missing.thresh) == FALSE) {
        
        total_sigs <- apply(data_m, 1, function(x) {
            if (is.na(missing.val) == FALSE) {
                return(length(which(x > missing.val)))
            } else {
                return(length(which(is.na(x) == FALSE)))
            }
        })
        
        
        
        total_sig_thresh <- dim(data_m)[2] * all.missing.thresh
        
        total_good_metabs <- which(total_sigs > total_sig_thresh)
        if (length(total_good_metabs) > 0) {
            data_m <- data_m[total_good_metabs, ]
            data_matrix <- data_matrix[total_good_metabs, 
                ]
            # print(paste('Dimension of data matrix after overall
            # ',all.missing.thresh,'% signal threshold
            # filtering',sep='')) print(paste('Dimension of data
            # matrix after using overall ',100*all.missing.thresh, '%
            # signal criteria for filtering:'),sep='')
            # print(dim(data_matrix))
        } else {
            stop(paste("None of the metabolites have signal in ", 
                all.missing.thresh * 100, "% of samples", 
                sep = ""))
        }
    }
    
    # print(dim(data_m)) Step 3) Remove features if signal is
    # not detected in at least 70% of samples in either of
    # the groups
    
    if (is.na(group.missing.thresh) == FALSE) {
        
        if (length(class_labels_levels) == 2) {
            
            sig_thresh_groupA <- group.missing.thresh * num_samps_group[[1]]
            sig_thresh_groupB <- group.missing.thresh * num_samps_group[[2]]
            
            for (metab_num in 1:dim(data_matrix)[1]) {
                # print(missing.val)
                if (is.na(missing.val) == FALSE) {
                  
                  num_sigsA <- length(which(data_m[metab_num, 
                    1:num_samps_group[[1]]] > missing.val))
                  
                  
                  num_sigsB <- length(which(data_m[metab_num, 
                    (num_samps_group[[1]] + 1):(num_samps_group[[1]] + 
                      num_samps_group[[2]])] > missing.val))
                  
                } else {
                  
                  num_sigsA <- length(which(is.na(data_m[metab_num, 
                    1:num_samps_group[[1]]]) == FALSE))
                  num_sigsB <- length(which(is.na(data_m[metab_num, 
                    (num_samps_group[[1]] + 1):(num_samps_group[[1]] + 
                      num_samps_group[[2]])]) == FALSE))
                  
                }
                
                if ((num_sigsA >= sig_thresh_groupA) || (num_sigsB >= 
                  sig_thresh_groupB)) {
                  clean_metabs <- c(clean_metabs, metab_num)
                }
                
            }
            
            # print(length(clean_metabs))
        } else {
            if (length(class_labels_levels) == 3) {
                
                
                sig_thresh_groupA <- group.missing.thresh * 
                  num_samps_group[[1]]
                sig_thresh_groupB <- group.missing.thresh * 
                  num_samps_group[[2]]
                sig_thresh_groupC <- group.missing.thresh * 
                  num_samps_group[[3]]
                
                for (metab_num in 1:dim(data_matrix)[1]) {
                  if (is.na(missing.val) == FALSE) {
                    num_sigsA <- length(which(data_m[metab_num, 
                      1:num_samps_group[[1]]] > missing.val))
                    num_sigsB <- length(which(data_m[metab_num, 
                      (num_samps_group[[1]] + 1):(num_samps_group[[1]] + 
                        num_samps_group[[2]])] > missing.val))
                    num_sigsC <- length(which(data_m[metab_num, 
                      (num_samps_group[[1]] + num_samps_group[[2]] + 
                        1):(num_samps_group[[1]] + num_samps_group[[2]] + 
                        num_samps_group[[3]])] > missing.val))
                  } else {
                    
                    num_sigsA <- length(which(is.na(data_m[metab_num, 
                      1:num_samps_group[[1]]]) == FALSE))
                    num_sigsB <- length(which(is.na(data_m[metab_num, 
                      (num_samps_group[[1]] + 1):(num_samps_group[[1]] + 
                        num_samps_group[[2]])]) == FALSE))
                    num_sigsC <- length(which(is.na(data_m[metab_num, 
                      (num_samps_group[[1]] + num_samps_group[[2]] + 
                        1):(num_samps_group[[1]] + num_samps_group[[2]] + 
                        num_samps_group[[3]])]) == FALSE))
                    
                  }
                  
                  if ((num_sigsA >= sig_thresh_groupA) || 
                    (num_sigsB >= sig_thresh_groupB) || (num_sigsC >= 
                    sig_thresh_groupC)) {
                    clean_metabs <- c(clean_metabs, metab_num)
                  }
                  
                }
            } else {
                if (length(class_labels_levels) > 2) {
                  
                  
                  
                  for (metab_num in 1:dim(data_m)[1]) {
                    for (c in 1:length(class_labels_levels)) {
                      if (is.na(missing.val) == FALSE) {
                        
                        num_cursig <- length(which(data_m[metab_num, 
                          which(ordered_labels == class_labels_levels[c])] > 
                          missing.val))
                        
                      } else {
                        
                        num_cursig <- length(which(is.na(data_m[metab_num, 
                          which(ordered_labels == class_labels_levels[c])]) == 
                          FALSE))
                        
                        
                      }
                      
                      sig_thresh_cur <- length(which(ordered_labels == 
                        class_labels_levels[c])) * group.missing.thresh
                      if (num_cursig >= sig_thresh_cur) {
                        clean_metabs <- c(clean_metabs, metab_num)
                        break  #for(i in 1:4){if(i==3){break}else{#print(i)}}
                        
                      }
                      
                    }
                  }
                } else {
                  
                  
                  
                  if (length(class_labels_levels) == 1) {
                    num_samps_group[[1]] <- num_samps_group[[1]]
                    
                    
                    sig_thresh_groupA <- group.missing.thresh * 
                      num_samps_group[[1]]
                    
                    
                    for (metab_num in 1:dim(data_matrix)[1]) {
                      if (is.na(missing.val) == FALSE) {
                        num_sigsA <- length(which(data_m[metab_num, 
                          1:num_samps_group[[1]]] > missing.val))
                        
                      } else {
                        
                        num_sigsA <- length(which(is.na(data_m[metab_num, 
                          1:num_samps_group[[1]]]) == FALSE))
                      }
                      
                      if ((num_sigsA >= sig_thresh_groupA)) {
                        clean_metabs <- c(clean_metabs, metab_num)
                      }
                      
                    }
                  }
                }
            }
        }
        
        if (length(clean_metabs) > 0) {
            # print(length(clean_metabs))
            
            
            
            data_m <- data_m[clean_metabs, ]
            data_matrix <- data_matrix[clean_metabs, ]
            
            # print(paste('Dimension of data matrix after using
            # group-wise ',100*group.missing.thresh, '% signal
            # criteria for filtering:'),sep='')
            # print(dim(data_matrix))
            
            
            
            # print('here 2') print(data_m[1:10,1:5])
            
        }
        
    }
    # print('before') print(data_m[1:10,1:10]) Step 4) Data
    # transformation and normalization
    
    if (log2transform == TRUE) {
        data_m <- log2(data_m + 1)
        
        # print('log scale') print(data_m[1:10,1:10])
    }
    
    if (quantile_norm == TRUE) {
        data_m <- normalizeQuantiles(data_m)
        # print('quant norm') print(data_m[1:10,1:10])
    }
    
    if (lowess_norm == TRUE) {
        data_m <- normalizeCyclicLoess(data_m)
        # print('lowess')
    }
    
    data_m_prescaling <- data_m
    
    if (medcenter == TRUE) {
        colmedians = apply(data_m, 1, function(x) {
            median(x, na.rm = TRUE)
        })
        data_m = sweep(data_m, 1, colmedians)
        
        
    }
    if (znormtransform == TRUE) {
        data_m <- scale(t(data_m))
        data_m <- t(data_m)
    }
    
    
    if (madscaling == TRUE) {
        colmedians = apply(data_m, 2, function(x) {
            median(x, na.rm = TRUE)
        })
        
        Y = sweep(data_m, 2, colmedians)
        mad <- apply(abs(Y), 2, function(x) {
            median(x, na.rm = TRUE)
        })
        const <- prod(mad)^(1/length(mad))
        scale.normalized <- sweep(data_m, 2, const/mad, "*")
        data_m <- scale.normalized
    }
    
    
    # print('after') print(data_m[1:10,1:10]) Use this if
    # first column is gene/metabolite name for apLCMS:
    # data_matrix_temp<-data_matrix[,c(1:2)]
    data_matrix <- cbind(data_matrix[, c(1:2)], data_m)
    
    # print(dim(data_matrix)) print(dim(data_m))
    
    data_m <- as.data.frame(data_m)
    
    num_rows <- dim(data_m)[1]
    num_columns <- dim(data_m)[2]
    
    # print('num rows is ') print(num_rows) for apLCMS:
    rnames <- paste("mzid_", seq(1, num_rows), sep = "")
    rownames(data_m) = rnames
    
    mzid_mzrt <- data_matrix[, c(1:2)]
    colnames(mzid_mzrt) <- c("mz", "time")
    rownames(mzid_mzrt) = rnames
    write.table(mzid_mzrt, file = "mzid_mzrt.txt", sep = "\t", 
        row.names = FALSE)
    
    # filename<-paste('Classlabels_file.txt',sep='')
    # write.table(classlabels,
    # file=filename,sep='\t',row.names=FALSE)
    
    filename <- paste("Normalized_sigthreshfilt_averaged_data.txt", 
        sep = "")
    data_matrix <- cbind(data_matrix[, c(1:2)], data_m)
    write.table(data_matrix, file = filename, sep = "\t", 
        row.names = FALSE)
    data_matrix_prescaling <- cbind(data_matrix[, c(1:2)], 
        data_m_prescaling)
    return(list(data_matrix_afternorm_scaling = data_matrix, 
        data_matrix_prescaling = data_matrix_prescaling))
    # return(data_matrix)
}
yufree/xMSannotator documentation built on Oct. 31, 2022, 12:20 a.m.