exec/loadGeneFamilies.R

require(GeneFamilies)
options(mc.cores = getMcCores())

message("USAGE: Rscript path/2/GeneFamilies/exec/loadGeneFamilies.R path/2/GeneFamilies/data mcl_output.txt mcl_table.tsv")
message("EXPECTED INPUT FORMATS:")
message(" - mcl_output.txt is the textual output generated by the markob clustering tool mcl used on the all vs all sequence similarity search results. A single cluster (family) is expected per line and gene members to be separated by TAB.")
message(" - mcl_table a TAB separated table with the following header:\nid A.thaliana A.lyrata C.rubella C.hirsuta A.arabicum B.rapa E.salsugineum S.parvula")

input.args <- commandArgs(trailingOnly = TRUE)

families.genes.df <- readMclOutput(input.args[[2]])
families.lst <- mclDataFrameAsList(families.genes.df)
families.df <- read.table(input.args[[3]], sep = "\t", header = TRUE, stringsAsFactors = FALSE, 
    comment.char = "", quote = "", na.strings = "", colClasses = c("character", rep("numeric", 
        8)))
families.df$size <- apply(families.df[, 2:9], 1, sum)

#' Save parsed data:
save(families.genes.df, families.lst, families.df, file = file.path(input.args[[1]], 
    "families.RData"))
asishallab/GeneFamilies documentation built on May 22, 2023, 11:30 a.m.