Analyses/deep_sequencing/5.Prepare_for_phyloscanner_v4.R

# prepare files to run phyloscanner

# shiver output files were run in individual directories per ID (shiver should
# be run in individual directory per ID), but in the cluster.

# for the cluster analysis, I moved the results from shiver to a directory
#named "../phyloscanner"


library(DescTools)
library(stringr)

#prefix for path names
#prefix <- paste(getwd(), "/", sep = "")
prefix <- getwd()


reps <- list.files(list.files(prefix, full.names=T), full.names=T)

for(i in 1:length(reps)){

  phyloscanner_dir <- paste(reps[i], "shiver_results", sep = "/")

  # Create input csv file for phyloscanner ----
  #list remap_bam files
  bam_files = list.files(phyloscanner_dir, pattern="*._remap.bam$")
  ref_files = list.files(phyloscanner_dir, pattern="*.remap_ref.fasta")


  if(length(bam_files) != length(ref_files)){
    stop("length of bam_files has to be equal to length of ref_files")
  }

  # create dataframe of bam and ref files

  input_files <- data.frame(bam = bam_files, ref_fasta = ref_files)

  # check that ID names in bam matches ID names in ref_fasta

  input_files["bam_ids"] <- unlist(lapply(input_files$bam, function(x) str_split(x, pattern = "_")[[1]][2]))
  input_files["ref_ids"] <- unlist(lapply(input_files$ref_fasta, function(x) str_split(x, pattern = "_")[[1]][2]))

  # check that all IDs in order are equal to each other
  if(any(input_files$bam_ids == input_files$ref_ids) == FALSE){
    stop("All or some IDs between bam and ref.fasta files do not match")
  }

  input_files["IDs"] <- paste("ID", input_files$bam_ids, sep = "_")

  #paste full file name path to bam and ref.fasta files
  input_files["bam"] <- paste(phyloscanner_dir, input_files$bam, sep = "/")
  input_files["ref_fasta"] <- paste(phyloscanner_dir, input_files$ref_fasta, sep = "/")

  #final dataframe
  input_files <- input_files[c(1,2,5)]

  #save csv file that will be used as input file for phyloscanner
  filename <- paste(phyloscanner_dir, "BamsRefsAndIDs.csv", sep = "/")
  write.table(x =  input_files, file = filename, quote = FALSE, sep = ",",
              row.names = FALSE, col.names = FALSE)
}
thednainus/HIVepisimAnalysis documentation built on Sept. 21, 2023, 7:32 a.m.